diff --git a/lib/Db/LoginAddressAggregatedMapper.php b/lib/Db/LoginAddressAggregatedMapper.php index 332f2b18..a7918ae9 100644 --- a/lib/Db/LoginAddressAggregatedMapper.php +++ b/lib/Db/LoginAddressAggregatedMapper.php @@ -70,7 +70,9 @@ private function findHistoricIpv4(int $threshold, int $maxAge): array { $qb->expr()->like('ip', $qb->createNamedParameter('_%._%._%._%')), $qb->expr()->gte('last_seen', $qb->createNamedParameter($maxAge)), $qb->expr()->lte('first_seen', $qb->createNamedParameter($threshold)) - )); + )) + ->orderBy('last_seen', 'DESC') // Use most recent data in case of limiting + ->setMaxResults(15_000); // More data will like exhaust memory return $this->findEntities($query); } @@ -84,7 +86,9 @@ private function findRecentIpV4(int $threshold): array { ->where($qb->expr()->andX( $qb->expr()->like('ip', $qb->createNamedParameter('_%._%._%._%')), $qb->expr()->gt('last_seen', $qb->createNamedParameter($threshold)) - )); + )) + ->orderBy('last_seen', 'DESC') // Use most recent data in case of limiting + ->setMaxResults(3_000); // More data will like exhaust memory; return $this->findEntities($query); } @@ -148,7 +152,9 @@ private function findHistoricIpv6(int $threshold, int $maxAge): array { $qb->expr()->notLike('ip', $qb->createNamedParameter('_%._%._%._%')), $qb->expr()->gte('last_seen', $qb->createNamedParameter($maxAge)), $qb->expr()->lte('first_seen', $qb->createNamedParameter($threshold)) - )); + )) + ->orderBy('last_seen', 'DESC') // Use most recent data in case of limiting + ->setMaxResults(15_000); // More data will like exhaust memory; return $this->findEntities($query); } @@ -162,7 +168,9 @@ private function findRecentIpV6(int $threshold): array { ->where($qb->expr()->andX( $qb->expr()->notLike('ip', $qb->createNamedParameter('_%._%._%._%')), $qb->expr()->gt('last_seen', $qb->createNamedParameter($threshold)) - )); + )) + ->orderBy('last_seen', 'DESC') // Use most recent data in case of limiting + ->setMaxResults(3_000); // More data will like exhaust memory return $this->findEntities($query); } diff --git a/lib/Service/DataLoader.php b/lib/Service/DataLoader.php index 9b99509f..70d9ce71 100644 --- a/lib/Service/DataLoader.php +++ b/lib/Service/DataLoader.php @@ -22,8 +22,11 @@ use function floor; use function log; use function max; +use function random_int; class DataLoader { + private const MAX_SAMPLES_POSITIVES = 15_000; + private const MAX_SAMPLES_VALIDATE_POSITIVES = 3_000; /** @var LoginAddressAggregatedMapper */ private $loginAddressMapper; @@ -65,6 +68,14 @@ public function loadTrainingAndValidationData(TrainingDataConfig $dataConfig, $positives = $this->addressesToDataSet($historyRaw, $strategy); $validationPositives = $this->addressesToDataSet($recentRaw, $strategy); + if ($positives->count() > self::MAX_SAMPLES_POSITIVES) { + $threshold = (self::MAX_SAMPLES_POSITIVES / $positives->count()) * 100; + $positives = $positives->filter(fn () => random_int(0, 100) <= $threshold); + } + if ($validationPositives->count() > self::MAX_SAMPLES_VALIDATE_POSITIVES) { + $threshold = (self::MAX_SAMPLES_VALIDATE_POSITIVES / $validationPositives->count()) * 100; + $validationPositives = $validationPositives->filter(fn () => random_int(0, 100) <= $threshold); + } return new CollectedData( $positives,