diff --git a/lib/Service/NegativeSampleGenerator.php b/lib/Service/NegativeSampleGenerator.php index 8286f90a..6081bd13 100644 --- a/lib/Service/NegativeSampleGenerator.php +++ b/lib/Service/NegativeSampleGenerator.php @@ -22,6 +22,10 @@ use function str_split; class NegativeSampleGenerator { + /** + * Get IP vectors exclusively used by one user. + * Includes the user vector in second dimension of the returned array. + */ private function getUniqueIPsPerUser(Dataset $positives): array { $map = []; @@ -35,7 +39,7 @@ private function getUniqueIPsPerUser(Dataset $positives): array { $map[$ipVecStr] = [ $uidVecStr, ]; - } else { + } elseif (!in_array($uidVecStr, $map[$ipVecStr])) { $map[$ipVecStr][] = $uidVecStr; } } diff --git a/tests/Unit/Service/NegativeSampleGeneratorTest.php b/tests/Unit/Service/NegativeSampleGeneratorTest.php index 246f53e9..2b1cd611 100644 --- a/tests/Unit/Service/NegativeSampleGeneratorTest.php +++ b/tests/Unit/Service/NegativeSampleGeneratorTest.php @@ -128,6 +128,37 @@ public function testGenerateMultipleShuffledFromLimitedUnique(): void { self::assertCount(5, $result); } + /** + * DataSet can consist of multiple unique entries only. If not handled correctly, + * this will result in an array without any IP. This tests the + * correct handling. See GitHub issue #860 for more. + * @return void + */ + public function testGenerateMultipleShuffledFromUniquesOnly(): void { + $positives = new Unlabeled([ + array_merge(self::decToBitArray(1, 16), self::decToBitArray(1, 32)), + array_merge(self::decToBitArray(1, 16), self::decToBitArray(1, 32)), + array_merge(self::decToBitArray(1, 16), self::decToBitArray(1, 32)), + + array_merge(self::decToBitArray(2, 16), self::decToBitArray(2, 32)), + array_merge(self::decToBitArray(2, 16), self::decToBitArray(2, 32)), + array_merge(self::decToBitArray(2, 16), self::decToBitArray(2, 32)), + ]); + + $result = $this->generator->generateShuffledFromPositiveSamples($positives, 2); + + self::assertCount(2, $result); + foreach ($result as $sample) { + $ipVec = array_slice($sample, 16, 32); + + self::assertTrue( + $ipVec === self::decToBitArray(1, 32) || + $ipVec === self::decToBitArray(2, 32), + 'Sample has an unique IP' + ); + } + } + /** * @return int[] */