data61 · gusmith · Nov 19, 2019 · Nov 4, 2019 · Nov 13, 2019 · Nov 13, 2019
diff --git a/backend/entityservice/api_def/swagger.yaml b/backend/entityservice/api_def/swagger.yaml
@@ -509,25 +509,26 @@ paths:
 
         ### result_type = "similarity_scores"
 
-        The list of the indices of potential matches and their similarity score
+        The list of the candidate of potential matches and their similarity score
         where the similarity score is greater than the mapping threshold.
         Data is returned as `json` object e.g.,
 
             {
                 "similarity_scores":
                     [
-                        [5, 27, 1.0],
-                        [14, 10, 1.0]
+                        {'group': [[0, 5], [1, 27]], 'sim': 1.0},
+                        {'group': [[1, 10], [0, 14]], 'sim': 1.0}
                     ]
             }
 
 
-        The element in the list is of the following format `[indexA, indexB, score]`,
-        where `indexA` refers to the index of entity from data provider 1, `indexB` is the index of entity
-        from data provider 2 that is a potential match to entity in `indexA`, and `score` is the similarity score
-        representing the likelihood that entity in `indexA` and entity in `indexB` is a match.
+        The element in the list is of the following format `{'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score}`,
+        where the value of `group` is a candidate pair represented in a `group` format, i.e. `[party_id_0, row_index_0]`
+        refers to the record at the index `row_index_0` from the dataset `party_id_0`, similarly for `[party_id_1, row_index_1]`,
+        and `score` is the similarity score representing the likelihood that the group is a match.
 
-        `indexA` and `indexB` starts from 0.
+        `ds_index_0`, `rec_index_0, `ds_index_1` and `rec_index_1` start from 0, and `party_id_0 != party_id_1` but
+        not necessarilly ordered.
 
         The value of `score` is between 0.0 and 1.0, where 0.0 corresponds to no match
         and 1.0 corresponds to total match.

diff --git a/backend/entityservice/serialization.py b/backend/entityservice/serialization.py
@@ -128,7 +128,8 @@ def generate_scores(candidate_pair_stream: typing.BinaryIO):
     """
     sims, (dset_is0, dset_is1), (rec_is0, rec_is1) = anonlink.serialization.load_candidate_pairs(candidate_pair_stream)
 
-    cs_sims_iter = (f'{rec_i0}, {rec_i1}, {sim}' for sim, rec_i0, rec_i1 in zip(sims, rec_is0, rec_is1))
+    cs_sims_iter = (f'"group": [[{dset_i0}, {rec_i0}], [{dset_i1}, {rec_i1}]], "sim": {sim}'
+                    for sim, dset_i0, dset_i1, rec_i0, rec_i1 in zip(sims, dset_is0, dset_is1, rec_is0, rec_is1))
     yield '{"similarity_scores": ['
     line_iter = iter(cs_sims_iter)
 
@@ -140,11 +141,11 @@ def generate_scores(candidate_pair_stream: typing.BinaryIO):
         return
 
     for line in line_iter:
-        yield '[{}],'.format(prev_line.strip())
+        yield '{{{}}},'.format(prev_line.strip())
         prev_line = line
 
     # Yield the last line without a trailing comma, instead close the json object
-    yield '[{}]'.format(prev_line.strip())
+    yield '{{{}}}'.format(prev_line.strip())
     yield ']}'
 
 

diff --git a/backend/entityservice/tests/test_project_run_results.py b/backend/entityservice/tests/test_project_run_results.py
@@ -5,10 +5,10 @@ def test_run_similarity_score_results(requests, similarity_scores_project, thres
     run_id = post_run(requests, similarity_scores_project, threshold)
     result = get_run_result(requests, similarity_scores_project, run_id, timeout=120)
     assert 'similarity_scores' in result
-    for index1, index2, score in result['similarity_scores']:
-        assert 0.0 <= score >= 1.0
-        assert 0 <= index1
-        assert 0 <= index2
+    for candidate_pair in result['similarity_scores']:
+        assert 0.0 <= candidate_pair['sim'] >= 1.0
+        for _, index in candidate_pair['group']:
+            assert 0 <= index
 
 
 def test_run_permutations_results(requests, permutations_project, threshold):

diff --git a/backend/entityservice/tests/test_results_correctness.py b/backend/entityservice/tests/test_results_correctness.py
@@ -48,7 +48,8 @@ def test_similarity_scores(requests, the_truth):
     result = get_run_result(requests, project_data, run, timeout=60)
 
     true_scores = the_truth['similarity_scores']
-    result_scores = {(a, b): sim for a, b, sim in result['similarity_scores']}
+    result_scores = {tuple(index for _, index in sorted(candidate_pair['group'])): candidate_pair['sim']
+                     for candidate_pair in result['similarity_scores']}
 
     # Anonlink is more strict on enforcing the k parameter. Hence the
     # subset.

diff --git a/backend/entityservice/tests/test_serialization.py b/backend/entityservice/tests/test_serialization.py
@@ -49,8 +49,13 @@ def test_generate_scores_produces_json(self):
         json_obj = json.loads(json_str)
         self.assertIn('similarity_scores', json_obj)
         assert len(json_obj["similarity_scores"]) == 3
-        for score in json_obj["similarity_scores"]:
-            self.assertEqual(len(score), 3)
+        for candidate_pair in json_obj["similarity_scores"]:
+            self.assertIn('group', candidate_pair)
+            self.assertIn('sim', candidate_pair)
+            self.assertEqual(len(candidate_pair), 2)
+            self.assertEqual(len(candidate_pair['group']), 2)
+            for group in candidate_pair['group']:
+                self.assertEqual(len(group), 2)
 
     def test_sims_to_json_empty(self):
         sims_iter = (

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -7,12 +7,21 @@ Changelog
 Next Version
 ------------
 
+- fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463)
+- modify `similiraty_score` output to follow the group format, which will simplify extending this output type to more parties (#464)
+
+Breaking Change
+~~~~~~~~~~~~~~~
+
+- the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463)
+- the `similiraty_score` output type has been modified, it now returns a JSON array of JSON objects, where such an object
+  looks like `{'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score}`.
+
 Version 1.13.0-alpha
 --------------------
 
 - fixed bug where invalid state changes could occur when starting a run (#459)
 - ``matching`` output type has been removed as redundant with the ``groups`` output with 2 parties. (#458)
-- fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463)
 
 - Update dependencies:
 
@@ -22,7 +31,6 @@ Breaking Change
 ~~~~~~~~~~~~~~~
 
 - ``matching`` output type is not available anymore. (#458)
-- the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463)
 
 
 Version 1.12.0

diff --git a/docs/concepts.rst b/docs/concepts.rst
@@ -106,14 +106,14 @@ relationships.
 The ``result_token`` (generated when creating the mapping) is required. The ``result_type`` should
 be set to ``"similarity_scores"``.
 
-Results are a simple JSON array of arrays::
+Results are a JSON array of JSON objects::
 
    [
-       [index_a, index_b, score],
+       {'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score},
        ...
    ]
 
-Where the index values will be the 0 based row index from the uploaded CLKs, and
+Where the index values will be the 0 based dataset index and row index from the uploaded CLKs, and
 the score will be a Number between the provided threshold and ``1.0``.
 
 A score of ``1.0`` means the CLKs were identical. Threshold values are usually between