data61 · gusmith · Nov 19, 2019 · Nov 4, 2019 · Nov 13, 2019 · Nov 13, 2019
diff --git a/backend/entityservice/api_def/swagger.yaml b/backend/entityservice/api_def/swagger.yaml
@@ -509,28 +509,30 @@ paths:
 
         ### result_type = "similarity_scores"
 
-        The list of the indices of potential matches and their similarity score
-        where the similarity score is greater than the mapping threshold.
+        The list of the pairwise similarity scores where the score
+        is greater than the similarity threshold.
         Data is returned as `json` object e.g.,
 
             {
                 "similarity_scores":
                     [
-                        [5, 27, 1.0],
-                        [14, 10, 1.0]
+                        [[0, 5], [1, 27], 1.0],
+                        [[1, 10], [0, 14], 1.0]
                     ]
             }
 
 
-        The element in the list is of the following format `[indexA, indexB, score]`,
-        where `indexA` refers to the index of entity from data provider 1, `indexB` is the index of entity
-        from data provider 2 that is a potential match to entity in `indexA`, and `score` is the similarity score
-        representing the likelihood that entity in `indexA` and entity in `indexB` is a match.
+        The element in the list is a list of three elements of the following format
+        `[[party_id_0, row_index_0], [party_id_1, row_index_1], score]`, where `[party_id_0, row_index_0]`
+        refers to the record at the index `row_index_0` from the dataset `party_id_0`, similarly for
+        `[party_id_1, row_index_1]`, and `score` is the similarity score representing the likelihood
+        that this pair or records is a match.
 
-        `indexA` and `indexB` starts from 0.
+        `ds_index_0`, `rec_index_0, `ds_index_1` and `rec_index_1` start from 0, and `party_id_0 != party_id_1` but
+        are not necessarilly ordered.
 
-        The value of `score` is between 0.0 and 1.0, where 0.0 corresponds to no match
-        and 1.0 corresponds to total match.
+        The value of `score` is between 0.0 and 1.0. The higher the score, the higher the similarity between
+        the compared CLKs.
 
         ### result_type = "permutations"
 

diff --git a/backend/entityservice/serialization.py b/backend/entityservice/serialization.py
@@ -128,7 +128,8 @@ def generate_scores(candidate_pair_stream: typing.BinaryIO):
     """
     sims, (dset_is0, dset_is1), (rec_is0, rec_is1) = anonlink.serialization.load_candidate_pairs(candidate_pair_stream)
 
-    cs_sims_iter = (f'{rec_i0}, {rec_i1}, {sim}' for sim, rec_i0, rec_i1 in zip(sims, rec_is0, rec_is1))
+    cs_sims_iter = (f'[{dset_i0}, {rec_i0}], [{dset_i1}, {rec_i1}], {sim}'
+                    for sim, dset_i0, dset_i1, rec_i0, rec_i1 in zip(sims, dset_is0, dset_is1, rec_is0, rec_is1))
     yield '{"similarity_scores": ['
     line_iter = iter(cs_sims_iter)
 

diff --git a/backend/entityservice/tests/test_project_run_results.py b/backend/entityservice/tests/test_project_run_results.py
@@ -5,10 +5,13 @@ def test_run_similarity_score_results(requests, similarity_scores_project, thres
     run_id = post_run(requests, similarity_scores_project, threshold)
     result = get_run_result(requests, similarity_scores_project, run_id, timeout=120)
     assert 'similarity_scores' in result
-    for index1, index2, score in result['similarity_scores']:
+    for (party_id_1, rec_id_1), (party_id_2, rec_id_2), score in result['similarity_scores']:
         assert 0.0 <= score >= 1.0
-        assert 0 <= index1
-        assert 0 <= index2
+        assert 0 <= party_id_1
+        assert 0 <= party_id_2
+        assert party_id_1 != party_id_2
+        assert 0 <= rec_id_1
+        assert 0 <= rec_id_2
 
 
 def test_run_permutations_results(requests, permutations_project, threshold):

diff --git a/backend/entityservice/tests/test_results_correctness.py b/backend/entityservice/tests/test_results_correctness.py
@@ -48,7 +48,8 @@ def test_similarity_scores(requests, the_truth):
     result = get_run_result(requests, project_data, run, timeout=60)
 
     true_scores = the_truth['similarity_scores']
-    result_scores = {(a, b): sim for a, b, sim in result['similarity_scores']}
+    result_scores = {tuple(index for _, index in sorted([a, b])): score
+                     for a, b, score in result['similarity_scores']}
 
     # Anonlink is more strict on enforcing the k parameter. Hence the
     # subset.

diff --git a/backend/entityservice/tests/test_serialization.py b/backend/entityservice/tests/test_serialization.py
@@ -49,8 +49,11 @@ def test_generate_scores_produces_json(self):
         json_obj = json.loads(json_str)
         self.assertIn('similarity_scores', json_obj)
         assert len(json_obj["similarity_scores"]) == 3
-        for score in json_obj["similarity_scores"]:
-            self.assertEqual(len(score), 3)
+        for pair_and_score in json_obj["similarity_scores"]:
+            self.assertEqual(len(pair_and_score), 3)
+            a, b, score = pair_and_score
+            self.assertEqual(len(a), 2)
+            self.assertEqual(len(b), 2)
 
     def test_sims_to_json_empty(self):
         sims_iter = (

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -7,12 +7,21 @@ Changelog
 Next Version
 ------------
 
+- fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463)
+- modify ``similarity_score`` output to follow the group format, which will simplify extending this output type to more parties (#464)
+
+Breaking Change
+~~~~~~~~~~~~~~~
+
+- the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463)
+- the ``similarity_score`` output type has been modified, it now returns a JSON array of JSON objects, where such an object
+  looks like `[[party_id_0, row_index_0], [party_id_1, row_index_1], score]`. (#464)
+
 Version 1.13.0-alpha
 --------------------
 
 - fixed bug where invalid state changes could occur when starting a run (#459)
 - ``matching`` output type has been removed as redundant with the ``groups`` output with 2 parties. (#458)
-- fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463)
 
 - Update dependencies:
 
@@ -22,7 +31,6 @@ Breaking Change
 ~~~~~~~~~~~~~~~
 
 - ``matching`` output type is not available anymore. (#458)
-- the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463)
 
 
 Version 1.12.0

diff --git a/docs/concepts.rst b/docs/concepts.rst
@@ -106,14 +106,14 @@ relationships.
 The ``result_token`` (generated when creating the mapping) is required. The ``result_type`` should
 be set to ``"similarity_scores"``.
 
-Results are a simple JSON array of arrays::
+Results are a JSON array of JSON arrays of three elements::
 
    [
-       [index_a, index_b, score],
+       [[party_id_0, row_index_0], [party_id_1, row_index_1], score],
        ...
    ]
 
-Where the index values will be the 0 based row index from the uploaded CLKs, and
+Where the index values will be the 0 based dataset index and row index from the uploaded CLKs, and
 the score will be a Number between the provided threshold and ``1.0``.
 
 A score of ``1.0`` means the CLKs were identical. Threshold values are usually between

diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb
@@ -78,7 +78,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Testing anonlink-entity-service hosted at https://testing.es.data61.xyz\n"
+      "Testing anonlink-entity-service hosted at http://0.0.0.0:8851\n"
      ]
     }
    ],
@@ -100,7 +100,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{\"project_count\": 6536, \"rate\": 2530484, \"status\": \"ok\"}\r\n"
+      "{\"project_count\": 1689, \"rate\": 2267284, \"status\": \"ok\"}\r\n"
      ]
     }
    ],
@@ -312,7 +312,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Overwriting /tmp/tmpp5kob1ay\n"
+      "Overwriting /tmp/tmpw_n8wu8g\n"
      ]
     }
    ],
@@ -554,17 +554,17 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Credentials will be saved in /tmp/tmp8pi2emsl\n",
+      "Credentials will be saved in /tmp/tmp2eppf_dc\n",
       "\u001b[31mProject created\u001b[0m\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "{'project_id': '500db47fcfed842b47f0ae20f6ba82a66dddc5d4d6e956a7',\n",
-       " 'result_token': '7c161ffe7873683fd8102a635815d7e7a577612458147c32',\n",
-       " 'update_tokens': ['1e50b588283e191f79769fc925949baded7c704bca28060d',\n",
-       "  '3b95f9e2a51429738c3ea9338b2c3f05cda6cfcef0c8918c']}"
+       "{'project_id': 'fc8f8216e33a7b8ffd4b967c27f8ce8e5d7371cf8f52bcdb',\n",
+       " 'result_token': '6423ccee1e634a390a12e3de1a57e7bd322621111c119351',\n",
+       " 'update_tokens': ['ef0404a7c23ea25c9f922f4c254f80dd6fa644d7d906efa9',\n",
+       "  '46a71922c19a75eae2dd75ec59db0eac453842123514c22a']}"
       ]
      },
      "execution_count": 8,
@@ -612,8 +612,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[31mCLK data written to /tmp/tmp2_h66ds2.json\u001b[0m\n",
-      "\u001b[31mCLK data written to /tmp/tmpiyu3o3vv.json\u001b[0m\n"
+      "\u001b[31mCLK data written to /tmp/tmpjlx4bxil.json\u001b[0m\n",
+      "\u001b[31mCLK data written to /tmp/tmpz2ykuhep.json\u001b[0m\n"
      ]
     }
    ],
@@ -812,16 +812,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[76, 2345, 1.0]\n",
-      "[83, 3439, 1.0]\n",
-      "[103, 863, 1.0]\n",
-      "[154, 2391, 1.0]\n",
-      "[177, 4247, 1.0]\n",
-      "[192, 1176, 1.0]\n",
-      "[270, 4516, 1.0]\n",
-      "[312, 1253, 1.0]\n",
-      "[407, 3743, 1.0]\n",
-      "[670, 3550, 1.0]\n"
+      "[[0, 76], [1, 2345], 1.0]\n",
+      "[[0, 83], [1, 3439], 1.0]\n",
+      "[[0, 103], [1, 863], 1.0]\n",
+      "[[0, 154], [1, 2391], 1.0]\n",
+      "[[0, 177], [1, 4247], 1.0]\n",
+      "[[0, 192], [1, 1176], 1.0]\n",
+      "[[0, 270], [1, 4516], 1.0]\n",
+      "[[0, 312], [1, 1253], 1.0]\n",
+      "[[0, 407], [1, 3743], 1.0]\n",
+      "[[0, 670], [1, 3550], 1.0]\n"
      ]
     }
    ],
@@ -895,7 +895,7 @@
     }
    ],
    "source": [
-    "plt.hist([_[2] for _ in data[::100]], bins=50);"
+    "plt.hist([score for _, _, score in data[::100]], bins=50);"
    ]
   },
   {
@@ -930,7 +930,7 @@
     }
    ],
    "source": [
-    "plt.hist([_[2] for _ in data[::1] if _[2] > 0.94], bins=50);"
+    "plt.hist([score for _, _, score in data[::1] if score > 0.94], bins=50);"
    ]
   },
   {
@@ -962,8 +962,10 @@
     "    return samples\n",
     "\n",
     "def lookup_originals(candidate_pair):\n",
-    "    a = dfA.iloc[candidate_pair[0]]\n",
-    "    b = dfB.iloc[candidate_pair[1]]\n",
+    "    a, b, score = candidate_pair\n",
+    "    a_index, b_index = [x[1] for x in sorted([a, b])]\n",
+    "    a = dfA.iloc[a_index]\n",
+    "    b = dfB.iloc[b_index]\n",
     "    return a, b"
    ]
   },

diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
@@ -12,6 +12,18 @@ Tutorials
    multiparty-linkage-in-entity-service.ipynb
 
 
+Usage
+-----
+
+The code is often evolving and may include some breaking changes not yet deployed in our testing deployment (at the
+URL https://testing.es.data61.xyz ). So to run the tutorials, you can either:
+
+ - use the tutorials from the `master` branch of this repository which will work with the currently deployed testing service,
+ - or build and deploy the service from the same branch as the tutorials you would like to run, providing its URL to
+   the tutorials via the environment variable `SERVER` (e.g. `SERVER=http://0.0.0.0:8851` if deployed locally).
+
+Other use-cases are not supported and may fail for non-obvious reasons.
+
 External Tutorials
 ------------------