diff --git a/backend/entityservice/api_def/swagger.yaml b/backend/entityservice/api_def/swagger.yaml index 6293dff3..4955a8f2 100644 --- a/backend/entityservice/api_def/swagger.yaml +++ b/backend/entityservice/api_def/swagger.yaml @@ -509,28 +509,30 @@ paths: ### result_type = "similarity_scores" - The list of the indices of potential matches and their similarity score - where the similarity score is greater than the mapping threshold. + The list of the pairwise similarity scores where the score + is greater than the similarity threshold. Data is returned as `json` object e.g., { "similarity_scores": [ - [5, 27, 1.0], - [14, 10, 1.0] + [[0, 5], [1, 27], 1.0], + [[1, 10], [0, 14], 1.0] ] } - The element in the list is of the following format `[indexA, indexB, score]`, - where `indexA` refers to the index of entity from data provider 1, `indexB` is the index of entity - from data provider 2 that is a potential match to entity in `indexA`, and `score` is the similarity score - representing the likelihood that entity in `indexA` and entity in `indexB` is a match. + The element in the list is a list of three elements of the following format + `[[party_id_0, row_index_0], [party_id_1, row_index_1], score]`, where `[party_id_0, row_index_0]` + refers to the record at the index `row_index_0` from the dataset `party_id_0`, similarly for + `[party_id_1, row_index_1]`, and `score` is the similarity score representing the likelihood + that this pair or records is a match. - `indexA` and `indexB` starts from 0. + `ds_index_0`, `rec_index_0, `ds_index_1` and `rec_index_1` start from 0, and `party_id_0 != party_id_1` but + are not necessarilly ordered. - The value of `score` is between 0.0 and 1.0, where 0.0 corresponds to no match - and 1.0 corresponds to total match. + The value of `score` is between 0.0 and 1.0. The higher the score, the higher the similarity between + the compared CLKs. ### result_type = "permutations" diff --git a/backend/entityservice/serialization.py b/backend/entityservice/serialization.py index eb328aa3..571490e7 100644 --- a/backend/entityservice/serialization.py +++ b/backend/entityservice/serialization.py @@ -128,7 +128,8 @@ def generate_scores(candidate_pair_stream: typing.BinaryIO): """ sims, (dset_is0, dset_is1), (rec_is0, rec_is1) = anonlink.serialization.load_candidate_pairs(candidate_pair_stream) - cs_sims_iter = (f'{rec_i0}, {rec_i1}, {sim}' for sim, rec_i0, rec_i1 in zip(sims, rec_is0, rec_is1)) + cs_sims_iter = (f'[{dset_i0}, {rec_i0}], [{dset_i1}, {rec_i1}], {sim}' + for sim, dset_i0, dset_i1, rec_i0, rec_i1 in zip(sims, dset_is0, dset_is1, rec_is0, rec_is1)) yield '{"similarity_scores": [' line_iter = iter(cs_sims_iter) diff --git a/backend/entityservice/tests/test_project_run_results.py b/backend/entityservice/tests/test_project_run_results.py index d0606500..1b08b10f 100644 --- a/backend/entityservice/tests/test_project_run_results.py +++ b/backend/entityservice/tests/test_project_run_results.py @@ -5,10 +5,13 @@ def test_run_similarity_score_results(requests, similarity_scores_project, thres run_id = post_run(requests, similarity_scores_project, threshold) result = get_run_result(requests, similarity_scores_project, run_id, timeout=120) assert 'similarity_scores' in result - for index1, index2, score in result['similarity_scores']: + for (party_id_1, rec_id_1), (party_id_2, rec_id_2), score in result['similarity_scores']: assert 0.0 <= score >= 1.0 - assert 0 <= index1 - assert 0 <= index2 + assert 0 <= party_id_1 + assert 0 <= party_id_2 + assert party_id_1 != party_id_2 + assert 0 <= rec_id_1 + assert 0 <= rec_id_2 def test_run_permutations_results(requests, permutations_project, threshold): diff --git a/backend/entityservice/tests/test_results_correctness.py b/backend/entityservice/tests/test_results_correctness.py index f435f3ed..37da082e 100644 --- a/backend/entityservice/tests/test_results_correctness.py +++ b/backend/entityservice/tests/test_results_correctness.py @@ -48,7 +48,8 @@ def test_similarity_scores(requests, the_truth): result = get_run_result(requests, project_data, run, timeout=60) true_scores = the_truth['similarity_scores'] - result_scores = {(a, b): sim for a, b, sim in result['similarity_scores']} + result_scores = {tuple(index for _, index in sorted([a, b])): score + for a, b, score in result['similarity_scores']} # Anonlink is more strict on enforcing the k parameter. Hence the # subset. diff --git a/backend/entityservice/tests/test_serialization.py b/backend/entityservice/tests/test_serialization.py index e1a0c314..8dc0e0ef 100644 --- a/backend/entityservice/tests/test_serialization.py +++ b/backend/entityservice/tests/test_serialization.py @@ -49,8 +49,11 @@ def test_generate_scores_produces_json(self): json_obj = json.loads(json_str) self.assertIn('similarity_scores', json_obj) assert len(json_obj["similarity_scores"]) == 3 - for score in json_obj["similarity_scores"]: - self.assertEqual(len(score), 3) + for pair_and_score in json_obj["similarity_scores"]: + self.assertEqual(len(pair_and_score), 3) + a, b, score = pair_and_score + self.assertEqual(len(a), 2) + self.assertEqual(len(b), 2) def test_sims_to_json_empty(self): sims_iter = ( diff --git a/docs/changelog.rst b/docs/changelog.rst index 605a456c..3e6be439 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,12 +7,21 @@ Changelog Next Version ------------ +- fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463) +- modify ``similarity_score`` output to follow the group format, which will simplify extending this output type to more parties (#464) + +Breaking Change +~~~~~~~~~~~~~~~ + +- the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463) +- the ``similarity_score`` output type has been modified, it now returns a JSON array of JSON objects, where such an object + looks like `[[party_id_0, row_index_0], [party_id_1, row_index_1], score]`. (#464) + Version 1.13.0-alpha -------------------- - fixed bug where invalid state changes could occur when starting a run (#459) - ``matching`` output type has been removed as redundant with the ``groups`` output with 2 parties. (#458) -- fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463) - Update dependencies: @@ -22,7 +31,6 @@ Breaking Change ~~~~~~~~~~~~~~~ - ``matching`` output type is not available anymore. (#458) -- the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463) Version 1.12.0 diff --git a/docs/concepts.rst b/docs/concepts.rst index 336d2b98..b4bb232f 100644 --- a/docs/concepts.rst +++ b/docs/concepts.rst @@ -106,14 +106,14 @@ relationships. The ``result_token`` (generated when creating the mapping) is required. The ``result_type`` should be set to ``"similarity_scores"``. -Results are a simple JSON array of arrays:: +Results are a JSON array of JSON arrays of three elements:: [ - [index_a, index_b, score], + [[party_id_0, row_index_0], [party_id_1, row_index_1], score], ... ] -Where the index values will be the 0 based row index from the uploaded CLKs, and +Where the index values will be the 0 based dataset index and row index from the uploaded CLKs, and the score will be a Number between the provided threshold and ``1.0``. A score of ``1.0`` means the CLKs were identical. Threshold values are usually between diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb index 702bd111..583718df 100644 --- a/docs/tutorial/Similarity Scores.ipynb +++ b/docs/tutorial/Similarity Scores.ipynb @@ -78,7 +78,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing anonlink-entity-service hosted at https://testing.es.data61.xyz\n" + "Testing anonlink-entity-service hosted at http://0.0.0.0:8851\n" ] } ], @@ -100,7 +100,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 6536, \"rate\": 2530484, \"status\": \"ok\"}\r\n" + "{\"project_count\": 1689, \"rate\": 2267284, \"status\": \"ok\"}\r\n" ] } ], @@ -312,7 +312,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /tmp/tmpp5kob1ay\n" + "Overwriting /tmp/tmpw_n8wu8g\n" ] } ], @@ -554,17 +554,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /tmp/tmp8pi2emsl\n", + "Credentials will be saved in /tmp/tmp2eppf_dc\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': '500db47fcfed842b47f0ae20f6ba82a66dddc5d4d6e956a7',\n", - " 'result_token': '7c161ffe7873683fd8102a635815d7e7a577612458147c32',\n", - " 'update_tokens': ['1e50b588283e191f79769fc925949baded7c704bca28060d',\n", - " '3b95f9e2a51429738c3ea9338b2c3f05cda6cfcef0c8918c']}" + "{'project_id': 'fc8f8216e33a7b8ffd4b967c27f8ce8e5d7371cf8f52bcdb',\n", + " 'result_token': '6423ccee1e634a390a12e3de1a57e7bd322621111c119351',\n", + " 'update_tokens': ['ef0404a7c23ea25c9f922f4c254f80dd6fa644d7d906efa9',\n", + " '46a71922c19a75eae2dd75ec59db0eac453842123514c22a']}" ] }, "execution_count": 8, @@ -612,8 +612,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mCLK data written to /tmp/tmp2_h66ds2.json\u001b[0m\n", - "\u001b[31mCLK data written to /tmp/tmpiyu3o3vv.json\u001b[0m\n" + "\u001b[31mCLK data written to /tmp/tmpjlx4bxil.json\u001b[0m\n", + "\u001b[31mCLK data written to /tmp/tmpz2ykuhep.json\u001b[0m\n" ] } ], @@ -812,16 +812,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "[76, 2345, 1.0]\n", - "[83, 3439, 1.0]\n", - "[103, 863, 1.0]\n", - "[154, 2391, 1.0]\n", - "[177, 4247, 1.0]\n", - "[192, 1176, 1.0]\n", - "[270, 4516, 1.0]\n", - "[312, 1253, 1.0]\n", - "[407, 3743, 1.0]\n", - "[670, 3550, 1.0]\n" + "[[0, 76], [1, 2345], 1.0]\n", + "[[0, 83], [1, 3439], 1.0]\n", + "[[0, 103], [1, 863], 1.0]\n", + "[[0, 154], [1, 2391], 1.0]\n", + "[[0, 177], [1, 4247], 1.0]\n", + "[[0, 192], [1, 1176], 1.0]\n", + "[[0, 270], [1, 4516], 1.0]\n", + "[[0, 312], [1, 1253], 1.0]\n", + "[[0, 407], [1, 3743], 1.0]\n", + "[[0, 670], [1, 3550], 1.0]\n" ] } ], @@ -895,7 +895,7 @@ } ], "source": [ - "plt.hist([_[2] for _ in data[::100]], bins=50);" + "plt.hist([score for _, _, score in data[::100]], bins=50);" ] }, { @@ -930,7 +930,7 @@ } ], "source": [ - "plt.hist([_[2] for _ in data[::1] if _[2] > 0.94], bins=50);" + "plt.hist([score for _, _, score in data[::1] if score > 0.94], bins=50);" ] }, { @@ -962,8 +962,10 @@ " return samples\n", "\n", "def lookup_originals(candidate_pair):\n", - " a = dfA.iloc[candidate_pair[0]]\n", - " b = dfB.iloc[candidate_pair[1]]\n", + " a, b, score = candidate_pair\n", + " a_index, b_index = [x[1] for x in sorted([a, b])]\n", + " a = dfA.iloc[a_index]\n", + " b = dfB.iloc[b_index]\n", " return a, b" ] }, diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst index a57629bc..2998dbe3 100644 --- a/docs/tutorial/index.rst +++ b/docs/tutorial/index.rst @@ -12,6 +12,18 @@ Tutorials multiparty-linkage-in-entity-service.ipynb +Usage +----- + +The code is often evolving and may include some breaking changes not yet deployed in our testing deployment (at the +URL https://testing.es.data61.xyz ). So to run the tutorials, you can either: + + - use the tutorials from the `master` branch of this repository which will work with the currently deployed testing service, + - or build and deploy the service from the same branch as the tutorials you would like to run, providing its URL to + the tutorials via the environment variable `SERVER` (e.g. `SERVER=http://0.0.0.0:8851` if deployed locally). + +Other use-cases are not supported and may fail for non-obvious reasons. + External Tutorials ------------------