data61 · gusmith · Nov 4, 2019 · Oct 30, 2019 · Oct 31, 2019 · Oct 31, 2019
diff --git a/backend/entityservice/api_def/swagger.yaml b/backend/entityservice/api_def/swagger.yaml
@@ -40,7 +40,7 @@ info:
     The personally identifiable information used for linking is first locally transformed
     to anonymous linking codes called [Cryptographic Longterm Keys](concepts.html#cryptographic-longterm-key)
     (CLKs). Each party then uploads its CLKs to the service for matching.
-    The service supports four different [types of outputs](concepts.html#result-types)
+    The service supports three different [types of outputs](concepts.html#result-types)
     of matching results, varying in privacy properties.
     For the CLKs to be comparable, the parties need to agree on and follow a particular
     [linkage schema](concepts.html#schema) and agree on a shared secret before generating
@@ -68,20 +68,19 @@ info:
 
     ## Matching Protocols
 
-    The Entity Service supports four different **result types** with varying privacy properties which define the produced result,
+    The Entity Service supports three different **result types** with varying privacy properties which define the produced result,
     and who may see the which part of the output.
 
     See the documentation section on [output types](./concepts.html#result-types).
 
     The options are:
 
-      * `"mapping"` - Creates a lookup table of the form `indexA = indexB`.
       * `"permutations"` - Creates random permutations and a mask.
       * `"similarity_scores"` - Outputs a list of similarity scores of `[indexA, indexB, score]`, where `score`
         represents the likelihood that `indexA = indexB`.
       * `"groups"` - Outputs a list of groups of records, where each group represents one entity.
 
-    Only `"groups"` supports multi-party linkage. `"mapping"`, `"permutations"`, and `"similarity_scores"` only support linkage
+    Only `"groups"` supports multi-party linkage. `"permutations"` and `"similarity_scores"` only support linkage
     with two parties.
 
 
@@ -193,16 +192,16 @@ paths:
         The parts of the computed linkage results that are accessable by the different tokens depends on the
         `result_type`:
 
-        - `"mapping"`, `"similarity_scores"`, or `"groups"`\
-        If the `result_type` is `"mapping"`, `"similarity_scores"`, or `"groups"` then the results can be accessed with the
+        - `"similarity_scores"`, or `"groups"`\
+        If the `result_type` is `"similarity_scores"` or `"groups"` then the results can be accessed with the
         `result_token``token`, which is provided when initially creating the mapping.
 
         - `"permutations"`\
         If the `result_type` is `permutations`, then the data providers can access their respective permutation with
         their individual `receipt_token`, which they obtain when adding data to the mapping.
         The mask can be accessed with the `result_token`.
 
-        Only `"groups"` supports multi-party linkage. If the result type is `"mapping"`,  `"similarity_scores"`, or
+        Only `"groups"` supports multi-party linkage. If the result type is `"similarity_scores"` or
         `"permutations"`, then the number of parties must be 2.
 
       parameters:
@@ -508,19 +507,6 @@ paths:
         Note if the result isn't ready, a `404` will be returned.
 
 
-        ### result_type = "mapping"
-
-        The mapping of indices between parties. Data is returned as `json` object e.g,
-
-            {
-                "mapping":
-                    {
-                        "0": "5",
-                        "2": "0"
-                    }
-            }
-
-
         ### result_type = "similarity_scores"
 
         The list of the indices of potential matches and their similarity score
@@ -718,10 +704,9 @@ definitions:
     description: |
       Defines the output type of the mapping. Multi-party linkage requires `"groups"` to be used.
     enum:
-      - similarity_scores
-      - mapping
-      - permutations
       - groups
+      - permutations
+      - similarity_scores
 
   RunState:
     type: string

diff --git a/backend/entityservice/init-db-schema.sql b/backend/entityservice/init-db-schema.sql
@@ -7,10 +7,10 @@ CASCADE;
 DROP TYPE IF EXISTS MAPPINGRESULT;
 
 CREATE TYPE MAPPINGRESULT AS ENUM (
-  'mapping',
+  'groups',
   'permutations',
-  'similarity_scores',
-  'groups'
+  'similarity_scores'
+
 );
 
 -- The table of entity matching jobs

diff --git a/backend/entityservice/messages.py b/backend/entityservice/messages.py
@@ -1,3 +1,3 @@
 INVALID_ACCESS_MSG = "Invalid access token or project doesn't exist"
 
-INVALID_RESULT_TYPE_MESSAGE = 'result_type must be either "permutations", "mapping", "similarity_scores", or "groups"'
+INVALID_RESULT_TYPE_MESSAGE = 'result_type must be either "groups", "permutations", or "similarity_scores"'
diff --git a/backend/entityservice/models/project.py b/backend/entityservice/models/project.py
@@ -41,8 +41,9 @@ def __init__(self, result_type, schema, name, notes, parties):
         self.data = {}
         self.result = {}
 
-    VALID_RESULT_TYPES = {'permutations', 'mapping',
-                          'similarity_scores', 'groups'}
+    VALID_RESULT_TYPES = {'groups',
+                          'permutations',
+                          'similarity_scores'}
 
     @staticmethod
     def from_json(data):

diff --git a/backend/entityservice/tasks/permutation.py b/backend/entityservice/tasks/permutation.py
@@ -1,9 +1,6 @@
 import random
 
-import anonlink
-
 from entityservice.cache import encodings as encoding_cache
-
 from entityservice.async_worker import celery, logger
 from entityservice.database import DBConn, get_project_column, insert_mapping_result, get_dataprovider_ids, \
     get_run_result, insert_permutation, insert_permutation_mask
@@ -13,11 +10,6 @@
 from entityservice.utils import convert_mapping_to_list
 
 
-def groups_to_mapping(groups):
-    return {str(i): str(j)
-            for i, j in anonlink.solving.pairs_from_groups(groups)}
-
-
 @celery.task(base=TracedTask, ignore_result=True, args_as_tags=('project_id', 'run_id'))
 def save_and_permute(similarity_result, project_id, run_id, parent_span):
     log = logger.bind(pid=project_id, run_id=run_id)
@@ -30,17 +22,9 @@ def save_and_permute(similarity_result, project_id, run_id, parent_span):
     with DBConn() as db:
         result_type = get_project_column(db, project_id, 'result_type')
 
-        if result_type == "groups":
-            # Save the raw groups
-            log.debug("Saving the groups in the DB")
-            result_id = insert_mapping_result(db, run_id, groups)
-        else:
-            # Turn groups into mapping and save that
-            log.debug("Turning groups into mapping")
-            mapping = groups_to_mapping(groups)
-            log.debug("Saving mapping in the DB")
-            result_id = insert_mapping_result(db, run_id, mapping)
-
+        # Save the raw groups
+        log.debug("Saving the groups in the DB")
+        result_id = insert_mapping_result(db, run_id, groups)
         dp_ids = get_dataprovider_ids(db, project_id)
 
     log.info("Result saved to db with result id {}".format(result_id))
@@ -84,10 +68,7 @@ def permute_mapping_data(project_id, run_id, len_filters1, len_filters2, parent_
 
     with DBConn() as conn:
 
-        mapping_str = get_run_result(conn, run_id)
-
-        # Convert to int: int
-        mapping = {int(k): int(mapping_str[k]) for k in mapping_str}
+        groups = get_run_result(conn, run_id)
 
         log.info("Creating random permutations")
         log.debug("Entities in dataset A: {}, Entities in dataset B: {}".format(len_filters1, len_filters2))
@@ -100,7 +81,7 @@ def permute_mapping_data(project_id, run_id, len_filters1, len_filters2, parent_
         """
         smaller_dataset_size = min(len_filters1, len_filters2)
         log.debug("Smaller dataset size is {}".format(smaller_dataset_size))
-        number_in_common = len(mapping)
+        number_in_common = len(groups)
         a_permutation = {}  # Should be length of filters1
         b_permutation = {}  # length of filters2
 
@@ -113,11 +94,13 @@ def permute_mapping_data(project_id, run_id, len_filters1, len_filters2, parent_
         random.shuffle(remaining_new_indexes)
         log.info("Assigning random indexes for {} matched entities".format(number_in_common))
 
-        for mapping_number, a_index in enumerate(mapping):
-            b_index = mapping[a_index]
+        for group_number, group in enumerate(groups):
+            # It should not fail because the permutation result is only available for 2 parties, but let's be to safe.
+            assert 2 == len(group)
+            (_, a_index), (_, b_index) = sorted(group)
 
             # Choose the index in the new mapping (randomly)
-            mapping_index = remaining_new_indexes[mapping_number]
+            mapping_index = remaining_new_indexes[group_number]
 
             a_permutation[a_index] = mapping_index
             b_permutation[b_index] = mapping_index

diff --git a/backend/entityservice/tasks/run.py b/backend/entityservice/tasks/run.py
@@ -3,7 +3,7 @@
 from entityservice.cache import progress as progress_cache
 from entityservice.cache.active_runs import set_run_state_active, is_run_missing
 from entityservice.database import DBConn, check_project_exists, get_run, get_run_state_for_update
-from entityservice.database import update_run_set_started, get_dataprovider_ids
+from entityservice.database import update_run_set_started
 from entityservice.errors import RunDeleted, ProjectDeleted
 from entityservice.tasks.base_task import TracedTask
 from entityservice.tasks.comparing import create_comparison_jobs

diff --git a/backend/entityservice/tasks/solver.py b/backend/entityservice/tasks/solver.py
@@ -5,7 +5,6 @@
 from entityservice.settings import Config as config
 from entityservice.tasks.base_task import TracedTask
 from entityservice.tasks.permutation import save_and_permute
-from entityservice.utils import similarity_matrix_from_csv_bytes
 
 
 @celery.task(base=TracedTask, ignore_result=True, args_as_tags=('project_id', 'run_id'))

diff --git a/backend/entityservice/tests/conftest.py b/backend/entityservice/tests/conftest.py
@@ -4,8 +4,7 @@
 import requests as requests_library
 import itertools
 
-from entityservice.tests.util import create_project_upload_fake_data, delete_project, temporary_blank_project, \
-    create_project_no_data
+from entityservice.tests.util import create_project_upload_fake_data, delete_project, create_project_no_data
 
 THROTTLE_SLEEP = 0.2
 
@@ -29,7 +28,7 @@ def delay_next(r, *args, **kwargs):
 #
 # - pairs of dataset sizes
 # - overlap of the sizes
-# - result_type in ['mapping', 'similarity_scores', 'permutations']
+# - result_type for 2 parties in ['similarity_scores', 'permutations'] and for more parties in ['groups']
 # - threshold
 
 ENVVAR_NAME = 'ENTITY_SERVICE_RUN_SLOW_TESTS'
@@ -70,7 +69,7 @@ def delay_next(r, *args, **kwargs):
     itertools.product(SIZES_2P, OVERLAPS, ENCODING_SIZES))
 PROJECT_PARAMS_NP = tuple(
     itertools.product(SIZES_NP, OVERLAPS, ENCODING_SIZES))
-PROJECT_RESULT_TYPES_2P = ['mapping', 'similarity_scores', 'permutations']
+PROJECT_RESULT_TYPES_2P = ['similarity_scores', 'permutations']
 PROJECT_RESULT_TYPES_NP = ['groups']
 
 
@@ -104,14 +103,6 @@ def create_project_response(requests, size, overlap, result_type, encoding_size=
     return project
 
 
-@pytest.fixture(scope='function', params=PROJECT_PARAMS_2P)
-def mapping_project(request, requests):
-    size, overlap, encoding_size = request.param
-    prj = create_project_response(requests, size, overlap, 'mapping', encoding_size)
-    yield prj
-    delete_project(requests, prj)
-
-
 @pytest.fixture(scope='function', params=PROJECT_PARAMS_2P)
 def similarity_scores_project(request, requests):
     size, overlap, encoding_size = request.param

diff --git a/backend/entityservice/tests/test_project_run_listing.py b/backend/entityservice/tests/test_project_run_listing.py
@@ -21,7 +21,7 @@ def test_list_run_invalid_auth(requests):
 
 
 def test_list_run_after_posting_runs(requests):
-    with temporary_blank_project(requests, result_type='mapping') as project:
+    with temporary_blank_project(requests, result_type='groups') as project:
 
         for i in range(1, 11):
             run_id = post_run(requests, project, 0.95)

diff --git a/backend/entityservice/tests/test_project_run_results.py b/backend/entityservice/tests/test_project_run_results.py
@@ -1,13 +1,4 @@
-from entityservice.tests.util import create_project_no_data, post_run, get_run_result, wait_approx_run_time
-
-
-def test_run_mapping_results(requests, mapping_project):
-    run_id = post_run(requests, mapping_project, 0.95)
-    wait_approx_run_time(mapping_project['size'])
-
-    result = get_run_result(requests, mapping_project, run_id, timeout=120)
-    assert 'mapping' in result
-    assert isinstance(result['mapping'], dict)
+from entityservice.tests.util import create_project_no_data, post_run, get_run_result
 
 
 def test_run_similarity_score_results(requests, similarity_scores_project, threshold):

diff --git a/backend/entityservice/tests/test_project_run_status.py b/backend/entityservice/tests/test_project_run_status.py
@@ -3,12 +3,17 @@
     create_project_no_data, ensure_run_progressing
 
 
-def test_run_status_with_clks_2p(requests, mapping_project):
-    size = mapping_project['size']
-    ensure_run_progressing(requests, mapping_project, size)
+def test_permutations_run_status_with_clks_2p(requests, permutations_project):
+    size = permutations_project['size']
+    ensure_run_progressing(requests, permutations_project, size)
 
 
-def test_run_status_with_clks_np(requests, groups_project):
+def test_similarity_scores_run_status_with_clks_2p(requests, similarity_scores_project):
+    size = similarity_scores_project['size']
+    ensure_run_progressing(requests, similarity_scores_project, size)
+
+
+def test_groups_run_status_with_clks_np(requests, groups_project):
     size = groups_project['size']
     ensure_run_progressing(requests, groups_project, size)
 

diff --git a/backend/entityservice/tests/test_project_uploads.py b/backend/entityservice/tests/test_project_uploads.py
@@ -49,15 +49,7 @@ def test_project_binary_data_uploaded(requests, valid_project_params):
     run_id = post_run(requests, new_project_data, 0.99)
     result = get_run_result(requests, new_project_data, run_id, wait=True)
 
-    if valid_project_params['result_type'] == 'mapping':
-        assert 'mapping' in result
-
-        # Since we uploaded the same file it should have identified the
-        # same rows as matches
-        for i in range(1, 1000):
-            assert str(i) in result['mapping']
-            assert result['mapping'][str(i)] == str(i)
-    elif valid_project_params['result_type'] == 'groups':
+    if valid_project_params['result_type'] == 'groups':
         assert 'groups' in result
         groups = result['groups']
         assert len(groups) == 1000
@@ -101,10 +93,7 @@ def test_project_binary_data_upload_with_different_encoded_size(
 
     run_id = post_run(requests, new_project_data, 0.99)
     result = get_run_result(requests, new_project_data, run_id, wait=True)
-    if valid_project_params['result_type'] == 'mapping':
-        assert 'mapping' in result
-        assert result['mapping']['499'] == '0'
-    elif valid_project_params['result_type'] == 'groups':
+    if valid_project_params['result_type'] == 'groups':
         assert 'groups' in result
         groups = result['groups']
         groups_set = {frozenset(map(tuple, group)) for group in groups}
@@ -127,10 +116,7 @@ def test_project_json_data_upload_with_various_encoded_sizes(
 
     run_id = post_run(requests, new_project_data, 0.9)
     result = get_run_result(requests, new_project_data, run_id, wait=True)
-    if result_type == 'mapping':
-        assert 'mapping' in result
-        assert len(result['mapping']) >= 400
-    elif result_type == 'groups':
+    if result_type == 'groups':
         assert 'groups' in result
         # This is a pretty bad bound, but we're not testing the
         # accuracy.

diff --git a/backend/entityservice/tests/test_results_correctness.py b/backend/entityservice/tests/test_results_correctness.py
@@ -26,7 +26,6 @@ def the_truth(scope='module'):
         sims, _, (rec_is_a, rec_is_b) = candidate_pairs
 
         groups = anonlink.solving.greedy_solve(candidate_pairs)
-        mapping = dict(anonlink.solving.pairs_from_groups(groups))
 
         similarity_scores = {(a, b): sim
                              for sim, a, b in zip(sims, rec_is_a, rec_is_b)}
@@ -35,7 +34,6 @@ def the_truth(scope='module'):
                'entity_ids_b': entity_ids_b,
                'similarity_scores': similarity_scores,
                'groups': groups,
-               'mapping': mapping,
                'threshold': threshold,
                'clks_a': clks_a,
                'clks_b': clks_b}
@@ -62,25 +60,6 @@ def test_similarity_scores(requests, the_truth):
     delete_project(requests, project_data)
 
 
-def test_mapping(requests, the_truth):
-    project_data, _ = create_project_upload_data(
-        requests,
-        (the_truth['clks_a'], the_truth['clks_b']),
-        result_type='mapping')
-    run = post_run(requests, project_data, threshold=the_truth['threshold'])
-    result = get_run_result(requests, project_data, run)
-    # compare mapping with the truth
-    mapping = {int(k): int(result['mapping'][k]) for k in result['mapping']}
-
-    # NB: Anonlink is more strict on enforcing the k parameter, so there
-    # is a small chance the below won't hold. This should only be the
-    # case for more noisy problems.
-    assert mapping.keys() == the_truth['mapping'].keys()
-    for key, value in mapping.items():
-        assert value == the_truth['mapping'][key]
-        assert the_truth['entity_ids_a'][key] == the_truth['entity_ids_b'][value]
-
-
 def test_permutation(requests, the_truth):
     project_data, (r_a, r_b) = create_project_upload_data(
         requests,
@@ -93,7 +72,9 @@ def test_permutation(requests, the_truth):
     # compare permutations and mask against mapping of the truth
     permutation_a = inverse_of_permutation(perm_a_result['permutation'])
     permutation_b = inverse_of_permutation(perm_b_result['permutation'])
-    mapping = the_truth['mapping']
+    groups = the_truth['groups']
+    # Use a mapping output to simplify the checking.
+    mapping = dict(anonlink.solving.pairs_from_groups(groups))
 
     # NB: Anonlink is more strict on enforcing the k parameter, so there
     # is a small chance the below won't hold. This should only be the