From 887f5961f059a7ebc435cf027049cf8784e2eb5e Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Mon, 4 Nov 2019 16:03:41 +1100 Subject: [PATCH 01/10] First step, serialize the similarity output to include the groups format. --- backend/entityservice/serialization.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backend/entityservice/serialization.py b/backend/entityservice/serialization.py index eb328aa3..ff2cab0e 100644 --- a/backend/entityservice/serialization.py +++ b/backend/entityservice/serialization.py @@ -128,7 +128,8 @@ def generate_scores(candidate_pair_stream: typing.BinaryIO): """ sims, (dset_is0, dset_is1), (rec_is0, rec_is1) = anonlink.serialization.load_candidate_pairs(candidate_pair_stream) - cs_sims_iter = (f'{rec_i0}, {rec_i1}, {sim}' for sim, rec_i0, rec_i1 in zip(sims, rec_is0, rec_is1)) + cs_sims_iter = (f'"group": [[{dset_i0}, {rec_i0}], [{dset_i1}, {rec_i1}]], "sim": {sim}' + for sim, dset_i0, dset_i1, rec_i0, rec_i1 in zip(sims, dset_is0, dset_is1, rec_is0, rec_is1)) yield '{"similarity_scores": [' line_iter = iter(cs_sims_iter) @@ -140,11 +141,11 @@ def generate_scores(candidate_pair_stream: typing.BinaryIO): return for line in line_iter: - yield '[{}],'.format(prev_line.strip()) + yield '{{{}}},'.format(prev_line.strip()) prev_line = line # Yield the last line without a trailing comma, instead close the json object - yield '[{}]'.format(prev_line.strip()) + yield '{{{}}}'.format(prev_line.strip()) yield ']}' From 4ffb171b6e8a59f6997d1b33c882b6b7ee34e525 Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Wed, 13 Nov 2019 15:54:19 +1100 Subject: [PATCH 02/10] Update tutorial for new similraty score format. Note that the tutorial does not work as long as the deployed entity-service is not updated with the current changes. --- docs/tutorial/Similarity Scores.ipynb | 158 +++++++++++--------------- 1 file changed, 67 insertions(+), 91 deletions(-) diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb index 63174a6d..4b9723fe 100644 --- a/docs/tutorial/Similarity Scores.ipynb +++ b/docs/tutorial/Similarity Scores.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "metadata": { "pycharm": { "is_executing": false @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "metadata": { "pycharm": { "is_executing": false @@ -78,18 +78,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing anonlink-entity-service hosted at https://testing.es.data61.xyz\n" + "Testing anonlink-entity-service hosted at http://0.0.0.0:8851\n" ] } ], "source": [ - "url = os.getenv(\"SERVER\", \"https://testing.es.data61.xyz\")\n", + "url = os.getenv(\"SERVER\", \"https://es.testing.data61.xyz\")\n", "print(f'Testing anonlink-entity-service hosted at {url}')" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 14, "metadata": { "pycharm": { "is_executing": false @@ -100,7 +100,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 2115, \"rate\": 7737583, \"status\": \"ok\"}\r\n" + "{\"project_count\": 0, \"rate\": 1, \"status\": \"ok\"}\r\n" ] } ], @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 15, "metadata": { "pycharm": { "is_executing": false @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 16, "metadata": { "pycharm": { "is_executing": false @@ -254,7 +254,7 @@ "rec-4405-org 4365168 " ] }, - "execution_count": 5, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -288,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 17, "metadata": { "pycharm": { "is_executing": false @@ -301,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 18, "metadata": { "pycharm": { "is_executing": false @@ -312,7 +312,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /tmp/tmpvlivqdcf\n" + "Overwriting /tmp/tmpt3tdin9z\n" ] } ], @@ -405,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 19, "metadata": { "pycharm": { "is_executing": false @@ -416,20 +416,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /tmp/tmpcwpvq6kj\n", + "Credentials will be saved in /tmp/tmpoonx5w3z\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': '1eb3da44f73440c496ab42217381181de55e9dcd6743580c',\n", - " 'result_token': '846c6c25097c7794131de0d3e2c39c04b7de9688acedc383',\n", - " 'update_tokens': ['52aae3f1dfa8a4ec1486d8f7d63a8fe708876b39a8ec585b',\n", - " '92e2c9c1ce52a2c2493b5e22953600735a07553f7d00a704']}" + "{'project_id': '3783f956baf9fee8f5df25d16377bd1d789a1ab3fbc103e6',\n", + " 'result_token': '3d87a15b6f56fb89b58db77a0c89da47b9896a7201c7a5ee',\n", + " 'update_tokens': ['e3476071f4395ae149f2d03e08e1edd7aa983aa1e56e52c6',\n", + " '85aa49ae3b90614b7f34ee65efc4246df17aa51d0caa577a']}" ] }, - "execution_count": 8, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -463,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 20, "metadata": { "pycharm": { "is_executing": false @@ -474,10 +474,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "generating CLKs: 100%|█| 5.00k/5.00k [00:01<00:00, 1.06kclk/s, mean=883, std=33.6]\n", - "\u001b[31mCLK data written to /tmp/tmpj8m1dvxj.json\u001b[0m\n", - "generating CLKs: 100%|█| 5.00k/5.00k [00:01<00:00, 1.30kclk/s, mean=875, std=39.7]\n", - "\u001b[31mCLK data written to /tmp/tmpi2y_ogl9.json\u001b[0m\n" + "generating CLKs: 100%|█| 5.00k/5.00k [00:00<00:00, 884clk/s, mean=882, std=35.1]\n", + "\u001b[31mCLK data written to /tmp/tmp8ppb7r2f.json\u001b[0m\n", + "generating CLKs: 100%|█| 5.00k/5.00k [00:00<00:00, 932clk/s, mean=873, std=42.5]\n", + "\u001b[31mCLK data written to /tmp/tmpu9wy917t.json\u001b[0m\n" ] } ], @@ -506,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 21, "metadata": { "pycharm": { "is_executing": false @@ -545,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 22, "metadata": { "pycharm": { "is_executing": false @@ -577,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 23, "metadata": { "pycharm": { "is_executing": false @@ -616,7 +616,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 24, "metadata": { "pycharm": { "is_executing": false @@ -629,7 +629,7 @@ "text": [ "State: completed\n", "Stage (2/2): compute similarity scores\n", - "Progress: 1.000%\n" + "Progress: 100.00%\n" ] } ], @@ -642,7 +642,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -664,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 26, "metadata": { "pycharm": { "is_executing": false @@ -675,16 +675,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "[76, 2345, 1.0]\n", - "[83, 3439, 1.0]\n", - "[103, 863, 1.0]\n", - "[154, 2391, 1.0]\n", - "[177, 4247, 1.0]\n", - "[192, 1176, 1.0]\n", - "[270, 4516, 1.0]\n", - "[312, 1253, 1.0]\n", - "[407, 3743, 1.0]\n", - "[670, 3550, 1.0]\n" + "{'group': [[0, 76], [1, 2345]], 'sim': 1.0}\n", + "{'group': [[0, 83], [1, 3439]], 'sim': 1.0}\n", + "{'group': [[0, 103], [1, 863]], 'sim': 1.0}\n", + "{'group': [[0, 154], [1, 2391]], 'sim': 1.0}\n", + "{'group': [[0, 177], [1, 4247]], 'sim': 1.0}\n", + "{'group': [[0, 192], [1, 1176]], 'sim': 1.0}\n", + "{'group': [[0, 270], [1, 4516]], 'sim': 1.0}\n", + "{'group': [[0, 312], [1, 1253]], 'sim': 1.0}\n", + "{'group': [[0, 407], [1, 3743]], 'sim': 1.0}\n", + "{'group': [[0, 670], [1, 3550]], 'sim': 1.0}\n" ] } ], @@ -704,7 +704,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 27, "metadata": { "pycharm": { "is_executing": false @@ -714,10 +714,10 @@ { "data": { "text/plain": [ - "1572906" + "1551460" ] }, - "execution_count": 19, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -737,7 +737,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 29, "metadata": { "pycharm": { "is_executing": false @@ -746,7 +746,7 @@ "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD8CAYAAAB+UHOxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAElFJREFUeJzt3W+QnWd53/HvD9kmbSG1HG89RhJZN4hpxYsIujWmKY0Lgy3saQVtSkynQbieKpnYM2EmeSGSF05JPeO0BQYmxFMnVjFMwHESUjSxUqM4MDQdjC2DMZZVx4sRYynCViJD8DClkXP1xbkFJ2JXe3b37Dla39/PzJl9zvX8OfelI52fnj/n2VQVkqT+vGjaA5AkTYcBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASerUedMewNlcfPHFNTs7O+1hSNK68tBDD/15Vc0stdw5HQCzs7McPHhw2sOQpHUlyddGWc5DQJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTSwZAkh9I8kCSLyU5lOQ/tvplST6fZD7Jbye5oNVf3J7Pt/mzQ9t6d6s/nuTqtWpKkrS0UfYAvgO8oap+FNgO7EhyBfCrwPur6hXAs8ANbfkbgGdb/f1tOZJsA64DXgXsAH49yYZxNiNJGt2SAVADz7Wn57dHAW8AfrfV7wTe0qZ3tue0+W9Mkla/q6q+U1VfBeaBy8fShSRp2Ub6JnD7n/pDwCuADwFfAb5RVafaIkeBTW16E/AUQFWdSvJN4Ida/f6hzQ6vsyZm99yzYP3Irdeu5ctK0row0kngqnq+qrYDmxn8r/0frNWAkuxOcjDJwRMnTqzVy0hS95Z1FVBVfQP4NPA64MIkp/cgNgPH2vQxYAtAm/93gb8Yri+wzvBr3F5Vc1U1NzOz5L2MJEkrNMpVQDNJLmzTfwt4E3CYQRD8RFtsF/DJNr2vPafN/+Oqqla/rl0ldBmwFXhgXI1IkpZnlHMAlwJ3tvMALwLurqo/SPIYcFeS/wR8EbijLX8H8NEk88BJBlf+UFWHktwNPAacAm6squfH244kaVRLBkBVPQK8eoH6kyxwFU9V/V/g3yyyrVuAW5Y/TEnSuPlNYEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ1aMgCSbEny6SSPJTmU5Oda/ZeTHEvycHtcM7TOu5PMJ3k8ydVD9R2tNp9kz9q0JEkaxXkjLHMK+Pmq+kKSlwIPJTnQ5r2/qv7r8MJJtgHXAa8CXgb8UZJXttkfAt4EHAUeTLKvqh4bRyPLMbvnngXrR269dsIjkaTpWTIAquo4cLxNfyvJYWDTWVbZCdxVVd8BvppkHri8zZuvqicBktzVlp14AEiSlnkOIMks8Grg8610U5JHkuxNsrHVNgFPDa12tNUWq5/5GruTHExy8MSJE8sZniRpGUYOgCQvAX4PeFdV/SVwG/AjwHYGewjvHceAqur2qpqrqrmZmZlxbFKStIBRzgGQ5HwGH/6/VVWfAKiqp4fm/wbwB+3pMWDL0OqbW42z1CVJEzbKVUAB7gAOV9X7huqXDi32VuDRNr0PuC7Ji5NcBmwFHgAeBLYmuSzJBQxOFO8bTxuSpOUaZQ/gx4CfAr6c5OFW+0Xg7Um2AwUcAX4aoKoOJbmbwcndU8CNVfU8QJKbgHuBDcDeqjo0xl4kScswylVAfwJkgVn7z7LOLcAtC9T3n209SdLk+E1gSeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUqfOmPYBzyeyeexasH7n12gmPRJLWnnsAktQpA0CSOrVkACTZkuTTSR5LcijJz7X6RUkOJHmi/dzY6knywSTzSR5J8pqhbe1qyz+RZNfatSVJWsooewCngJ+vqm3AFcCNSbYBe4D7qmorcF97DvBmYGt77AZug0FgADcDrwUuB24+HRqSpMlbMgCq6nhVfaFNfws4DGwCdgJ3tsXuBN7SpncCH6mB+4ELk1wKXA0cqKqTVfUscADYMdZuJEkjW9Y5gCSzwKuBzwOXVNXxNuvrwCVtehPw1NBqR1ttsbokaQpGDoAkLwF+D3hXVf3l8LyqKqDGMaAku5McTHLwxIkT49ikJGkBIwVAkvMZfPj/VlV9opWfbod2aD+fafVjwJah1Te32mL1v6Gqbq+quaqam5mZWU4vkqRlGOUqoAB3AIer6n1Ds/YBp6/k2QV8cqj+jnY10BXAN9uhonuBq5JsbCd/r2o1SdIUjPJN4B8Dfgr4cpKHW+0XgVuBu5PcAHwNeFubtx+4BpgHvg1cD1BVJ5P8CvBgW+49VXVyLF1IkpZtyQCoqj8BssjsNy6wfAE3LrKtvcDe5QxQkrQ2/CawJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSp0a5HXT3Zvfcs2D9yK3XTngkkjQ+7gFIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdWjIAkuxN8kySR4dqv5zkWJKH2+OaoXnvTjKf5PEkVw/Vd7TafJI9429FkrQco+wBfBjYsUD9/VW1vT32AyTZBlwHvKqt8+tJNiTZAHwIeDOwDXh7W1aSNCVL3g20qj6bZHbE7e0E7qqq7wBfTTIPXN7mzVfVkwBJ7mrLPrbsEUuSxmI15wBuSvJIO0S0sdU2AU8NLXO01RarS5KmZKUBcBvwI8B24Djw3nENKMnuJAeTHDxx4sS4NitJOsOKAqCqnq6q56vqr4Hf4HuHeY4BW4YW3dxqi9UX2vbtVTVXVXMzMzMrGZ4kaQQr+o1gSS6tquPt6VuB01cI7QM+luR9wMuArcADQICtSS5j8MF/HfBvVzPwc4G/KUzSerZkACT5OHAlcHGSo8DNwJVJtgMFHAF+GqCqDiW5m8HJ3VPAjVX1fNvOTcC9wAZgb1UdGns3kqSRjXIV0NsXKN9xluVvAW5ZoL4f2L+s0UmS1ozfBJakThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnq1HnTHsAL0eyeexasH7n12gmPRJIW5x6AJHXKAJCkThkAktQpA0CSOrVkACTZm+SZJI8O1S5KciDJE+3nxlZPkg8mmU/ySJLXDK2zqy3/RJJda9OOJGlUo+wBfBjYcUZtD3BfVW0F7mvPAd4MbG2P3cBtMAgM4GbgtcDlwM2nQ0OSNB1LBkBVfRY4eUZ5J3Bnm74TeMtQ/SM1cD9wYZJLgauBA1V1sqqeBQ7w/aEiSZqglZ4DuKSqjrfprwOXtOlNwFNDyx1ttcXq3yfJ7iQHkxw8ceLECocnSVrKqk8CV1UBNYaxnN7e7VU1V1VzMzMz49qsJOkMKw2Ap9uhHdrPZ1r9GLBlaLnNrbZYXZI0JSsNgH3A6St5dgGfHKq/o10NdAXwzXao6F7gqiQb28nfq1pNkjQlS94LKMnHgSuBi5McZXA1z63A3UluAL4GvK0tvh+4BpgHvg1cD1BVJ5P8CvBgW+49VXXmiWVJ0gRlcAj/3DQ3N1cHDx5c8fqL3ZTtXONN4iSNU5KHqmpuqeX8JrAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnlrwdtNbeYnct9S6hktaSewCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnVnUvoCRHgG8BzwOnqmouyUXAbwOzwBHgbVX1bJIAHwCuAb4NvLOqvrCa13+h8x5BktbSOPYA/nlVba+qufZ8D3BfVW0F7mvPAd4MbG2P3cBtY3htSdIKrcUhoJ3AnW36TuAtQ/WP1MD9wIVJLl2D15ckjWC1AVDAp5I8lGR3q11SVcfb9NeBS9r0JuCpoXWPtpokaQpW+/sA/mlVHUvy94ADSf7P8MyqqiS1nA22INkN8PKXv3yVw5MkLWZVewBVdaz9fAb4feBy4OnTh3baz2fa4seALUOrb261M7d5e1XNVdXczMzMaoYnSTqLFQdAkr+T5KWnp4GrgEeBfcCuttgu4JNteh/wjgxcAXxz6FCRJGnCVnMI6BLg9wdXd3Ie8LGq+p9JHgTuTnID8DXgbW35/QwuAZ1ncBno9at4bUnSKq04AKrqSeBHF6j/BfDGBeoF3LjS15MkjZe/FH4dWuwLYuCXxCSNzltBSFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE55GegLjL9DQNKo3AOQpE4ZAJLUKQNAkjplAEhSpzwJ3AlPDks6k3sAktQpA0CSOmUASFKnDABJ6pQBIEmd8iqgznl1kNQv9wAkqVPuAWhB7hlIL3zuAUhSpwwASeqUh4C0LB4akl443AOQpE5NfA8gyQ7gA8AG4Der6tZJj0Hj556BtP5MNACSbAA+BLwJOAo8mGRfVT02yXFocgwG6dw16T2Ay4H5qnoSIMldwE7AAOiMwSBN36QDYBPw1NDzo8BrJzwGncMWC4aVMEykszvnrgJKshvY3Z4+l+TxVWzuYuDPVz+qdaW3nhftN7864ZFMTm/vMdjzcv3wKAtNOgCOAVuGnm9ute+qqtuB28fxYkkOVtXcOLa1XvTWc2/9gj33YhI9T/oy0AeBrUkuS3IBcB2wb8JjkCQx4T2AqjqV5CbgXgaXge6tqkOTHIMkaWDi5wCqaj+wf0IvN5ZDSetMbz331i/Ycy/WvOdU1Vq/hiTpHOStICSpU+syAJLsSPJ4kvkkexaY/8NJ7kvySJLPJNk8NG9XkifaY9dkR75yK+05yfYkn0tyqM37ycmPfmVW8z63+T+Y5GiSX5vcqFdnlX+3X57kU0kOJ3ksyewkx75Sq+z5P7e/24eTfDBJJjv65UuyN8kzSR5dZH5aL/Ot59cMzRvv51dVrasHg5PHXwH+PnAB8CVg2xnL/A6wq02/Afhom74IeLL93NimN067pzXu+ZXA1jb9MuA4cOG0e1rLnofmfwD4GPBr0+5nEj0DnwHe1KZfAvztafe0lj0D/wT4320bG4DPAVdOu6cRev5nwGuARxeZfw3wh0CAK4DPt/rYP7/W4x7Ad28nUVX/Dzh9O4lh24A/btOfHpp/NXCgqk5W1bPAAWDHBMa8Wivuuar+tKqeaNN/BjwDzExk1KuzmveZJP8IuAT41ATGOi4r7jnJNuC8qjoAUFXPVdW3JzPsVVnN+1zADzAIjhcD5wNPr/mIV6mqPgucPMsiO4GP1MD9wIVJLmUNPr/WYwAsdDuJTWcs8yXgX7XptwIvTfJDI657LlpNz9+V5HIG/1i+skbjHKcV95zkRcB7gV9Y81GO12re51cC30jyiSRfTPJf2s0Xz3Ur7rmqPscgEI63x71VdXiNxzsJi/2ZjP3zaz0GwCh+AfjxJF8EfpzBt42fn+6Q1txZe27/g/gocH1V/fV0hjh2i/X8s8D+qjo6zcGtkcV6Pg94fZv/jxkcUnnnlMY4bgv2nOQVwD9kcEeBTcAbkrx+esNcf865ewGNYJTbSfwZ7X8MSV4C/Ouq+kaSY8CVZ6z7mbUc7JisuOf2/AeBe4BfaruU68Fq3ufXAa9P8rMMjoVfkOS5qvq+E4znmNX0fBR4uL53p93/weD48R2TGPgqrKbn/wDcX1XPtXl/CLwO+F+TGPgaWuzPZPyfX9M+IbKCEyjnMTj5cRnfO2n0qjOWuRh4UZu+BXjP0EmUrzI4gbKxTV807Z7WuOcLgPuAd027j0n1fMYy72T9nARezfu8oS0/057/d+DGafe0xj3/JPBHbRvnt7/n/2LaPY3Y9yyLnwS+lr95EviBVh/759fU/yBW+Id3DfCnDI5l/1KrvQf4l236J4An2jK/Cbx4aN1/D8y3x/XT7mWtewb+HfBXwMNDj+3T7met3+ehbaybAFhtzwx+0dIjwJeBDwMXTLufteyZQej9N+Awg98p8r5p9zJivx9ncM7irxgcx78B+BngZ9r8MPjFWV9p7+Xc0Lpj/fzym8CS1KkX6klgSdISDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjr1/wHNa9U2GtFvqQAAAABJRU5ErkJggg==\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAASLUlEQVR4nO3df6zdd33f8ecLO6HboI3T3EXBNr3parSaP2qYF8I61gxEYhKthq6jYVoxaTS3aiIVqf3DtH+kg0VKtwEClUZLG4+ACmm60mE17oKbBtFOhMSBEOJ4aS7BKPZM7NaBEqGxJn3vj/NxOZh7fX+cH9c3n+dDOrrf8/7+OJ+3j/06X3+/3/O9qSokSX140WoPQJI0PYa+JHXE0Jekjhj6ktQRQ1+SOrJ+tQdwNhdddFHNzs6u9jAkaU156KGH/rKqZuabd06H/uzsLAcPHlztYUjSmpLkqwvN8/COJHXE0Jekjhj6ktQRQ1+SOrJo6Cf5viQPJPlikkNJ/kOrX5rkc0nmkvxekvNb/cXt+VybPzu0rXe1+uNJrppUU5Kk+S1lT//bwOur6seAbcCOJJcDvwG8v6p+BHgGuL4tfz3wTKu/vy1Hkq3AtcArgR3AbyVZN85mJElnt2jo18Cz7el57VHA64H/3up3AG9u0zvbc9r8NyRJq99ZVd+uqq8Ac8BlY+lCkrQkSzqmn2RdkoeBE8AB4MvA16vqubbIUWBjm94IPAXQ5n8D+MHh+jzrDL/W7iQHkxw8efLk8juSJC1oSaFfVc9X1TZgE4O98388qQFV1W1Vtb2qts/MzPuFMknSCi3rG7lV9fUk9wGvBS5Isr7tzW8CjrXFjgGbgaNJ1gM/APzVUP204XUmYnbP3fPWj9xyzSRfVpLOWUu5emcmyQVt+u8BbwQOA/cBP90W2wV8sk3va89p8/+0Br+eax9wbbu651JgC/DAuBqRJC1uKXv6lwB3tCttXgTcVVV/lOQx4M4k/xH4AnB7W/524KNJ5oBTDK7YoaoOJbkLeAx4Drihqp4fbzuSpLNZNPSr6hHgVfPUn2Seq2+q6v8C/2aBbd0M3Lz8YUqSxsFv5EpSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRxYN/SSbk9yX5LEkh5L8Uqv/epJjSR5uj6uH1nlXkrkkjye5aqi+o9XmkuyZTEuSpIWsX8IyzwG/XFWfT/JS4KEkB9q891fVfxleOMlW4FrglcDLgD9J8oo2+0PAG4GjwINJ9lXVY+NoRJK0uEVDv6qOA8fb9DeTHAY2nmWVncCdVfVt4CtJ5oDL2ry5qnoSIMmdbVlDX5KmZFnH9JPMAq8CPtdKNyZ5JMneJBtabSPw1NBqR1ttobokaUqWcngHgCQvAf4AeGdV/XWSW4H3ANV+vhf4uVEHlGQ3sBvg5S9/+aibm9fsnrvnrR+55ZqJvJ4knSuWtKef5DwGgf+7VfUJgKp6uqqer6q/BX6b7xzCOQZsHlp9U6stVP8uVXVbVW2vqu0zMzPL7UeSdBZLuXonwO3A4ap631D9kqHF3gI82qb3AdcmeXGSS4EtwAPAg8CWJJcmOZ/Byd5942lDkrQUSzm88+PAzwJfSvJwq/0q8LYk2xgc3jkC/DxAVR1KcheDE7TPATdU1fMASW4E7gHWAXur6tAYe5EkLWIpV+/8OZB5Zu0/yzo3AzfPU99/tvUkSZPlN3IlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdWb/aAziXzO65e976kVuumfJIJGky3NOXpI4sGvpJNie5L8ljSQ4l+aVWvzDJgSRPtJ8bWj1JPphkLskjSV49tK1dbfknkuyaXFuSpPksZU//OeCXq2orcDlwQ5KtwB7g3qraAtzbngO8CdjSHruBW2HwIQHcBLwGuAy46fQHhSRpOhYN/ao6XlWfb9PfBA4DG4GdwB1tsTuAN7fpncBHauB+4IIklwBXAQeq6lRVPQMcAHaMtRtJ0lkt65h+klngVcDngIur6nib9TXg4ja9EXhqaLWjrbZQ/czX2J3kYJKDJ0+eXM7wJEmLWHLoJ3kJ8AfAO6vqr4fnVVUBNY4BVdVtVbW9qrbPzMyMY5OSpGZJoZ/kPAaB/7tV9YlWfrodtqH9PNHqx4DNQ6tvarWF6pKkKVnK1TsBbgcOV9X7hmbtA05fgbML+ORQ/e3tKp7LgW+0w0D3AFcm2dBO4F7ZapKkKVnKl7N+HPhZ4EtJHm61XwVuAe5Kcj3wVeCtbd5+4GpgDvgWcB1AVZ1K8h7gwbbcu6vq1Fi6kCQtyaKhX1V/DmSB2W+YZ/kCblhgW3uBvcsZoCRpfPxGriR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR1Zyi9R6d7snrvnrR+55Zopj0SSRuOeviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOLBr6SfYmOZHk0aHaryc5luTh9rh6aN67kswleTzJVUP1Ha02l2TP+FuRJC1mKXv6HwZ2zFN/f1Vta4/9AEm2AtcCr2zr/FaSdUnWAR8C3gRsBd7WlpUkTdGiN1yrqs8kmV3i9nYCd1bVt4GvJJkDLmvz5qrqSYAkd7ZlH1v2iCVJKzbKMf0bkzzSDv9saLWNwFNDyxxttYXq3yPJ7iQHkxw8efLkCMOTJJ1ppaF/K/CPgG3AceC94xpQVd1WVduravvMzMy4NitJYoX306+qp09PJ/lt4I/a02PA5qFFN7UaZ6lLkqZkRaGf5JKqOt6evgU4fWXPPuBjSd4HvAzYAjwABNiS5FIGYX8t8G9HGfi5wF+uImmtWTT0k3wcuAK4KMlR4CbgiiTbgAKOAD8PUFWHktzF4ATtc8ANVfV8286NwD3AOmBvVR0aezeSpLNaytU7b5unfPtZlr8ZuHme+n5g/7JGJ0kaK7+RK0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOrJ+tQfwQjS75+5560duuWbKI5Gk7+aeviR1xNCXpI4Y+pLUkUVDP8neJCeSPDpUuzDJgSRPtJ8bWj1JPphkLskjSV49tM6utvwTSXZNph1J0tksZU//w8COM2p7gHuragtwb3sO8CZgS3vsBm6FwYcEcBPwGuAy4KbTHxSSpOlZNPSr6jPAqTPKO4E72vQdwJuH6h+pgfuBC5JcAlwFHKiqU1X1DHCA7/0gkSRN2EqP6V9cVcfb9NeAi9v0RuCpoeWOttpC9e+RZHeSg0kOnjx5coXDkyTNZ+QTuVVVQI1hLKe3d1tVba+q7TMzM+ParCSJlYf+0+2wDe3niVY/BmweWm5Tqy1UlyRN0UpDfx9w+gqcXcAnh+pvb1fxXA58ox0Guge4MsmGdgL3ylaTJE3RordhSPJx4ArgoiRHGVyFcwtwV5Lrga8Cb22L7weuBuaAbwHXAVTVqSTvAR5sy727qs48OSxJmrBFQ7+q3rbArDfMs2wBNyywnb3A3mWNTpI0Vt5wbYq8EZuk1eZtGCSpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjriXTbPAd59U9K0uKcvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1ZKR77yQ5AnwTeB54rqq2J7kQ+D1gFjgCvLWqnkkS4APA1cC3gHdU1edHef0XOu/JI2ncxrGn/y+raltVbW/P9wD3VtUW4N72HOBNwJb22A3cOobXliQtwyQO7+wE7mjTdwBvHqp/pAbuBy5IcskEXl+StIBRQ7+ATyV5KMnuVru4qo636a8BF7fpjcBTQ+sebbXvkmR3koNJDp48eXLE4UmSho16P/1/XlXHkvxD4ECS/z08s6oqSS1ng1V1G3AbwPbt25e1riTp7Eba06+qY+3nCeAPgcuAp08ftmk/T7TFjwGbh1bf1GqSpClZcegn+QdJXnp6GrgSeBTYB+xqi+0CPtmm9wFvz8DlwDeGDgNJkqZglMM7FwN/OLgSk/XAx6rqfyZ5ELgryfXAV4G3tuX3M7hcc47BJZvXjfDakqQVWHHoV9WTwI/NU/8r4A3z1Au4YaWvJ0kanb8YfQ1a6Etb4Be3JJ2dt2GQpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHfGSzRcY78Ev6Wzc05ekjhj6ktQRQ1+SOmLoS1JHPJHbCU/wSgL39CWpK4a+JHXE0Jekjhj6ktQRT+R2zhO8Ul/c05ekjhj6ktQRD+9oXh72kV6Y3NOXpI4Y+pLUEQ/vaFk87COtbe7pS1JH3NPXWPg/AGltmHroJ9kBfABYB/xOVd0y7TFoevwwkM4tUw39JOuADwFvBI4CDybZV1WPTXMcWn1+GEirY9p7+pcBc1X1JECSO4GdgKEvYOEPg3Hyg0U9m3bobwSeGnp+FHjN8AJJdgO729Nnkzw+wutdBPzlCOuvRb31vOx+8xsTGsn09PYegz0v1w8tNOOcO5FbVbcBt41jW0kOVtX2cWxrreit5976BXvuxaR6nvYlm8eAzUPPN7WaJGkKph36DwJbklya5HzgWmDflMcgSd2a6uGdqnouyY3APQwu2dxbVYcm+JJjOUy0xvTWc2/9gj33YiI9p6omsV1J0jnI2zBIUkcMfUnqyJoM/SQ7kjyeZC7Jnnnm/1CSe5M8kuTTSTYNzduV5In22DXdka/cSntOsi3JZ5McavN+ZvqjX5lR3uc2//uTHE3ym9Mb9WhG/Lv98iSfSnI4yWNJZqc59pUasef/1P5uH07ywSSZ7uiXL8neJCeSPLrA/LRe5lrPrx6aN3p+VdWaejA4Afxl4IeB84EvAlvPWOb3gV1t+vXAR9v0hcCT7eeGNr1htXuacM+vALa06ZcBx4ELVrunSfY8NP8DwMeA31ztfqbRM/Bp4I1t+iXA31/tnibZM/DPgP/VtrEO+CxwxWr3tISe/wXwauDRBeZfDfwxEOBy4HOtPpb8Wot7+n93K4eq+n/A6Vs5DNsK/Gmbvm9o/lXAgao6VVXPAAeAHVMY86hW3HNV/UVVPdGm/w9wApiZyqhHM8r7TJJ/AlwMfGoKYx2XFfecZCuwvqoOAFTVs1X1rekMeySjvM8FfB+DD4sXA+cBT098xCOqqs8Ap86yyE7gIzVwP3BBkksYU36txdCf71YOG89Y5ovAT7XptwAvTfKDS1z3XDRKz38nyWUM/oF8eULjHKcV95zkRcB7gV+Z+CjHa5T3+RXA15N8IskXkvzndoPDc92Ke66qzzL4EDjeHvdU1eEJj3caFvozGUt+rcXQX4pfAX4iyReAn2Dwrd/nV3dIE3fWntuewkeB66rqb1dniGO3UM+/COyvqqOrObgJWajn9cDr2vx/yuBwyTtWaYzjNm/PSX4E+FEG3+zfCLw+yetWb5hrwzl3750lWPRWDu0wxk8BJHkJ8K+r6utJjgFXnLHupyc52DFZcc/t+fcDdwO/1v67uBaM8j6/Fnhdkl9kcGz7/CTPVtX3nCQ8x4zS81Hg4frOHWz/B4PjwbdPY+AjGKXnfw/cX1XPtnl/DLwW+LNpDHyCFvozGU9+rfZJjRWcBFnP4ATGpXznxM8rz1jmIuBFbfpm4N1DJ0K+wuAkyIY2feFq9zThns8H7gXeudp9TKvnM5Z5B2vnRO4o7/O6tvxMe/7fgBtWu6cJ9/wzwJ+0bZzX/p7/q9XuaYl9z7Lwidxr+O4TuQ+0+ljya9WbX+Ef2NXAXzA4Nv1rrfZu4Cfb9E8DT7Rlfgd48dC6PwfMtcd1q93LpHsG/h3wN8DDQ49tq93PpN/noW2smdAftWcGv5zoEeBLwIeB81e7n0n2zOCD7r8Chxn8To73rXYvS+z34wzOQfwNg+Py1wO/APxCmx8Gv2zqy+293D607sj55W0YJKkjL9QTuZKkeRj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSP/H4oJuc8G6ixUAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] @@ -758,7 +758,7 @@ } ], "source": [ - "plt.hist([_[2] for _ in data[::100]], bins=50);" + "plt.hist([_['sim'] for _ in data[::100]], bins=50);" ] }, { @@ -772,7 +772,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 31, "metadata": { "pycharm": { "is_executing": false @@ -781,7 +781,7 @@ "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEIpJREFUeJzt3XuMpXV9x/H3h10u9cptS8guOLTStPQi0i3FWqtAbLlYl7aI2KYudNONERMba+q2/aOpqQm0qaixMd2IdTH1Qq0WolihC8ReBF3kDlUWCmG3CKsCLSW2Yr/94/yos+sMc2bOnDkzv32/kpPzPL/nOed8f/PsfM5vfs9zzqaqkCT164BJFyBJGi+DXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktS51ZMuAODII4+sqampSZchSSvKzTff/I2qWjPXfssi6KemptixY8eky5CkFSXJg8Ps59SNJHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1bll8MnYUU1s+O2P7AxefvcSVSNLy5Ihekjpn0EtS5wx6SercUEGf5IEkdyS5NcmO1nZ4kmuT3NvuD2vtSfK+JDuT3J7kpHF2QJL07OYzoj+1qk6sqvVtfQuwvaqOB7a3dYAzgePbbTPwgcUqVpI0f6NM3WwAtrXlbcA509ovr4EbgUOTHD3C60iSRjBs0BdwTZKbk2xubUdV1cNt+evAUW15LfDQtMfuam17SbI5yY4kO/bs2bOA0iVJwxj2Ovqfr6rdSX4QuDbJv07fWFWVpObzwlW1FdgKsH79+nk9VpI0vKFG9FW1u90/CnwaOBl45JkpmXb/aNt9N3DMtIeva22SpAmYM+iTPDfJ859ZBn4RuBO4CtjYdtsIXNmWrwLe2K6+OQV4YtoUjyRpiQ0zdXMU8Okkz+z/0ar6+yRfBq5Isgl4EDiv7X81cBawE3gKuHDRq5YkDW3OoK+q+4GXzND+TeD0GdoLuGhRqpMkjcxPxkpS5wx6SeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1DmDXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1zqCXpM4Z9JLUOYNekjpn0EtS5wx6SeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1Lmhgz7JqiS3JPlMWz8uyU1Jdib5RJKDWvvBbX1n2z41ntIlScOYz4j+rcA909YvAS6tqhcDjwGbWvsm4LHWfmnbT5I0IUMFfZJ1wNnAB9t6gNOAT7ZdtgHntOUNbZ22/fS2vyRpAoYd0b8H+D3gf9v6EcDjVfV0W98FrG3La4GHANr2J9r+e0myOcmOJDv27NmzwPIlSXOZM+iTvAZ4tKpuXswXrqqtVbW+qtavWbNmMZ9akjTN6iH2eTnw2iRnAYcALwDeCxyaZHUbta8Ddrf9dwPHALuSrAZeCHxz0SuXJA1lzhF9Vf1+Va2rqingfOC6qvoN4Hrg3LbbRuDKtnxVW6dtv66qalGrliQNbZTr6N8BvC3JTgZz8Je19suAI1r724Ato5UoSRrFMFM3/6+qbgBuaMv3AyfPsM+3gdctQm2SpEXgJ2MlqXMGvSR1zqCXpM4Z9JLUOYNekjpn0EtS5wx6SeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1DmDXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXOrJ13AuExt+eyM7Q9cfPYSVyJJk+WIXpI6Z9BLUucMeknqnEEvSZ2bM+iTHJLkS0luS3JXkj9u7ccluSnJziSfSHJQaz+4re9s26fG2wVJ0rMZZkT/38BpVfUS4ETgjCSnAJcAl1bVi4HHgE1t/03AY6390rafJGlC5gz6GniyrR7YbgWcBnyytW8DzmnLG9o6bfvpSbJoFUuS5mWoOfokq5LcCjwKXAvcBzxeVU+3XXYBa9vyWuAhgLb9CeCIxSxakjS8oYK+qr5bVScC64CTgR8d9YWTbE6yI8mOPXv2jPp0kqRZzOuqm6p6HLgeeBlwaJJnPlm7DtjdlncDxwC07S8EvjnDc22tqvVVtX7NmjULLF+SNJdhrrpZk+TQtvwDwKuBexgE/rltt43AlW35qrZO235dVdViFi1JGt4w33VzNLAtySoGbwxXVNVnktwNfDzJnwC3AJe1/S8DPpJkJ/At4Pwx1C1JGtKcQV9VtwMvnaH9fgbz9fu2fxt43aJUJ0kamZ+MlaTOGfSS1DmDXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXPD/McjXZna8tlZtz1w8dlLWIkkLQ1H9JLUOYNekjpn0EtS5wx6SeqcQS9Jndvvrrp5NrNdkePVOJJWMkf0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1zqCXpM4Z9JLUuTmDPskxSa5PcneSu5K8tbUfnuTaJPe2+8Nae5K8L8nOJLcnOWncnZAkzW6YEf3TwO9W1QnAKcBFSU4AtgDbq+p4YHtbBzgTOL7dNgMfWPSqJUlDmzPoq+rhqvpKW/5P4B5gLbAB2NZ22wac05Y3AJfXwI3AoUmOXvTKJUlDmdccfZIp4KXATcBRVfVw2/R14Ki2vBZ4aNrDdrU2SdIEDB30SZ4H/C3wO1X1H9O3VVUBNZ8XTrI5yY4kO/bs2TOfh0qS5mGorylOciCDkP/rqvpUa34kydFV9XCbmnm0te8Gjpn28HWtbS9VtRXYCrB+/fp5vUlIEvjV4sMa5qqbAJcB91TVu6dtugrY2JY3AldOa39ju/rmFOCJaVM8kqQlNsyI/uXAbwJ3JLm1tf0BcDFwRZJNwIPAeW3b1cBZwE7gKeDCRa1YkjQvcwZ9Vf0TkFk2nz7D/gVcNGJdkqRF4idjJalzBr0kdc6gl6TOGfSS1DmDXpI6N9QHpvZ3fihDmqzZfgc1HEf0ktQ5g16SOufUjaTuON26N0f0ktQ5g16SOufUjaT9xv46peOIXpI6Z9BLUucMeknqnHP0kpYNPwE7Ho7oJalzBr0kdc6gl6TOOUcvaWz21+vWlxuDXtJ+r/c3JINe0pLz6pqlZdBLGsqzhXMvI99eeTJWkjpn0EtS55y6GUHvJ3CkYTnnvrw5opekzjmil7QXR+ff08tf7Y7oJalzBr0kdc6gl6TOzTlHn+RDwGuAR6vqJ1rb4cAngCngAeC8qnosSYD3AmcBTwEXVNVXxlP68tXLvJ6kPgwzov8wcMY+bVuA7VV1PLC9rQOcCRzfbpuBDyxOmZKkhZpzRF9VX0gytU/zBuBVbXkbcAPwjtZ+eVUVcGOSQ5McXVUPL1bB0v5usf5i9Oqa/cdC5+iPmhbeXweOastrgYem7bertUmSJmTk6+irqpLUfB+XZDOD6R2OPfbYUcuQNAtH7lroiP6RJEcDtPtHW/tu4Jhp+61rbd+nqrZW1fqqWr9mzZoFliFJmstCR/RXARuBi9v9ldPa35Lk48DPAk84Pz83r9KRNE7DXF75MQYnXo9Msgv4IwYBf0WSTcCDwHlt96sZXFq5k8HllReOoWapK77Ra9yGuermDbNsOn2GfQu4aNSiJDm3rsXjl5pJC+RIXCuFX4EgSZ1zRL8fcOS5tPx5a7kx6LVfMYS1PzLol5An1xZuUgG9ko7ZSqpVS8ugl1iakDSINSkG/TI27lHsswWPUxlSP7zqRpI654heYzXf6Yql+GtF2t8Y9JqRV6dI/TDoJWmeFjIQmuTgyaDXsjLfKRenaKS5GfSaF6d0pJXHoF+BlmPYOrKWli+DviOGraSZeB29JHXOoJekzhn0ktQ5g16SOmfQS1LnvOpGkhbJcr3yzRG9JHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1zqCXpM6NJeiTnJHkq0l2JtkyjteQJA1n0YM+ySrgL4AzgROANyQ5YbFfR5I0nHGM6E8GdlbV/VX1P8DHgQ1jeB1J0hDGEfRrgYemre9qbZKkCZjY1xQn2QxsbqtPJvnqAp/qSOAbi1PVxNmX5aeXfoB9WZZyyUh9edEwO40j6HcDx0xbX9fa9lJVW4Gto75Ykh1VtX7U51kO7Mvy00s/wL4sV0vRl3FM3XwZOD7JcUkOAs4HrhrD60iShrDoI/qqejrJW4DPA6uAD1XVXYv9OpKk4Yxljr6qrgauHsdzz2Dk6Z9lxL4sP730A+zLcjX2vqSqxv0akqQJ8isQJKlzyzro5/oqhSQvSrI9ye1Jbkiybp/tL0iyK8n7l67q7zdKP5J8N8mt7Tbxk9oj9uXYJNckuSfJ3UmmlrL2fS20L0lOnXZMbk3y7STnLH0P9qp1lOPyp0nuasflfUmytNXvVeco/bgkyZ3t9vqlrfz7JflQkkeT3DnL9rSf987Wn5OmbduY5N522zhyMVW1LG8MTuTeB/wQcBBwG3DCPvv8DbCxLZ8GfGSf7e8FPgq8f6X2A3hy0sdiEftyA/Dqtvw84DkrtS/T9jkc+NZK7Qvwc8A/t+dYBXwReNUK7MfZwLUMzjs+l8HVfy+Y1DFpNf0CcBJw5yzbzwI+BwQ4Bbhp2r+p+9v9YW35sFFqWc4j+mG+SuEE4Lq2fP307Ul+GjgKuGYJan02I/VjmVlwX9r3Ha2uqmsBqurJqnpqacqe0WIdl3OBz63gvhRwCINgPRg4EHhk7BXPbJR+nAB8oaqerqr/Am4HzliCmmdVVV9gMAiYzQbg8hq4ETg0ydHALwHXVtW3quoxBm9gI/VlOQf9MF+lcBvwq235V4DnJzkiyQHAnwNvH3uVc1twP9r6IUl2JLlx0tMDjNaXHwEeT/KpJLck+bP2BXiTMupxecb5wMfGUuHwFtyXqvoig8B8uN0+X1X3jLne2YxyTG4DzkjynCRHAqey9wc3l6PZ+rvoXyOznIN+GG8HXpnkFuCVDD6B+13gzcDVVbVrksXNw2z9AHhRDT419+vAe5L88IRqHNZsfVkNvKJt/xkGf55fMKEah/Vsx4U2+vpJBp8ZWe5m7EuSFwM/xuAT7GuB05K8YnJlzmnGflTVNQwu6f4XBm+8X2TasdrfTey7boYw51cpVNW/097dkzwP+LWqejzJy4BXJHkzg7ngg5I8WVWT+G78Bfejbdvd7u9PcgPwUgbzmJMwyjHZBdxaVfe3bX/HYF7ysqUofAYjHZfmPODTVfWdMdc6l1GOy28DN1bVk23b54CXAf+4FIXvY9TflXcB72rbPgp8bQlqHsVs/d0NvGqf9htGeqVJnqyY40TGagYnIY7jeydmfnyffY4EDmjL7wLeOcPzXMBkT8YuuB8MTsQcPG2fe9nn5NQK6suqtv+atv5XwEUrsS/Ttt8InDqpPizScXk98A/tOQ4EtgO/vAL7sQo4oi3/FHAng3NCkz42U8x+MvZs9j4Z+6XWfjjwb+33/7C2fPhIdUz6BzHHD+ksBu/K9wF/2NreCby2LZ/bwu9rwAefCcV9nuMCJhj0o/SDwRURd7R/8HcAm1byMQFezeAk2R3Ah4GDVnBfphiMvA6Y9DEZ8d/YKuAvgXuAu4F3r9B+HNLqv5vBG/CJy+CYfIzBeY/vMJhn3wS8CXhT2x4G/0nTfe13Yv20x/4WsLPdLhy1Fj8ZK0mdW+knYyVJczDoJalzBr0kdc6gl6TOGfSS1DmDXpI6Z9BLUucMeknq3P8BpdqoH5C0KWEAAAAASUVORK5CYII=\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAQdklEQVR4nO3de6xlZ13G8e/TmV7k2ts4aWYqp0qN1guljqWISGmD9oJM1VKKRqZ14oRQEwwSGfUPI5Gk1UiBYIgTikyJXCqCbaBIay/BCy2c0nsrdFrbdMbSGehFa4NS/PnHfqtnpufM2eecvc/lne8n2dlrve/ae73vrDnPfs+71l4nVYUkqS8HLXUDJEmjZ7hLUocMd0nqkOEuSR0y3CWpQ6uXugEARx99dE1MTCx1MyRpRbnlllu+VVVrpqtbFuE+MTHB5OTkUjdDklaUJA/NVOe0jCR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHRoq3JM8mOTOJLclmWxlRya5Nsl97fmIVp4kH0iyI8kdSU4aZwckSc81l5H7a6vqxKra0Na3AtdV1fHAdW0d4Ezg+PbYAnxoVI2VJA1nId9Q3Qic2pa3AzcC72rll9fgr4DclOTwJMdU1SMLaehMJrZ+ftryBy8+exy7k6QVYdiRewHXJLklyZZWtnZKYH8TWNuW1wEPT3ntzla2lyRbkkwmmdyzZ888mi5JmsmwI/efrapdSb4fuDbJv0ytrKpKMqe/11dV24BtABs2bPBv/UnSCA01cq+qXe15N/BZ4GTg0STHALTn3W3zXcCxU16+vpVJkhbJrOGe5PlJXvjsMvDzwF3AVcCmttkm4Mq2fBXwlnbVzCnAk+Oab5ckTW+YaZm1wGeTPLv9x6vq75J8FbgiyWbgIeC8tv3VwFnADuBp4MKRt1qStF+zhntVPQC8bJrybwOnT1NewEUjaZ0kaV78hqokdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ0OHe5JVSW5N8rm2flySm5PsSPKpJIe08kPb+o5WPzGepkuSZjKXkfvbgXunrF8CXFpVLwUeBza38s3A46380radJGkRDRXuSdYDZwMfbusBTgM+3TbZDpzTlje2dVr96W17SdIiGXbk/j7gd4H/aetHAU9U1TNtfSewri2vAx4GaPVPtu33kmRLkskkk3v27Jln8yVJ05k13JO8HthdVbeMcsdVta2qNlTVhjVr1ozyrSXpgLd6iG1eBbwhyVnAYcCLgPcDhydZ3Ubn64FdbftdwLHAziSrgRcD3x55yyVJM5p15F5Vv1dV66tqAjgfuL6qfg24ATi3bbYJuLItX9XWafXXV1WNtNWSpP1ayHXu7wLekWQHgzn1y1r5ZcBRrfwdwNaFNVGSNFfDTMv8n6q6EbixLT8AnDzNNt8B3jiCtkmS5slvqEpShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOrV7qBozLxNbPT1v+4MVnL3JLJGnxOXKXpA7NGu5JDkvylSS3J7k7yR+18uOS3JxkR5JPJTmklR/a1ne0+onxdkGStK9hRu7/BZxWVS8DTgTOSHIKcAlwaVW9FHgc2Ny23ww83sovbdtJkhbRrOFeA0+11YPbo4DTgE+38u3AOW15Y1un1Z+eJCNrsSRpVkPNuSdZleQ2YDdwLXA/8ERVPdM22Qmsa8vrgIcBWv2TwFGjbLQkaf+GCveq+l5VnQisB04GfmShO06yJclkksk9e/Ys9O0kSVPM6WqZqnoCuAF4JXB4kmcvpVwP7GrLu4BjAVr9i4FvT/Ne26pqQ1VtWLNmzTybL0mazjBXy6xJcnhb/j7gdcC9DEL+3LbZJuDKtnxVW6fVX19VNcpGS5L2b5gvMR0DbE+yisGHwRVV9bkk9wCfTPLHwK3AZW37y4CPJdkBPAacP4Z2S5L2Y9Zwr6o7gJdPU/4Ag/n3fcu/A7xxJK2TJM2L31CVpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nq0DC3/O3KxNbPz1j34MVnL2JLJGl8HLlLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR16IC7K+T+zHTHSO8WKWmlceQuSR0y3CWpQ4a7JHXIcJekDs0a7kmOTXJDknuS3J3k7a38yCTXJrmvPR/RypPkA0l2JLkjyUnj7oQkaW/DjNyfAX6nqk4ATgEuSnICsBW4rqqOB65r6wBnAse3xxbgQyNvtSRpv2YN96p6pKq+1pb/A7gXWAdsBLa3zbYD57TljcDlNXATcHiSY0becknSjOY0555kAng5cDOwtqoeaVXfBNa25XXAw1NetrOVSZIWydDhnuQFwN8Av11V/z61rqoKqLnsOMmWJJNJJvfs2TOXl0qSZjFUuCc5mEGw/1VVfaYVP/rsdEt73t3KdwHHTnn5+la2l6raVlUbqmrDmjVr5tt+SdI0hrlaJsBlwL1V9d4pVVcBm9ryJuDKKeVvaVfNnAI8OWX6RpK0CIa5t8yrgF8H7kxyWyv7feBi4Iokm4GHgPNa3dXAWcAO4GngwpG2WJIa7wc1s1nDvar+EcgM1adPs30BFy2wXZKkBfAbqpLUIcNdkjrk/dyH4LyepJXGkbskdciRu6Rlb6bfnjUzR+6S1CHDXZI65LSMpO54EYQjd0nqkuEuSR0y3CWpQ4a7JHXIE6qSDhgH0olWR+6S1CHDXZI65LSMpGXD2wyMjiN3SeqQ4S5JHTLcJalDzrlLGpuVcunhSmnnXBjukhadJ07Hz3CXNJT9BfJKHuH2yjl3SeqQ4S5JHTLcJalDhrskdcgTqgvQ4+VT0nx49cvyY7hL0gxW8gDOaRlJ6pAjd0l7cYqlD47cJalDjtylA5Qj9L45cpekDs06ck/yEeD1wO6q+vFWdiTwKWACeBA4r6oeTxLg/cBZwNPABVX1tfE0XdIwHKEfmIaZlvko8EHg8illW4HrquriJFvb+ruAM4Hj2+MVwIfa8wFlJV8+JakPs07LVNWXgMf2Kd4IbG/L24FzppRfXgM3AYcnOWZUjZUkDWe+c+5rq+qRtvxNYG1bXgc8PGW7na3sOZJsSTKZZHLPnj3zbIYkaToLvlqmqipJzeN124BtABs2bJjz66VezHUaz2k/DWO+4f5okmOq6pE27bK7le8Cjp2y3fpWpv3wh1XSqM13WuYqYFNb3gRcOaX8LRk4BXhyyvSNJGmRDHMp5CeAU4Gjk+wE/hC4GLgiyWbgIeC8tvnVDC6D3MHgUsgLx9BmSdIsZg33qnrzDFWnT7NtARcttFHSSjaqaTavT9dC+A1VSeqQ4S5JHfLGYdJ+7G9qxKuZtJwZ7tI8OSeu5cxwX8bGfWJuJY08x92HxQjqce/DDxtNZbjrgGIA6kBhuGtZMXyl0TDcF9GogquHaRZJ42W4a0Xzg06anuF+AJvPZX5++1Kan8UeiBjumtZKD9+V3n4tbyvhN0bDXWNlyEpLw3DvyFIGqSEuLS/eW0aSOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yEshJWlEltMlwY7cJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktShsYR7kjOSfD3JjiRbx7EPSdLMRh7uSVYBfw6cCZwAvDnJCaPejyRpZuMYuZ8M7KiqB6rqv4FPAhvHsB9J0gzGcT/3dcDDU9Z3Aq/Yd6MkW4AtbfWpJF+f5/6OBr41z9cuN/Zl+emlH2BflqVcsqC+vGSmiiX7Yx1VtQ3YttD3STJZVRtG0KQlZ1+Wn176AfZluRpXX8YxLbMLOHbK+vpWJklaJOMI968Cxyc5LskhwPnAVWPYjyRpBiOflqmqZ5L8FvBFYBXwkaq6e9T7mWLBUzvLiH1ZfnrpB9iX5WosfUlVjeN9JUlLyG+oSlKHDHdJ6tCyDvfZbmOQ5CVJrktyR5Ibk6zfp/5FSXYm+eDitfq5FtKPJN9Lclt7LPmJ6QX25QeSXJPk3iT3JJlYzLbva759SfLaKcfktiTfSXLO4vdgr7Yu5Lj8SZK723H5QJIsbuv3audC+nFJkrva402L2/LnSvKRJLuT3DVDfdq/947Wn5Om1G1Kcl97bJpXA6pqWT4YnIy9H/hB4BDgduCEfbb5a2BTWz4N+Ng+9e8HPg58cKX2A3hqqY/FCPtyI/C6tvwC4HkrtS9TtjkSeGyl9gX4GeCf2nusAr4MnLoC+3E2cC2Di0Sez+CqvRct1TFpbfo54CTgrhnqzwK+AAQ4Bbh5yv+pB9rzEW35iLnufzmP3Ie5jcEJwPVt+Yap9Ul+ClgLXLMIbd2fBfVjmZl3X9r9hVZX1bUAVfVUVT29OM2e1qiOy7nAF1ZwXwo4jEGYHgocDDw69hZPbyH9OAH4UlU9U1X/CdwBnLEIbZ5RVX2JwQf/TDYCl9fATcDhSY4BfgG4tqoeq6rHGXxozbkvyzncp7uNwbp9trkd+OW2/EvAC5McleQg4M+Ad469lbObdz/a+mFJJpPctNS/+rOwvvww8ESSzyS5NcmftpvMLZWFHpdnnQ98YiwtHN68+1JVX2YQko+0xxer6t4xt3cmCzkmtwNnJHlekqOB17L3lymXo5n6O8y/w6yWc7gP453Aa5LcCryGwTdhvwe8Dbi6qnYuZePmYKZ+ALykBl9N/lXgfUl+aInaOKyZ+rIaeHWr/2kGv3pfsERtHNb+jgttlPUTDL7TsdxN25ckLwV+lME3ydcBpyV59dI1c1bT9qOqrgGuBv6ZwYftl5lyrA5ES3ZvmSHMehuDqvo32qd4khcAv1JVTyR5JfDqJG9jMLd7SJKnqmop7i0/7360ul3t+YEkNwIvZzAvuRQWckx2ArdV1QOt7m8ZzDNethgNn8aCjktzHvDZqvrumNs6m4Ucl98Ebqqqp1rdF4BXAv+wGA3fx0J/Vt4DvKfVfRz4xiK0eSFm6u8u4NR9ym+c87sv5QmHWU5GrGZwIuE4/v/kyo/ts83RwEFt+T3Au6d5nwtY2hOq8+4Hg5Mph07Z5j72OcG0gvqyqm2/pq3/JXDRSuzLlPqbgNcuVR9GdFzeBPx9e4+DgeuAX1yB/VgFHNWWfxK4i8E5nqU+NhPMfEL1bPY+ofqVVn4k8K/t5/+ItnzknPe91J2f5R/mLAafvvcDf9DK3g28oS2f2wLvG8CHnw3Cfd7jApYw3BfSDwZXMtzZ/pPfCWxeyccEeB2DE113Ah8FDlnBfZlgMMI6aKmPyQL/j60C/gK4F7gHeO8K7cdhrf33MPjQPXEZHJNPMDiP8V0G8+abgbcCb231YfCHje5vPxMbprz2N4Ad7XHhfPbv7QckqUMr/YSqJGkahrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nq0P8C7vqLh/c2MZYAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] @@ -793,7 +793,7 @@ } ], "source": [ - "plt.hist([_[2] for _ in data[::1] if _[2] > 0.94], bins=50);" + "plt.hist([_['sim'] for _ in data[::1] if _['sim'] > 0.94], bins=50);" ] }, { @@ -807,7 +807,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 34, "metadata": { "pycharm": { "is_executing": false @@ -818,21 +818,22 @@ "def sample(data, threshold, num_samples, epsilon=0.01):\n", " samples = []\n", " for row in data:\n", - " if abs(row[2] - threshold) <= epsilon:\n", + " if abs(row['sim'] - threshold) <= epsilon:\n", " samples.append(row)\n", " if len(samples) >= num_samples:\n", " break\n", " return samples\n", "\n", "def lookup_originals(candidate_pair):\n", - " a = dfA.iloc[candidate_pair[0]]\n", - " b = dfB.iloc[candidate_pair[1]]\n", + " a_index, b_index = [x[1] for x in sorted(candidate_pair['group'])]\n", + " a = dfA.iloc[a_index]\n", + " b = dfB.iloc[b_index]\n", " return a, b" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 35, "metadata": { "pycharm": { "is_executing": false @@ -861,7 +862,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 36, "metadata": { "pycharm": { "is_executing": false @@ -902,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 37, "metadata": { "pycharm": { "is_executing": false @@ -914,14 +915,14 @@ "output_type": "stream", "text": [ "Proportion of exact matches for each field using threshold: 0.95\n", - "given_name 0.49\n", - "surname 0.57\n", - "street_number 0.81\n", - "address_1 0.55\n", - "address_2 0.44\n", - "suburb 0.70\n", - "postcode 0.84\n", - "state 0.93\n", + "given_name 0.47\n", + "surname 0.55\n", + "street_number 0.77\n", + "address_1 0.56\n", + "address_2 0.45\n", + "suburb 0.69\n", + "postcode 0.85\n", + "state 0.91\n", "date_of_birth 0.84\n", "soc_sec_id 0.92\n", "dtype: float64\n" @@ -931,31 +932,6 @@ "source": [ "look_at_per_field_accuracy(threshold = 0.95, num_samples = 100)" ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.12.0'" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -974,7 +950,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.7.3" } }, "nbformat": 4, From 5a9771f12791bb5a5646052c25478a1bf967ab0d Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Wed, 13 Nov 2019 16:46:41 +1100 Subject: [PATCH 03/10] Update the tests --- backend/entityservice/tests/test_project_run_results.py | 8 ++++---- backend/entityservice/tests/test_results_correctness.py | 3 ++- backend/entityservice/tests/test_serialization.py | 9 +++++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/backend/entityservice/tests/test_project_run_results.py b/backend/entityservice/tests/test_project_run_results.py index d0606500..afcfe9a1 100644 --- a/backend/entityservice/tests/test_project_run_results.py +++ b/backend/entityservice/tests/test_project_run_results.py @@ -5,10 +5,10 @@ def test_run_similarity_score_results(requests, similarity_scores_project, thres run_id = post_run(requests, similarity_scores_project, threshold) result = get_run_result(requests, similarity_scores_project, run_id, timeout=120) assert 'similarity_scores' in result - for index1, index2, score in result['similarity_scores']: - assert 0.0 <= score >= 1.0 - assert 0 <= index1 - assert 0 <= index2 + for candidate_pair in result['similarity_scores']: + assert 0.0 <= candidate_pair['sim'] >= 1.0 + for _, index in candidate_pair['group']: + assert 0 <= index def test_run_permutations_results(requests, permutations_project, threshold): diff --git a/backend/entityservice/tests/test_results_correctness.py b/backend/entityservice/tests/test_results_correctness.py index f435f3ed..c174c796 100644 --- a/backend/entityservice/tests/test_results_correctness.py +++ b/backend/entityservice/tests/test_results_correctness.py @@ -48,7 +48,8 @@ def test_similarity_scores(requests, the_truth): result = get_run_result(requests, project_data, run, timeout=60) true_scores = the_truth['similarity_scores'] - result_scores = {(a, b): sim for a, b, sim in result['similarity_scores']} + result_scores = {tuple(index for _, index in sorted(candidate_pair['group'])): candidate_pair['sim'] + for candidate_pair in result['similarity_scores']} # Anonlink is more strict on enforcing the k parameter. Hence the # subset. diff --git a/backend/entityservice/tests/test_serialization.py b/backend/entityservice/tests/test_serialization.py index e1a0c314..a3bb9342 100644 --- a/backend/entityservice/tests/test_serialization.py +++ b/backend/entityservice/tests/test_serialization.py @@ -49,8 +49,13 @@ def test_generate_scores_produces_json(self): json_obj = json.loads(json_str) self.assertIn('similarity_scores', json_obj) assert len(json_obj["similarity_scores"]) == 3 - for score in json_obj["similarity_scores"]: - self.assertEqual(len(score), 3) + for candidate_pair in json_obj["similarity_scores"]: + self.assertIn('group', candidate_pair) + self.assertIn('sim', candidate_pair) + self.assertEqual(len(candidate_pair), 2) + self.assertEqual(len(candidate_pair['group']), 2) + for group in candidate_pair['group']: + self.assertEqual(len(group), 2) def test_sims_to_json_empty(self): sims_iter = ( From ec615d20604f0abea4437cb074f539ab1d9ad4fa Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Wed, 13 Nov 2019 17:04:06 +1100 Subject: [PATCH 04/10] And finally the documentation. --- backend/entityservice/api_def/swagger.yaml | 17 +++++++++-------- docs/concepts.rst | 6 +++--- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/backend/entityservice/api_def/swagger.yaml b/backend/entityservice/api_def/swagger.yaml index 6293dff3..2f658233 100644 --- a/backend/entityservice/api_def/swagger.yaml +++ b/backend/entityservice/api_def/swagger.yaml @@ -509,25 +509,26 @@ paths: ### result_type = "similarity_scores" - The list of the indices of potential matches and their similarity score + The list of the candidate of potential matches and their similarity score where the similarity score is greater than the mapping threshold. Data is returned as `json` object e.g., { "similarity_scores": [ - [5, 27, 1.0], - [14, 10, 1.0] + {'group': [[0, 5], [1, 27]], 'sim': 1.0}, + {'group': [[1, 10], [0, 14]], 'sim': 1.0} ] } - The element in the list is of the following format `[indexA, indexB, score]`, - where `indexA` refers to the index of entity from data provider 1, `indexB` is the index of entity - from data provider 2 that is a potential match to entity in `indexA`, and `score` is the similarity score - representing the likelihood that entity in `indexA` and entity in `indexB` is a match. + The element in the list is of the following format `{'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score}`, + where the value of `group` is a candidate pair represented in a `group` format, i.e. `[party_id_0, row_index_0]` + refers to the record at the index `row_index_0` from the dataset `party_id_0`, similarly for `[party_id_1, row_index_1]`, + and `score` is the similarity score representing the likelihood that the group is a match. - `indexA` and `indexB` starts from 0. + `ds_index_0`, `rec_index_0, `ds_index_1` and `rec_index_1` start from 0, and `party_id_0 != party_id_1` but + not necessarilly ordered. The value of `score` is between 0.0 and 1.0, where 0.0 corresponds to no match and 1.0 corresponds to total match. diff --git a/docs/concepts.rst b/docs/concepts.rst index 336d2b98..e1645b96 100644 --- a/docs/concepts.rst +++ b/docs/concepts.rst @@ -106,14 +106,14 @@ relationships. The ``result_token`` (generated when creating the mapping) is required. The ``result_type`` should be set to ``"similarity_scores"``. -Results are a simple JSON array of arrays:: +Results are a JSON array of JSON objects:: [ - [index_a, index_b, score], + {'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score}, ... ] -Where the index values will be the 0 based row index from the uploaded CLKs, and +Where the index values will be the 0 based dataset index and row index from the uploaded CLKs, and the score will be a Number between the provided threshold and ``1.0``. A score of ``1.0`` means the CLKs were identical. Threshold values are usually between From 20787132f975e6e37ee08cab5aecffa2677935c6 Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Wed, 13 Nov 2019 17:14:03 +1100 Subject: [PATCH 05/10] Update changelog And move some changelogs line to the next version instead of the last alpha release as they have not been integrated in it. --- docs/changelog.rst | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 605a456c..279070c2 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,12 +7,21 @@ Changelog Next Version ------------ +- fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463) +- modify `similiraty_score` output to follow the group format, which will simplify extending this output type to more parties (#464) + +Breaking Change +~~~~~~~~~~~~~~~ + +- the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463) +- the `similiraty_score` output type has been modified, it now returns a JSON array of JSON objects, where such an object + looks like `{'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score}`. + Version 1.13.0-alpha -------------------- - fixed bug where invalid state changes could occur when starting a run (#459) - ``matching`` output type has been removed as redundant with the ``groups`` output with 2 parties. (#458) -- fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463) - Update dependencies: @@ -22,7 +31,6 @@ Breaking Change ~~~~~~~~~~~~~~~ - ``matching`` output type is not available anymore. (#458) -- the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463) Version 1.12.0 From abc329b851afa6bcb99eb83a7e49ae6c48625db8 Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Fri, 15 Nov 2019 16:47:56 +1100 Subject: [PATCH 06/10] Reduce size of output compared to first attempt. Change everything based on Wilko's comment. --- backend/entityservice/api_def/swagger.yaml | 19 +-- backend/entityservice/serialization.py | 6 +- .../tests/test_project_run_results.py | 12 +- .../tests/test_results_correctness.py | 4 +- .../entityservice/tests/test_serialization.py | 12 +- docs/changelog.rst | 6 +- docs/concepts.rst | 4 +- docs/tutorial/Similarity Scores.ipynb | 110 ++++++++++-------- 8 files changed, 91 insertions(+), 82 deletions(-) diff --git a/backend/entityservice/api_def/swagger.yaml b/backend/entityservice/api_def/swagger.yaml index 2f658233..989b19bc 100644 --- a/backend/entityservice/api_def/swagger.yaml +++ b/backend/entityservice/api_def/swagger.yaml @@ -509,26 +509,27 @@ paths: ### result_type = "similarity_scores" - The list of the candidate of potential matches and their similarity score - where the similarity score is greater than the mapping threshold. + The list of the pairwise similarity scores where the score + is greater than the similarity threshold. Data is returned as `json` object e.g., { "similarity_scores": [ - {'group': [[0, 5], [1, 27]], 'sim': 1.0}, - {'group': [[1, 10], [0, 14]], 'sim': 1.0} + [[0, 5], [1, 27], 1.0], + [[1, 10], [0, 14], 1.0] ] } - The element in the list is of the following format `{'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score}`, - where the value of `group` is a candidate pair represented in a `group` format, i.e. `[party_id_0, row_index_0]` - refers to the record at the index `row_index_0` from the dataset `party_id_0`, similarly for `[party_id_1, row_index_1]`, - and `score` is the similarity score representing the likelihood that the group is a match. + The element in the list is a list of three elements of the following format + `[[party_id_0, row_index_0], [party_id_1, row_index_1], score]`, where `[party_id_0, row_index_0]` + refers to the record at the index `row_index_0` from the dataset `party_id_0`, similarly for + `[party_id_1, row_index_1]`, and `score` is the similarity score representing the likelihood + that this pair or records is a match. `ds_index_0`, `rec_index_0, `ds_index_1` and `rec_index_1` start from 0, and `party_id_0 != party_id_1` but - not necessarilly ordered. + are not necessarilly ordered. The value of `score` is between 0.0 and 1.0, where 0.0 corresponds to no match and 1.0 corresponds to total match. diff --git a/backend/entityservice/serialization.py b/backend/entityservice/serialization.py index ff2cab0e..571490e7 100644 --- a/backend/entityservice/serialization.py +++ b/backend/entityservice/serialization.py @@ -128,7 +128,7 @@ def generate_scores(candidate_pair_stream: typing.BinaryIO): """ sims, (dset_is0, dset_is1), (rec_is0, rec_is1) = anonlink.serialization.load_candidate_pairs(candidate_pair_stream) - cs_sims_iter = (f'"group": [[{dset_i0}, {rec_i0}], [{dset_i1}, {rec_i1}]], "sim": {sim}' + cs_sims_iter = (f'[{dset_i0}, {rec_i0}], [{dset_i1}, {rec_i1}], {sim}' for sim, dset_i0, dset_i1, rec_i0, rec_i1 in zip(sims, dset_is0, dset_is1, rec_is0, rec_is1)) yield '{"similarity_scores": [' line_iter = iter(cs_sims_iter) @@ -141,11 +141,11 @@ def generate_scores(candidate_pair_stream: typing.BinaryIO): return for line in line_iter: - yield '{{{}}},'.format(prev_line.strip()) + yield '[{}],'.format(prev_line.strip()) prev_line = line # Yield the last line without a trailing comma, instead close the json object - yield '{{{}}}'.format(prev_line.strip()) + yield '[{}]'.format(prev_line.strip()) yield ']}' diff --git a/backend/entityservice/tests/test_project_run_results.py b/backend/entityservice/tests/test_project_run_results.py index afcfe9a1..9d1b831c 100644 --- a/backend/entityservice/tests/test_project_run_results.py +++ b/backend/entityservice/tests/test_project_run_results.py @@ -4,11 +4,13 @@ def test_run_similarity_score_results(requests, similarity_scores_project, threshold): run_id = post_run(requests, similarity_scores_project, threshold) result = get_run_result(requests, similarity_scores_project, run_id, timeout=120) - assert 'similarity_scores' in result - for candidate_pair in result['similarity_scores']: - assert 0.0 <= candidate_pair['sim'] >= 1.0 - for _, index in candidate_pair['group']: - assert 0 <= index + for (party_id_1, rec_id_1), (party_id_2, rec_id_2), score in result['similarity_scores']: + assert 0.0 <= score >= 1.0 + assert 0 <= party_id_1 + assert 0 <= party_id_2 + assert party_id_1 != party_id_2 + assert 0 <= rec_id_1 + assert 0 <= rec_id_2 def test_run_permutations_results(requests, permutations_project, threshold): diff --git a/backend/entityservice/tests/test_results_correctness.py b/backend/entityservice/tests/test_results_correctness.py index c174c796..37da082e 100644 --- a/backend/entityservice/tests/test_results_correctness.py +++ b/backend/entityservice/tests/test_results_correctness.py @@ -48,8 +48,8 @@ def test_similarity_scores(requests, the_truth): result = get_run_result(requests, project_data, run, timeout=60) true_scores = the_truth['similarity_scores'] - result_scores = {tuple(index for _, index in sorted(candidate_pair['group'])): candidate_pair['sim'] - for candidate_pair in result['similarity_scores']} + result_scores = {tuple(index for _, index in sorted([a, b])): score + for a, b, score in result['similarity_scores']} # Anonlink is more strict on enforcing the k parameter. Hence the # subset. diff --git a/backend/entityservice/tests/test_serialization.py b/backend/entityservice/tests/test_serialization.py index a3bb9342..8dc0e0ef 100644 --- a/backend/entityservice/tests/test_serialization.py +++ b/backend/entityservice/tests/test_serialization.py @@ -49,13 +49,11 @@ def test_generate_scores_produces_json(self): json_obj = json.loads(json_str) self.assertIn('similarity_scores', json_obj) assert len(json_obj["similarity_scores"]) == 3 - for candidate_pair in json_obj["similarity_scores"]: - self.assertIn('group', candidate_pair) - self.assertIn('sim', candidate_pair) - self.assertEqual(len(candidate_pair), 2) - self.assertEqual(len(candidate_pair['group']), 2) - for group in candidate_pair['group']: - self.assertEqual(len(group), 2) + for pair_and_score in json_obj["similarity_scores"]: + self.assertEqual(len(pair_and_score), 3) + a, b, score = pair_and_score + self.assertEqual(len(a), 2) + self.assertEqual(len(b), 2) def test_sims_to_json_empty(self): sims_iter = ( diff --git a/docs/changelog.rst b/docs/changelog.rst index 279070c2..5fc77ca5 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,14 +8,14 @@ Next Version ------------ - fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463) -- modify `similiraty_score` output to follow the group format, which will simplify extending this output type to more parties (#464) +- modify ``similiraty_score`` output to follow the group format, which will simplify extending this output type to more parties (#464) Breaking Change ~~~~~~~~~~~~~~~ - the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463) -- the `similiraty_score` output type has been modified, it now returns a JSON array of JSON objects, where such an object - looks like `{'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score}`. +- the ``similiraty_score`` output type has been modified, it now returns a JSON array of JSON objects, where such an object + looks like `[[party_id_0, row_index_0], [party_id_1, row_index_1], score]`. (#464) Version 1.13.0-alpha -------------------- diff --git a/docs/concepts.rst b/docs/concepts.rst index e1645b96..b4bb232f 100644 --- a/docs/concepts.rst +++ b/docs/concepts.rst @@ -106,10 +106,10 @@ relationships. The ``result_token`` (generated when creating the mapping) is required. The ``result_type`` should be set to ``"similarity_scores"``. -Results are a JSON array of JSON objects:: +Results are a JSON array of JSON arrays of three elements:: [ - {'group': [[party_id_0, row_index_0], [party_id_1, row_index_1]], 'sim': score}, + [[party_id_0, row_index_0], [party_id_1, row_index_1], score], ... ] diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb index 4b9723fe..483754a9 100644 --- a/docs/tutorial/Similarity Scores.ipynb +++ b/docs/tutorial/Similarity Scores.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 1, "metadata": { "pycharm": { "is_executing": false @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 25, "metadata": { "pycharm": { "is_executing": false @@ -78,7 +78,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing anonlink-entity-service hosted at http://0.0.0.0:8851\n" + "Testing anonlink-entity-service hosted at https://es.testing.data61.xyz\n" ] } ], @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "metadata": { "pycharm": { "is_executing": false @@ -100,7 +100,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 0, \"rate\": 1, \"status\": \"ok\"}\r\n" + "{\"project_count\": 608, \"rate\": 1736579, \"status\": \"ok\"}\r\n" ] } ], @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": { "pycharm": { "is_executing": false @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": { "pycharm": { "is_executing": false @@ -254,7 +254,7 @@ "rec-4405-org 4365168 " ] }, - "execution_count": 16, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -288,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": { "pycharm": { "is_executing": false @@ -301,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": { "pycharm": { "is_executing": false @@ -312,7 +312,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /tmp/tmpt3tdin9z\n" + "Overwriting /tmp/tmpv1v83v52\n" ] } ], @@ -405,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "metadata": { "pycharm": { "is_executing": false @@ -416,20 +416,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /tmp/tmpoonx5w3z\n", + "Credentials will be saved in /tmp/tmpungswy3f\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': '3783f956baf9fee8f5df25d16377bd1d789a1ab3fbc103e6',\n", - " 'result_token': '3d87a15b6f56fb89b58db77a0c89da47b9896a7201c7a5ee',\n", - " 'update_tokens': ['e3476071f4395ae149f2d03e08e1edd7aa983aa1e56e52c6',\n", - " '85aa49ae3b90614b7f34ee65efc4246df17aa51d0caa577a']}" + "{'project_id': '1ef6a2279d2cea3b37c7257c9c84d6235ae197c8a1acc970',\n", + " 'result_token': '6eb05268c2693b2ba97a0d9908dfafa13e458356f9deaa27',\n", + " 'update_tokens': ['c64624a63b1fb18ee36c5f2f0cddbf1ec1ad2383b1d5ffa0',\n", + " '4cf104ae92d2fa6c3f276de1bfb0d43d380b6cf8fd152d6f']}" ] }, - "execution_count": 19, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -463,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "metadata": { "pycharm": { "is_executing": false @@ -474,10 +474,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "generating CLKs: 100%|█| 5.00k/5.00k [00:00<00:00, 884clk/s, mean=882, std=35.1]\n", - "\u001b[31mCLK data written to /tmp/tmp8ppb7r2f.json\u001b[0m\n", - "generating CLKs: 100%|█| 5.00k/5.00k [00:00<00:00, 932clk/s, mean=873, std=42.5]\n", - "\u001b[31mCLK data written to /tmp/tmpu9wy917t.json\u001b[0m\n" + "generating CLKs: 100%|█| 5.00k/5.00k [00:01<00:00, 834clk/s, mean=882, std=35.1]\n", + "\u001b[31mCLK data written to /tmp/tmp0d26hlis.json\u001b[0m\n", + "generating CLKs: 100%|█| 5.00k/5.00k [00:00<00:00, 894clk/s, mean=873, std=42.5]\n", + "\u001b[31mCLK data written to /tmp/tmpi4v3ua27.json\u001b[0m\n" ] } ], @@ -506,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 11, "metadata": { "pycharm": { "is_executing": false @@ -545,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 12, "metadata": { "pycharm": { "is_executing": false @@ -577,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 13, "metadata": { "pycharm": { "is_executing": false @@ -616,7 +616,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "metadata": { "pycharm": { "is_executing": false @@ -642,7 +642,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -664,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "metadata": { "pycharm": { "is_executing": false @@ -675,16 +675,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'group': [[0, 76], [1, 2345]], 'sim': 1.0}\n", - "{'group': [[0, 83], [1, 3439]], 'sim': 1.0}\n", - "{'group': [[0, 103], [1, 863]], 'sim': 1.0}\n", - "{'group': [[0, 154], [1, 2391]], 'sim': 1.0}\n", - "{'group': [[0, 177], [1, 4247]], 'sim': 1.0}\n", - "{'group': [[0, 192], [1, 1176]], 'sim': 1.0}\n", - "{'group': [[0, 270], [1, 4516]], 'sim': 1.0}\n", - "{'group': [[0, 312], [1, 1253]], 'sim': 1.0}\n", - "{'group': [[0, 407], [1, 3743]], 'sim': 1.0}\n", - "{'group': [[0, 670], [1, 3550]], 'sim': 1.0}\n" + "[[0, 76], [1, 2345], 1.0]\n", + "[[0, 83], [1, 3439], 1.0]\n", + "[[0, 103], [1, 863], 1.0]\n", + "[[0, 154], [1, 2391], 1.0]\n", + "[[0, 177], [1, 4247], 1.0]\n", + "[[0, 192], [1, 1176], 1.0]\n", + "[[0, 270], [1, 4516], 1.0]\n", + "[[0, 312], [1, 1253], 1.0]\n", + "[[0, 407], [1, 3743], 1.0]\n", + "[[0, 670], [1, 3550], 1.0]\n" ] } ], @@ -704,7 +704,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "metadata": { "pycharm": { "is_executing": false @@ -717,7 +717,7 @@ "1551460" ] }, - "execution_count": 27, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -737,7 +737,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 18, "metadata": { "pycharm": { "is_executing": false @@ -758,7 +758,7 @@ } ], "source": [ - "plt.hist([_['sim'] for _ in data[::100]], bins=50);" + "plt.hist([score for _, _, score in data[::100]], bins=50);" ] }, { @@ -772,7 +772,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 20, "metadata": { "pycharm": { "is_executing": false @@ -793,7 +793,7 @@ } ], "source": [ - "plt.hist([_['sim'] for _ in data[::1] if _['sim'] > 0.94], bins=50);" + "plt.hist([score for _, _, score in data[::1] if score > 0.94], bins=50);" ] }, { @@ -807,7 +807,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 21, "metadata": { "pycharm": { "is_executing": false @@ -818,14 +818,15 @@ "def sample(data, threshold, num_samples, epsilon=0.01):\n", " samples = []\n", " for row in data:\n", - " if abs(row['sim'] - threshold) <= epsilon:\n", + " if abs(row[2] - threshold) <= epsilon:\n", " samples.append(row)\n", " if len(samples) >= num_samples:\n", " break\n", " return samples\n", "\n", "def lookup_originals(candidate_pair):\n", - " a_index, b_index = [x[1] for x in sorted(candidate_pair['group'])]\n", + " a, b, score = candidate_pair\n", + " a_index, b_index = [x[1] for x in sorted([a, b])]\n", " a = dfA.iloc[a_index]\n", " b = dfB.iloc[b_index]\n", " return a, b" @@ -833,7 +834,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 22, "metadata": { "pycharm": { "is_executing": false @@ -862,7 +863,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 23, "metadata": { "pycharm": { "is_executing": false @@ -903,7 +904,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 24, "metadata": { "pycharm": { "is_executing": false @@ -932,6 +933,13 @@ "source": [ "look_at_per_field_accuracy(threshold = 0.95, num_samples = 100)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 2c712e3e6c635b1c919f0dfabb163085ac9efd0d Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Fri, 15 Nov 2019 16:51:17 +1100 Subject: [PATCH 07/10] Re-introduced an assertion in test deleted by mistake. --- backend/entityservice/tests/test_project_run_results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/entityservice/tests/test_project_run_results.py b/backend/entityservice/tests/test_project_run_results.py index 9d1b831c..1b08b10f 100644 --- a/backend/entityservice/tests/test_project_run_results.py +++ b/backend/entityservice/tests/test_project_run_results.py @@ -4,6 +4,7 @@ def test_run_similarity_score_results(requests, similarity_scores_project, threshold): run_id = post_run(requests, similarity_scores_project, threshold) result = get_run_result(requests, similarity_scores_project, run_id, timeout=120) + assert 'similarity_scores' in result for (party_id_1, rec_id_1), (party_id_2, rec_id_2), score in result['similarity_scores']: assert 0.0 <= score >= 1.0 assert 0 <= party_id_1 From 73938e6a060ad8746547ca79d8b66dc09545638e Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Mon, 18 Nov 2019 11:09:24 +1100 Subject: [PATCH 08/10] Rephrase description of `score` in similiratiy output type. --- backend/entityservice/api_def/swagger.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/entityservice/api_def/swagger.yaml b/backend/entityservice/api_def/swagger.yaml index 989b19bc..4955a8f2 100644 --- a/backend/entityservice/api_def/swagger.yaml +++ b/backend/entityservice/api_def/swagger.yaml @@ -531,8 +531,8 @@ paths: `ds_index_0`, `rec_index_0, `ds_index_1` and `rec_index_1` start from 0, and `party_id_0 != party_id_1` but are not necessarilly ordered. - The value of `score` is between 0.0 and 1.0, where 0.0 corresponds to no match - and 1.0 corresponds to total match. + The value of `score` is between 0.0 and 1.0. The higher the score, the higher the similarity between + the compared CLKs. ### result_type = "permutations" From 67b9f737b290bba7aee7bd21a0f4d133bdd3f9e6 Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Mon, 18 Nov 2019 11:09:47 +1100 Subject: [PATCH 09/10] Correct typos. --- docs/changelog.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 5fc77ca5..3e6be439 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,13 +8,13 @@ Next Version ------------ - fixed a bug where a dataprovider could upload her clks multiple time in a project using the same upload token (#463) -- modify ``similiraty_score`` output to follow the group format, which will simplify extending this output type to more parties (#464) +- modify ``similarity_score`` output to follow the group format, which will simplify extending this output type to more parties (#464) Breaking Change ~~~~~~~~~~~~~~~ - the ``dataproviders`` table `uploaded` field has been modified from a BOOL to an ENUM type (#463) -- the ``similiraty_score`` output type has been modified, it now returns a JSON array of JSON objects, where such an object +- the ``similarity_score`` output type has been modified, it now returns a JSON array of JSON objects, where such an object looks like `[[party_id_0, row_index_0], [party_id_1, row_index_1], score]`. (#464) Version 1.13.0-alpha From cce3cac84e13f5977adb32bcd2b57bbbe926e29c Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Tue, 19 Nov 2019 16:38:17 +1100 Subject: [PATCH 10/10] Add a note about the tutorials usage. They may not work with the currently deployed service because of breaking changes. --- docs/tutorial/index.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst index a57629bc..2998dbe3 100644 --- a/docs/tutorial/index.rst +++ b/docs/tutorial/index.rst @@ -12,6 +12,18 @@ Tutorials multiparty-linkage-in-entity-service.ipynb +Usage +----- + +The code is often evolving and may include some breaking changes not yet deployed in our testing deployment (at the +URL https://testing.es.data61.xyz ). So to run the tutorials, you can either: + + - use the tutorials from the `master` branch of this repository which will work with the currently deployed testing service, + - or build and deploy the service from the same branch as the tutorials you would like to run, providing its URL to + the tutorials via the environment variable `SERVER` (e.g. `SERVER=http://0.0.0.0:8851` if deployed locally). + +Other use-cases are not supported and may fail for non-obvious reasons. + External Tutorials ------------------