From 8f33dd1bb3f606b39e1213a84c1d7d08cb8bcd8d Mon Sep 17 00:00:00 2001 From: wilko Date: Wed, 20 Nov 2019 17:03:26 +1100 Subject: [PATCH 01/12] fixed links such they might work with nbsphinx --- docs/tutorial/Permutations.ipynb | 311 +++++++++---------------------- 1 file changed, 86 insertions(+), 225 deletions(-) diff --git a/docs/tutorial/Permutations.ipynb b/docs/tutorial/Permutations.ipynb index e30adabc..e17285b9 100644 --- a/docs/tutorial/Permutations.ipynb +++ b/docs/tutorial/Permutations.ipynb @@ -23,15 +23,15 @@ "### Steps\n", "These steps are usually run by different companies - but for illustration all is carried out in this one file. The participants providing data are _Alice_ and *Bob*, and the *Analyst* acting the integration authority.\n", "\n", - "* [Check connection to Entity Service](#check_con)\n", - "* [Data preparation](#data_prep)\n", + "* [Check connection to Entity Service](#Check-Connection)\n", + "* [Data preparation](#Data-preparation)\n", " * Write CSV files with PII\n", - " * [Create a Linkage Schema](#schema_prep)\n", - "* [Create Linkage Project](#create_pro)\n", - "* [Generate CLKs from PII](#hash_n_up)\n", - "* [Upload the PII](#hash_n_up)\n", - "* [Create a run](#create_run)\n", - "* [Retrieve and analyse results](#results)" + " * [Create a Linkage Schema](#Schema-Preparation)\n", + "* [Create Linkage Project](#Create-Linkage-Project)\n", + "* [Generate CLKs from PII](#Hash-and-Upload)\n", + "* [Upload the PII](#Hash-and-Upload)\n", + "* [Create a run](#Create-a-run)\n", + "* [Retrieve and analyse results](#Results)" ] }, { @@ -40,7 +40,6 @@ "pycharm": {} }, "source": [ - "\n", "## Check Connection\n", "\n", "> If you're connecting to a custom entity service, change the address here." @@ -82,7 +81,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 6534, \"rate\": 2504556, \"status\": \"ok\"}\r\n" + "{\"project_count\": 6539, \"rate\": 2530484, \"status\": \"ok\"}\r\n" ] } ], @@ -96,7 +95,6 @@ "pycharm": {} }, "source": [ - "\n", "## Data preparation\n", "\n", "Following the [clkhash tutorial](http://clkhash.readthedocs.io/en/latest/tutorial_cli.html) we will use a dataset from the `recordlinkage` library. We will just write both datasets out to temporary CSV files.\n" @@ -173,7 +171,7 @@ " \n", " \n", " \n", - " rec-1070-org\n", + " rec-1070-org\n", " michaela\n", " neumann\n", " 8\n", @@ -186,7 +184,7 @@ " 5304218\n", " \n", " \n", - " rec-1016-org\n", + " rec-1016-org\n", " courtney\n", " painter\n", " 12\n", @@ -199,7 +197,7 @@ " 4066625\n", " \n", " \n", - " rec-4405-org\n", + " rec-4405-org\n", " charles\n", " green\n", " 38\n", @@ -262,9 +260,7 @@ "pycharm": {} }, "source": [ - "\n", "## Schema Preparation\n", - "\n", "The linkage schema must be agreed on by the two parties. A hashing schema instructs clkhash how to treat each column for generating CLKs. A detailed description of the hashing schema can be found in the [api docs](http://clkhash.readthedocs.io/en/latest/schema.html). We will ignore the columns ‘rec_id’ and ‘soc_sec_id’ for CLK generation." ] }, @@ -294,23 +290,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /tmp/tmptm0w938k\n" + "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp5qsl5x48\n" ] } ], "source": [ "%%writefile {schema.name}\n", "{\n", - " \"version\": 3,\n", + " \"version\": 1,\n", " \"clkConfig\": {\n", " \"l\": 1024,\n", - " \"xor_folds\": 0,\n", + " \"k\": 30,\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", " \"kdf\": {\n", " \"type\": \"HKDF\",\n", " \"hash\": \"SHA256\",\n", - " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", - " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", - " \"keySize\": 64\n", + " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", + " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", + " \"keySize\": 64\n", " }\n", " },\n", " \"features\": [\n", @@ -320,189 +319,48 @@ " },\n", " {\n", " \"identifier\": \"given_name\",\n", - " \"format\": {\n", - " \"type\": \"string\",\n", - " \"encoding\": \"utf-8\"\n", - " },\n", - " \"hashing\": {\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 2,\n", - " \"positional\": false\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", " },\n", " {\n", " \"identifier\": \"surname\",\n", - " \"format\": {\n", - " \"type\": \"string\",\n", - " \"encoding\": \"utf-8\"\n", - " },\n", - " \"hashing\": {\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 2,\n", - " \"positional\": false\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", " },\n", " {\n", " \"identifier\": \"street_number\",\n", - " \"format\": {\n", - " \"type\": \"integer\"\n", - " },\n", - " \"hashing\": {\n", - " \"missingValue\": {\n", - " \"sentinel\": \"\"\n", - " },\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 15\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 1,\n", - " \"positional\": true\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"integer\" },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 0.5, \"missingValue\": {\"sentinel\": \"\"} }\n", " },\n", " {\n", " \"identifier\": \"address_1\",\n", - " \"format\": {\n", - " \"type\": \"string\",\n", - " \"encoding\": \"utf-8\"\n", - " },\n", - " \"hashing\": {\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 15\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 2,\n", - " \"positional\": false\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"weight\": 0.5 }\n", " },\n", " {\n", " \"identifier\": \"address_2\",\n", - " \"format\": {\n", - " \"type\": \"string\",\n", - " \"encoding\": \"utf-8\"\n", - " },\n", - " \"hashing\": {\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 15\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 2,\n", - " \"positional\": false\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"weight\": 0.5 }\n", " },\n", " {\n", " \"identifier\": \"suburb\",\n", - " \"format\": {\n", - " \"type\": \"string\",\n", - " \"encoding\": \"utf-8\"\n", - " },\n", - " \"hashing\": {\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 15\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 2,\n", - " \"positional\": false\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", + " \"hashing\": { \"ngram\": 2, \"weight\": 0.5 }\n", " },\n", " {\n", " \"identifier\": \"postcode\",\n", - " \"format\": {\n", - " \"type\": \"integer\",\n", - " \"minimum\": 100,\n", - " \"maximum\": 9999\n", - " },\n", - " \"hashing\": {\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 15\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 1,\n", - " \"positional\": true\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"integer\", \"minimum\": 100, \"maximum\": 9999 },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 0.5 }\n", " },\n", " {\n", " \"identifier\": \"state\",\n", - " \"format\": {\n", - " \"type\": \"string\",\n", - " \"encoding\": \"utf-8\",\n", - " \"maxLength\": 3\n", - " },\n", - " \"hashing\": {\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 2,\n", - " \"positional\": false\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 3 },\n", + " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", " },\n", " {\n", " \"identifier\": \"date_of_birth\",\n", - " \"format\": {\n", - " \"type\": \"integer\"\n", - " },\n", - " \"hashing\": {\n", - " \"missingValue\": {\n", - " \"sentinel\": \"\"\n", - " },\n", - " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", - " },\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", - " \"comparison\": {\n", - " \"type\": \"ngram\",\n", - " \"n\": 1,\n", - " \"positional\": true\n", - " }\n", - " }\n", + " \"format\": { \"type\": \"integer\" },\n", + " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 1, \"missingValue\": {\"sentinel\": \"\"} }\n", " },\n", " {\n", " \"identifier\": \"soc_sec_id\",\n", @@ -518,7 +376,6 @@ "pycharm": {} }, "source": [ - "\n", "## Create Linkage Project\n", "\n", "The analyst carrying out the linkage starts by creating a linkage project of the desired output type with the Entity Service.\n" @@ -537,17 +394,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /tmp/tmptneh9xy1\n", + "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpdo3x0629\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': '12256e29a8ad92c9016ba3e7650888f13d3bfb3bd23cc98a',\n", - " 'result_token': '1a588d384f651e9430ac1bb42196f9fe393ff10e8ec65f48',\n", - " 'update_tokens': ['6111c582a0d6a649480c719adcd258b811da17887849ee00',\n", - " '4239370ce8868a9eb3dc85a85eca243bf593a0cc637a5be8']}" + "{'project_id': 'fbb0845d2063e5cefe9153ebeacf42921418038a11c104ef',\n", + " 'result_token': '635bd95ab7c4d834bdf811aed0c026a81cd4944ba66ffc15',\n", + " 'update_tokens': ['ab2f33eef06d045db454d4fbc7821ea5971970beafede1be',\n", + " '13dc3ac340a2b51c78400a301fbaebc819022e5d231bb4a7']}" ] }, "execution_count": 7, @@ -578,7 +435,6 @@ "source": [ "**Note:** the analyst will need to pass on the `project_id` (the id of the linkage project) and one of the two `update_tokens` to each data provider.\n", "\n", - "\n", "## Hash and Upload\n", "\n", "At the moment both data providers have *raw* personally identiy information. We first have to generate CLKs from the raw entity information. We need:\n", @@ -602,8 +458,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mCLK data written to /tmp/tmp9vdauwh4.json\u001b[0m\n", - "\u001b[31mCLK data written to /tmp/tmpgspffags.json\u001b[0m\n" + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpbd2u9qhd.json\u001b[0m\n", + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp5al8agy7.json\u001b[0m\n" ] } ], @@ -743,7 +599,6 @@ "pycharm": {} }, "source": [ - "\n", "## Create a run\n", "\n", "Now the project has been created and the CLK data has been uploaded we can carry out some privacy preserving record linkage. Try with a few different threshold values:" @@ -776,7 +631,6 @@ "pycharm": {} }, "source": [ - "\n", "## Results\n", "\n", "Now after some delay (depending on the size) we can fetch the mask.\n", @@ -964,7 +818,7 @@ { "data": { "text/plain": [ - "[2418, 3590, 2340, 1226, 1323, 251, 4696, 2598, 4019, 301]" + "[1886, 1657, 2212, 4844, 3960, 125, 1791, 2770, 4888, 1367]" ] }, "execution_count": 20, @@ -998,7 +852,7 @@ { "data": { "text/plain": [ - "[3183, 4293, 3406, 2808, 4528, 2446, 4606, 1601, 1641, 2062]" + "[4498, 3092, 520, 3834, 1074, 4519, 997, 991, 4505, 2389]" ] }, "execution_count": 21, @@ -1072,16 +926,16 @@ { "data": { "text/plain": [ - "['rec-3933-org,joshua,rigley,19,east place,kergunyah,kingaroy,3665,vic,19670613,4096438\\n',\n", - " 'rec-1057-org,samara,pringle,7,allan street,bonnie doon,campbelltown,5073,nsw,19560429,3493586\\n',\n", - " 'rec-4035-org,chloe,worm,6,brentnall place,donna valley,karloo,3128,nsw,19000814,9383057\\n',\n", - " 'rec-3793-org,lucy,mccarthy,29,charlton street,warrah lea,bundaberg,4061,qld,19940917,6596660\\n',\n", - " 'rec-27-org,angelina,campbell,161,jackie howe crescent,bugoren,woorim,6052,nsw,19531108,8948230\\n',\n", - " 'rec-2303-org,tahlia,hage,3,maclaurin crescent,,ormond,4740,tas,19190517,6174860\\n',\n", - " 'rec-658-org,david,hobson,14,vagabond crescent,dugout 65,patterson lakes,4880,wa,19010305,7666240\\n',\n", - " 'rec-4484-org,alexandra,clarke,15,parnell road,rsdb 284,nedlands,4014,sa,19890608,7235143\\n',\n", - " 'rec-702-org,barnaby,fleet,4,martley circuit,peak view,ascot vale,3930,sa,19360907,9383837\\n',\n", - " 'rec-3252-org,,campbell,4,dunbar street,delicate nobby street,cloverdale,2528,vic,19480406,8607518\\n']" + "['rec-1225-org,hayden,ballantyne,13,,nunnook,young,2077,nsw,19330812,3414771\\n',\n", + " 'rec-4635-org,isabella,white,8,cooling place,,rosebud,6151,sa,19990911,2206317\\n',\n", + " 'rec-1790-org,bailey,heuer,65,fossey street,brindabella specialist centre,vaucluse,2010,qld,19511013,9539538\\n',\n", + " 'rec-2882-org,sarah,eglinton,19,beasley street,bandaroo,naracoorte,4021,nsw,19451107,4310446\\n',\n", + " 'rec-3521-org,spencer,bates-brownsword,151,pinkerton circuit,tora,smithfield,4860,nsw,19810308,5402648\\n',\n", + " 'rec-2055-org,tai,garven,21,finniss crescent,donette downs,pymble,2035,nsw,19930723,6253715\\n',\n", + " 'rec-1529-org,zachariah,campbell,32,gellibrand street,carowood,keswick,3148,vic,19271210,2544494\\n',\n", + " 'rec-1817-org,noah,boyle,11,dooland court,,flowerdale,3163,vic,19260331,2019310\\n',\n", + " 'rec-4200-org,lara,sekuless,82,loch street,,yarraville,3196,qld,19861129,1392776\\n',\n", + " 'rec-1541-org,jessica,paine,58,eddison place,pine hut,new farm,2022,vic,19661210,8315488\\n']" ] }, "execution_count": 24, @@ -1105,16 +959,16 @@ { "data": { "text/plain": [ - "['rec-3933-dup-0,joshua,rigly,19,east place,kergunyah,kingaroy,3665,vic,19670613,4096438\\n',\n", - " 'rec-1057-dup-0,pringle,samara,7,allan street,bonnie doon,campbelltown,5073,nsw,19560429,3493586\\n',\n", - " 'rec-4035-dup-0,chooe,worm,6,brentnal place,donna valley,karloo,3128,nsw,19000814,9383057\\n',\n", - " 'rec-3793-dup-0,mccarthy,lucy,29,charltonstreet,warrahlea,bundaverg,4061,qld,19940917,6596660\\n',\n", - " 'rec-27-dup-0,angelina,campbell,190,jackie howe crescent,bugoren,woorim,6352,nsw,19531108,8948230\\n',\n", - " 'rec-2303-dup-0,peter,ha ge,3,maclaurin crescent,,ormond,4704,tas,19190517,6174860\\n',\n", - " 'rec-658-dup-0,david,hobsson,14,vagabond cfescent,dugout 65,patterson lakes,4880,wa,19010305,7666240\\n',\n", - " 'rec-4484-dup-0,alexandra,clarke,15,rsd b 284,parnell roa,,4014,sa,19890608,7235143\\n',\n", - " 'rec-702-dup-0,barnay,fleet,4,martley circuit,peak view,ascot vale,3930,sa,19360907,9383837\\n',\n", - " 'rec-3252-dup-0,,campbell,4,dunbar svtreet,delicate nobby street,cloverdale,2528,vic,19480406,8607518\\n']" + "['rec-1225-dup-0,hayden,ballantyne,13,,,young,2077,nsw,19330812,3414771\\n',\n", + " 'rec-4635-dup-0,isaeblla,white,8,cooling place,massey green,rosebud,6151,sa,19990911,2206317\\n',\n", + " 'rec-1790-dup-0,shannon,heurr,65,fossey street,brindabella specialist centre,vaucluse,2010,qld,19511013,9539538\\n',\n", + " 'rec-2882-dup-0,sarah,eglinton,19,beasleyz street,,naraocorte,4012,nsw,19451107,4310446\\n',\n", + " 'rec-3521-dup-0,spencer,bates-brownsword,151,tora,pinkerton circuit,smithfield,4860,nsw,19810308,5402648\\n',\n", + " 'rec-2055-dup-0,taiz,garven,,finniss crescent,donetted owns,pymble,2035,nsw,19930723,6253715\\n',\n", + " 'rec-1529-dup-0,ebonie,campbell,32,gellibrand street,carowood,kessick,3148,vic,19271210,2544494\\n',\n", + " 'rec-1817-dup-0,noah,boyle,11,doolandcouhrt,,flowerdale,3163,vic,19260331,7756654\\n',\n", + " 'rec-4200-dup-0,lara,sekuless,9,loch sutreet,,yarraville,3196,qld,19861129,1392776\\n',\n", + " 'rec-1541-dup-0,jessica,paine,58,eddisonv place,pine hut,new farm,2022,vic,19661210,8315488\\n']" ] }, "execution_count": 25, @@ -1152,16 +1006,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Joshua Rigley (rec-3933-org) =? Joshua Rigly (rec-3933-dup-0)\n", - "Samara Pringle (rec-1057-org) =? Pringle Samara (rec-1057-dup-0)\n", - "Chloe Worm (rec-4035-org) =? Chooe Worm (rec-4035-dup-0)\n", - "Lucy Mccarthy (rec-3793-org) =? Mccarthy Lucy (rec-3793-dup-0)\n", - "Angelina Campbell (rec-27-org) =? Angelina Campbell (rec-27-dup-0)\n", - "Tahlia Hage (rec-2303-org) =? Peter Ha Ge (rec-2303-dup-0)\n", - "David Hobson (rec-658-org) =? David Hobsson (rec-658-dup-0)\n", - "Alexandra Clarke (rec-4484-org) =? Alexandra Clarke (rec-4484-dup-0)\n", - "Barnaby Fleet (rec-702-org) =? Barnay Fleet (rec-702-dup-0)\n", - " Campbell (rec-3252-org) =? Campbell (rec-3252-dup-0)\n" + "Hayden Ballantyne (rec-1225-org) =? Hayden Ballantyne (rec-1225-dup-0)\n", + "Isabella White (rec-4635-org) =? Isaeblla White (rec-4635-dup-0)\n", + "Bailey Heuer (rec-1790-org) =? Shannon Heurr (rec-1790-dup-0)\n", + "Sarah Eglinton (rec-2882-org) =? Sarah Eglinton (rec-2882-dup-0)\n", + "Spencer Bates-Brownsword (rec-3521-org) =? Spencer Bates-Brownsword (rec-3521-dup-0)\n", + "Tai Garven (rec-2055-org) =? Taiz Garven (rec-2055-dup-0)\n", + "Zachariah Campbell (rec-1529-org) =? Ebonie Campbell (rec-1529-dup-0)\n", + "Noah Boyle (rec-1817-org) =? Noah Boyle (rec-1817-dup-0)\n", + "Lara Sekuless (rec-4200-org) =? Lara Sekuless (rec-4200-dup-0)\n", + "Jessica Paine (rec-1541-org) =? Jessica Paine (rec-1541-dup-0)\n" ] } ], @@ -1230,6 +1084,13 @@ "print(\"Precision: {:.1f}%\".format(100*precision))\n", "print(\"Recall: {:.1f}%\".format(100*recall))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1248,7 +1109,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" }, "pycharm": { "stem_cell": { From 5690fbc2422d264e140786bf22ce6e12037f1cc6 Mon Sep 17 00:00:00 2001 From: wilko Date: Thu, 21 Nov 2019 11:32:35 +1100 Subject: [PATCH 02/12] incorporated Joyce's feedback --- docs/tutorial/Similarity Scores.ipynb | 238 +++++++++++--------------- 1 file changed, 100 insertions(+), 138 deletions(-) diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb index 583718df..25740861 100644 --- a/docs/tutorial/Similarity Scores.ipynb +++ b/docs/tutorial/Similarity Scores.ipynb @@ -47,6 +47,7 @@ "import json\n", "import os\n", "import time\n", + "import pandas as pd\n", "\n", "import matplotlib.pyplot as plt\n", "import requests\n", @@ -78,7 +79,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing anonlink-entity-service hosted at http://0.0.0.0:8851\n" + "Testing anonlink-entity-service hosted at https://testing.es.data61.xyz\n" ] } ], @@ -100,7 +101,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 1689, \"rate\": 2267284, \"status\": \"ok\"}\r\n" + "{\"project_count\": 6542, \"rate\": 2549867, \"status\": \"ok\"}\r\n" ] } ], @@ -192,7 +193,7 @@ " \n", " \n", " \n", - " rec-1070-org\n", + " rec-1070-org\n", " michaela\n", " neumann\n", " 8\n", @@ -205,7 +206,7 @@ " 5304218\n", " \n", " \n", - " rec-1016-org\n", + " rec-1016-org\n", " courtney\n", " painter\n", " 12\n", @@ -218,7 +219,7 @@ " 4066625\n", " \n", " \n", - " rec-4405-org\n", + " rec-4405-org\n", " charles\n", " green\n", " 38\n", @@ -312,7 +313,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /tmp/tmpw_n8wu8g\n" + "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpf_28_avn\n" ] } ], @@ -344,7 +345,7 @@ " },\n", " \"hashing\": {\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 200\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -364,7 +365,7 @@ " },\n", " \"hashing\": {\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 200\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -386,7 +387,7 @@ " \"sentinel\": \"\"\n", " },\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 100\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -406,7 +407,7 @@ " },\n", " \"hashing\": {\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 100\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -426,7 +427,7 @@ " },\n", " \"hashing\": {\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 100\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -446,7 +447,7 @@ " },\n", " \"hashing\": {\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 100\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -467,7 +468,7 @@ " },\n", " \"hashing\": {\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 100\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -488,7 +489,7 @@ " },\n", " \"hashing\": {\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 100\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -510,7 +511,7 @@ " \"sentinel\": \"\"\n", " },\n", " \"strategy\": {\n", - " \"bitsPerToken\": 30\n", + " \"bitsPerFeature\": 200\n", " },\n", " \"hash\": {\n", " \"type\": \"doubleHash\"\n", @@ -554,17 +555,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /tmp/tmp2eppf_dc\n", + "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpd6lzqk30\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': 'fc8f8216e33a7b8ffd4b967c27f8ce8e5d7371cf8f52bcdb',\n", - " 'result_token': '6423ccee1e634a390a12e3de1a57e7bd322621111c119351',\n", - " 'update_tokens': ['ef0404a7c23ea25c9f922f4c254f80dd6fa644d7d906efa9',\n", - " '46a71922c19a75eae2dd75ec59db0eac453842123514c22a']}" + "{'project_id': 'a28958f3c6df6afa3cdbe0337a2621f1a76ce4c6929fb772',\n", + " 'result_token': 'dde8c58598ea98de862ae5c4e48ec3acfe342162a4133afd',\n", + " 'update_tokens': ['1535fe32ca6becf8fe91b0de32d2e47d9e3edddb72017205',\n", + " 'c50cb50080d0345fb8407ad9a974323567a054884ab2f4d1']}" ] }, "execution_count": 8, @@ -612,8 +613,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mCLK data written to /tmp/tmpjlx4bxil.json\u001b[0m\n", - "\u001b[31mCLK data written to /tmp/tmpz2ykuhep.json\u001b[0m\n" + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp3hr0gbdc.json\u001b[0m\n", + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp8ba6c8bt.json\u001b[0m\n" ] } ], @@ -726,7 +727,7 @@ " --project=\"{project_id}\" \\\n", " --apikey=\"{credentials['result_token']}\" \\\n", " --server \"{url}\" \\\n", - " --threshold 0.9 \\\n", + " --threshold 0.75 \\\n", " --output \"{f.name}\"\n", " \n", " run_id = json.load(open(f.name))['run_id']" @@ -812,16 +813,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "[[0, 76], [1, 2345], 1.0]\n", - "[[0, 83], [1, 3439], 1.0]\n", - "[[0, 103], [1, 863], 1.0]\n", - "[[0, 154], [1, 2391], 1.0]\n", - "[[0, 177], [1, 4247], 1.0]\n", - "[[0, 192], [1, 1176], 1.0]\n", - "[[0, 270], [1, 4516], 1.0]\n", - "[[0, 312], [1, 1253], 1.0]\n", - "[[0, 407], [1, 3743], 1.0]\n", - "[[0, 670], [1, 3550], 1.0]\n" + "[76, 2345, 1.0]\n", + "[83, 3439, 1.0]\n", + "[103, 863, 1.0]\n", + "[154, 2391, 1.0]\n", + "[177, 4247, 1.0]\n", + "[192, 1176, 1.0]\n", + "[270, 4516, 1.0]\n", + "[312, 1253, 1.0]\n", + "[407, 3743, 1.0]\n", + "[670, 3550, 1.0]\n" ] } ], @@ -851,7 +852,7 @@ { "data": { "text/plain": [ - "1150393" + "280116" ] }, "execution_count": 16, @@ -883,7 +884,7 @@ "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAPGElEQVR4nO3df6xfd13H8eeLjmEUyDpbm9lt3KklsfzhnHUMFZkQtm6LFtDwI1HKXKyEkUgif1T5Y2aEpGrAsIALVSobEciMII0rjlohqGGwImPsh9DL6LLWshYL6LJEAd/+8f0UvnT3trf3+6t3n+cj+eZ7vp/zOef7efd7+zrnnnO+56aqkCT14WmzHoAkaXoMfUnqiKEvSR0x9CWpI4a+JHXknFkP4FTWrFlTc3Nzsx6GJK0on/vc575eVWsXmndWh/7c3Bz79++f9TAkaUVJ8shi8zy8I0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTmrv5E7qrntdy7YfnDHdVMeiSSdHdzTl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SerIaUM/yUVJPpHkwSQPJPm91n5+kr1JDrTn1a09SW5JMp/kviSXDa1ra+t/IMnWyZUlSVrIUvb0vwP8flVtBK4AbkyyEdgO7KuqDcC+9hrgGmBDe2wDboXBRgK4CXg+cDlw04kNhSRpOk4b+lV1pKr+rU3/N/AQsB7YAtzWut0GvKxNbwFur4G7gfOSXABcDeytquNV9Q1gL7B5rNVIkk7pjI7pJ5kDfhb4DLCuqo60WV8D1rXp9cCjQ4sdam2LtZ/8HtuS7E+y/9ixY2cyPEnSaSw59JM8E/hb4E1V9V/D86qqgBrHgKpqZ1VtqqpNa9euHccqJUnNkkI/ydMZBP5fV9WHW/Nj7bAN7floaz8MXDS0+IWtbbF2SdKULOXqnQDvBR6qqncMzdoNnLgCZyvw0aH217areK4AvtUOA90FXJVkdTuBe1VrkyRNyTlL6POLwG8BX0xyb2v7Q2AHcEeSG4BHgFe2eXuAa4F54AngeoCqOp7krcA9rd/NVXV8LFVIkpbktKFfVf8CZJHZL1mgfwE3LrKuXcCuMxmgJGl8/EauJHXE0Jekjhj6ktQRQ1+SOrKUq3eecua237lg+8Ed1015JJI0Xe7pS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI6cM+sBnE3mtt+5YPvBHddNeSSSNBmn3dNPsivJ0ST3D7X9UZLDSe5tj2uH5v1BkvkkX0py9VD75tY2n2T7+EuRJJ3OUg7vvA/YvED7n1XVpe2xByDJRuDVwPPaMn+eZFWSVcC7gWuAjcBrWl9J0hSd9vBOVX0qydwS17cF+FBV/Q/w1STzwOVt3nxVPQyQ5EOt74NnPGJJ0rKNciL3jUnua4d/Vre29cCjQ30OtbbF2p8kybYk+5PsP3bs2AjDkySdbLmhfyvwk8ClwBHg7eMaUFXtrKpNVbVp7dq141qtJIllXr1TVY+dmE7yF8Dft5eHgYuGul7Y2jhFuyRpSpa1p5/kgqGXLwdOXNmzG3h1kmckuQTYAHwWuAfYkOSSJOcyONm7e/nDliQtx2n39JN8ELgSWJPkEHATcGWSS4ECDgK/C1BVDyS5g8EJ2u8AN1bVd9t63gjcBawCdlXVA2OvRpJ0Sku5euc1CzS/9xT93wa8bYH2PcCeMxqdJGmsvA2DJHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHVnWn0vszdz2OxdsP7jjuimPRJJG456+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SO+OcSR+CfUZS00rinL0kdMfQlqSOnDf0ku5IcTXL/UNv5SfYmOdCeV7f2JLklyXyS+5JcNrTM1tb/QJKtkylHknQqS9nTfx+w+aS27cC+qtoA7GuvAa4BNrTHNuBWGGwkgJuA5wOXAzed2FBIkqbntKFfVZ8Cjp/UvAW4rU3fBrxsqP32GrgbOC/JBcDVwN6qOl5V3wD28uQNiSRpwpZ7TH9dVR1p018D1rXp9cCjQ/0OtbbF2p8kybYk+5PsP3bs2DKHJ0layMgncquqgBrDWE6sb2dVbaqqTWvXrh3XaiVJLD/0H2uHbWjPR1v7YeCioX4XtrbF2iVJU7Tc0N8NnLgCZyvw0aH217areK4AvtUOA90FXJVkdTuBe1VrkyRN0Wm/kZvkg8CVwJokhxhchbMDuCPJDcAjwCtb9z3AtcA88ARwPUBVHU/yVuCe1u/mqjr55LAkacJOG/pV9ZpFZr1kgb4F3LjIenYBu85odJKksfIbuZLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkdOe+8dnbm57Xcu2H5wx3VTHokk/SD39CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjriXTanyLtvSpo19/QlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdWSk0E9yMMkXk9ybZH9rOz/J3iQH2vPq1p4ktySZT3JfksvGUYAkaenGsaf/K1V1aVVtaq+3A/uqagOwr70GuAbY0B7bgFvH8N6SpDMwicM7W4Db2vRtwMuG2m+vgbuB85JcMIH3lyQtYtS7bBbw8SQFvKeqdgLrqupIm/81YF2bXg88OrTsodZ2ZKiNJNsY/CbAxRdfPOLwVgbvvilpWkYN/V+qqsNJfgzYm+Tfh2dWVbUNwpK1DcdOgE2bNp3RspKkUxvp8E5VHW7PR4GPAJcDj504bNOej7buh4GLhha/sLVJkqZk2aGf5EeSPOvENHAVcD+wG9jaum0FPtqmdwOvbVfxXAF8a+gwkCRpCkY5vLMO+EiSE+v5QFX9Q5J7gDuS3AA8Aryy9d8DXAvMA08A14/w3pKkZVh26FfVw8DPLND+n8BLFmgv4Mblvp8kaXR+I1eSOmLoS1JHDH1J6oihL0kdMfQlqSOjfiNXE+TtGSSNm3v6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xOv0V6DFrt8Hr+GXdGru6UtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOeMnmU4y3Y5Z0Ku7pS1JHDH1J6oihL0kdMfQlqSOeyO2EJ3glgXv6ktQVQ1+SOmLoS1JHDH1J6ogncjvnCV6pL+7pS1JH3NPXgvwNQHpqck9fkjpi6EtSRzy8ozPiYR9pZTP0NRZuDKSVYeqhn2Qz8E5gFfCXVbVj2mPQ9LgxkM4uUw39JKuAdwMvBQ4B9yTZXVUPTnMcmj03BtJsTHtP/3JgvqoeBkjyIWALYOgLWHxjME5uWNSzaYf+euDRodeHgOcPd0iyDdjWXj6e5EsjvN8a4OsjLL8S9VbzGdebP57QSKant88YrPlMPWexGWfdidyq2gnsHMe6kuyvqk3jWNdK0VvNvdUL1tyLSdU87ev0DwMXDb2+sLVJkqZg2qF/D7AhySVJzgVeDeye8hgkqVtTPbxTVd9J8kbgLgaXbO6qqgcm+JZjOUy0wvRWc2/1gjX3YiI1p6omsV5J0lnIe+9IUkcMfUnqyIoM/SSbk3wpyXyS7QvMf06SfUnuS/LJJBcOzdua5EB7bJ3uyJdvuTUnuTTJp5M80Oa9avqjX55RPuc2/9lJDiV51/RGPZoRf7YvTvLxJA8leTDJ3DTHvlwj1vwn7Wf7oSS3JMl0R3/mkuxKcjTJ/YvMT6tlvtV82dC80fOrqlbUg8EJ4K8APwGcC3wB2HhSn78BtrbpFwPvb9PnAw+359VtevWsa5pwzc8FNrTpHweOAOfNuqZJ1jw0/53AB4B3zbqeadQMfBJ4aZt+JvDDs65pkjUDvwD8a1vHKuDTwJWzrmkJNf8ycBlw/yLzrwU+BgS4AvhMax9Lfq3EPf3v3cqhqv4XOHErh2EbgX9q058Ymn81sLeqjlfVN4C9wOYpjHlUy665qr5cVQfa9H8AR4G1Uxn1aEb5nEnyc8A64ONTGOu4LLvmJBuBc6pqL0BVPV5VT0xn2CMZ5XMu4IcYbCyeATwdeGziIx5RVX0KOH6KLluA22vgbuC8JBcwpvxaiaG/0K0c1p/U5wvAK9r0y4FnJfnRJS57Nhql5u9JcjmD/yBfmdA4x2nZNSd5GvB24M0TH+V4jfI5Pxf4ZpIPJ/l8kj9tNzg82y275qr6NIONwJH2uKuqHprweKdhsX+TseTXSgz9pXgz8KIknwdexOBbv9+d7ZAm7pQ1tz2F9wPXV9X/zWaIY7dYzW8A9lTVoVkObkIWq/kc4IVt/s8zOFzyuhmNcdwWrDnJTwE/zeCb/euBFyd54eyGuTKcdffeWYLT3sqhHcZ4BUCSZwK/XlXfTHIYuPKkZT85ycGOybJrbq+fDdwJvKX9urgSjPI5vwB4YZI3MDi2fW6Sx6vqSScJzzKj1HwIuLe+fwfbv2NwPPi90xj4CEap+XeAu6vq8TbvY8ALgH+exsAnaLF/k/Hk16xPaizjJMg5DE5gXML3T/w876Q+a4Cntem3ATcPnQj5KoOTIKvb9PmzrmnCNZ8L7APeNOs6plXzSX1ex8o5kTvK57yq9V/bXv8VcOOsa5pwza8C/rGt4+nt5/xXZ13TEuueY/ETudfxgydyP9vax5JfMy9+mf9g1wJfZnBs+i2t7Wbg19r0bwAHWp+/BJ4xtOxvA/Ptcf2sa5l0zcBvAt8G7h16XDrreib9OQ+tY8WE/qg1M/jjRPcBXwTeB5w763omWTODDd17gIcY/E2Od8y6liXW+0EG5yC+zeC4/A3A64HXt/lh8MemvtI+y01Dy46cX96GQZI68lQ9kStJWoChL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjry/11JeywGTfuPAAAAAElFTkSuQmCC\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEGCAYAAACQO2mwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAXVklEQVR4nO3dfbRddX3n8ffHpKC15UmyWDahJqOZ2minFbOQqWPLkgoBi6EdcKAdjTQ1taLT6TC1MM4Sl0qXLrVYrNqyJCUw1oiMHaJFmZQHnToDEkQeggVug0oiSiSAtT5g9Dt/nF/qMdxfcnPPzb1J7vu11ll37+/+7b1/v9yHT/bD2SdVhSRJ43nSTHdAkrTvMiQkSV2GhCSpy5CQJHUZEpKkrrkz3YGpduSRR9bChQtnuhuStF+59dZbv1FV83auH3AhsXDhQjZs2DDT3ZCk/UqSL49X93STJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSp64B7x/UoTj336nHrH3/38mnuiSTtGzySkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV27DYkkq5M8lOSuodo7k/xDkjuS/E2Sw4aWnZ9kLMk9SU4aqi9rtbEk5w3VFyW5udU/kuSgVj+4zY+15QunatCSpImZyJHEZcCynWrrgedW1b8B7gXOB0iyBDgTeE5b5/1J5iSZA7wPOBlYApzV2gK8A7ioqp4FPAKsbPWVwCOtflFrJ0maRrsNiar6DLBtp9r/rqrtbfYmYEGbXg6srarvVdX9wBhwbHuNVdWmqnocWAssTxLgxcBVbf01wGlD21rTpq8CTmjtJUnTZCquSfwO8Mk2PR94YGjZ5lbr1Z8GPDoUODvqP7attvyx1v4JkqxKsiHJhq1bt448IEnSwEghkeSNwHbgQ1PTncmpqkuqamlVLZ03b95MdkWSDihzJ7tiklcBvw6cUFXVyluAo4eaLWg1OvWHgcOSzG1HC8Ptd2xrc5K5wKGtvSRpmkzqSCLJMuANwMuq6ttDi9YBZ7Y7kxYBi4HPAbcAi9udTAcxuLi9roXLDcDpbf0VwNVD21rRpk8Hrh8KI0nSNNjtkUSSDwPHA0cm2QxcwOBupoOB9e1a8k1V9Zqq2pjkSuBuBqehzqmqH7TtvA64FpgDrK6qjW0XfwysTfI24Dbg0la/FLgiyRiDC+dnTsF4JUl7YLchUVVnjVO+dJzajvYXAheOU78GuGac+iYGdz/tXP8ucMbu+idJ2nt8x7UkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVLXbkMiyeokDyW5a6h2RJL1Se5rXw9v9SS5OMlYkjuSHDO0zorW/r4kK4bqz09yZ1vn4iTZ1T4kSdNnIkcSlwHLdqqdB1xXVYuB69o8wMnA4vZaBXwABn/wgQuAFwDHAhcM/dH/APDqofWW7WYfkqRpstuQqKrPANt2Ki8H1rTpNcBpQ/XLa+Am4LAkTwdOAtZX1baqegRYDyxryw6pqpuqqoDLd9rWePuQJE2TyV6TOKqqHmzTXwOOatPzgQeG2m1utV3VN49T39U+JEnTZOQL1+0IoKagL5PeR5JVSTYk2bB169a92RVJmlUmGxJfb6eKaF8favUtwNFD7Ra02q7qC8ap72ofT1BVl1TV0qpaOm/evEkOSZK0s8mGxDpgxx1KK4Crh+qvbHc5HQc81k4ZXQucmOTwdsH6RODatuybSY5rdzW9cqdtjbcPSdI0mbu7Bkk+DBwPHJlkM4O7lN4OXJlkJfBl4OWt+TXAKcAY8G3gbICq2pbkrcAtrd1bqmrHxfDXMriD6inAJ9uLXexDkjRNdhsSVXVWZ9EJ47Qt4JzOdlYDq8epbwCeO0794fH2IUmaPr7jWpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1jRQSSf4wycYkdyX5cJInJ1mU5OYkY0k+kuSg1vbgNj/Wli8c2s75rX5PkpOG6stabSzJeaP0VZK05yYdEknmA/8JWFpVzwXmAGcC7wAuqqpnAY8AK9sqK4FHWv2i1o4kS9p6zwGWAe9PMifJHOB9wMnAEuCs1laSNE1GPd00F3hKkrnATwIPAi8GrmrL1wCntenlbZ62/IQkafW1VfW9qrofGAOOba+xqtpUVY8Da1tbSdI0mXRIVNUW4F3AVxiEw2PArcCjVbW9NdsMzG/T84EH2rrbW/unDdd3WqdXf4Ikq5JsSLJh69atkx2SJGkno5xuOpzB/+wXAT8DPJXB6aJpV1WXVNXSqlo6b968meiCJB2QRjnd9GvA/VW1taq+D3wMeCFwWDv9BLAA2NKmtwBHA7TlhwIPD9d3WqdXlyRNk1FC4ivAcUl+sl1bOAG4G7gBOL21WQFc3abXtXna8uurqlr9zHb30yJgMfA54BZgcbtb6iAGF7fXjdBfSdIemrv7JuOrqpuTXAV8HtgO3AZcAvwtsDbJ21rt0rbKpcAVScaAbQz+6FNVG5NcySBgtgPnVNUPAJK8DriWwZ1Tq6tq42T7K0nac5MOCYCqugC4YKfyJgZ3Ju3c9rvAGZ3tXAhcOE79GuCaUfooSZo833EtSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpa6RnN80Wp5579bj1j7/bD8qTdGDzSEKS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSukYKiSSHJbkqyT8k+WKSf5vkiCTrk9zXvh7e2ibJxUnGktyR5Jih7axo7e9LsmKo/vwkd7Z1Lk6SUforSdozox5J/Bnwqap6NvCLwBeB84DrqmoxcF2bBzgZWNxeq4APACQ5ArgAeAFwLHDBjmBpbV49tN6yEfsrSdoDkw6JJIcCvwJcClBVj1fVo8ByYE1rtgY4rU0vBy6vgZuAw5I8HTgJWF9V26rqEWA9sKwtO6SqbqqqAi4f2pYkaRqMciSxCNgK/FWS25J8MMlTgaOq6sHW5mvAUW16PvDA0PqbW21X9c3j1J8gyaokG5Js2Lp16whDkiQNGyUk5gLHAB+oqucB/8yPTi0B0I4AaoR9TEhVXVJVS6tq6bx58/b27iRp1hglJDYDm6vq5jZ/FYPQ+Ho7VUT7+lBbvgU4emj9Ba22q/qCceqSpGky6ZCoqq8BDyT5uVY6AbgbWAfsuENpBbDjE3vWAa9sdzkdBzzWTktdC5yY5PB2wfpE4Nq27JtJjmt3Nb1yaFuSpGkw6ifTvR74UJKDgE3A2QyC58okK4EvAy9vba8BTgHGgG+3tlTVtiRvBW5p7d5SVdva9GuBy4CnAJ9sL0nSNBkpJKrqC8DScRadME7bAs7pbGc1sHqc+gbguaP0UZI0eb7jWpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqWvkkEgyJ8ltST7R5hcluTnJWJKPJDmo1Q9u82Nt+cKhbZzf6vckOWmovqzVxpKcN2pfJUl7ZiqOJP4A+OLQ/DuAi6rqWcAjwMpWXwk80uoXtXYkWQKcCTwHWAa8vwXPHOB9wMnAEuCs1laSNE1GCokkC4CXAh9s8wFeDFzVmqwBTmvTy9s8bfkJrf1yYG1Vfa+q7gfGgGPba6yqNlXV48Da1laSNE1GPZJ4D/AG4Idt/mnAo1W1vc1vBua36fnAAwBt+WOt/b/Ud1qnV3+CJKuSbEiyYevWrSMOSZK0w6RDIsmvAw9V1a1T2J9JqapLqmppVS2dN2/eTHdHkg4Yc0dY94XAy5KcAjwZOAT4M+CwJHPb0cICYEtrvwU4GticZC5wKPDwUH2H4XV6dUnSNJj0kURVnV9VC6pqIYMLz9dX1W8DNwCnt2YrgKvb9Lo2T1t+fVVVq5/Z7n5aBCwGPgfcAixud0sd1PaxbrL9lSTtuVGOJHr+GFib5G3AbcClrX4pcEWSMWAbgz/6VNXGJFcCdwPbgXOq6gcASV4HXAvMAVZX1ca90F9JUseUhERV3Qjc2KY3Mbgzaec23wXO6Kx/IXDhOPVrgGumoo+SpD3nO64lSV1743TTrHHquVePW//4u307h6QDg0cSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlS16RDIsnRSW5IcneSjUn+oNWPSLI+yX3t6+GtniQXJxlLckeSY4a2taK1vy/JiqH685Pc2da5OElGGawkac+MciSxHTi3qpYAxwHnJFkCnAdcV1WLgevaPMDJwOL2WgV8AAahAlwAvAA4FrhgR7C0Nq8eWm/ZCP2VJO2hSYdEVT1YVZ9v0/8EfBGYDywH1rRma4DT2vRy4PIauAk4LMnTgZOA9VW1raoeAdYDy9qyQ6rqpqoq4PKhbUmSpsGUXJNIshB4HnAzcFRVPdgWfQ04qk3PBx4YWm1zq+2qvnmcuiRpmowcEkl+CvifwH+uqm8OL2tHADXqPibQh1VJNiTZsHXr1r29O0maNUYKiSQ/wSAgPlRVH2vlr7dTRbSvD7X6FuDoodUXtNqu6gvGqT9BVV1SVUuraum8efNGGZIkacjcya7Y7jS6FPhiVf3p0KJ1wArg7e3r1UP11yVZy+Ai9WNV9WCSa4E/GbpYfSJwflVtS/LNJMcxOI31SuC9k+3vdDr13Ku7yz7+7uXT2BNJGs2kQwJ4IfAK4M4kX2i1/8YgHK5MshL4MvDytuwa4BRgDPg2cDZAC4O3Are0dm+pqm1t+rXAZcBTgE+2lyRpmkw6JKrq74He+xZOGKd9Aed0trUaWD1OfQPw3Mn2UZI0Gt9xLUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqWuUB/xpEnpPiPXpsJL2RR5JSJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHV5C+w+wltjJe2LPJKQJHUZEpKkLkNCktTlNYl9nNcqJM2kff5IIsmyJPckGUty3kz3R5Jmk336SCLJHOB9wEuAzcAtSdZV1d0z27OZ5xGGpOmwT4cEcCwwVlWbAJKsBZYDsz4kenrh0WOo6EC1p78LB4K98fu8r4fEfOCBofnNwAt2bpRkFbCqzX4ryT2T3N+RwDcmue5+KX86+8bMLPw+45hnhRF/n58xXnFfD4kJqapLgEtG3U6SDVW1dAq6tN9wzLODY54d9saY9/UL11uAo4fmF7SaJGka7OshcQuwOMmiJAcBZwLrZrhPkjRr7NOnm6pqe5LXAdcCc4DVVbVxL+5y5FNW+yHHPDs45tlhysecqprqbUqSDhD7+ukmSdIMMiQkSV2zJiR293iPJBcl+UJ73Zvk0aFlPxhatt9cOJ/AmH82yQ1JbktyR5JThpad39a7J8lJ09vzyZnseJMsTPKdoe/xX0x/7ydnAmN+RpLr2nhvTLJgaNmKJPe114rp7fnkjTjm/fV3eXWSh5Lc1VmeJBe3f5M7khwztGy073NVHfAvBhe9/xH4V8BBwO3Akl20fz2Di+Q75r8102PYG2NmcJHr99v0EuBLQ9O3AwcDi9p25sz0mPbieBcCd830GPbSmD8KrGjTLwauaNNHAJva18Pb9OEzPaa9OeY2v9/9Lrd+/wpwTO/nFDgF+CQQ4Djg5qn6Ps+WI4l/ebxHVT0O7Hi8R89ZwIenpWd7z0TGXMAhbfpQ4Kttejmwtqq+V1X3A2Nte/uyUca7v5rImJcA17fpG4aWnwSsr6ptVfUIsB5YNg19HtUoY95vVdVngG27aLIcuLwGbgIOS/J0puD7PFtCYrzHe8wfr2GSZzD43/P1Q+UnJ9mQ5KYkp+29bk6piYz5zcB/TLIZuIbBEdRE193XjDJegEXtNNSnk7xor/Z06kxkzLcDv9mmfwP46SRPm+C6+6JRxgz75+/yRPT+XUb+Ps+WkNgTZwJXVdUPhmrPqMFb3X8LeE+SZ85M16bcWcBlVbWAweHqFUkO5J+J3ngfBH62qp4H/Bfgr5Mcsovt7E/+K/CrSW4DfpXBEwt+sOtV9nu7GvOB+ru81xzIfxCG7cnjPc5kp1NNVbWlfd0E3Ag8b+q7OOUmMuaVwJUAVfX/gCczeCja/vg4lEmPt51We7jVb2Vwzvtf7/Uej263Y66qr1bVb7YAfGOrPTqRdfdRo4x5f/1dnojev8vo3+eZviAzTRd95jK4YLOIH13ses447Z4NfIn2JsNWOxw4uE0fCdzHLi567yuviYyZwYWuV7Xpn2dwjj7Ac/jxC9eb2PcvXI8y3nk7xsfggugW4IiZHtMUjflI4Elt+kLgLW36COD+9vN9eJs+0Me8X/4uD41rIf0L1y/lxy9cf26qvs8zPvBp/Ac+BbiXwf8S39hqbwFeNtTmzcDbd1rvl4E72w/jncDKmR7LVI2ZwQW+z7axfQE4cWjdN7b17gFOnumx7M3xAv8e2NhqnwdOnemxTOGYT29/DO8FPrjjj2Rb9jsMbkoYA86e6bHs7THv57/LH2ZwWvT7DK4rrAReA7ymLQ+DD2j7xza2pVP1ffaxHJKkrtlyTUKSNAmGhCSpy5CQJHUZEpKkLkNCktRlSGjWSPLBJEv2oP3SJBe36Vcl+fM93N/w+scn+eU967E08/bpjy+VplJV/e4ett8AbJjMvpLM3Wn944FvAf93MtubCknm1I8/bkbaLY8kdMBJ8tQkf5vk9iR3JfkPrX5jkqVt+ltJ3plkY5K/S3JsW74pyctam+OTfGKc7Z+a5Ob2QMC/S3JUq785yRVJPsvguVDHJ/lEkoUM3vj0h+1zDF6U5P4kP9HWO2R4fmg/Z7T+357kM602J8m7Wv2OJK9v9RNaf+5snz1wcKt/Kck7knweOCPJM5N8KsmtSf5Pkmfvje+BDhweSehAtAz4alW9FCDJoeO0eSpwfVX9UZK/Ad4GvITBu7LXALv6QJq/B46rqkryu8AbgHPbsiXAv6uq7yQ5HqCqvpTBBxl9q6re1fp0I4NHKfwvBs8L+1hVfX+n/bwJOKmqtiQ5rNVWMXg8wy9V1fYkRyR5MnAZcEJV3ZvkcuD3gfe0dR6uqmPafq9j8C7d+5K8AHg/g89ckMblkYQORHcCL2n/g35RVT02TpvHgU8Ntf90+yN9J4M/wruyALg2yZ3AHzF41tUO66rqOxPo4weBs9v02cBfjdPms8BlSV7N4MN2AH4N+Muq2g5QVduAnwPur6p7W5s1DD6kZoePACT5KQaPpvhoki8Afwk8fQJ91SxmSOiA0/5YHsPgD/7bkrxpnGbfrx89k+aHwPfauj9k90fY7wX+vKp+Afg9Bk+T3eGfJ9jHzwIL29HGnKp6wsdSVtVrgP/O4Cmetw59JsKe2tGnJwGPVtUvDb1+fpLb1CxhSOiAk+RngG9X1f8A3skgMKbSofzoccsT/czgfwJ+eqfa5cBfM/5RBEmeWVU3V9WbgK0MwmI98HtJ5rY2RzB4COPCJM9qq74C+PTO26uqbwL3JzmjrZskvzjB/muWMiR0IPoF4HPtlMoFDK43TKU3MzhlcyvwjQmu83HgN3ZcuG61DzF4fHPvo3Lf2S5E38XgrqjbGZym+gpwR5Lbgd+qqu8yOGX10XYK7IfAX3S2+dvAyrbuRg6Aj/bU3uVTYKUZkuR0YHlVvWKm+yL1eHeTNAOSvBc4mcFnI0j7LI8kJEldXpOQJHUZEpKkLkNCktRlSEiSugwJSVLX/wfHVDfj9t4R7gAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] @@ -895,7 +896,10 @@ } ], "source": [ - "plt.hist([score for _, _, score in data[::100]], bins=50);" + "plt.style.use('seaborn-deep')\n", + "plt.hist([score for _, _, score in data], bins=50)\n", + "plt.xlabel('similarity score')\n", + "plt.show()" ] }, { @@ -904,7 +908,7 @@ "pycharm": {} }, "source": [ - "The vast majority of these similarity scores are for non matches. Let's zoom into the right side of the distribution." + "The vast majority of these similarity scores are for non matches. We expect the matches to have a high similarity score. So let's zoom into the right side of the distribution." ] }, { @@ -918,7 +922,7 @@ "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAARWklEQVR4nO3df4zkdX3H8eeL41cVW0C25DzApYppz7ai3VKstfIjVoS2h61FbKKHkp5GTGqiSdH+obUlwbZiaWxJz0IFIyitWkjFCqLE2gp4KPKz6oFHuOsJp4BKjVbw3T/me2FYdm9md3Zmdz88H8lkv/P5fGfm/dnZe+1nP9/vfC9VhSSpLXstdwGSpKVnuEtSgwx3SWqQ4S5JDTLcJalBey93AQCHHHJITU9PL3cZkrSq3HTTTd+uqqm5+lZEuE9PT7Nly5blLkOSVpUk98zX57KMJDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1aEV8QnUU02d/ct6+beeeMsFKJGnlcOYuSQ0y3CWpQQPDPcn+SW5M8tUktyf5s679yCQ3JNma5KNJ9u3a9+vub+36p8c7BEnSbMPM3H8EnFBVzwOOBk5KcizwHuB9VfVs4EHgzG7/M4EHu/b3dftJkiZoYLhXz8Pd3X26WwEnAP/StV8MnNptb+ju0/WfmCRLVrEkaaCh1tyTrElyM3A/cA1wF/BQVT3S7bIdWNdtrwPuBej6vws8fY7n3JRkS5Itu3btGm0UkqTHGSrcq+rRqjoaOAw4Bvj5UV+4qjZX1UxVzUxNzfkfiUiSFmlBZ8tU1UPA54AXAgcm2X2e/GHAjm57B3A4QNf/M8B3lqRaSdJQhjlbZirJgd32TwEvBe6kF/Kv7HbbCFzRbV/Z3afr/2xV1VIWLUnas2E+oboWuDjJGnq/DC6vqn9LcgfwkSR/AXwFuLDb/0LgQ0m2Ag8Ap4+hbknSHgwM96q6BXj+HO1301t/n93+Q+APlqQ6SdKi+AlVSWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQcNcOGzVmj77k3O2bzv3lAlXIkmT5cxdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQQPDPcnhST6X5I4ktyf54679XUl2JLm5u53c95i3J9ma5GtJXjbOAUiSnmiYS/4+Ary1qr6c5GnATUmu6freV1V/3b9zkvXA6cBzgWcAn0nynKp6dCkLlyTNb+DMvap2VtWXu+3vA3cC6/bwkA3AR6rqR1X1TWArcMxSFCtJGs6C1tyTTAPPB27omt6c5JYkFyU5qGtbB9zb97DtzPHLIMmmJFuSbNm1a9eCC5ckzW/ocE9yAPAx4C1V9T3gAuBZwNHATuC9C3nhqtpcVTNVNTM1NbWQh0qSBhgq3JPsQy/YP1xVHweoqvuq6tGq+gnwAR5betkBHN738MO6NknShAxztkyAC4E7q+q8vva1fbu9Arit274SOD3JfkmOBI4Cbly6kiVJgwxztsyLgNcAtya5uWt7B/DqJEcDBWwD3gBQVbcnuRy4g96ZNmd5powkTdbAcK+qLwCZo+uqPTzmHOCcEeqSJI3AT6hKUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIatPegHZIcDlwCHAoUsLmqzk9yMPBRYBrYBpxWVQ8mCXA+cDLwA+CMqvryeMpfnOmzPzln+7ZzT5lwJZI0HsPM3B8B3lpV64FjgbOSrAfOBq6tqqOAa7v7AC8Hjupum4ALlrxqSdIeDQz3qtq5e+ZdVd8H7gTWARuAi7vdLgZO7bY3AJdUz/XAgUnWLnnlkqR5LWjNPck08HzgBuDQqtrZdX2L3rIN9IL/3r6Hbe/aZj/XpiRbkmzZtWvXAsuWJO3J0OGe5ADgY8Bbqup7/X1VVfTW44dWVZuraqaqZqamphbyUEnSAEOFe5J96AX7h6vq413zfbuXW7qv93ftO4DD+x5+WNcmSZqQgeHenf1yIXBnVZ3X13UlsLHb3ghc0df+2vQcC3y3b/lGkjQBA0+FBF4EvAa4NcnNXds7gHOBy5OcCdwDnNb1XUXvNMit9E6FfN2SVixJGmhguFfVF4DM033iHPsXcNaIdUmSRuAnVCWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUoGGuCilJK5L/2f38nLlLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIs2X6eORdUiucuUtSg5y5S2rOfH+Fz6fFv86duUtSg5y5S1rxFjoTl+EuaQUxxJfOwGWZJBcluT/JbX1t70qyI8nN3e3kvr63J9ma5GtJXjauwiVJ8xtmzf2DwElztL+vqo7ublcBJFkPnA48t3vM3ydZs1TFSpKGMzDcq+rzwANDPt8G4CNV9aOq+iawFThmhPokSYswytkyb05yS7dsc1DXtg64t2+f7V2bJGmCFntA9QLgz4Hqvr4XeP1CniDJJmATwBFHHLHIMiStRh44Hb9Fzdyr6r6qerSqfgJ8gMeWXnYAh/fteljXNtdzbK6qmaqamZqaWkwZkqR5LGrmnmRtVe3s7r4C2H0mzZXApUnOA54BHAXcOHKVkjRGLV5XamC4J7kMOA44JMl24J3AcUmOprcssw14A0BV3Z7kcuAO4BHgrKp6dDylS5LmMzDcq+rVczRfuIf9zwHOGaUoSdJovLaMJDXIcJekBhnuktQgw12SGmS4S1KDvOSvpLHxk6jLx5m7JDXIcJekBhnuktQgw12SGmS4S1KDPFtmCC1eMU5S25y5S1KDnLlL0jz2dJ7+Sv/L3XCX9DirOdD0GMNd0sj8JOrK45q7JDXIcJekBhnuktQg19wlDc219dXDcJeepAzqtrksI0kNMtwlqUEuy4zAa85IWqmcuUtSgwbO3JNcBPw2cH9V/WLXdjDwUWAa2AacVlUPJglwPnAy8APgjKr68nhKlzQMD5w+OQ0zc/8gcNKstrOBa6vqKODa7j7Ay4Gjutsm4IKlKVOStBADZ+5V9fkk07OaNwDHddsXA9cBf9K1X1JVBVyf5MAka6tq51IVLD3ZeaxHw1jsmvuhfYH9LeDQbnsdcG/fftu7tidIsinJliRbdu3atcgyJElzGfmAajdLr0U8bnNVzVTVzNTU1KhlSJL6LDbc70uyFqD7en/XvgM4vG+/w7o2SdIELTbcrwQ2dtsbgSv62l+bnmOB77reLkmTN8ypkJfRO3h6SJLtwDuBc4HLk5wJ3AOc1u1+Fb3TILfSOxXydWOoWZI0wDBny7x6nq4T59i3gLNGLUqSNBo/oSpJDTLcJalBXjhMWiQ/TKSVzHCXBlgt12ZZLXVqMlyWkaQGOXMfA/9cl7TcDHdpifnLXSuB4S4tM9fKNQ6uuUtSg5y5a6zGvUThEog0N8N9BTCgJC01w12aENfWNUmuuUtSg5y5a0VxiUpaGoa7nlT2tDTiLxC1xHBfwZzFTpZr4mqJ4S5JEzDpyZrhPkHODB8z7u+F32s92Xm2jCQ1yJm75rTS1vudiUsL48xdkhrkzP1JwNP/pKW30v66nc2ZuyQ1yJn7k9xSrWW7Ji6tLCOFe5JtwPeBR4FHqmomycHAR4FpYBtwWlU9OFqZWikMcWl1WIplmeOr6uiqmununw1cW1VHAdd29yVJEzSONfcNwMXd9sXAqWN4DUnSHowa7gVcneSmJJu6tkOrame3/S3g0LkemGRTki1JtuzatWvEMiRJ/UY9oPobVbUjyc8C1yT57/7OqqokNdcDq2ozsBlgZmZmzn20MK6HS9ptpHCvqh3d1/uTfAI4Brgvydqq2plkLXD/EtSpPoa4pEEWvSyT5KlJnrZ7G/gt4DbgSmBjt9tG4IpRi5QkLcwoM/dDgU8k2f08l1bVvyf5EnB5kjOBe4DTRi9TkrQQiw73qrobeN4c7d8BThylKEnSaLz8gCQ1yHCXpAYZ7pLUIC8cJklLaKWcquzMXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBYwv3JCcl+VqSrUnOHtfrSJKeaCzhnmQN8HfAy4H1wKuTrB/Ha0mSnmhcM/djgK1VdXdV/R/wEWDDmF5LkjTL3mN63nXAvX33twO/1r9Dkk3Apu7uw0m+tsjXOgT49iIfu9I4lpWplbG0Mg5oaCx5z0hjeeZ8HeMK94GqajOwedTnSbKlqmaWoKRl51hWplbG0so4wLEMY1zLMjuAw/vuH9a1SZImYFzh/iXgqCRHJtkXOB24ckyvJUmaZSzLMlX1SJI3A58G1gAXVdXt43gtlmBpZwVxLCtTK2NpZRzgWAZKVY3jeSVJy8hPqEpSgwx3SWrQig73QZcwSPLMJNcmuSXJdUkOm9X/00m2J3n/5Kqe2yhjSfJokpu727IemB5xHEckuTrJnUnuSDI9ydpnW+xYkhzf937cnOSHSU6d/AgeV+so78tfJrm9e1/+NkkmW/0Tah1lLO9Jclt3e9VkK39CnRcluT/JbfP0p/t+b+3G8oK+vo1JvtHdNi6qgKpakTd6B2LvAn4O2Bf4KrB+1j7/DGzstk8APjSr/3zgUuD9q3kswMPL/X4s0TiuA17abR8APGW1jqVvn4OBB1brWIBfB/6ze441wBeB41bpWE4BrqF3oshT6Z2199PLOJbfBF4A3DZP/8nAp4AAxwI39P1M3d19PajbPmihr7+SZ+7DXMJgPfDZbvtz/f1JfgU4FLh6ArUOMtJYVpBFj6O7ttDeVXUNQFU9XFU/mEzZc1qq9+SVwKdW8VgK2J9ekO4H7APcN/aK5zfKWNYDn6+qR6rqf4FbgJMmUPOcqurz9H7xz2cDcEn1XA8cmGQt8DLgmqp6oKoepPcLa8HjWMnhPtclDNbN2uerwO91268Anpbk6Un2At4LvG3sVQ5n0WPp7u+fZEuS65f5z/9RxvEc4KEkH0/ylSR/1V1gbrmM+p7sdjpw2VgqHN6ix1JVX6QXkDu726er6s4x17sno7wvXwVOSvKUJIcAx/P4D1OuNPONdZjvwUArOdyH8TbgJUm+AryE3qdgHwXeBFxVVduXs7gFmm8sAM+s3seT/xD4myTPWqYahzHfOPYGXtz1/yq9P7vPWKYah7Wn94RulvVL9D7PsdLNOZYkzwZ+gd6nyNcBJyR58fKVOZQ5x1JVVwNXAf9F7xfuF+l7v55slu3aMkMYeAmDqvofut/gSQ4Afr+qHkryQuDFSd5Eb2133yQPV9VyXVd+0WPp+nZ0X+9Och3wfHrrkpM2ynuyHbi5qu7u+v6V3jrjhZMofA4jvSed04BPVNWPx1zrIKO8L38EXF9VD3d9nwJeCPzHJAqfw6j/Vs4Bzun6LgW+PoGaF2u+se4AjpvVft2Cn325DjYMcTBib3oHEo7ksQMrz521zyHAXt32OcC753ieM1j+A6qLHgu9Ayr79e3zDWYdYFol41jT7T/V3f8n4KzV+J709V8PHL+cP1tL8L68CvhM9xz7ANcCv7NKx7IGeHq3/cvAbfSO8yznezPN/AdUT+HxB1Rv7NoPBr7Z/ds/qNs+eMGvvdw/mAO+MSfT+817F/CnXdu7gd/ttl/Zhd3XgX/cHYKznuMMljncRxkLvbMZbu1+yG8FzlyN4+j6XkrvINetwAeBfVfxWKbpzbD2Wu6frRF/vtYA/wDcCdwBnLeKx7J/N4Y76P3iPXqZx3EZveMYP6a3bn4m8EbgjV1/6P2nRnd1/yZm+h77emBrd3vdYl7fyw9IUoNW+wFVSdIcDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUoP8H1xfysAfPXP0AAAAASUVORK5CYII=\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEGCAYAAACevtWaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAT/ElEQVR4nO3df7SlVX3f8fcnINBllAGZzqIzjEN0GjNtlsiahSTGhEpUwOCQVKgm1ZFOOjGLuNpKk2LtMrbLP+IyRjFpTWaBcTD+AG0so6VaMog2tmAG+S1RRoQwI8jIr4TiL/TbP86echzunXvuveeee2ff92utu+7z7Gc/5+yzuXzOnn2eZ59UFZKkvvzYYjdAkjR+hrskdchwl6QOGe6S1CHDXZI6dPhiNwDguOOOq3Xr1i12MyTpkHLDDTd8q6pWTnVsSYT7unXr2LVr12I3Q5IOKUnume6Y0zKS1CHDXZI6ZLhLUocMd0nqkOEuSR0aKdyTrEjy8SR/neSOJD+T5NgkVye5s/0+ptVNkvcm2Z3kliQnL+xLkCQdaNSR+8XAp6vqecDzgTuAi4CdVbUe2Nn2Ac4E1refrcD7xtpiSdKMZgz3JEcDPw9cClBV36uqR4BNwPZWbTtwTtveBFxWA9cBK5IcP/aWS5KmNcrI/URgH/CnSW5MckmSpwOrquq+Vud+YFXbXg3cO3T+nlb2I5JsTbIrya59+/bN/RVIkp5ilDtUDwdOBt5YVdcnuZgnp2AAqKpKMqtv/aiqbcA2gI0bN875G0POvvDKKcs/+a5Nc31ISTrkjTJy3wPsqarr2/7HGYT9N/dPt7TfD7Tje4EThs5f08okSRMyY7hX1f3AvUl+shWdDnwZ2AFsbmWbgf1D6B3A69pVM6cCjw5N30iSJmDUhcPeCHwoyRHAXcD5DN4YrkiyBbgHOK/VvQo4C9gNPN7qSpImaKRwr6qbgI1THDp9iroFXDDPdkmS5sE7VCWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQyOFe5K7k9ya5KYku1rZsUmuTnJn+31MK0+S9ybZneSWJCcv5AuQJD3VbEbu/6SqTqqqjW3/ImBnVa0HdrZ9gDOB9e1nK/C+cTVWkjSa+UzLbAK2t+3twDlD5ZfVwHXAiiTHz+N5JEmzNGq4F/A/k9yQZGsrW1VV97Xt+4FVbXs1cO/QuXtamSRpQg4fsd7PVdXeJH8fuDrJXw8frKpKUrN54vYmsRVg7dq1szlVkjSDkUbuVbW3/X4A+ARwCvDN/dMt7fcDrfpe4ISh09e0sgMfc1tVbayqjStXrpz7K5AkPcWM4Z7k6UmesX8beBlwG7AD2NyqbQaubNs7gNe1q2ZOBR4dmr6RJE3AKNMyq4BPJNlf/8NV9ekkfwVckWQLcA9wXqt/FXAWsBt4HDh/7K2WJB3UjOFeVXcBz5+i/EHg9CnKC7hgLK2TJM2Jd6hKUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nq0MjhnuSwJDcm+VTbPzHJ9Ul2J7k8yRGt/Mi2v7sdX7cwTZckTWc2I/d/BdwxtP8O4N1V9VzgYWBLK98CPNzK393qSZImaKRwT7IGeAVwSdsP8BLg463KduCctr2p7dOOn97qS5ImZNSR+3uA3wF+2PafBTxSVU+0/T3A6ra9GrgXoB1/tNX/EUm2JtmVZNe+ffvm2HxJ0lRmDPckvwQ8UFU3jPOJq2pbVW2sqo0rV64c50NL0rJ3+Ah1XgS8MslZwFHAM4GLgRVJDm+j8zXA3lZ/L3ACsCfJ4cDRwINjb7kkaVozjtyr6s1Vtaaq1gGvBq6pql8DPgu8qlXbDFzZtne0fdrxa6qqxtpqSdJBzec6938HvCnJbgZz6pe28kuBZ7XyNwEXza+JkqTZGmVa5v+rqmuBa9v2XcApU9T5DnDuGNomSZoj71CVpA4Z7pLUIcNdkjo0qzn3Q8nZF145Zfkn37Vpwi2RpMlz5C5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ90uPyBJkzbdsicw+aVPHLlLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdmjHckxyV5ItJbk5ye5L/2MpPTHJ9kt1JLk9yRCs/su3vbsfXLexLkCQdaJSR+3eBl1TV84GTgDOSnAq8A3h3VT0XeBjY0upvAR5u5e9u9SRJEzRjuNfAY233ae2ngJcAH2/l24Fz2vamtk87fnqSjK3FkqQZjTTnnuSwJDcBDwBXA18DHqmqJ1qVPcDqtr0auBegHX8UeNYUj7k1ya4ku/bt2ze/VyFJ+hEjhXtV/aCqTgLWAKcAz5vvE1fVtqraWFUbV65cOd+HkyQNmdXVMlX1CPBZ4GeAFUn2f9nHGmBv294LnADQjh8NPDiW1kqSRjLK1TIrk6xo238PeClwB4OQf1WrthnY/xUkO9o+7fg1VVXjbLQk6eBG+Zq944HtSQ5j8GZwRVV9KsmXgY8meTtwI3Bpq38p8MEku4GHgFcvQLslSQcxY7hX1S3AC6Yov4vB/PuB5d8Bzh1L6yRJc+IdqpLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDo1znLkmHlLMvvHLK8k++a9OEW7J4HLlLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDnkppCTN0nSXWi4ljtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ17lLWjaW01LAhrukZa/H0HdaRpI6ZLhLUodmnJZJcgJwGbAKKGBbVV2c5FjgcmAdcDdwXlU9nCTAxcBZwOPA66vqSwvTfEnL2aGwxstiGWXk/gRwYVVtAE4FLkiyAbgI2FlV64GdbR/gTGB9+9kKvG/srZYkHdSM4V5V9+0feVfV3wF3AKuBTcD2Vm07cE7b3gRcVgPXASuSHD/2lkuSpjWrOfck64AXANcDq6rqvnbofgbTNjAI/nuHTtvTyiRJEzLypZBJfhz4r8C/rqq/HUytD1RVJanZPHGSrQymbVi7du1sTpXUKefQx2ekcE/yNAbB/qGq+vNW/M0kx1fVfW3a5YFWvhc4Yej0Na3sR1TVNmAbwMaNG2f1xiBJk3Aov9nMOC3Trn65FLijqv5g6NAOYHPb3gxcOVT+ugycCjw6NH0jSZqAUUbuLwJeC9ya5KZW9u+B3wOuSLIFuAc4rx27isFlkLsZXAp5/lhbLEma0YzhXlV/CWSaw6dPUb+AC+bZLknSPLi2jKSJO5Tnsg8VLj8gSR1y5C5pwThCXzyO3CWpQ4a7JHXIcJekDhnuktShZfeB6sE+4DmUv1JLkoY5cpekDhnuktShZTctI2n8vJ596XHkLkkdcuQuSRMw3b9uFupCDsNdWqYmHTaaLKdlJKlDhrskdchwl6QOGe6S1CE/UJU0Eq9lP7Q4cpekDhnuktQhw12SOuScu7REjWuOe7Y3JTm33gfDXVpkhqkWguEudc43j+XJOXdJ6tCMI/ck7wd+CXigqv5xKzsWuBxYB9wNnFdVDycJcDFwFvA48Pqq+tLCNF1amlyQS0vBKCP3DwBnHFB2EbCzqtYDO9s+wJnA+vazFXjfeJopSZqNGcO9qj4PPHRA8SZge9veDpwzVH5ZDVwHrEhy/LgaK0kazVw/UF1VVfe17fuBVW17NXDvUL09rew+DpBkK4PRPWvXrp1jM8bLf05L6sW8P1CtqgJqDudtq6qNVbVx5cqV822GJGnIXMP9m/unW9rvB1r5XuCEoXprWpkkaYLmOi2zA9gM/F77feVQ+W8l+SjwQuDRoekbaVnzenNN0iiXQn4EOA04Lske4HcZhPoVSbYA9wDntepXMbgMcjeDSyHPX4A2S0uCYa2lbMZwr6rXTHPo9CnqFnDBfBslSZoflx+Q8Eop9cflBySpQ47cpYNwXl2HKkfuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUNeCqlZ8WYf6dBguI/AQDv0+N9My53TMpLUIUfu8+DocPHN9g5S7zjVcmG4d2ScbzaGoHRoM9y1oMb1huObjTQ7hvsysBSD0SktaWEZ7gtgoYPrUAprSYvDq2UkqUOO3JcApyie5L8ApPEw3Jcwg07SXDktI0kdMtwlqUOGuyR1yDn3CXIOXdKkOHKXpA4Z7pLUoQUJ9yRnJPlKkt1JLlqI55AkTW/s4Z7kMOA/A2cCG4DXJNkw7ueRJE1vIUbupwC7q+quqvoe8FFg+d1qKUmLaCGullkN3Du0vwd44YGVkmwFtrbdx5J8ZQHaMpPjgG8twvMeKuyfmdlHB2f/zCB/MK8+evZ0BxbtUsiq2gZsW6znB0iyq6o2LmYbljL7Z2b20cHZPzNbqD5aiGmZvcAJQ/trWpkkaUIWItz/Clif5MQkRwCvBnYswPNIkqYx9mmZqnoiyW8BnwEOA95fVbeP+3nGZFGnhQ4B9s/M7KODs39mtiB9lKpaiMeVJC0i71CVpA4Z7pLUoS7DfablD5KsTfLZJDcmuSXJWUPH3tzO+0qSl0+25ZMz1z5Ksi7Jt5Pc1H7+ePKtX3gj9M+zk+xsfXNtkjVDxzYnubP9bJ5syydnnn30g6G/oS4vuEjy/iQPJLltmuNJ8t7Wf7ckOXno2Pz/hqqqqx8GH+J+DfgJ4AjgZmDDAXW2Ab/ZtjcAdw9t3wwcCZzYHuewxX5NS6yP1gG3LfZrWAL98zFgc9t+CfDBtn0scFf7fUzbPmaxX9NS6qO2/9hiv4YJ9NHPAydP9/8LcBbwP4AApwLXj/NvqMeR+yjLHxTwzLZ9NPCNtr0J+GhVfbeqvg7sbo/Xm/n00XIwSv9sAK5p258dOv5y4OqqeqiqHgauBs6YQJsnbT59tCxU1eeBhw5SZRNwWQ1cB6xIcjxj+hvqMdynWv5g9QF13gb88yR7gKuAN87i3B7Mp48ATmzTNZ9L8uIFbeniGKV/bgZ+pW3/MvCMJM8a8dwezKePAI5KsivJdUnOWdimLlnT9eFY/oZ6DPdRvAb4QFWtYfBPow8mWa59MZ3p+ug+YG1VvQB4E/DhJM88yOP06t8Cv5DkRuAXGNyF/YPFbdKSc7A+enYNbrn/VeA9SZ6zSG3sVo+BNsryB1uAKwCq6v8ARzFY4Gi5LJ0w5z5qU1YPtvIbGMy7/sMFb/Fkzdg/VfWNqvqV9ib3llb2yCjndmI+fURV7W2/7wKuBV4wgTYvNdP14Vj+hnoM91GWP/gb4HSAJD/FILj2tXqvTnJkkhOB9cAXJ9byyZlzHyVZ2dbsJ8lPMOijuybW8smYsX+SHDf0r703A+9v258BXpbkmCTHAC9rZb2Zcx+1vjlyfx3gRcCXJ9bypWMH8Lp21cypwKNVdR/j+hta7E+UF+hT6rOArzIYVb6llf0n4JVtewPwBQZzgjcBLxs69y3tvK8AZy72a1lqfQT8U+D2VvYl4OzFfi2L1D+vAu5sdS4Bjhw6918w+DB+N3D+Yr+WpdZHwM8Ct7a/rVuBLYv9Whaofz7CYBrz+wzmzbcAbwDe0I6HwRcbfa31w8Zx/g25/IAkdajHaRlJWvYMd0nqkOEuSR0y3CWpQ4a7JHXIcNeSluSSJBtmUX9jkve27dcn+aNZPt/w+acl+dnZtVhaGsb+NXvSOFXVr8+y/i5g11yeK8nhB5x/GvAY8L/n8njjkOSwqnJZA82aI3ctCUmenuS/J7k5yW1J/lkrvzbJxrb9WJJ3Jrk9yV8kOaUdvyvJK1ud05J8aorHPzvJ9W3Bs79IsqqVvy3JB5N8gcH6Oacl+VSSdQxuOPk3bc3xFyf5epKntfOeObw/9DzntvbfnOTzreywJL/fym9J8sZWfnprz60ZrP29/67Nu5O8I8mXgHOTPCfJp5PckOR/JXneQvw3UF8cuWupOAP4RlW9AiDJ0VPUeTpwTVX9dpJPAG8HXsrgbtrtPHUJhWF/CZxaVZXk14HfAS5sxzYAP1dV305yGkBV3Z3BF5E8VlW/39p0LfAK4L8xuN3+z6vq+wc8z1uBl1fV3iQrWtlWBuvgn1SDL5A/NslRwAeA06vqq0kuA34TeE8758GqOrk9704GdzXemeSFwH9hsD66NC1H7loqbgVe2kasL66qR6eo8z3g00P1P9fC9VYG4Xkwa4DPJLkV+G3gHw0d21FV3x6hjZcA57ft84E/naLOF4APJPmXDL7QAuAXgT+pqicAquoh4CeBr1fVV1ud7Qy+3GG/ywGS/DiD2/U/luQm4E+A40doq5Y5w11LQgu5kxkE9duTvHWKat+vJ9fL+CHw3XbuD5n5X6F/CPxRVf008BsMFkLb7/+O2MYvAOva6P6wqnrK16dV1RuA/8BgVb8b8uT65bO1v00/BjxSVScN/fzUHB9Ty4jhriUhyT8AHq+qPwPeySDox+lonlw2ddTvpPw74BkHlF0GfJipR+0keU5VXV9Vb2Ww0ugJDL5J5zeSHN7qHMtgYbp1SZ7bTn0t8LkDH6+q/hb4epJz27lJ8vwR269lzHDXUvHTwBfb1MPvMphPH6e3MZjauAH41ojnfBL45f0fqLayDzH4XsuPTHPOO9sHpLcxuMrmZgbTOX8D3JLkZuBXq+o7DKZ2Ptamin4ITPdl478GbGnn3s4y+7o6zY2rQkqzkORVwKaqeu1it0U6GK+WkUaU5A+BMxmsYy4taY7cJalDzrlLUocMd0nqkOEuSR0y3CWpQ4a7JHXo/wGLDuOcFHH2SgAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] @@ -930,104 +934,76 @@ } ], "source": [ - "plt.hist([score for _, _, score in data[::1] if score > 0.94], bins=50);" + "plt.hist([score for _, _, score in data if score >= 0.79], bins=50);\n", + "plt.xlabel('similarity score')\n", + "plt.show()" ] }, { "cell_type": "markdown", - "metadata": { - "pycharm": {} - }, + "metadata": {}, "source": [ - "Now it looks like a good threshold should be above `0.95`. Let's have a look at some of the candidate matches around there." + "Indeed, there is a cluster of scores between 0.9 and 1.0. To better visualize that these are indeed the scores for the matches, we will now extract the true_matches from the datasets and group the similarity scores into those for the matches and the non-matches (We can do this because we know the ground truth of the dataset)." ] }, { "cell_type": "code", "execution_count": 19, - "metadata": { - "pycharm": { - "is_executing": false - } - }, + "metadata": {}, "outputs": [], "source": [ - "def sample(data, threshold, num_samples, epsilon=0.01):\n", - " samples = []\n", - " for row in data:\n", - " if abs(row[2] - threshold) <= epsilon:\n", - " samples.append(row)\n", - " if len(samples) >= num_samples:\n", - " break\n", - " return samples\n", - "\n", - "def lookup_originals(candidate_pair):\n", - " a, b, score = candidate_pair\n", - " a_index, b_index = [x[1] for x in sorted([a, b])]\n", - " a = dfA.iloc[a_index]\n", - " b = dfB.iloc[b_index]\n", - " return a, b" + "# rec_id in dfA has the form 'rec-1070-org'. We only want the number. Additionally, as we are\n", + "# interested in the position of the records, we create a new index which contains the row numbers.\n", + "dfA_ = dfA.rename(lambda x: x[4:-4], axis='index').reset_index()\n", + "dfB_ = dfB.rename(lambda x: x[4:-6], axis='index').reset_index()\n", + "# now we can merge dfA_ and dfB_ on the record_id.\n", + "a = pd.DataFrame({'ida': dfA_.index, 'rec_id': dfA_['rec_id']})\n", + "b = pd.DataFrame({'idb': dfB_.index, 'rec_id': dfB_['rec_id']})\n", + "dfj = a.merge(b, on='rec_id', how='inner').drop(columns=['rec_id'])\n", + "# and build a set of the corresponding row numbers.\n", + "true_matches = set((row[0], row[1]) for row in dfj.itertuples(index=False))" ] }, { "cell_type": "code", "execution_count": 20, - "metadata": { - "pycharm": { - "is_executing": false - } - }, + "metadata": {}, "outputs": [], "source": [ - "def look_at_per_field_accuracy(threshold = 0.999, num_samples = 100):\n", - " results = []\n", - " for i, candidate in enumerate(sample(data, threshold, num_samples, 0.01), start=1):\n", - " record_a, record_b = lookup_originals(candidate)\n", - " results.append(record_a == record_b)\n", - "\n", - " print(\"Proportion of exact matches for each field using threshold: {}\".format(threshold))\n", - " print(sum(results)/num_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": {} - }, - "source": [ - "So we should expect a very high proportion of matches across all fields for high thresholds:" + "scores_matches = []\n", + "scores_non_matches = []\n", + "for a, b, score in data:\n", + " if score < 0.79:\n", + " continue\n", + " if (a, b) in true_matches:\n", + " scores_matches.append(score)\n", + " else:\n", + " scores_non_matches.append(score)" ] }, { "cell_type": "code", "execution_count": 21, - "metadata": { - "pycharm": { - "is_executing": false - } - }, + "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Proportion of exact matches for each field using threshold: 0.999\n", - "given_name 0.95\n", - "surname 0.94\n", - "street_number 0.85\n", - "address_1 0.93\n", - "address_2 0.75\n", - "suburb 0.95\n", - "postcode 0.97\n", - "state 1.00\n", - "date_of_birth 0.98\n", - "soc_sec_id 0.38\n", - "dtype: float64\n" - ] + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEGCAYAAACevtWaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAb8klEQVR4nO3dfXRV1b3u8e/Di1CrBVSuA4mnYIsWMRAQLEpbUY6I+AKlanuOVeTYUr20o1e9CirVtsM6cNhzqrZePVRRsFrx5fSKXq8VfEO9RQwaEERLtLQEqaQotFSsUn/3jz1Jt5iQnWQnO1l5PmNkZK255lpr7pnwZDH32nMpIjAzs2zpUuoGmJlZ8TnczcwyyOFuZpZBDnczswxyuJuZZVC3UjcA4IADDogBAwaUuhlmZh3KihUr/hQRfevb1i7CfcCAAVRWVpa6GWZmHYqk3ze0zcMyZmYZ5HA3M8sgh7uZWQYVNOYuqTdwK3AEEMC/Aa8BC4EBwHrgzIh4R5KAG4CJwLvAuRHxYtFbbmat5oMPPqCmpob33nuv1E0xoGfPnpSVldG9e/eC9yn0DdUbgEcj4nRJewF7A5cDj0fEHEmzgFnATOAkYFD6+jxwc/puZh1ETU0N++67LwMGDCB3vWalEhFs2bKFmpoaBg4cWPB+jQ7LSOoFfAm4LZ3o/YjYCkwC5qdq84HJaXkSsCBylgG9JfUr/KWYWam999577L///g72dkAS+++/f5P/F1XImPtAoBa4XdJLkm6V9EngwIjYlOr8ETgwLfcHNuTtX5PKdm/wdEmVkipra2ub1Ggza30O9vajOT+LQsK9GzACuDkihgN/JTcEUydy8wY3ae7giJgbESMjYmTfvvXeg29mZs1UyJh7DVATEc+n9fvJhftbkvpFxKY07LI5bd8IHJy3f1kqM7MO6tSLHyzq8R7690lFPV6+qqoq3nzzTSZOnLjHevvssw/bt29vtXaUWqNX7hHxR2CDpMNS0TjgFWARMDWVTQV2/fQXAecoZzSwLW/4plWdufACzlx4QVucyszaqaqqKh555JFSN6PkCr3P/TvAXZJWARXANcAc4ARJ64B/TusAjwBvANXAz4H/XtQWm1nmrV+/ns997nOce+65HHrooZx11lksWbKEMWPGMGjQIJYvX87y5cs5+uijGT58OMcccwyvvfYa77//PldeeSULFy6koqKChQsXsn37dqZNm0Z5eTlDhw7lgQceqDvPFVdcwbBhwxg9ejRvvfUWALW1tXzlK19h1KhRjBo1iueeew6Ap59+moqKCioqKhg+fDh/+ctfStI3hSroVsiIqAJG1rNpXD11A5jRwnaZWSdXXV3Nfffdx7x58xg1ahR33303zz77LIsWLeKaa65hwYIFPPPMM3Tr1o0lS5Zw+eWX88ADD/DDH/6QyspKfvaznwEwc+ZMevXqxcsvvwzAO++8A8Bf//pXRo8ezY9+9CMuvfRSfv7znzN79my++93vcuGFF/KFL3yBP/zhD5x44omsXbuWH//4x9x0002MGTOG7du307Nnz5L1TSHaxcRhZma7GzhwIOXl5QAMGTKEcePGIYny8nLWr1/Ptm3bmDp1KuvWrUMSH3zwQb3HWbJkCffcc0/dep8+fQDYa6+9OOWUUwA48sgjWbx4cV39V155pa7+n//8Z7Zv386YMWO46KKLOOuss5gyZQplZWWt8rqLxdMPmFm71KNHj7rlLl261K136dKFnTt38r3vfY/jjjuO1atX89BDDzX5PvDu3bvX3WLYtWtXdu7cCcCHH37IsmXLqKqqoqqqio0bN7LPPvswa9Ysbr31Vnbs2MGYMWN49dVXi/RKW4fD3cw6pG3bttG/f+4jNHfccUdd+b777vuR8fATTjiBm266qW5917BMQ8aPH89Pf/rTuvWqqioAXn/9dcrLy5k5cyajRo1q9+HuYRkza1Rr3rrYXJdeeilTp07l6quv5uSTT64rP+6445gzZw4VFRVcdtllzJ49mxkzZnDEEUfQtWtXrrrqKqZMmdLgcW+88UZmzJjB0KFD2blzJ1/60pe45ZZbuP7663nyySfp0qULQ4YM4aSTTmqLl9lsyr3/WVojR46MYjysY9dtkPd+9eYWH8usM1u7di2DBw8udTMsT30/E0krIqK+m108LGNmlkUOdzOzDHK4m5llkMPdzCyDHO5mZhnkcDczyyDf525mjSr2bKsd9Xbla665hssvv3yPdc4991xOOeUUTj/99DZqVf185W5mVqBrrrmm1E0omMPdzNqd9evXM3jwYL75zW8yZMgQxo8fz44dO6iqqmL06NEMHTqUL3/5y3VTCYwdO5aZM2dy1FFHceihh/LMM8/Ue9yxY8dy4YUXMnLkSAYPHswLL7zAlClTGDRoELNnz66rN3nyZI488kiGDBnC3LlzAZg1axY7duygoqKCs846C4AFCxYwdOhQhg0bxtlnn123/9KlSznmmGM45JBDuP/+++vKr7vuOkaNGsXQoUO56qqrgNzslCeffDLDhg3jiCOOYOHChUXpQ4e7mbVL69atY8aMGaxZs4bevXvzwAMPcM4553DttdeyatUqysvL+cEPflBXf+fOnSxfvpzrr7/+I+W722uvvaisrOT8889n0qRJ3HTTTaxevZo77riDLVu2ADBv3jxWrFhBZWUlN954I1u2bGHOnDl84hOfoKqqirvuuos1a9Zw9dVX88QTT7By5UpuuOGGunNs2rSJZ599locffphZs3JPJX3sscdYt24dy5cvp6qqihUrVrB06VIeffRRDjroIFauXMnq1auZMGFCUfrP4W5m7dLAgQOpqKgAclPyvv7662zdupVjjz0WgKlTp7J06dK6+rvmiznyyCNZv359g8c97bTTACgvL2fIkCH069ePHj16cMghh7BhwwYgN7/Mrod4bNiwgXXr1n3sOE888QRnnHEGBxxwAAD77bdf3bbJkyfTpUsXDj/88LqHgDz22GM89thjDB8+nBEjRvDqq6+ybt06ysvLWbx4MTNnzuSZZ56hV69eze2yj/AbqmbWLuVP+du1a1e2bt1aUP386XunTZvGSy+9xEEHHVT36L38qYN3n1Z4586dPPXUUyxZsoTf/OY37L333owdO7bJ0wnnH3fX/F0RwWWXXca3vvWtj9V/8cUXeeSRR5g9ezbjxo3jyiuvbNL56uMrdzPrEHr16kWfPn3qxtPvvPPOuqv4htx+++1Nfqbqtm3b6NOnD3vvvTevvvoqy5Ytq9vWvXv3uoeCHH/88dx33311Qzlvv/32Ho974oknMm/evLqHcm/cuJHNmzfz5ptvsvfee/P1r3+dSy65hBdffLHgtu6Jr9zNrFHt5dbF+fPnc/755/Puu+9yyCGHcPvttxf9HBMmTOCWW25h8ODBHHbYYYwePbpu2/Tp0xk6dCgjRozgrrvu4oorruDYY4+la9euDB8+/CPzyu9u/PjxrF27lqOPPhqAffbZh1/84hdUV1dzySWX0KVLF7p3787NNxenrz3lr5l9jKf8bX885a+ZmTnczcyyyOFuZvVqD0O2ltOcn4XD3cw+pmfPnmzZssUB3w5EBFu2bKFnz55N2s93y5jZx5SVlVFTU0NtbW2pm2Lk/tiWlZU1aR+Hu5l9TPfu3Rk4cGCpm2EtUNCwjKT1kl6WVCWpMpXtJ2mxpHXpe59ULkk3SqqWtErSiNZ8AWZm9nFNGXM/LiIq8u6pnAU8HhGDgMfTOsBJwKD0NR3wTedmZm2sJW+oTgLmp+X5wOS88gWRswzoLalfC85jZmZNVGi4B/CYpBWSpqeyAyNiU1r+I3BgWu4PbMjbtyaVfYSk6ZIqJVX6TRszs+Iq9A3VL0TERkn/DVgs6dX8jRERkpp0z1REzAXmQm76gabsa2Zme1bQlXtEbEzfNwO/Ao4C3to13JK+b07VNwIH5+1elsrMzKyNNBrukj4pad9dy8B4YDWwCJiaqk0FHkzLi4Bz0l0zo4FtecM3ZmbWBgoZljkQ+JWkXfXvjohHJb0A3CvpPOD3wJmp/iPARKAaeBeYVvRWm5nZHjUa7hHxBjCsnvItwLh6ygOYUZTWmZlZs3huGTOzDHK4m5llkMPdzCyDHO5mZhnkcDczyyCHu5lZBjnczcwyyOFuZpZBDnczswxyuJuZZZDD3cwsgxzuZmYZ5HA3M8sgh7uZWQY53M3MMsjhbmaWQQ53M7MMcribmWWQw93MLIMc7mZmGeRwNzPLIIe7mVkGOdzNzDLI4W5mlkEOdzOzDHK4m5llUMHhLqmrpJckPZzWB0p6XlK1pIWS9krlPdJ6ddo+oHWabmZmDWnKlft3gbV569cCP4mIzwLvAOel8vOAd1L5T1I9MzNrQwWFu6Qy4GTg1rQu4Hjg/lRlPjA5LU9K66Tt41J9MzNrI4VeuV8PXAp8mNb3B7ZGxM60XgP0T8v9gQ0Aafu2VP8jJE2XVCmpsra2tpnNNzOz+jQa7pJOATZHxIpinjgi5kbEyIgY2bdv32Ie2sys0+tWQJ0xwGmSJgI9gU8BNwC9JXVLV+dlwMZUfyNwMFAjqRvQC9hS9JabmVmDGr1yj4jLIqIsIgYAXwOeiIizgCeB01O1qcCDaXlRWidtfyIioqitNjOzPWrJfe4zgYskVZMbU78tld8G7J/KLwJmtayJZmbWVIUMy9SJiKeAp9LyG8BR9dR5DzijCG0zM7Nm8idUzcwyyOFuZpZBDnczswxyuJuZZZDD3cwsgxzuZmYZ5HA3M8ugThHuZy68gDMXXlDqZpiZtZlOEe5mZp2Nw93MLIMc7mZmGeRwNzPLIIe7mVkGOdzNzDLI4W5mVmSnXvwgp178YOMVW5HD3cwsgxzuZmYZ5HA3M8sgh7uZWQY53M3MMsjhbmaWQQ53M7MMcribmWWQw93MLIMc7mZmGeRwNzPLoEbDXVJPScslrZS0RtIPUvlASc9Lqpa0UNJeqbxHWq9O2we07kswM7PdFXLl/jfg+IgYBlQAEySNBq4FfhIRnwXeAc5L9c8D3knlP0n1zMysDTUa7pGzPa12T18BHA/cn8rnA5PT8qS0Tto+TpKK1mIzM2tUQWPukrpKqgI2A4uB14GtEbEzVakB+qfl/sAGgLR9G7B/PcecLqlSUmVtbW3LXoWZmX1EQeEeEX+PiAqgDDgK+FxLTxwRcyNiZESM7Nu3b0sPZ2ZmeZp0t0xEbAWeBI4GekvqljaVARvT8kbgYIC0vRewpSitNTOzghRyt0xfSb3T8ieAE4C15EL+9FRtKrDrsSOL0jpp+xMREcVstJmZ7Vm3xqvQD5gvqSu5Pwb3RsTDkl4B7pF0NfAScFuqfxtwp6Rq4G3ga63QbjMz24NGwz0iVgHD6yl/g9z4++7l7wFnFKV1ZmbWLP6EqplZBjnczcwyyOFuZpZBDnczswxyuJuZZZDD3cwsgxzuZmYZVMiHmMzMOpxTL36wbvmhf59UwpaUhq/czcwyyOFuZpZBDnczswxyuJuZZZDD3cwsgxzuZmbNdOrFD37krpz2xOFuZpZBDnczswxyuJuZZZDD3cwsgxzuZmYZ5HA3M8sgh7uZWQY53M3MMshT/ppZp9JZpgJ2uJuZkb3Q97CMmVkGOdzNrENrz/O7lFKj4S7pYElPSnpF0hpJ303l+0laLGld+t4nlUvSjZKqJa2SNKK1X4SZmX1UIVfuO4GLI+JwYDQwQ9LhwCzg8YgYBDye1gFOAgalr+nAzUVvtZmZ7VGj4R4RmyLixbT8F2At0B+YBMxP1eYDk9PyJGBB5CwDekvqV/SWm5lZg5o05i5pADAceB44MCI2pU1/BA5My/2BDXm71aSy3Y81XVKlpMra2tomNtvMsqqhMXSPrTdNweEuaR/gAeB/RMSf87dFRADRlBNHxNyIGBkRI/v27duUXc3MrBEF3ecuqTu5YL8rIv4rFb8lqV9EbErDLptT+Ubg4Lzdy1KZmVmH01Hvfy/kbhkBtwFrI+I/8jYtAqam5anAg3nl56S7ZkYD2/KGb8zMrA0UcuU+BjgbeFlSVSq7HJgD3CvpPOD3wJlp2yPARKAaeBeYVtQWm5lZoxoN94h4FlADm8fVUz+AGS1sl5mZtYA/oWpmJeG7X1qXw93MLIMc7mbWqnyFXhoOdzOzDHK4m5llkMPdzCyDHO5mZhnkcDczyyCHu5lZBnXqcD9z4QWcufCCUjfDzKzoCpoV0sxsTzrqzIlZ1qmv3M3M2lJbfqDL4W5mlkEeljHrxDyckl2+cjczyyCHu5lZBjnczcwyyOFuZpZBfkPVzArmN2A7Dl+5m5llkMPdzCyDHO5m7VhDn2hsarl1Ph5zN7OP8dh6x+crd7N2wFfcVmwOd7NOwH88Oh8Py5gVmYc0rD1o9Mpd0jxJmyWtzivbT9JiSevS9z6pXJJulFQtaZWkEa3ZeDMzq18hwzJ3ABN2K5sFPB4Rg4DH0zrAScCg9DUduLk4zTQzs6ZoNNwjYinw9m7Fk4D5aXk+MDmvfEHkLAN6S+pXrMaamVlhmvuG6oERsSkt/xE4MC33Bzbk1atJZR8jabqkSkmVtbW1zWyGmZnVp8V3y0REANGM/eZGxMiIGNm3b9+WNsPMzPI0N9zf2jXckr5vTuUbgYPz6pWlMjMza0PNvRVyETAVmJO+P5hX/m1J9wCfB7blDd90GGcuvKBu+d6v+j1hKw7fImltqdFwl/RLYCxwgKQa4CpyoX6vpPOA3wNnpuqPABOBauBdYFortNmsXXBYW3vWaLhHxL80sGlcPXUDmNHSRpmZWct4+gGzxB/RtyxxuJuZZZDnljFrhMfWrSPylbuZWQY53M3MMsjhbmaWQQ53M7MMcribmWWQw93MLIMc7tYk/qCPWcfgcLfM8h8i68wc7mZmGeRwtw6vqVfovqK3zsDhnjHFCi4HoFnH5rllmqAjP8RjV1C39dwoxZyXpVSvwawjcrhbSXgyLrPW5XDv5Ip1NeywNmtfHO4dkIPUzBrjcLd2xX+4zIrD4d4OONDMrNgc7kWw+100Hsc2s1JzuLfArvD9xFElboiZ2W78ISYzswxyuJuZZZCHZQrg4Rcz62gyG+75b0aWKpQ78nQFZtaxtUq4S5oA3AB0BW6NiDmtcZ7mauhKvLWv0D/6B+dRwKFvZq2j6OEuqStwE3ACUAO8IGlRRLxS7HNB+7hCL6ZdV/sOfTNrida4cj8KqI6INwAk3QNMAlol3DuLhkK/qeWNHX/3fTy0ZNYxKSKKe0DpdGBCRHwjrZ8NfD4ivr1bvenA9LR6GPBaURvSuAOAP7XxOTsa91Hj3EeNcx/tWUv659MR0be+DSV7QzUi5gJzS3V+SZURMbJU5+8I3EeNcx81zn20Z63VP61xn/tG4OC89bJUZmZmbaQ1wv0FYJCkgZL2Ar4GLGqF85iZWQOKPiwTETslfRv4NblbIedFxJpin6cISjYk1IG4jxrnPmqc+2jPWqV/iv6GqpmZlZ7nljEzyyCHu5lZBmUy3CVNkPSapGpJs+rZ/k+SnpT0kqRVkibmbbss7feapBPbtuVtp7l9JGmApB2SqtLXLW3f+tZXQP98WtLjqW+eklSWt22qpHXpa2rbtrzttLCP/p73O5TJGy4kzZO0WdLqBrZL0o2p/1ZJGpG3reW/QxGRqS9yb+K+DhwC7AWsBA7frc5c4IK0fDiwPm95JdADGJiO07XUr6md9dEAYHWpX0M76J/7gKlp+XjgzrS8H/BG+t4nLfcp9WtqT32U1reX+jW0QR99CRjR0L8XYCLwfwEBo4Hni/k7lMUr97rpDyLifWDX9Af5AvhUWu4FvJmWJwH3RMTfIuJ3QHU6Xta0pI86g0L653DgibT8ZN72E4HFEfF2RLwDLAYmtEGb21pL+qhTiIilwNt7qDIJWBA5y4DekvpRpN+hLIZ7f2BD3npNKsv3feDrkmqAR4DvNGHfLGhJHwEMTMM1T0v6Yqu2tDQK6Z+VwJS0/GVgX0n7F7hvFrSkjwB6SqqUtEzS5NZtarvVUB8W5Xcoi+FeiH8B7oiIMnL/NbpTUmfti4Y01EebgH+KiOHARcDdkj61h+Nk1f8EjpX0EnAsuU9h/720TWp39tRHn47cR+7/Fbhe0mdK1MbMymKgFTL9wXnAvQAR8RugJ7nJezrL1AnN7qM0ZLUlla8gN+56aKu3uG012j8R8WZETEl/5K5IZVsL2TcjWtJHRMTG9P0N4ClgeBu0ub1pqA+L8juUxXAvZPqDPwDjACQNJhdctane1yT1kDQQGAQsb7OWt51m95GkvmnOfiQdQq6P3mizlreNRvtH0gF5/9u7DJiXln8NjJfUR1IfYHwqy5pm91Hqmx676gBj6JxTgi8Czkl3zYwGtkXEJor1O1Tqd5Rb6V3qicBvyV1VXpHKfgiclpYPB54jNyZYBYzP2/eKtN9rwEmlfi3trY+ArwBrUtmLwKmlfi0l6p/TgXWpzq1Aj7x9/43cm/HVwLRSv5b21kfAMcDL6XfrZeC8Ur+WVuqfX5IbxvyA3Lj5ecD5wPlpu8g92Oj11A8ji/k75OkHzMwyKIvDMmZmnZ7D3cwsgxzuZmYZ5HA3M8sgh7uZWQY53K1dk3SrpMObUH+kpBvT8rmSftbE8+XvP1bSMU1rsVn7UPTH7JkVU0R8o4n1K4HK5pxLUrfd9h8LbAf+X3OOVwySukaEpzWwJvOVu7ULkj4p6f9IWilptaSvpvKnJI1My9slXSdpjaQlko5K29+QdFqqM1bSw/Uc/1RJz6cJz5ZIOjCVf1/SnZKeIzd/zlhJD0saQO4DJxemOce/KOl3krqn/T6Vv553njNS+1dKWprKukr6cSpfJek7qXxcas/Lys39vetTm+slXSvpReAMSZ+R9KikFZKekfS51vgZWLb4yt3aiwnAmxFxMoCkXvXU+STwRERcIulXwNXACeQ+TTufj0+hkO9ZYHREhKRvAJcCF6dthwNfiIgdksYCRMR65R5Esj0ifpza9BRwMvC/yX3c/r8i4oPdznMlcGJEbJTUO5VNJzcPfkXkHiC/n6SewB3AuIj4raQFwAXA9WmfLRExIp33cXKfalwn6fPA/yI3P7pZg3zlbu3Fy8AJ6Yr1ixGxrZ467wOP5tV/OoXry+TCc0/KgF9Lehm4BBiSt21RROwooI23AtPS8jTg9nrqPAfcIemb5B5oAfDPwH9GxE6AiHgbOAz4XUT8NtWZT+7hDrssBJC0D7mP698nqQr4T6BfAW21Ts7hbu1CCrkR5IL6aklX1lPtg/jHfBkfAn9L+35I4/8L/Snws4goB75FbiK0Xf5aYBufAwakq/uuEfGxx6dFxPnAbHKz+q3QP+Yvb6pdbeoCbI2Iiryvwc08pnUiDndrFyQdBLwbEb8AriMX9MXUi39Mm1roMyn/Auy7W9kC4G7qv2pH0mci4vmIuJLcTKMHk3uSzrckdUt19iM3Md0ASZ9Nu54NPL378SLiz8DvJJ2R9pWkYQW23zoxh7u1F+XA8jT0cBW58fRi+j65oY0VwJ8K3Och4Mu73lBNZXeRe67lLxvY57r0BulqcnfZrCQ3nPMHYJWklcC/RsR75IZ27ktDRR8CDT1s/CzgvLTvGjrZ4+qseTwrpFkTSDodmBQRZ5e6LWZ74rtlzAok6afASeTmMTdr13zlbmaWQR5zNzPLIIe7mVkGOdzNzDLI4W5mlkEOdzOzDPr/wUIK/pP1YT4AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" } ], "source": [ - "look_at_per_field_accuracy(threshold = 0.999, num_samples = 100)" + "plt.hist([scores_matches, scores_non_matches], bins=50, label=['matches', 'non-matches'])\n", + "plt.legend(loc='upper right')\n", + "plt.xlabel('similarity score')\n", + "plt.show()" ] }, { @@ -1036,40 +1012,17 @@ "pycharm": {} }, "source": [ - "But if we look at a threshold which is closer to the boundary between real matches we should see a lot more errors:" + "We can see that the similarity scores for the matches and the ones for the non-matches form two different distributions. With a suitable linkage schema, these two distributions hardly overlap. \n", + "\n", + "When choosing a similarity threshold for solving, the valley between these two distributions is a good starting point. In this example, it is around 0.82. We can see that almost all similarity scores above 0.82 are from matches, thus the solver will produce a linkage result with high precision. However, recall will not be optimal, as there are still some scores from matches below 0.82. By moving the threshold to either side, you can favour either precision or recall." ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "pycharm": { - "is_executing": false - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Proportion of exact matches for each field using threshold: 0.95\n", - "given_name 0.58\n", - "surname 0.59\n", - "street_number 0.73\n", - "address_1 0.67\n", - "address_2 0.53\n", - "suburb 0.71\n", - "postcode 0.89\n", - "state 0.95\n", - "date_of_birth 0.75\n", - "soc_sec_id 0.92\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "look_at_per_field_accuracy(threshold = 0.95, num_samples = 100)" - ] + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1088,7 +1041,16 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } } }, "nbformat": 4, From 02307dc7f4aa20f3cca971c11a6ca3d32b779ee3 Mon Sep 17 00:00:00 2001 From: wilko Date: Thu, 21 Nov 2019 11:33:14 +1100 Subject: [PATCH 03/12] added pandas --- docs/tutorial/tutorial-requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorial/tutorial-requirements.txt b/docs/tutorial/tutorial-requirements.txt index 5d2f8bf8..85cde1d6 100644 --- a/docs/tutorial/tutorial-requirements.txt +++ b/docs/tutorial/tutorial-requirements.txt @@ -3,3 +3,4 @@ ipython matplotlib recordlinkage requests +pandas From 44398ca5ccd9e5bb2c8febea51f7f176fe068494 Mon Sep 17 00:00:00 2001 From: wilko Date: Thu, 21 Nov 2019 13:28:16 +1100 Subject: [PATCH 04/12] clarified output format --- .../multiparty-linkage-with-clkhash.ipynb | 801 ++++++++++++++++-- 1 file changed, 722 insertions(+), 79 deletions(-) diff --git a/docs/tutorial/multiparty-linkage-with-clkhash.ipynb b/docs/tutorial/multiparty-linkage-with-clkhash.ipynb index 20ddd0ed..1ab88726 100644 --- a/docs/tutorial/multiparty-linkage-with-clkhash.ipynb +++ b/docs/tutorial/multiparty-linkage-with-clkhash.ipynb @@ -70,10 +70,10 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ "keys: my_secret\n" - ], - "output_type": "stream" + ] } ], "source": [ @@ -91,10 +91,100 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ - "{\n \"version\": 3,\n \"clkConfig\": {\n \"l\": 1024,\n \"kdf\": {\n \"type\": \"HKDF\",\n \"hash\": \"SHA256\",\n \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n \"keySize\": 64\n }\n },\n \"features\": [\n {\n \"identifier\": \"id\",\n \"ignored\": true\n },\n {\n \"identifier\": \"givenname\",\n \"format\": {\n \"type\": \"string\",\n \"encoding\": \"utf-8\"\n },\n \"hashing\": {\n \"strategy\": {\n \"bitsPerToken\": 15\n },\n \"comparison\": {\n \"type\": \"ngram\",\n \"n\": 2,\n \"positional\": false\n }\n }\n },\n {\n \"identifier\": \"surname\",\n \"format\": {\n \"type\": \"string\",\n \"encoding\": \"utf-8\"\n },\n \"hashing\": {\n \"strategy\": {\n \"bitsPerToken\": 15\n },\n \"comparison\": {\n \"type\": \"ngram\",\n \"n\": 2,\n \"positional\": false\n }\n }\n },\n {\n \"identifier\": \"dob\",\n \"format\": {\n \"type\": \"string\",\n \"encoding\": \"utf-8\"\n },\n \"hashing\": {\n \"strategy\": {\n \"bitsPerToken\": 15\n },\n \"comparison\": {\n \"type\": \"ngram\",\n \"n\": 2,\n \"positional\": true\n }\n }\n },\n {\n \"identifier\": \"phone number\",\n \"format\": {\n \"type\": \"string\",\n \"encoding\": \"utf-8\"\n },\n \"hashing\": {\n \"strategy\": {\n \"bitsPerToken\": 8\n },\n \"comparison\": {\n \"type\": \"ngram\",\n \"n\": 1,\n \"positional\": true\n }\n }\n },\n {\n \"identifier\": \"ignoredForLinkage\",\n \"ignored\": true\n }\n ]\n}\n" - ], - "output_type": "stream" + "{\n", + " \"version\": 3,\n", + " \"clkConfig\": {\n", + " \"l\": 1024,\n", + " \"kdf\": {\n", + " \"type\": \"HKDF\",\n", + " \"hash\": \"SHA256\",\n", + " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", + " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", + " \"keySize\": 64\n", + " }\n", + " },\n", + " \"features\": [\n", + " {\n", + " \"identifier\": \"id\",\n", + " \"ignored\": true\n", + " },\n", + " {\n", + " \"identifier\": \"givenname\",\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 15\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": false\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"identifier\": \"surname\",\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 15\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": false\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"identifier\": \"dob\",\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 15\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": true\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"identifier\": \"phone number\",\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 8\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 1,\n", + " \"positional\": true\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"identifier\": \"ignoredForLinkage\",\n", + " \"ignored\": true\n", + " }\n", + " ]\n", + "}\n" + ] } ], "source": [ @@ -124,12 +214,95 @@ "outputs": [ { "data": { - "text/plain": " id givenname surname dob phone number gender\n0 0 tara hilton 27-08-1941 08 2210 0298 male\n1 3 saJi vernre 22-12-2972 02 1090 1906 mals\n2 7 sliver paciorek NaN NaN mals\n3 9 ruby george 09-05-1939 07 4698 6255 male\n4 10 eyrinm campbell 29-1q-1983 08 299y 1535 male", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idgivennamesurnamedobphone numbergender
00tarahilton27-08-194108 2210 0298male
13saJivernre22-12-297202 1090 1906mals
27sliverpaciorekNaNNaNmals
39rubygeorge09-05-193907 4698 6255male
410eyrinmcampbell29-1q-198308 299y 1535male
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgivennamesurnamedobphone numbergender
00tarahilton27-08-194108 2210 0298male
13saJivernre22-12-297202 1090 1906mals
27sliverpaciorekNaNNaNmals
39rubygeorge09-05-193907 4698 6255male
410eyrinmcampbell29-1q-198308 299y 1535male
\n", + "
" + ], + "text/plain": [ + " id givenname surname dob phone number gender\n", + "0 0 tara hilton 27-08-1941 08 2210 0298 male\n", + "1 3 saJi vernre 22-12-2972 02 1090 1906 mals\n", + "2 7 sliver paciorek NaN NaN mals\n", + "3 9 ruby george 09-05-1939 07 4698 6255 male\n", + "4 10 eyrinm campbell 29-1q-1983 08 299y 1535 male" + ] }, + "execution_count": 5, "metadata": {}, - "output_type": "execute_result", - "execution_count": 5 + "output_type": "execute_result" } ], "source": [ @@ -156,12 +329,95 @@ "outputs": [ { "data": { - "text/plain": " id givenname surname dob phone number city\n0 3 zali verner 22-12-1972 02 1090 1906 perth\n1 4 samuel tremellen 21-12-1923 03 3605 9336 melbourne\n2 5 amy lodge 16-01-1958 07 8286 9372 canberra\n3 7 oIji pacioerk 10-02-1959 04 4220 5949 sydney\n4 10 erin kampgell 29-12-1983 08 2996 1445 perth", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idgivennamesurnamedobphone numbercity
03zaliverner22-12-197202 1090 1906perth
14samueltremellen21-12-192303 3605 9336melbourne
25amylodge16-01-195807 8286 9372canberra
37oIjipacioerk10-02-195904 4220 5949sydney
410erinkampgell29-12-198308 2996 1445perth
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgivennamesurnamedobphone numbercity
03zaliverner22-12-197202 1090 1906perth
14samueltremellen21-12-192303 3605 9336melbourne
25amylodge16-01-195807 8286 9372canberra
37oIjipacioerk10-02-195904 4220 5949sydney
410erinkampgell29-12-198308 2996 1445perth
\n", + "
" + ], + "text/plain": [ + " id givenname surname dob phone number city\n", + "0 3 zali verner 22-12-1972 02 1090 1906 perth\n", + "1 4 samuel tremellen 21-12-1923 03 3605 9336 melbourne\n", + "2 5 amy lodge 16-01-1958 07 8286 9372 canberra\n", + "3 7 oIji pacioerk 10-02-1959 04 4220 5949 sydney\n", + "4 10 erin kampgell 29-12-1983 08 2996 1445 perth" + ] }, + "execution_count": 6, "metadata": {}, - "output_type": "execute_result", - "execution_count": 6 + "output_type": "execute_result" } ], "source": [ @@ -188,12 +444,95 @@ "outputs": [ { "data": { - "text/plain": " id givenname surname dob phone number income\n0 1 joshua arkwright 16-02-1903 04 8511 9580 70189.446\n1 3 zal: verner 22-12-1972 02 1090 1906 50194.118\n2 7 oliyer paciorwk 10-02-1959 04 4210 5949 31750.993\n3 8 nacoya ranson 17-08-1925 07 6033 4580 102446.131\n4 10 erih campbell 29-12-1i83 08 299t 1435 331476.599", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idgivennamesurnamedobphone numberincome
01joshuaarkwright16-02-190304 8511 958070189.446
13zal:verner22-12-197202 1090 190650194.118
27oliyerpaciorwk10-02-195904 4210 594931750.993
38nacoyaranson17-08-192507 6033 4580102446.131
410erihcampbell29-12-1i8308 299t 1435331476.599
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgivennamesurnamedobphone numberincome
01joshuaarkwright16-02-190304 8511 958070189.446
13zal:verner22-12-197202 1090 190650194.118
27oliyerpaciorwk10-02-195904 4210 594931750.993
38nacoyaranson17-08-192507 6033 4580102446.131
410erihcampbell29-12-1i8308 299t 1435331476.599
\n", + "
" + ], + "text/plain": [ + " id givenname surname dob phone number income\n", + "0 1 joshua arkwright 16-02-1903 04 8511 9580 70189.446\n", + "1 3 zal: verner 22-12-1972 02 1090 1906 50194.118\n", + "2 7 oliyer paciorwk 10-02-1959 04 4210 5949 31750.993\n", + "3 8 nacoya ranson 17-08-1925 07 6033 4580 102446.131\n", + "4 10 erih campbell 29-12-1i83 08 299t 1435 331476.599" + ] }, + "execution_count": 7, "metadata": {}, - "output_type": "execute_result", - "execution_count": 7 + "output_type": "execute_result" } ], "source": [ @@ -222,10 +561,10 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ "\u001b[31mProject created\u001b[0m\r\n" - ], - "output_type": "stream" + ] } ], "source": [ @@ -261,10 +600,10 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ "\u001b[31mCLK data written to dataset-alice-hashed.json\u001b[0m\r\n" - ], - "output_type": "stream" + ] } ], "source": [ @@ -282,10 +621,10 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ - "{\"message\": \"Updated\", \"receipt_token\": \"372a1a7f5cdc639ec3dfb98475573bb796212149e50a5116\"}" - ], - "output_type": "stream" + "{\"message\": \"Updated\", \"receipt_token\": \"0351e7dcca593e9704f75bf6891a95804b22c0af51474f92\"}" + ] } ], "source": [ @@ -312,10 +651,10 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ "\u001b[31mCLK data written to dataset-bob-hashed.json\u001b[0m\r\n" - ], - "output_type": "stream" + ] } ], "source": [ @@ -333,10 +672,10 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ - "{\"message\": \"Updated\", \"receipt_token\": \"85126409e52f61cdaa5f761a28644707bd17fbcf17bb1e4d\"}" - ], - "output_type": "stream" + "{\"message\": \"Updated\", \"receipt_token\": \"ad857ec2489061afc324c866adbf74296f5d55768868f91c\"}" + ] } ], "source": [ @@ -363,10 +702,10 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ "\u001b[31mCLK data written to dataset-charlie-hashed.json\u001b[0m\r\n" - ], - "output_type": "stream" + ] } ], "source": [ @@ -384,10 +723,10 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ - "{\"message\": \"Updated\", \"receipt_token\": \"bc348c187f2f3fe0e179bd1ffcfa96ced642dabced79723a\"}" - ], - "output_type": "stream" + "{\"message\": \"Updated\", \"receipt_token\": \"ad6cfde05179416f4fbf5c7ef896bbd2fcb37554fe11c87a\"}" + ] } ], "source": [ @@ -428,7 +767,7 @@ "pycharm": {} }, "source": [ - "## Analyst: retreve the results" + "## Analyst: retrieve the results" ] }, { @@ -442,12 +781,17 @@ "outputs": [ { "name": "stdout", + "output_type": "stream", "text": [ - "\u001b[31mState: completed\r\nStage (3/3): compute output\u001b[0m\r\n", - "\u001b[31mState: completed\r\nStage (3/3): compute output\u001b[0m\r\n\u001b[31mState: completed\r\nStage (3/3): compute output\u001b[0m\r\n\u001b[31mDownloading result\u001b[0m\r\n", - "\u001b[31mReceived result\u001b[0m\r\n" - ], - "output_type": "stream" + "\u001b[31mState: completed\n", + "Stage (3/3): compute output\u001b[0m\n", + "\u001b[31mState: completed\n", + "Stage (3/3): compute output\u001b[0m\n", + "\u001b[31mState: completed\n", + "Stage (3/3): compute output\u001b[0m\n", + "\u001b[31mDownloading result\u001b[0m\n", + "\u001b[31mReceived result\u001b[0m\n" + ] } ], "source": [ @@ -462,11 +806,50 @@ "is_executing": false } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[[[0, 2188], [1, 2186], [2, 2194]],\n", + " [[1, 9], [2, 9], [0, 10]],\n", + " [[1, 332], [2, 329], [0, 330]],\n", + " [[0, 287], [2, 293], [1, 295]],\n", + " [[1, 399], [2, 401], [0, 414]],\n", + " [[0, 1074], [1, 1032], [2, 1088]],\n", + " [[0, 2482], [1, 2494], [2, 2509]],\n", + " [[0, 1723], [1, 1678], [2, 1714]],\n", + " [[0, 2234], [1, 2228], [2, 2242]],\n", + " [[0, 918], [2, 2840]],\n", + " [[0, 2461], [2, 2479], [1, 2468]],\n", + " [[1, 1393], [2, 1421], [0, 1451]],\n", + " [[0, 2343], [1, 2338], [2, 2351]],\n", + " [[0, 1077], [2, 1091], [1, 1036]],\n", + " [[1, 351], [2, 356]]]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "with open('linkage-output.json') as f:\n", " linkage_output = json.load(f)\n", - " linkage_groups = linkage_output['groups']" + " linkage_groups = linkage_output['groups']\n", + "linkage_groups[-15:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result is a list of groups of records. Every record in such a group belongs to the same entity and consists of two values, the party index and the row index:\n", + "```\n", + "[\n", + " [[party_id, row_index], ... ],\n", + " ...\n", + "]\n", + "```" ] }, { @@ -517,12 +900,112 @@ "outputs": [ { "data": { - "text/plain": " gender city income\n0 male sydney \n1 male canbrrra \n2 femake sydn4v \n3 pertb 21407e.192\n4 femake sydriey \n5 mlebourne 56899.522\n6 male canberra \n7 female 44652.704\n8 male sydnely \n9 male 65381.450", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
gendercityincome
0malesydney
1malecanbrrra
2femakesydn4v
3pertb21407e.192
4femakesydriey
5mlebourne56899.522
6malecanberra
7female44652.704
8malesydnely
9male65381.450
\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gendercityincome
0msl5sydnev
1malemelbourne
2malecanbrrra
3mlebourne56899.522
4femaoesydn3y
5male154195.553
6female44652.704
7malesydnely
8maleacbbeera81191.584
9maoemesllootrne
\n", + "
" + ], + "text/plain": [ + " gender city income\n", + "0 msl5 sydnev \n", + "1 male melbourne \n", + "2 male canbrrra \n", + "3 mlebourne 56899.522\n", + "4 femaoe sydn3y \n", + "5 male 154195.553\n", + "6 female 44652.704\n", + "7 male sydnely \n", + "8 male acbbeera 81191.584\n", + "9 maoe mesllootrne " + ] }, + "execution_count": 19, "metadata": {}, - "output_type": "execute_result", - "execution_count": 19 + "output_type": "execute_result" } ], "source": [ @@ -543,29 +1026,6 @@ "The last 20 groups look like this." ] }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "pycharm": { - "is_executing": false - }, - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": "[[[1, 2065], [0, 2428]],\n [[0, 1740], [1, 1693], [2, 1736]],\n [[1, 2224], [2, 2236]],\n [[0, 565], [1, 557], [2, 564]],\n [[0, 1980], [1, 1953]],\n [[0, 536], [2, 525], [1, 512]],\n [[1, 171], [2, 175], [0, 169]],\n [[0, 2234], [1, 2228], [2, 2242]],\n [[0, 918], [2, 2840]],\n [[0, 2461], [2, 2479], [1, 2468]],\n [[0, 2451], [2, 2471], [1, 2458]],\n [[0, 230], [1, 232]],\n [[0, 2765], [2, 2794], [1, 2789]],\n [[0, 1758], [2, 1754], [1, 1712]],\n [[1, 351], [2, 356]]]" - }, - "metadata": {}, - "output_type": "execute_result", - "execution_count": 20 - } - ], - "source": [ - "linkage_groups[-15:]" - ] - }, { "cell_type": "markdown", "metadata": { @@ -579,7 +1039,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": { "pycharm": { "is_executing": false @@ -605,7 +1065,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": { "pycharm": { "is_executing": false, @@ -615,12 +1075,195 @@ "outputs": [ { "data": { - "text/plain": " id given name surname dob phone number non-linking\n6450 \n6451 1522 poahtia torpe 22-09-1999 07 6482 4546 femalr\n6452 1522 portia thorpe 22-09-1999 07 6482 4546 canberra\n6453 \n6454 8662 luct pulfort 05-03-1903 02 0726 9479 male\n6455 8662 lucy pulford 05-03-1903 melbourrie\n6456 8662 lusy pulford 05-03-1993 02 0726 0489 192230.309\n6457 \n6458 5797 chelsie pajc0ek 27-03-1961 07 3258 9992 male\n6459 5797 chel5i padci4 27-04-1961 07 3258 0991 sydney\n6460 5797 chelsie pasl\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idgiven namesurnamedobphone numbernon-linking
6450
64511522poahtiatorpe22-09-199907 6482 4546femalr
64521522portiathorpe22-09-199907 6482 4546canberra
6453
64548662luctpulfort05-03-190302 0726 9479male
64558662lucypulford05-03-1903melbourrie
64568662lusypulford05-03-199302 0726 0489192230.309
6457
64585797chelsiepajc0ek27-03-196107 3258 9992male
64595797chel5ipadci427-04-196107 3258 0991sydney
64605797chelsiepasl<oe27-94-196107 3258 089262334.690
6461
64621885nicholasrobson06-01-191402 7799 6803canberra
64631885nicho|asrobson06-91-191402 7799 680361333.218
6464
\n" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgiven namesurnamedobphone numbernon-linking
64504914runybriten01-01-197103 7339 6523malw
64514914rubybritten01-01-197103 7338 6523melbourne
64524914rubybr'ltten01-01-197103 7337 652359544.220
6453
64547461henrvclisxjold13-07-191803 9072 8476male
64557461henryclissold13-07-191803 9071 8376canberra
64567461henryclissmold13-08-201803 9071 837667649.443
6457
64583944jean-claudemckinnell14-06-192508 8157 2282male
64593944jean-claudemckinenll08 8157 2282
64603944jean-claudemkniell14-06-192508 8157 2282
6461
64621885nicholasrobson06-01-191402 7799 6803canberra
64631885nicho|asrobson06-91-191402 7799 680361333.218
6464
\n", + "
" + ], + "text/plain": [ + " id given name surname dob phone number non-linking\n", + "6450 4914 runy briten 01-01-1971 03 7339 6523 malw\n", + "6451 4914 ruby britten 01-01-1971 03 7338 6523 melbourne\n", + "6452 4914 ruby br'ltten 01-01-1971 03 7337 6523 59544.220\n", + "6453 \n", + "6454 7461 henrv clisxjold 13-07-1918 03 9072 8476 male\n", + "6455 7461 henry clissold 13-07-1918 03 9071 8376 canberra\n", + "6456 7461 henry clissmold 13-08-2018 03 9071 8376 67649.443\n", + "6457 \n", + "6458 3944 jean-claude mckinnell 14-06-1925 08 8157 2282 male\n", + "6459 3944 jean-claude mckinenll 08 8157 2282 \n", + "6460 3944 jean-claude mkniell 14-06-1925 08 8157 2282 \n", + "6461 \n", + "6462 1885 nicholas robson 06-01-1914 02 7799 6803 canberra\n", + "6463 1885 nicho|as robson 06-91-1914 02 7799 6803 61333.218\n", + "6464 " + ] }, + "execution_count": 21, "metadata": {}, - "output_type": "execute_result", - "execution_count": 22 + "output_type": "execute_result" } ], "source": [ @@ -651,18 +1294,18 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" }, "pycharm": { "stem_cell": { "cell_type": "raw", - "source": [], "metadata": { "collapsed": false - } + }, + "source": [] } } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} From 3b22f63507e4e3971e77bdb113004a04db716766 Mon Sep 17 00:00:00 2001 From: wilko Date: Thu, 21 Nov 2019 13:41:06 +1100 Subject: [PATCH 05/12] clarified where to find tutorials --- docs/tutorial/index.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst index 2998dbe3..d5dd159b 100644 --- a/docs/tutorial/index.rst +++ b/docs/tutorial/index.rst @@ -15,12 +15,14 @@ Tutorials Usage ----- +You can download the tutorials from `github `_. +The dependencies are listed in ``tutorial-requirements.txt``. The code is often evolving and may include some breaking changes not yet deployed in our testing deployment (at the -URL https://testing.es.data61.xyz ). So to run the tutorials, you can either: +URL ``_). So to run the tutorials, you can either: - - use the tutorials from the `master` branch of this repository which will work with the currently deployed testing service, + - use the tutorials from the ``master`` branch of this repository which will work with the currently deployed testing service, - or build and deploy the service from the same branch as the tutorials you would like to run, providing its URL to - the tutorials via the environment variable `SERVER` (e.g. `SERVER=http://0.0.0.0:8851` if deployed locally). + the tutorials via the environment variable ``SERVER`` (e.g. ``SERVER=http://0.0.0.0:8851`` if deployed locally). Other use-cases are not supported and may fail for non-obvious reasons. @@ -28,5 +30,5 @@ External Tutorials ------------------ The ``clkhash`` library includes a tutorial of carrying out record linkage on perturbed data. - +``_ From 2af440e4a772c071ad4519210ad2078fc095a6fa Mon Sep 17 00:00:00 2001 From: wilko Date: Thu, 21 Nov 2019 14:06:18 +1100 Subject: [PATCH 06/12] breaking up too long commands, delete project at the end --- docs/tutorial/Permutations.ipynb | 109 ++++--- docs/tutorial/Record Linkage API.ipynb | 2 +- docs/tutorial/Similarity Scores.ipynb | 45 ++- ...multiparty-linkage-in-entity-service.ipynb | 2 +- .../multiparty-linkage-with-clkhash.ipynb | 307 +++++++++++------- 5 files changed, 286 insertions(+), 179 deletions(-) diff --git a/docs/tutorial/Permutations.ipynb b/docs/tutorial/Permutations.ipynb index e17285b9..15925f08 100644 --- a/docs/tutorial/Permutations.ipynb +++ b/docs/tutorial/Permutations.ipynb @@ -81,7 +81,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 6539, \"rate\": 2530484, \"status\": \"ok\"}\r\n" + "{\"project_count\": 7050, \"rate\": 2824020, \"status\": \"ok\"}\r\n" ] } ], @@ -290,7 +290,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp5qsl5x48\n" + "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp3jpcxxrs\n" ] } ], @@ -394,17 +394,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpdo3x0629\n", + "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp_tz_feve\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': 'fbb0845d2063e5cefe9153ebeacf42921418038a11c104ef',\n", - " 'result_token': '635bd95ab7c4d834bdf811aed0c026a81cd4944ba66ffc15',\n", - " 'update_tokens': ['ab2f33eef06d045db454d4fbc7821ea5971970beafede1be',\n", - " '13dc3ac340a2b51c78400a301fbaebc819022e5d231bb4a7']}" + "{'project_id': '7c942add9259b0c61fc06ce24afc6ee9c99355cc5a5eae7a',\n", + " 'result_token': '4552074bebabf66a19e707ef64aa35638fc1eb2cd3b9a768',\n", + " 'update_tokens': ['1045c9dda873d3cccf37181bcff7c61a5e82c6051d0da2c0',\n", + " 'fc27160c4e4736c1dbbecbedd6bc5e4117a3626c1f2eda9c']}" ] }, "execution_count": 7, @@ -416,7 +416,12 @@ "creds = NamedTemporaryFile('wt')\n", "print(\"Credentials will be saved in\", creds.name)\n", "\n", - "!clkutil create-project --schema \"{schema.name}\" --output \"{creds.name}\" --type \"permutations\" --server \"{url}\"\n", + "!clkutil create-project \\\n", + " --schema \"{schema.name}\" \\\n", + " --output \"{creds.name}\" \\\n", + " --type \"permutations\" \\\n", + " --server \"{url}\"\n", + "\n", "creds.seek(0)\n", "\n", "import json\n", @@ -458,8 +463,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpbd2u9qhd.json\u001b[0m\n", - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp5al8agy7.json\u001b[0m\n" + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmppybfm62c.json\u001b[0m\n", + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpu4jx4mjv.json\u001b[0m\n" ] } ], @@ -818,7 +823,7 @@ { "data": { "text/plain": [ - "[1886, 1657, 2212, 4844, 3960, 125, 1791, 2770, 4888, 1367]" + "[3645, 1068, 4371, 465, 1533, 987, 343, 53, 3298, 2515]" ] }, "execution_count": 20, @@ -852,7 +857,7 @@ { "data": { "text/plain": [ - "[4498, 3092, 520, 3834, 1074, 4519, 997, 991, 4505, 2389]" + "[3857, 4827, 3267, 4934, 1958, 3682, 4576, 4895, 4867, 1188]" ] }, "execution_count": 21, @@ -926,16 +931,16 @@ { "data": { "text/plain": [ - "['rec-1225-org,hayden,ballantyne,13,,nunnook,young,2077,nsw,19330812,3414771\\n',\n", - " 'rec-4635-org,isabella,white,8,cooling place,,rosebud,6151,sa,19990911,2206317\\n',\n", - " 'rec-1790-org,bailey,heuer,65,fossey street,brindabella specialist centre,vaucluse,2010,qld,19511013,9539538\\n',\n", - " 'rec-2882-org,sarah,eglinton,19,beasley street,bandaroo,naracoorte,4021,nsw,19451107,4310446\\n',\n", - " 'rec-3521-org,spencer,bates-brownsword,151,pinkerton circuit,tora,smithfield,4860,nsw,19810308,5402648\\n',\n", - " 'rec-2055-org,tai,garven,21,finniss crescent,donette downs,pymble,2035,nsw,19930723,6253715\\n',\n", - " 'rec-1529-org,zachariah,campbell,32,gellibrand street,carowood,keswick,3148,vic,19271210,2544494\\n',\n", - " 'rec-1817-org,noah,boyle,11,dooland court,,flowerdale,3163,vic,19260331,2019310\\n',\n", - " 'rec-4200-org,lara,sekuless,82,loch street,,yarraville,3196,qld,19861129,1392776\\n',\n", - " 'rec-1541-org,jessica,paine,58,eddison place,pine hut,new farm,2022,vic,19661210,8315488\\n']" + "['rec-3302-org,blaize,koopman,17,allison place,aldersyde estate,balwyn north,4650,nsw,19110608,7823755\\n',\n", + " 'rec-1385-org,joel,bishop,10,french street,cedarview,orange,3223,nt,,1324854\\n',\n", + " 'rec-190-org,,alias,24,elkington street,pangani,isle of capri,2145,sa,19650429,8261472\\n',\n", + " 'rec-4781-org,jacob,waller,89,dalley crescent,the willows,mosman,2480,qld,19580408,6317326\\n',\n", + " 'rec-4881-org,alexandra,nguyen,44,colebatch place,langley flats,freshwater,3242,nsw,19511004,6416159\\n',\n", + " 'rec-4770-org,tegan,rosendale,1,sherbrooke street,nazareth village,innaloo,2250,wa,19801011,9351309\\n',\n", + " 'rec-3385-org,shanaye,carbone,41,haystack crescent,st vincents hospital,matong,3690,nsw,19300519,1632237\\n',\n", + " 'rec-3738-org,imogen,carlington,45,mcinnes street,parish talowahl,girilambone,2154,nsw,19781117,7912921\\n',\n", + " 'rec-831-org,laura,flannery,54,sid barnes crescent,weemilah,winston hills,5073,qld,19581023,9712180\\n',\n", + " 'rec-815-org,holly,campbell,21,casey crescent,nestor,westmead,4573,qld,19911007,4424335\\n']" ] }, "execution_count": 24, @@ -959,16 +964,16 @@ { "data": { "text/plain": [ - "['rec-1225-dup-0,hayden,ballantyne,13,,,young,2077,nsw,19330812,3414771\\n',\n", - " 'rec-4635-dup-0,isaeblla,white,8,cooling place,massey green,rosebud,6151,sa,19990911,2206317\\n',\n", - " 'rec-1790-dup-0,shannon,heurr,65,fossey street,brindabella specialist centre,vaucluse,2010,qld,19511013,9539538\\n',\n", - " 'rec-2882-dup-0,sarah,eglinton,19,beasleyz street,,naraocorte,4012,nsw,19451107,4310446\\n',\n", - " 'rec-3521-dup-0,spencer,bates-brownsword,151,tora,pinkerton circuit,smithfield,4860,nsw,19810308,5402648\\n',\n", - " 'rec-2055-dup-0,taiz,garven,,finniss crescent,donetted owns,pymble,2035,nsw,19930723,6253715\\n',\n", - " 'rec-1529-dup-0,ebonie,campbell,32,gellibrand street,carowood,kessick,3148,vic,19271210,2544494\\n',\n", - " 'rec-1817-dup-0,noah,boyle,11,doolandcouhrt,,flowerdale,3163,vic,19260331,7756654\\n',\n", - " 'rec-4200-dup-0,lara,sekuless,9,loch sutreet,,yarraville,3196,qld,19861129,1392776\\n',\n", - " 'rec-1541-dup-0,jessica,paine,58,eddisonv place,pine hut,new farm,2022,vic,19661210,8315488\\n']" + "['rec-3302-dup-0,blaize,koopman,17,allison place,aldersydeestate,balwyn north,4650,nsw,19110608,7823755\\n',\n", + " 'rec-1385-dup-0,elton,bishop,10,french street,,orange,3223,nt,,1324854\\n',\n", + " 'rec-190-dup-0,,alias,24,elkington street,panganu,isle of capri,2145,sa,19650429,8261472\\n',\n", + " 'rec-4781-dup-0,jacob,waliler,89,dalley crescent,the ui llows,mosman,2487,qld,19580408,6317326\\n',\n", + " 'rec-4881-dup-0,nguyen,alexandra,44,colebatch place,langley flats,freshwater,3242,nsw,19511004,6416159\\n',\n", + " 'rec-4770-dup-0,tegan,rosendale,1,sherbrooke street,nazareth village,innaloo,2550,nsw,19801011,9351309\\n',\n", + " 'rec-3385-dup-0,shanaye,lonto,41,haystack crescent,,leetob,3680,nsw,19300519,1632237\\n',\n", + " 'rec-3738-dup-0,imogen,carlington,45,mcinnes treet,parish talowahl,girilabmone,2154,nsw,19781117,7912921\\n',\n", + " 'rec-831-dup-0,laura,flannery,54,sid barnes crescent,,winstonhills,5073,qld,19581023,9712180\\n',\n", + " 'rec-815-dup-0,holyl,campbell,21,casey crescent,,westmead,4573,qld,19911007,4424335\\n']" ] }, "execution_count": 25, @@ -1006,16 +1011,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Hayden Ballantyne (rec-1225-org) =? Hayden Ballantyne (rec-1225-dup-0)\n", - "Isabella White (rec-4635-org) =? Isaeblla White (rec-4635-dup-0)\n", - "Bailey Heuer (rec-1790-org) =? Shannon Heurr (rec-1790-dup-0)\n", - "Sarah Eglinton (rec-2882-org) =? Sarah Eglinton (rec-2882-dup-0)\n", - "Spencer Bates-Brownsword (rec-3521-org) =? Spencer Bates-Brownsword (rec-3521-dup-0)\n", - "Tai Garven (rec-2055-org) =? Taiz Garven (rec-2055-dup-0)\n", - "Zachariah Campbell (rec-1529-org) =? Ebonie Campbell (rec-1529-dup-0)\n", - "Noah Boyle (rec-1817-org) =? Noah Boyle (rec-1817-dup-0)\n", - "Lara Sekuless (rec-4200-org) =? Lara Sekuless (rec-4200-dup-0)\n", - "Jessica Paine (rec-1541-org) =? Jessica Paine (rec-1541-dup-0)\n" + "Blaize Koopman (rec-3302-org) =? Blaize Koopman (rec-3302-dup-0)\n", + "Joel Bishop (rec-1385-org) =? Elton Bishop (rec-1385-dup-0)\n", + " Alias (rec-190-org) =? Alias (rec-190-dup-0)\n", + "Jacob Waller (rec-4781-org) =? Jacob Waliler (rec-4781-dup-0)\n", + "Alexandra Nguyen (rec-4881-org) =? Nguyen Alexandra (rec-4881-dup-0)\n", + "Tegan Rosendale (rec-4770-org) =? Tegan Rosendale (rec-4770-dup-0)\n", + "Shanaye Carbone (rec-3385-org) =? Shanaye Lonto (rec-3385-dup-0)\n", + "Imogen Carlington (rec-3738-org) =? Imogen Carlington (rec-3738-dup-0)\n", + "Laura Flannery (rec-831-org) =? Laura Flannery (rec-831-dup-0)\n", + "Holly Campbell (rec-815-org) =? Holyl Campbell (rec-815-dup-0)\n" ] } ], @@ -1087,10 +1092,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mProject deleted\u001b[0m\r\n" + ] + } + ], + "source": [ + "# Deleting the project\n", + "!clkutil delete-project \\\n", + " --project=\"{credentials['project_id']}\" \\\n", + " --apikey=\"{credentials['result_token']}\" \\\n", + " --server=\"{url}\"" + ] } ], "metadata": { diff --git a/docs/tutorial/Record Linkage API.ipynb b/docs/tutorial/Record Linkage API.ipynb index b5e074f1..71e87353 100644 --- a/docs/tutorial/Record Linkage API.ipynb +++ b/docs/tutorial/Record Linkage API.ipynb @@ -694,7 +694,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" }, "pycharm": { "stem_cell": { diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb index 25740861..f6124619 100644 --- a/docs/tutorial/Similarity Scores.ipynb +++ b/docs/tutorial/Similarity Scores.ipynb @@ -101,7 +101,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 6542, \"rate\": 2549867, \"status\": \"ok\"}\r\n" + "{\"project_count\": 7082, \"rate\": 2845548, \"status\": \"ok\"}\r\n" ] } ], @@ -313,7 +313,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpf_28_avn\n" + "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpgayhu51z\n" ] } ], @@ -555,17 +555,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpd6lzqk30\n", + "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp6fab0949\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': 'a28958f3c6df6afa3cdbe0337a2621f1a76ce4c6929fb772',\n", - " 'result_token': 'dde8c58598ea98de862ae5c4e48ec3acfe342162a4133afd',\n", - " 'update_tokens': ['1535fe32ca6becf8fe91b0de32d2e47d9e3edddb72017205',\n", - " 'c50cb50080d0345fb8407ad9a974323567a054884ab2f4d1']}" + "{'project_id': '224ac23b0c6ba6c661ade4082d0741fc94b9af3ebf09f9fd',\n", + " 'result_token': '331d33ba45f4b636aac944ba1ba52f2602a3f18bcff9ff25',\n", + " 'update_tokens': ['8826451e71fb0ea6f1b07d7e54264ab7477f4d97898f8ce5',\n", + " '122076e7ef37ccbedda55d028210d4e5a14d9441329ea492']}" ] }, "execution_count": 8, @@ -577,7 +577,12 @@ "creds = NamedTemporaryFile('wt')\n", "print(\"Credentials will be saved in\", creds.name)\n", "\n", - "!clkutil create-project --schema \"{schema.name}\" --output \"{creds.name}\" --type \"similarity_scores\" --server \"{url}\"\n", + "!clkutil create-project \\\n", + " --schema \"{schema.name}\" \\\n", + " --output \"{creds.name}\" \\\n", + " --type \"similarity_scores\" \\\n", + " --server \"{url}\"\n", + "\n", "creds.seek(0)\n", "\n", "with open(creds.name, 'r') as f:\n", @@ -613,8 +618,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp3hr0gbdc.json\u001b[0m\n", - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp8ba6c8bt.json\u001b[0m\n" + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp2s_mj16v.json\u001b[0m\n", + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpvlxznb1y.json\u001b[0m\n" ] } ], @@ -1017,6 +1022,26 @@ "When choosing a similarity threshold for solving, the valley between these two distributions is a good starting point. In this example, it is around 0.82. We can see that almost all similarity scores above 0.82 are from matches, thus the solver will produce a linkage result with high precision. However, recall will not be optimal, as there are still some scores from matches below 0.82. By moving the threshold to either side, you can favour either precision or recall." ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mProject deleted\u001b[0m\r\n" + ] + } + ], + "source": [ + "# Deleting the project\n", + "!clkutil delete-project --project=\"{credentials['project_id']}\" \\\n", + " --apikey=\"{credentials['result_token']}\" \\\n", + " --server=\"{url}\"" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/docs/tutorial/multiparty-linkage-in-entity-service.ipynb b/docs/tutorial/multiparty-linkage-in-entity-service.ipynb index a5a5e5f6..d74a4c19 100644 --- a/docs/tutorial/multiparty-linkage-in-entity-service.ipynb +++ b/docs/tutorial/multiparty-linkage-in-entity-service.ipynb @@ -521,7 +521,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" }, "pycharm": { "stem_cell": { diff --git a/docs/tutorial/multiparty-linkage-with-clkhash.ipynb b/docs/tutorial/multiparty-linkage-with-clkhash.ipynb index 1ab88726..d5ab1789 100644 --- a/docs/tutorial/multiparty-linkage-with-clkhash.ipynb +++ b/docs/tutorial/multiparty-linkage-with-clkhash.ipynb @@ -568,7 +568,12 @@ } ], "source": [ - "!clkutil create-project --server $SERVER --type groups --schema data/schema_ABC.json --parties 3 --output credentials.json\n", + "!clkutil create-project \\\n", + " --server $SERVER \\\n", + " --type groups \\\n", + " --schema data/schema_ABC.json \\\n", + " --parties 3 \\\n", + " --output credentials.json\n", "\n", "with open('credentials.json') as f:\n", " credentials = json.load(f)\n", @@ -607,7 +612,12 @@ } ], "source": [ - "!clkutil hash data/dataset-alice.csv $SECRET data/schema_ABC.json dataset-alice-hashed.json --check-header false" + "!clkutil hash \\\n", + " data/dataset-alice.csv \\\n", + " $SECRET \\\n", + " data/schema_ABC.json \\\n", + " dataset-alice-hashed.json \\\n", + " --check-header false" ] }, { @@ -623,12 +633,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"message\": \"Updated\", \"receipt_token\": \"0351e7dcca593e9704f75bf6891a95804b22c0af51474f92\"}" + "{\"message\": \"Updated\", \"receipt_token\": \"c202d98eb83c7e55e6177ba9bcf55cb35f40ac1d21714897\"}" ] } ], "source": [ - "!clkutil upload --server $SERVER --apikey $update_token_alice --project $project_id dataset-alice-hashed.json" + "!clkutil upload \\\n", + " --server $SERVER \\\n", + " --apikey $update_token_alice \\\n", + " --project $project_id \\\n", + " dataset-alice-hashed.json" ] }, { @@ -658,7 +672,12 @@ } ], "source": [ - "!clkutil hash data/dataset-bob.csv $SECRET data/schema_ABC.json dataset-bob-hashed.json --check-header false" + "!clkutil hash \\\n", + " data/dataset-bob.csv \\\n", + " $SECRET \\\n", + " data/schema_ABC.json \\\n", + " dataset-bob-hashed.json \\\n", + " --check-header false" ] }, { @@ -674,12 +693,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"message\": \"Updated\", \"receipt_token\": \"ad857ec2489061afc324c866adbf74296f5d55768868f91c\"}" + "{\"message\": \"Updated\", \"receipt_token\": \"75083f544df8e944cc590089bb3e31c134e810992f08ea80\"}" ] } ], "source": [ - "!clkutil upload --server $SERVER --apikey $update_token_bob --project $project_id dataset-bob-hashed.json" + "!clkutil upload \\\n", + " --server $SERVER \\\n", + " --apikey $update_token_bob \\\n", + " --project $project_id \\\n", + " dataset-bob-hashed.json" ] }, { @@ -709,7 +732,12 @@ } ], "source": [ - "!clkutil hash data/dataset-charlie.csv $SECRET data/schema_ABC.json dataset-charlie-hashed.json --check-header false" + "!clkutil hash \\\n", + " data/dataset-charlie.csv \\\n", + " $SECRET \\\n", + " data/schema_ABC.json \\\n", + " dataset-charlie-hashed.json \\\n", + " --check-header false" ] }, { @@ -725,12 +753,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"message\": \"Updated\", \"receipt_token\": \"ad6cfde05179416f4fbf5c7ef896bbd2fcb37554fe11c87a\"}" + "{\"message\": \"Updated\", \"receipt_token\": \"814b4a226453d7261348a403e134b0764501432bf679658f\"}" ] } ], "source": [ - "!clkutil upload --server $SERVER --apikey $update_token_charlie --project $project_id dataset-charlie-hashed.json" + "!clkutil upload \\\n", + " --server $SERVER \\\n", + " --apikey $update_token_charlie \\\n", + " --project $project_id \\\n", + " dataset-charlie-hashed.json" ] }, { @@ -754,7 +786,12 @@ }, "outputs": [], "source": [ - "!clkutil create --server $SERVER --project $project_id --apikey $result_token --threshold 0.7 --output=run-credentials.json\n", + "!clkutil create \\\n", + " --server $SERVER \\\n", + " --project $project_id \\\n", + " --apikey $result_token \\\n", + " --threshold 0.7 \\\n", + " --output=run-credentials.json\n", "\n", "with open('run-credentials.json') as f:\n", " run_credentials = json.load(f)\n", @@ -795,7 +832,13 @@ } ], "source": [ - "!clkutil results --server $SERVER --project $project_id --apikey $result_token --run $run_id --watch --output linkage-output.json" + "!clkutil results \\\n", + " --server $SERVER \\\n", + " --project $project_id \\\n", + " --apikey $result_token \\\n", + " --run $run_id \\\n", + " --watch \\\n", + " --output linkage-output.json" ] }, { @@ -810,20 +853,20 @@ { "data": { "text/plain": [ - "[[[0, 2188], [1, 2186], [2, 2194]],\n", - " [[1, 9], [2, 9], [0, 10]],\n", - " [[1, 332], [2, 329], [0, 330]],\n", - " [[0, 287], [2, 293], [1, 295]],\n", - " [[1, 399], [2, 401], [0, 414]],\n", - " [[0, 1074], [1, 1032], [2, 1088]],\n", - " [[0, 2482], [1, 2494], [2, 2509]],\n", - " [[0, 1723], [1, 1678], [2, 1714]],\n", + "[[[0, 1787], [1, 1751], [2, 1784]],\n", + " [[0, 565], [1, 557], [2, 564]],\n", + " [[0, 836], [1, 815], [2, 850]],\n", + " [[0, 505], [2, 495]],\n", + " [[0, 536], [2, 525], [1, 512]],\n", + " [[0, 1641], [2, 1608], [1, 1584]],\n", " [[0, 2234], [1, 2228], [2, 2242]],\n", + " [[0, 781], [1, 762], [2, 799]],\n", " [[0, 918], [2, 2840]],\n", - " [[0, 2461], [2, 2479], [1, 2468]],\n", " [[1, 1393], [2, 1421], [0, 1451]],\n", - " [[0, 2343], [1, 2338], [2, 2351]],\n", - " [[0, 1077], [2, 1091], [1, 1036]],\n", + " [[1, 1587], [2, 1609], [0, 1642]],\n", + " [[1, 1730], [2, 1767]],\n", + " [[1, 2808], [2, 2813]],\n", + " [[0, 2765], [2, 2794], [1, 2789]],\n", " [[1, 351], [2, 356]]]" ] }, @@ -927,21 +970,21 @@ " \n", " \n", " 0\n", - " msl5\n", - " sydnev\n", + " male\n", + " melbourne\n", " \n", " \n", " \n", " 1\n", - " male\n", - " melbourne\n", + " femalr\n", " \n", + " 277039.294\n", " \n", " \n", " 2\n", - " male\n", - " canbrrra\n", " \n", + " pertb\n", + " 21407e.192\n", " \n", " \n", " 3\n", @@ -951,38 +994,38 @@ " \n", " \n", " 4\n", + " male\n", + " canberra\n", + " \n", + " \n", + " \n", + " 5\n", " femaoe\n", " sydn3y\n", " \n", " \n", " \n", - " 5\n", + " 6\n", " male\n", " \n", " 154195.553\n", " \n", " \n", - " 6\n", + " 7\n", " female\n", " \n", " 44652.704\n", " \n", " \n", - " 7\n", + " 8\n", " male\n", " sydnely\n", " \n", " \n", " \n", - " 8\n", - " male\n", - " acbbeera\n", - " 81191.584\n", - " \n", - " \n", " 9\n", - " maoe\n", - " mesllootrne\n", + " mal3\n", + " sydney\n", " \n", " \n", " \n", @@ -990,17 +1033,17 @@ "" ], "text/plain": [ - " gender city income\n", - "0 msl5 sydnev \n", - "1 male melbourne \n", - "2 male canbrrra \n", - "3 mlebourne 56899.522\n", - "4 femaoe sydn3y \n", - "5 male 154195.553\n", - "6 female 44652.704\n", - "7 male sydnely \n", - "8 male acbbeera 81191.584\n", - "9 maoe mesllootrne " + " gender city income\n", + "0 male melbourne \n", + "1 femalr 277039.294\n", + "2 pertb 21407e.192\n", + "3 mlebourne 56899.522\n", + "4 male canberra \n", + "5 femaoe sydn3y \n", + "6 male 154195.553\n", + "7 female 44652.704\n", + "8 male sydnely \n", + "9 mal3 sydney " ] }, "execution_count": 19, @@ -1105,33 +1148,42 @@ " \n", " \n", " 6450\n", - " 4914\n", - " runy\n", - " briten\n", - " 01-01-1971\n", - " 03 7339 6523\n", - " malw\n", + " 5436\n", + " nikki\n", + " spears\n", + " 10-02-2097\n", + " 06 9447 1767\n", + " 156639.106\n", " \n", " \n", " 6451\n", - " 4914\n", - " ruby\n", - " britten\n", - " 01-01-1971\n", - " 03 7338 6523\n", - " melbourne\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " 6452\n", - " 4914\n", - " ruby\n", - " br'ltten\n", - " 01-01-1971\n", - " 03 7337 6523\n", - " 59544.220\n", + " 5833\n", + " nell\n", + " rud\n", + " 06-1p-1956\n", + " 08 5510 5369\n", + " sydnev\n", " \n", " \n", " 6453\n", + " 5833\n", + " ned\n", + " reif\n", + " 06-20-1956\n", + " 08 5510 5369\n", + " 117275.089\n", + " \n", + " \n", + " 6454\n", " \n", " \n", " \n", @@ -1140,31 +1192,22 @@ " \n", " \n", " \n", - " 6454\n", - " 7461\n", - " henrv\n", - " clisxjold\n", - " 13-07-1918\n", - " 03 9072 8476\n", - " male\n", - " \n", - " \n", " 6455\n", - " 7461\n", - " henry\n", - " clissold\n", - " 13-07-1918\n", - " 03 9071 8376\n", - " canberra\n", + " 872\n", + " jackson\n", + " green\n", + " 06-09-1920\n", + " \n", + " \n", " \n", " \n", " 6456\n", - " 7461\n", - " henry\n", - " clissmold\n", - " 13-08-2018\n", - " 03 9071 8376\n", - " 67649.443\n", + " 872\n", + " jackson\n", + " gnn\n", + " 06-00-1920\n", + " 08 3409 2246\n", + " 147663.277\n", " \n", " \n", " 6457\n", @@ -1177,30 +1220,30 @@ " \n", " \n", " 6458\n", - " 3944\n", - " jean-claude\n", - " mckinnell\n", - " 14-06-1925\n", - " 08 8157 2282\n", + " 8662\n", + " luct\n", + " pulfort\n", + " 05-03-1903\n", + " 02 0726 9479\n", " male\n", " \n", " \n", " 6459\n", - " 3944\n", - " jean-claude\n", - " mckinenll\n", - " \n", - " 08 8157 2282\n", + " 8662\n", + " lucy\n", + " pulford\n", + " 05-03-1903\n", " \n", + " melbourrie\n", " \n", " \n", " 6460\n", - " 3944\n", - " jean-claude\n", - " mkniell\n", - " 14-06-1925\n", - " 08 8157 2282\n", - " \n", + " 8662\n", + " lusy\n", + " pulford\n", + " 05-03-1993\n", + " 02 0726 0489\n", + " 192230.309\n", " \n", " \n", " 6461\n", @@ -1243,22 +1286,22 @@ "" ], "text/plain": [ - " id given name surname dob phone number non-linking\n", - "6450 4914 runy briten 01-01-1971 03 7339 6523 malw\n", - "6451 4914 ruby britten 01-01-1971 03 7338 6523 melbourne\n", - "6452 4914 ruby br'ltten 01-01-1971 03 7337 6523 59544.220\n", - "6453 \n", - "6454 7461 henrv clisxjold 13-07-1918 03 9072 8476 male\n", - "6455 7461 henry clissold 13-07-1918 03 9071 8376 canberra\n", - "6456 7461 henry clissmold 13-08-2018 03 9071 8376 67649.443\n", - "6457 \n", - "6458 3944 jean-claude mckinnell 14-06-1925 08 8157 2282 male\n", - "6459 3944 jean-claude mckinenll 08 8157 2282 \n", - "6460 3944 jean-claude mkniell 14-06-1925 08 8157 2282 \n", - "6461 \n", - "6462 1885 nicholas robson 06-01-1914 02 7799 6803 canberra\n", - "6463 1885 nicho|as robson 06-91-1914 02 7799 6803 61333.218\n", - "6464 " + " id given name surname dob phone number non-linking\n", + "6450 5436 nikki spears 10-02-2097 06 9447 1767 156639.106\n", + "6451 \n", + "6452 5833 nell rud 06-1p-1956 08 5510 5369 sydnev\n", + "6453 5833 ned reif 06-20-1956 08 5510 5369 117275.089\n", + "6454 \n", + "6455 872 jackson green 06-09-1920 \n", + "6456 872 jackson gnn 06-00-1920 08 3409 2246 147663.277\n", + "6457 \n", + "6458 8662 luct pulfort 05-03-1903 02 0726 9479 male\n", + "6459 8662 lucy pulford 05-03-1903 melbourrie\n", + "6460 8662 lusy pulford 05-03-1993 02 0726 0489 192230.309\n", + "6461 \n", + "6462 1885 nicholas robson 06-01-1914 02 7799 6803 canberra\n", + "6463 1885 nicho|as robson 06-91-1914 02 7799 6803 61333.218\n", + "6464 " ] }, "execution_count": 21, @@ -1276,6 +1319,26 @@ "pd.DataFrame(table, columns=['id', 'given name', 'surname', 'dob', 'phone number', 'non-linking']).tail(15)\n", "\n" ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mProject deleted\u001b[0m\r\n" + ] + } + ], + "source": [ + "# Deleting the project\n", + "!clkutil delete-project --project=\"{credentials['project_id']}\" \\\n", + " --apikey=\"{credentials['result_token']}\" \\\n", + " --server=\"{SERVER}\"" + ] } ], "metadata": { From 3f23804e6fa3e3475ace4d0555a750a296604737 Mon Sep 17 00:00:00 2001 From: wilko Date: Thu, 21 Nov 2019 17:41:30 +1100 Subject: [PATCH 07/12] back to the future --- docs/tutorial/Permutations.ipynb | 196 ++++++++++++++++++++++++++----- 1 file changed, 167 insertions(+), 29 deletions(-) diff --git a/docs/tutorial/Permutations.ipynb b/docs/tutorial/Permutations.ipynb index 15925f08..c815debd 100644 --- a/docs/tutorial/Permutations.ipynb +++ b/docs/tutorial/Permutations.ipynb @@ -297,19 +297,16 @@ "source": [ "%%writefile {schema.name}\n", "{\n", - " \"version\": 1,\n", + " \"version\": 3,\n", " \"clkConfig\": {\n", " \"l\": 1024,\n", - " \"k\": 30,\n", - " \"hash\": {\n", - " \"type\": \"doubleHash\"\n", - " },\n", + " \"xor_folds\": 0,\n", " \"kdf\": {\n", " \"type\": \"HKDF\",\n", " \"hash\": \"SHA256\",\n", - " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", - " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", - " \"keySize\": 64\n", + " \"info\": \"c2NoZW1hX2V4YW1wbGU=\",\n", + " \"salt\": \"SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==\",\n", + " \"keySize\": 64\n", " }\n", " },\n", " \"features\": [\n", @@ -319,48 +316,189 @@ " },\n", " {\n", " \"identifier\": \"given_name\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 30\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": false\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"surname\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 30\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": false\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"street_number\",\n", - " \"format\": { \"type\": \"integer\" },\n", - " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 0.5, \"missingValue\": {\"sentinel\": \"\"} }\n", + " \"format\": {\n", + " \"type\": \"integer\"\n", + " },\n", + " \"hashing\": {\n", + " \"missingValue\": {\n", + " \"sentinel\": \"\"\n", + " },\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 15\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 1,\n", + " \"positional\": true\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"address_1\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 0.5 }\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 15\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": false\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"address_2\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 0.5 }\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 15\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": false\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"suburb\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\" },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 0.5 }\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\"\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 15\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": false\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"postcode\",\n", - " \"format\": { \"type\": \"integer\", \"minimum\": 100, \"maximum\": 9999 },\n", - " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 0.5 }\n", + " \"format\": {\n", + " \"type\": \"integer\",\n", + " \"minimum\": 100,\n", + " \"maximum\": 9999\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 15\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 1,\n", + " \"positional\": true\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"state\",\n", - " \"format\": { \"type\": \"string\", \"encoding\": \"utf-8\", \"maxLength\": 3 },\n", - " \"hashing\": { \"ngram\": 2, \"weight\": 1 }\n", + " \"format\": {\n", + " \"type\": \"string\",\n", + " \"encoding\": \"utf-8\",\n", + " \"maxLength\": 3\n", + " },\n", + " \"hashing\": {\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 30\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 2,\n", + " \"positional\": false\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"date_of_birth\",\n", - " \"format\": { \"type\": \"integer\" },\n", - " \"hashing\": { \"ngram\": 1, \"positional\": true, \"weight\": 1, \"missingValue\": {\"sentinel\": \"\"} }\n", + " \"format\": {\n", + " \"type\": \"integer\"\n", + " },\n", + " \"hashing\": {\n", + " \"missingValue\": {\n", + " \"sentinel\": \"\"\n", + " },\n", + " \"strategy\": {\n", + " \"bitsPerToken\": 30\n", + " },\n", + " \"hash\": {\n", + " \"type\": \"doubleHash\"\n", + " },\n", + " \"comparison\": {\n", + " \"type\": \"ngram\",\n", + " \"n\": 1,\n", + " \"positional\": true\n", + " }\n", + " }\n", " },\n", " {\n", " \"identifier\": \"soc_sec_id\",\n", @@ -1133,13 +1271,13 @@ "pycharm": { "stem_cell": { "cell_type": "raw", + "source": [], "metadata": { "collapsed": false - }, - "source": [] + } } } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file From 05392d61142261dd298e9bc079f8fdf6dc9e1249 Mon Sep 17 00:00:00 2001 From: wilko Date: Thu, 21 Nov 2019 17:49:50 +1100 Subject: [PATCH 08/12] sorted alphabetically --- docs/tutorial/tutorial-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial/tutorial-requirements.txt b/docs/tutorial/tutorial-requirements.txt index 85cde1d6..63998975 100644 --- a/docs/tutorial/tutorial-requirements.txt +++ b/docs/tutorial/tutorial-requirements.txt @@ -1,6 +1,6 @@ clkhash==0.15.0 ipython matplotlib +pandas recordlinkage requests -pandas From dc3205f761895ec1a00b691103a37b8b54c77d73 Mon Sep 17 00:00:00 2001 From: wilko Date: Thu, 21 Nov 2019 18:01:09 +1100 Subject: [PATCH 09/12] update to new sim scores format --- docs/tutorial/Similarity Scores.ipynb | 42 +++++++++++++-------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb index f6124619..4a2de1e7 100644 --- a/docs/tutorial/Similarity Scores.ipynb +++ b/docs/tutorial/Similarity Scores.ipynb @@ -79,7 +79,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing anonlink-entity-service hosted at https://testing.es.data61.xyz\n" + "Testing anonlink-entity-service hosted at http://0.0.0.0:8851\n" ] } ], @@ -101,7 +101,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 7082, \"rate\": 2845548, \"status\": \"ok\"}\r\n" + "{\"project_count\": 1, \"rate\": 1, \"status\": \"ok\"}\r\n" ] } ], @@ -313,7 +313,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpgayhu51z\n" + "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpnwnrgmrz\n" ] } ], @@ -555,17 +555,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp6fab0949\n", + "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpb4_q6h6x\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': '224ac23b0c6ba6c661ade4082d0741fc94b9af3ebf09f9fd',\n", - " 'result_token': '331d33ba45f4b636aac944ba1ba52f2602a3f18bcff9ff25',\n", - " 'update_tokens': ['8826451e71fb0ea6f1b07d7e54264ab7477f4d97898f8ce5',\n", - " '122076e7ef37ccbedda55d028210d4e5a14d9441329ea492']}" + "{'project_id': 'a17e061f51d850298ff1e170f53ef157fc672cf086c5be1f',\n", + " 'result_token': 'b5379610146169560ef083763e6b097e38b4f7dca52032a4',\n", + " 'update_tokens': ['ed82d9aace5399f58c8bf749210ebef8aecb3ed60c7118ac',\n", + " 'b096e769b1c194000bf306be38be7cdbbcf9f69dac559e99']}" ] }, "execution_count": 8, @@ -618,8 +618,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp2s_mj16v.json\u001b[0m\n", - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpvlxznb1y.json\u001b[0m\n" + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpz8n7ijbj.json\u001b[0m\n", + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpuwp9igb6.json\u001b[0m\n" ] } ], @@ -818,16 +818,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "[76, 2345, 1.0]\n", - "[83, 3439, 1.0]\n", - "[103, 863, 1.0]\n", - "[154, 2391, 1.0]\n", - "[177, 4247, 1.0]\n", - "[192, 1176, 1.0]\n", - "[270, 4516, 1.0]\n", - "[312, 1253, 1.0]\n", - "[407, 3743, 1.0]\n", - "[670, 3550, 1.0]\n" + "[[0, 76], [1, 2345], 1.0]\n", + "[[0, 83], [1, 3439], 1.0]\n", + "[[0, 103], [1, 863], 1.0]\n", + "[[0, 154], [1, 2391], 1.0]\n", + "[[0, 177], [1, 4247], 1.0]\n", + "[[0, 192], [1, 1176], 1.0]\n", + "[[0, 270], [1, 4516], 1.0]\n", + "[[0, 312], [1, 1253], 1.0]\n", + "[[0, 407], [1, 3743], 1.0]\n", + "[[0, 670], [1, 3550], 1.0]\n" ] } ], @@ -977,7 +977,7 @@ "source": [ "scores_matches = []\n", "scores_non_matches = []\n", - "for a, b, score in data:\n", + "for (_, a), (_, b), score in data:\n", " if score < 0.79:\n", " continue\n", " if (a, b) in true_matches:\n", From d5f29805ae9903ef0fb481aff370a39683caf7f3 Mon Sep 17 00:00:00 2001 From: wilko Date: Mon, 25 Nov 2019 09:39:42 +1100 Subject: [PATCH 10/12] remove mentioning the mask --- docs/tutorial/Similarity Scores.ipynb | 50 +++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb index 4a2de1e7..328253bc 100644 --- a/docs/tutorial/Similarity Scores.ipynb +++ b/docs/tutorial/Similarity Scores.ipynb @@ -79,7 +79,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing anonlink-entity-service hosted at http://0.0.0.0:8851\n" + "Testing anonlink-entity-service hosted at https://testing.es.data61.xyz\n" ] } ], @@ -101,7 +101,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 1, \"rate\": 1, \"status\": \"ok\"}\r\n" + "{\"project_count\": 7082, \"rate\": 2845548, \"status\": \"ok\"}\r\n" ] } ], @@ -313,7 +313,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpnwnrgmrz\n" + "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpgayhu51z\n" ] } ], @@ -555,17 +555,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpb4_q6h6x\n", + "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp6fab0949\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': 'a17e061f51d850298ff1e170f53ef157fc672cf086c5be1f',\n", - " 'result_token': 'b5379610146169560ef083763e6b097e38b4f7dca52032a4',\n", - " 'update_tokens': ['ed82d9aace5399f58c8bf749210ebef8aecb3ed60c7118ac',\n", - " 'b096e769b1c194000bf306be38be7cdbbcf9f69dac559e99']}" + "{'project_id': '224ac23b0c6ba6c661ade4082d0741fc94b9af3ebf09f9fd',\n", + " 'result_token': '331d33ba45f4b636aac944ba1ba52f2602a3f18bcff9ff25',\n", + " 'update_tokens': ['8826451e71fb0ea6f1b07d7e54264ab7477f4d97898f8ce5',\n", + " '122076e7ef37ccbedda55d028210d4e5a14d9441329ea492']}" ] }, "execution_count": 8, @@ -618,8 +618,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpz8n7ijbj.json\u001b[0m\n", - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpuwp9igb6.json\u001b[0m\n" + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp2s_mj16v.json\u001b[0m\n", + "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpvlxznb1y.json\u001b[0m\n" ] } ], @@ -746,7 +746,7 @@ "source": [ "## Results\n", "\n", - "Now after some delay (depending on the size) we can fetch the mask.\n", + "Now after some delay (depending on the size) we can fetch the result.\n", "This can be done with clkutil:\n", "\n", " !clkutil results --server \"{url}\" \\\n", @@ -818,16 +818,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "[[0, 76], [1, 2345], 1.0]\n", - "[[0, 83], [1, 3439], 1.0]\n", - "[[0, 103], [1, 863], 1.0]\n", - "[[0, 154], [1, 2391], 1.0]\n", - "[[0, 177], [1, 4247], 1.0]\n", - "[[0, 192], [1, 1176], 1.0]\n", - "[[0, 270], [1, 4516], 1.0]\n", - "[[0, 312], [1, 1253], 1.0]\n", - "[[0, 407], [1, 3743], 1.0]\n", - "[[0, 670], [1, 3550], 1.0]\n" + "[76, 2345, 1.0]\n", + "[83, 3439, 1.0]\n", + "[103, 863, 1.0]\n", + "[154, 2391, 1.0]\n", + "[177, 4247, 1.0]\n", + "[192, 1176, 1.0]\n", + "[270, 4516, 1.0]\n", + "[312, 1253, 1.0]\n", + "[407, 3743, 1.0]\n", + "[670, 3550, 1.0]\n" ] } ], @@ -977,7 +977,7 @@ "source": [ "scores_matches = []\n", "scores_non_matches = []\n", - "for (_, a), (_, b), score in data:\n", + "for a, b, score in data:\n", " if score < 0.79:\n", " continue\n", " if (a, b) in true_matches:\n", @@ -1071,13 +1071,13 @@ "pycharm": { "stem_cell": { "cell_type": "raw", + "source": [], "metadata": { "collapsed": false - }, - "source": [] + } } } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file From d73aca8d76866b49eb11af50569029c32d51d677 Mon Sep 17 00:00:00 2001 From: wilko Date: Mon, 25 Nov 2019 10:13:10 +1100 Subject: [PATCH 11/12] show data and link to schema --- ...multiparty-linkage-in-entity-service.ipynb | 419 ++++++++++++++---- 1 file changed, 338 insertions(+), 81 deletions(-) diff --git a/docs/tutorial/multiparty-linkage-in-entity-service.ipynb b/docs/tutorial/multiparty-linkage-in-entity-service.ipynb index d74a4c19..b4ea0f79 100644 --- a/docs/tutorial/multiparty-linkage-in-entity-service.ipynb +++ b/docs/tutorial/multiparty-linkage-in-entity-service.ipynb @@ -13,6 +13,7 @@ "import csv\n", "import itertools\n", "import os\n", + "import pandas as pd\n", "\n", "import requests" ] @@ -26,7 +27,262 @@ "# Entity Service: Multiparty linkage demo\n", "This notebook is a demonstration of the multiparty linkage capability that has been implemented in the Entity Service.\n", "\n", - "We show how five parties may upload their hashed data to the Entity Service to obtain a multiparty linkage result. This result identifies each entity across all datasets in which they are included." + "We show how five parties may upload their hashed data to the Entity Service to obtain a multiparty linkage result. This result identifies each entity across all datasets in which they are included.\n", + "\n", + "Each party has a dataset of the following form:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
givennamesurnamedobgendercityincomephone number
id
0tarahilton27-08-1941malecanberra84052.97308 2210 0298
3saJivernre22-12-2972malsperth50104.11802 1090 1906
7sliverpaciorekNaNmalssydney31750.893NaN
9rubygeorge09-05-1939malesydney135099.87507 4698 6255
10eyrinmcampbell29-1q-1983maleperthNaN08 299y 1535
\n", + "
" + ], + "text/plain": [ + " givenname surname dob gender city income phone number\n", + "id \n", + "0 tara hilton 27-08-1941 male canberra 84052.973 08 2210 0298\n", + "3 saJi vernre 22-12-2972 mals perth 50104.118 02 1090 1906\n", + "7 sliver paciorek NaN mals sydney 31750.893 NaN\n", + "9 ruby george 09-05-1939 male sydney 135099.875 07 4698 6255\n", + "10 eyrinm campbell 29-1q-1983 male perth NaN 08 299y 1535" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_csv('data/dataset-1.csv', index_col='id').head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Comparing the beginning of the first dataset to the second, we can see that the quality of the data is not very good. There are a lot of spelling mistakes and missing information. Let's see how well the entity service does with linking those entities." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
givennamesurnamedobgendercityincomephone number
id
3zaliverner22-12-1972maleperth50104.11802 1090 1906
4samueltremellen21-12-1923malemelbourne159316.09103 3605 9336
5amylodge16-01-1958malecanberra70170.45607 8286 9372
7oIjipacioerk10-02-1959mal3sydney31750.89304 4220 5949
10erinkampgell29-12-1983makeperth331476.59808 2996 1445
\n", + "
" + ], + "text/plain": [ + " givenname surname dob gender city income \\\n", + "id \n", + "3 zali verner 22-12-1972 male perth 50104.118 \n", + "4 samuel tremellen 21-12-1923 male melbourne 159316.091 \n", + "5 amy lodge 16-01-1958 male canberra 70170.456 \n", + "7 oIji pacioerk 10-02-1959 mal3 sydney 31750.893 \n", + "10 erin kampgell 29-12-1983 make perth 331476.598 \n", + "\n", + " phone number \n", + "id \n", + "3 02 1090 1906 \n", + "4 03 3605 9336 \n", + "5 07 8286 9372 \n", + "7 04 4220 5949 \n", + "10 08 2996 1445 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_csv('data/dataset-2.csv', index_col='id').head()" ] }, { @@ -41,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "pycharm": { "is_executing": false @@ -52,7 +308,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'project_count': 5944, 'rate': 2260983, 'status': 'ok'}\n", + "{'project_count': 7107, 'rate': 2884208, 'status': 'ok'}\n", "{'anonlink': '0.12.5', 'entityservice': 'v1.13.0-alpha', 'python': '3.7.5'}\n" ] } @@ -76,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": { "pycharm": { "is_executing": false @@ -87,11 +343,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "project_id: 21d8916332764c00c0861f1dda132c633c731c377fd89696\n", + "project_id: e3602cac3940582e87c636f3a3827176ca7abe8d5b4e0096\n", "\n", - "result_token: 4b8c53796161aad56414631fd553d5905256ea5cba0476e8\n", + "result_token: ca19df31d445fade86390f38c5d8f578d545c5f92376ffb3\n", "\n", - "update_tokens: ['f3dafb72996cbc0f453f2acde9dd0e037066039d492c96ee', '28c6cb8b3f85bb528574d51c1f67953af7bb9b835b119451', '028b0b1c05b1e669c7b5bf13caf3a53022481d867c3c0fb9', '105c8d242b51f30388f6f8b0bd4d32189127ea760d22377e', '36955c914e3e0d1aed86a5af32027dfb8a8169532ba4125e']\n" + "update_tokens: ['c24cab922055e8dd2c7ea639c342b9fce706fbbe7a531f8e', '7712f77f2ab2c2d7210ffa09465de5209ac9f50657fac0a8', 'ae41434b182d2ac82fc0646bf4e49e0e6c5e8f52f6350ba1', 'd8419a8c0f4b274ed1aca56d6adc8b8743c681b7eb02af9a', 'baefc60676a830b648fd176cc1c6d18248b048825036f8d6']\n" ] } ], @@ -125,12 +381,12 @@ "## Upload the hashed data\n", "This is where each party uploads their CLKs into the service. Here, we do the work of all five data providers inside this for loop. In a deployment scenario, each data provider would be uploading their own CLKs using their own update token.\n", "\n", - "These CLKs are already hashed using [clkhash](https://github.com/data61/clkhash), so for each data provider, we just need to upload their corresponding hash file." + "These CLKs are already hashed using [clkhash](https://github.com/data61/clkhash) (with [this](data/schema.json) linkage schema), so for each data provider, we just need to upload their corresponding hash file." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": { "pycharm": { "is_executing": false @@ -143,27 +399,27 @@ "text": [ "Data provider 1: {\n", " \"message\": \"Updated\",\n", - " \"receipt_token\": \"3e102ce587ae97feb18aebf7596aee5ba3ba5b6a41d5bedf\"\n", + " \"receipt_token\": \"b060225db2fb1edda39bcc2153a9310392f87abcacd9db2b\"\n", "}\n", "\n", "Data provider 2: {\n", " \"message\": \"Updated\",\n", - " \"receipt_token\": \"ab758b30126ddc083bf65749773fc5856719b4273adc0703\"\n", + " \"receipt_token\": \"db94c740c469a9bda9931829d1ba58210426134a46ba1edb\"\n", "}\n", "\n", "Data provider 3: {\n", " \"message\": \"Updated\",\n", - " \"receipt_token\": \"e013c252746cbc5ceb00b4009500769ceb63389de886137c\"\n", + " \"receipt_token\": \"ad60b956a4f90c8dd16fb7d278c0a8670d0bb3348a19f70a\"\n", "}\n", "\n", "Data provider 4: {\n", " \"message\": \"Updated\",\n", - " \"receipt_token\": \"f2f38a3206197dd46b53c4c6da079527552d7c6e24b9b63e\"\n", + " \"receipt_token\": \"2ce533e0a87020654d150084389529ba05bb1ad1628a0bd4\"\n", "}\n", "\n", "Data provider 5: {\n", " \"message\": \"Updated\",\n", - " \"receipt_token\": \"e489cf14d65b211dd6c8b98b1a902f04e3b09c0e3da21a44\"\n", + " \"receipt_token\": \"ce6b281666226d181a9b8bb191daf57128400096d59bfd4c\"\n", "}\n", "\n" ] @@ -197,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": { "pycharm": { "is_executing": false @@ -229,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": { "pycharm": {} }, @@ -239,16 +495,16 @@ "text/plain": [ "{'current_stage': {'description': 'compute similarity scores',\n", " 'number': 2,\n", - " 'progress': {'absolute': 31440720,\n", + " 'progress': {'absolute': 0,\n", " 'description': 'number of already computed similarity scores',\n", - " 'relative': 0.2984721650891483}},\n", + " 'relative': 0.0}},\n", " 'stages': 3,\n", " 'state': 'running',\n", - " 'time_added': '2019-11-18T02:52:30.352381+00:00',\n", - " 'time_started': '2019-11-18T02:52:30.373760+00:00'}" + " 'time_added': '2019-11-24T23:12:37.412183+00:00',\n", + " 'time_started': '2019-11-24T23:12:37.436726+00:00'}" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -272,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -308,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": { "pycharm": {} }, @@ -316,29 +572,29 @@ { "data": { "text/plain": [ - "[[[0, 287], [2, 293], [4, 277]],\n", - " [[0, 2387], [1, 2386]],\n", - " [[0, 264], [3, 252], [1, 272]],\n", - " [[0, 2496], [4, 2498]],\n", - " [[3, 147], [4, 147]],\n", - " [[3, 815], [4, 812]],\n", - " [[3, 1302], [4, 1343]],\n", - " [[0, 1691], [3, 1674]],\n", + "[[[3, 1831], [4, 1854]],\n", + " [[0, 2362], [2, 2369]],\n", + " [[2, 2910], [4, 2915]],\n", + " [[3, 1885], [4, 1902]],\n", + " [[2, 11], [3, 10]],\n", " [[0, 3085], [3, 3117]],\n", - " [[1, 2559], [4, 2545]],\n", - " [[0, 574], [3, 576], [4, 554]],\n", - " [[0, 424], [4, 387]],\n", - " [[1, 1087], [2, 1140]],\n", + " [[1, 815], [3, 838]],\n", + " [[1, 450], [2, 474]],\n", + " [[0, 1253], [2, 1252], [1, 1191], [4, 1261]],\n", + " [[1, 1967], [2, 1985]],\n", + " [[1, 4], [4, 2]],\n", " [[1, 468], [2, 489], [3, 482], [4, 469]],\n", + " [[2, 2384], [3, 2378], [0, 2378]],\n", " [[3, 2102], [4, 2115]],\n", - " [[1, 981], [3, 1007]],\n", - " [[0, 696], [3, 704]],\n", - " [[0, 2475], [2, 2501], [1, 2485]],\n", + " [[1, 2215], [2, 2221]],\n", + " [[0, 1993], [4, 1994]],\n", + " [[0, 474], [4, 437], [1, 443], [2, 466]],\n", " [[1, 1034], [2, 1090]],\n", - " [[0, 2785], [4, 2797]]]" + " [[0, 1835], [4, 1847]],\n", + " [[0, 2496], [4, 2498]]]" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -363,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": { "pycharm": {} }, @@ -372,71 +628,72 @@ "name": "stdout", "output_type": "stream", "text": [ - "0 ['mackenzie', 'tremellen', '11-01-2947', 'maoe', 'melbourne', '79469.112', '']\n", - "2 ['mackenzie', 'dremellen', '11-01-2937', 'mals', 'mceloburne', '70469.122', '07 5988 5208']\n", - "4 ['macckenzie', 'tremellen', '', 'malr', 'melbovrne', '70469.122', '07 5988 5208']\n", - "\n", - "0 ['sophi', 'couljon', '12-03-1841', 'female', 'sydney', '80972.256', '04 3854 3784']\n", - "1 ['sophie', 'coulson', '12-03-1941', 'female', 'sydney', '80972.356', '04 3854 3784']\n", + "3 ['joshua', 'tremellen', '05-01-1988', 'male', 'sydney', '156320.936', '03 7154 7258']\n", + "4 ['joua', 'dreemleln', '05-01-1988', 'male', 'sydnru', '156320.936', '03 8154 7258']\n", "\n", - "0 ['jasmine', 'clarke', '04-00-2009', 'maje', 'melb0urme', '99853.100', '02 1507 1520']\n", - "3 ['jasmine', 'clarke', '04-09-2009', 'male', 'melbourne', '99853.200', '02 1507 1520']\n", - "1 ['jasminr', 'klarle', '04-99-2009', 'male', 'melbourne', '99863.200', '02 1507 1520']\n", + "0 ['katharine', 'procter', '03-02-2003', 'female', 'sydney', '116172.524', '08 4057 0794']\n", + "2 ['katharine', 'procter', '03-02-3003', 'femald', 'sydnev', '116172.524', '08 4057 0694']\n", "\n", - "0 ['zoel', 'ev', '06-09-1990', 'gemale', 'ysdnvvy', '183366.696', '02 5578 4520']\n", - "4 ['joel', 'everett', '06-09-1990', 'female', 'sydney', '183366.696', '02 5578 4520']\n", + "2 ['georgi3', \"wytk'ln\", '01-06-1927', 'male', 'sydriry', '35625.897', '08 2668 2433']\n", + "4 ['georgja', 'ytkkn', '01-06-1927', 'male', 'sydrirv', '35626.797', '08 2668 2433']\n", "\n", - "3 ['katelyn', 'matthets', '23-07-1977', '', 'melbourne', '118010.996', '07 9265 9238']\n", - "4 ['kateyln', 'matth4ws', '23-07-1978', 'male', 'melbounre', '118010.996', '07 9265 9238']\n", + "3 ['heath', 'ryan', '20-02-1949', 'male', 'canberra', '70507.784', '04 9913 1283']\n", + "4 ['heath', 'rya17', '20-02-2949', '', 'canbcera4', '70507.784', '04 9913 1283']\n", "\n", - "3 ['max', 'pontifex', '17-07-1930', 'male', 'melbourne', '42337.169', '04 8102 3785']\n", - "4 ['max', 'pontjef', '17-07-1930', 'male', 'melbovrne', '', '04 9102 3785']\n", - "\n", - "3 ['talrna', 'seilo', '06-09-1953', 'maoe', '', '55815.962', '03 8568 8024']\n", - "4 ['talezba', 'seib', '06-09-1953', 'male', '', '', '03 8567 8024']\n", - "\n", - "0 ['maddiaon', \"mel'ln\", '21-12-1945', 'male', 'melbouren', '', '02 1963 9316']\n", - "3 ['madklidon', 'meJi7|', '21-12-1945', 'maie', 'melbourne', '98312.180', '02 1964 9316']\n", + "2 ['siaitlyn', 'robezon', '31-12-1937', 'male', 'sdvnev', '105108.052', '07 2226 8544']\n", + "3 ['kaitlyn', 'robeson', '31-12-1937', 'maoe', 'sydney', '105107.051', '07 2226 8545']\n", "\n", "0 ['holly', 'reih', '22-06-2009', 'msle', 'syconey', '131184.582', '']\n", "3 ['holly', 'reicl', '21-06-2009', 'male', 'sydey', '131184.582', '']\n", "\n", - "1 ['jessica', 'peteahsen', '30-07-1940', 'malr', 'mel1>oume', '173806.400', '04 7005 4927']\n", - "4 ['jes5ica', 'peter5en', '30-08-1040', 'male', 'melbourne', '173806.400', '04 7005 49q7']\n", + "1 ['sasmine', 'bridqland', '20-06-1942', 'msle', 'syclney', '155539.109', '04 5020 4447']\n", + "3 ['ajsmine', 'bridgland', '20-06-2942', 'male', 's6dney', '155539.100', '04 5020 4447']\n", "\n", - "0 ['thomas', 'kositcin', '26-08-1939', 'male', 'melbourne', '43048.734', '07 4737 4471']\n", - "3 ['tomas', 'kosutcin', '26-08-1939', 'msle', 'melbourne', '43048.735', '07 4737 4471']\n", - "4 ['thornas', 'kos9tcin', '26-08-1939', 'male', 'melborune', '43948.734', '07 4737 4471']\n", + "1 ['ella', 'mo1davt5ev', '01-93-1985', 'male', 'pertj', '', '03 1427 7602']\n", + "2 ['ella', 'moldavtsev', '01-03-1985', 'male', 'perth', '171412.470', '03 1427 7602']\n", "\n", - "0 ['sofie', 'ny', '20-10-1933', 'fenale', '', '135685.300', '07 7905 6885']\n", - "4 ['stofia', 'ny', '20-10-q933', 'female', 'sydnev', '135685.300', '07 7905 6885']\n", + "0 ['courtney', 'mashberg', '30-05-1908', 'male', 'perth', '277942.921', '03 1022 1796']\n", + "2 ['courtne', 'mazhberg', '30-05-1908', 'mzle', 'perth', '277942.021', '03 1022 1796']\n", + "1 ['courtnev', 'mashbcrg', '30-05-1808', 'male', 'perth', '277941.921', '03 1022 1796']\n", + "4 ['kourtney', 'msshperg', '30-05-1907', 'male', 'per6b', '277942.921', '03 1022 1796']\n", "\n", - "1 ['sophie', 'mazx9ne', '25-03-2814', 'make', 'melbourne', '36878.525', '08 3679 2653']\n", - "2 ['sofie', 'mazzone', '25-03-2924', 'mals', 'melbourne', '36878.526', '08 3678 2653']\n", + "1 ['ary', 'relkos', '26-10-2003', 'male', 'melbonrrie', '136614.506', '02 2102 6467']\n", + "2 ['arru', 'rellos', '26-10-2093', 'male', 'melbouthd', '136614.506', '02 1192 6367']\n", + "\n", + "1 ['erin', 'kampgell', '29-12-1983', 'make', 'perth', '331476.598', '08 2996 1445']\n", + "4 ['wrin', 'kampbwll', '29-22-1983', 'male', 'pertl0', '331476.599', '08 2996 1435']\n", "\n", "1 ['stephnaie', 'goldsworthy', '03-06-1958', '', 'canbrrra', '83372.67q', '02 4093 4044']\n", "2 ['sttepbanie', 'goldsworthy', '03-06-1958', 'mald', 'canbedra', '83372.772', '02 4093 4044']\n", "3 ['stefanie', 'goldsworthy', '03-06-1958', 'male', 'camberra', '83372.572', '']\n", "4 ['stefanie', 'go|dsworthy', '03-06-1958', '', 'cabr:erra', '83372.672', '02 4093 4044']\n", "\n", + "2 ['ro5y', 'whitr', '30-12-1933', 'mal4', 'sydney', '91104.885', '02 2375 0175']\n", + "3 ['rory', 'white', '30-12-1933', 'male', 'sydney', '91104.785', '02 2375 0175']\n", + "0 ['mory', 'wh:te', '30-12-1033', 'male', 'sydhey', '91104.785', '02 2375 0175']\n", + "\n", "3 ['antony', 'riean', '18-01-1908', 'male', 'canberra', '59633.334', '07 2734 8270']\n", "4 ['anthnoy', 'ryari', '18-01-1908', 'male', 'cajberra', '58633.434', '07 2734 8370']\n", "\n", - "1 ['eiahn', 'greeti', '11-0e-1977', 'male', 'melbourne', '68538.966', '03 8798 1825']\n", - "3 ['eirn', 'kreen', '11-04-1977', 'male', 'meluourne', '68548.95y', '03 8798 1825']\n", + "1 ['ryan', 'allxhin', '20-10-2011', 'male', 'melbounre', '267843.384', '']\n", + "2 ['ryan', 'allchin', '20-10-2011', 'male', 'melbourne', '167843.484', '08 7962 6255']\n", "\n", - "0 ['aleesga', 'nkuyen', '14-06-1068', 'male', 'melbourrie', '122053.275', '02 6678 5223']\n", - "3 ['aleeSa', 'nguyen', '14-o6-1968', 'male', 'mtelbournr', '122053.265', '02 6678 5223']\n", + "0 ['haery', 'reklos', '26-10-2003', 'malw', 'mlebourne', '136614.506', '02 1102 6467']\n", + "4 ['harey', 'eelloz', '26-10-2003', 'mame', 'melbourne', '136614.506', '02 110w 6467']\n", "\n", - "0 ['benjamin', 'bishop', '25-11-1980', 'male', 'sydney', '95170.703', '04 3415 3977']\n", - "2 [\"benzam'ln\", 'bish9p', '25-11-1980', 'msle', 'sydn3v', '95170.703', '04 3415 3977']\n", - "1 ['bennie', 'bishop', '25-11-1980', 'mald', '', '95180.703', '04 3415 3977']\n", + "0 ['larizsa', 'morrison', '16-04-2960', 'maje', 'melbouene', '196846.869', '04 3434 7115']\n", + "4 ['larissa', 'morrison', '16-04-1960', 'male', 'melbourne', '196846.869', '04 3434 7115']\n", + "1 ['lairssa', 'mornson', '16-04-1960', 'male', '', '196836.869', '04 3434 7115']\n", + "2 ['larissa', 'morrijon', '16-04-1960', 'make', '', '196846.859', '04 3434 7115']\n", "\n", "1 [\"ke'Irx\", 'chappel', '19-05-1966', 'male', '', '138869.396', '']\n", "2 ['keira', 'chapepl', '19-05-1966', 'male', '', '148869.296', '']\n", "\n", - "0 ['deagxan', 'zaffino', '22-01-1979', 'femame', 'sydne7', '99746.221', '04 1534 02e5']\n", - "4 ['teagan', 'zaffino', '22-01-1979', 'female', 'sydney', '99746.221', '04 1534 0225']\n", + "0 ['meagan', 'vrahn', '26-05-2950', '', 'melbourne', '154858.094', '04 1222 9254']\n", + "4 ['meagan', 'frahn', '26-05-1950', 'male', 'melbourne', '154856.094', '04 1222 9254']\n", + "\n", + "0 ['zoel', 'ev', '06-09-1990', 'gemale', 'ysdnvvy', '183366.696', '02 5578 4520']\n", + "4 ['joel', 'everett', '06-09-1990', 'female', 'sydney', '183366.696', '02 5578 4520']\n", "\n" ] } @@ -481,7 +738,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": { "pycharm": {} }, From 4b657315b503893839b128a8735bf45e1471dde0 Mon Sep 17 00:00:00 2001 From: Guillaume Smith Date: Tue, 26 Nov 2019 13:02:37 +1100 Subject: [PATCH 12/12] It had been reverted back in a previous commit. --- docs/tutorial/Similarity Scores.ipynb | 69 ++++++++++++--------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/docs/tutorial/Similarity Scores.ipynb b/docs/tutorial/Similarity Scores.ipynb index 328253bc..5a2c4c9f 100644 --- a/docs/tutorial/Similarity Scores.ipynb +++ b/docs/tutorial/Similarity Scores.ipynb @@ -79,7 +79,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing anonlink-entity-service hosted at https://testing.es.data61.xyz\n" + "Testing anonlink-entity-service hosted at http://0.0.0.0:8851\n" ] } ], @@ -101,7 +101,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\"project_count\": 7082, \"rate\": 2845548, \"status\": \"ok\"}\r\n" + "{\"project_count\": 4, \"rate\": 32036360, \"status\": \"ok\"}\r\n" ] } ], @@ -193,7 +193,7 @@ " \n", " \n", " \n", - " rec-1070-org\n", + " rec-1070-org\n", " michaela\n", " neumann\n", " 8\n", @@ -206,7 +206,7 @@ " 5304218\n", " \n", " \n", - " rec-1016-org\n", + " rec-1016-org\n", " courtney\n", " painter\n", " 12\n", @@ -219,7 +219,7 @@ " 4066625\n", " \n", " \n", - " rec-4405-org\n", + " rec-4405-org\n", " charles\n", " green\n", " 38\n", @@ -313,7 +313,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpgayhu51z\n" + "Overwriting /tmp/tmp23q54lqu\n" ] } ], @@ -555,17 +555,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Credentials will be saved in /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp6fab0949\n", + "Credentials will be saved in /tmp/tmp6c2zwr2b\n", "\u001b[31mProject created\u001b[0m\n" ] }, { "data": { "text/plain": [ - "{'project_id': '224ac23b0c6ba6c661ade4082d0741fc94b9af3ebf09f9fd',\n", - " 'result_token': '331d33ba45f4b636aac944ba1ba52f2602a3f18bcff9ff25',\n", - " 'update_tokens': ['8826451e71fb0ea6f1b07d7e54264ab7477f4d97898f8ce5',\n", - " '122076e7ef37ccbedda55d028210d4e5a14d9441329ea492']}" + "{'project_id': '4d499f0fd3fb41c7dca684ee923ee056daff1d1d0dea0e69',\n", + " 'result_token': '644530073d94cec15ee0b6955192e6ec66e4d5b6a7c59ec4',\n", + " 'update_tokens': ['aaec135b6729e8234b2d974e99b47df48a5d2b83b1e0e5fb',\n", + " 'f154012b6f6d48700490f964525633ff8efaa18f200ec7c5']}" ] }, "execution_count": 8, @@ -618,8 +618,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmp2s_mj16v.json\u001b[0m\n", - "\u001b[31mCLK data written to /var/folders/mw/21b9jb5d1c9_3_z0dq7hpx1m00j_0b/T/tmpvlxznb1y.json\u001b[0m\n" + "\u001b[31mCLK data written to /tmp/tmp75ho6ywb.json\u001b[0m\n", + "\u001b[31mCLK data written to /tmp/tmp1rw5bksd.json\u001b[0m\n" ] } ], @@ -818,16 +818,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "[76, 2345, 1.0]\n", - "[83, 3439, 1.0]\n", - "[103, 863, 1.0]\n", - "[154, 2391, 1.0]\n", - "[177, 4247, 1.0]\n", - "[192, 1176, 1.0]\n", - "[270, 4516, 1.0]\n", - "[312, 1253, 1.0]\n", - "[407, 3743, 1.0]\n", - "[670, 3550, 1.0]\n" + "[[0, 76], [1, 2345], 1.0]\n", + "[[0, 83], [1, 3439], 1.0]\n", + "[[0, 103], [1, 863], 1.0]\n", + "[[0, 154], [1, 2391], 1.0]\n", + "[[0, 177], [1, 4247], 1.0]\n", + "[[0, 192], [1, 1176], 1.0]\n", + "[[0, 270], [1, 4516], 1.0]\n", + "[[0, 312], [1, 1253], 1.0]\n", + "[[0, 407], [1, 3743], 1.0]\n", + "[[0, 670], [1, 3550], 1.0]\n" ] } ], @@ -971,13 +971,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "scores_matches = []\n", "scores_non_matches = []\n", - "for a, b, score in data:\n", + "for (_, a), (_, b), score in data:\n", " if score < 0.79:\n", " continue\n", " if (a, b) in true_matches:\n", @@ -988,7 +988,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1024,7 +1024,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1041,13 +1041,6 @@ " --apikey=\"{credentials['result_token']}\" \\\n", " --server=\"{url}\"" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1066,18 +1059,18 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.3" }, "pycharm": { "stem_cell": { "cell_type": "raw", - "source": [], "metadata": { "collapsed": false - } + }, + "source": [] } } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +}