Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
anonlink==0.12.5
bitmath==1.3.1.2
celery==4.3.0
clkhash==0.14.0
clkhash==0.15.0
colorama==0.4.1 # required for structlog
connexion==1.4
Flask-Opentracing==0.2.0
Expand Down
147 changes: 79 additions & 68 deletions docs/tutorial/Permutations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{\"project_count\": 1021, \"rate\": 2453247, \"status\": \"ok\"}\n"
"{\"project_count\": 5938, \"rate\": 2118828, \"status\": \"ok\"}\r\n"
]
}
],
Expand Down Expand Up @@ -294,7 +294,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting /tmp/tmptfalxkiq\n"
"Overwriting /tmp/tmp2d7l4ief\n"
]
}
],
Expand Down Expand Up @@ -399,17 +399,17 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Credentials will be saved in /tmp/tmpyr8dc2pf\n",
"Credentials will be saved in /tmp/tmp5kxg8nky\n",
"\u001b[31mProject created\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
"{'project_id': 'b8211d1450c8d0d631dbdc1fb482af106b8cbdebed5b7fd3',\n",
" 'result_token': '8fe1fc01f7ac3a3406d1e031b7d120800aa6460d0da62abb',\n",
" 'update_tokens': ['1c39c6972626bd34729812f0b9cf6e467461824dbbd0682c',\n",
" '901c12061cf621b67df5b9de2719b8806636364d3fdc1765']}"
"{'project_id': '61ad07b11de00b335e4efdb440d0da061b9e89f0b7d25006',\n",
" 'result_token': 'f75ec8d9142b2ac43c1f43899b663113a260e97a3fc23bca',\n",
" 'update_tokens': ['25df4927eccfb37d11b1ab72dad0abc9b5d9abae9e73f0fc',\n",
" '3aafac5ff5c5024a9f770fb2dc8d87a8ab73a504c0a2a03c']}"
]
},
"execution_count": 7,
Expand Down Expand Up @@ -446,7 +446,7 @@
"At the moment both data providers have *raw* personally identiy information. We first have to generate CLKs from the raw entity information. We need:\n",
"- the *clkhash* library\n",
"- the linkage schema from above\n",
"- and two secret passwords which are only known to Alice and Bob. (here: `horse` and `staple`)\n",
"- and a secret which is only known to Alice and Bob. (here: `my_secret`)\n",
"\n",
"Please see [clkhash](https://clkhash.readthedocs.io/) documentation for further details on this."
]
Expand All @@ -464,16 +464,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"generating CLKs: 100%|█| 5.00k/5.00k [00:01<00:00, 1.32kclk/s, mean=765, std=37.1]\n",
"\u001b[31mCLK data written to /tmp/tmpc_4k553j.json\u001b[0m\n",
"generating CLKs: 100%|█| 5.00k/5.00k [00:01<00:00, 4.28kclk/s, mean=756, std=43.3]\n",
"\u001b[31mCLK data written to /tmp/tmpv7eo2tfp.json\u001b[0m\n"
"\u001b[31mCLK data written to /tmp/tmpdumz7c8u.json\u001b[0m\n",
"\u001b[31mCLK data written to /tmp/tmpq70k1o_9.json\u001b[0m\n"
]
}
],
"source": [
"!clkutil hash \"{a_csv.name}\" horse staple \"{schema.name}\" \"{a_clks.name}\"\n",
"!clkutil hash \"{b_csv.name}\" horse staple \"{schema.name}\" \"{b_clks.name}\""
"!clkutil hash \"{a_csv.name}\" my_secret \"{schema.name}\" \"{a_clks.name}\"\n",
"!clkutil hash \"{b_csv.name}\" my_secret \"{schema.name}\" \"{b_clks.name}\""
]
},
{
Expand All @@ -498,22 +496,33 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Usage: clkutil upload [OPTIONS] CLK_JSON\n",
"\n",
" Upload CLK data to entity matching server.\n",
"\n",
" Given a json file containing hashed clk data as CLK_JSON, upload to the\n",
" entity resolution service.\n",
"\n",
" Use \"-\" to read from stdin.\n",
"\n",
"Options:\n",
" --project TEXT Project identifier\n",
" --apikey TEXT Authentication API key for the server.\n",
" --server TEXT Server address including protocol\n",
" -o, --output FILENAME\n",
" -v, --verbose Script is more talkative\n",
" --help Show this message and exit.\n"
"Usage: clkutil upload [OPTIONS] CLK_JSON\r\n",
"\r\n",
" Upload CLK data to entity matching server.\r\n",
"\r\n",
" Given a json file containing hashed clk data as CLK_JSON, upload to the\r\n",
" entity resolution service.\r\n",
"\r\n",
" Use \"-\" to read from stdin.\r\n",
"\r\n",
"Options:\r\n",
" --project TEXT Project identifier\r\n",
" --apikey TEXT Authentication API key for the server.\r\n",
" -o, --output FILENAME\r\n",
" --server TEXT Server address including protocol. Default\r\n",
" https://testing.es.data61.xyz.\r\n",
" --retry-multiplier INTEGER <milliseconds> If receives a 503 from\r\n",
" server, minimum waiting time before\r\n",
" retrying. Default 100.\r\n",
" --retry-exponential-max INTEGER\r\n",
" <milliseconds> If receives a 503 from\r\n",
" server, maximum time interval between\r\n",
" retries. Default 10000.\r\n",
" --retry-max-time INTEGER <milliseconds> If receives a 503 from\r\n",
" server, retry only within this period.\r\n",
" Default 20000.\r\n",
" -v, --verbose Script is more talkative\r\n",
" --help Show this message and exit.\r\n"
]
}
],
Expand Down Expand Up @@ -653,7 +662,8 @@
"outputs": [],
"source": [
"import requests\n",
"import clkhash.rest_client\n",
"from clkhash.rest_client import RestClient\n",
"from clkhash.rest_client import format_run_status\n",
"\n",
"from IPython.display import clear_output"
]
Expand All @@ -677,9 +687,10 @@
}
],
"source": [
"for update in clkhash.rest_client.watch_run_status(url, project_id, run_id, credentials['result_token'], timeout=300):\n",
"rest_client = RestClient(url)\n",
"for update in rest_client.watch_run_status(project_id, run_id, credentials['result_token'], timeout=300):\n",
" clear_output(wait=True)\n",
" print(clkhash.rest_client.format_run_status(update))"
" print(format_run_status(update))"
]
},
{
Expand Down Expand Up @@ -759,7 +770,7 @@
{
"data": {
"text/plain": [
"4858"
"4851"
]
},
"execution_count": 18,
Expand Down Expand Up @@ -815,7 +826,7 @@
{
"data": {
"text/plain": [
"[2333, 1468, 559, 274, 653, 3385, 278, 3568, 3617, 4356]"
"[1704, 3246, 227, 2913, 1848, 2942, 1358, 3469, 2025, 1349]"
]
},
"execution_count": 20,
Expand Down Expand Up @@ -849,7 +860,7 @@
{
"data": {
"text/plain": [
"[2083, 1106, 3154, 1180, 2582, 375, 3533, 1046, 316, 2427]"
"[3095, 4173, 2439, 29, 3016, 493, 764, 746, 1981, 2840]"
]
},
"execution_count": 21,
Expand Down Expand Up @@ -923,16 +934,16 @@
{
"data": {
"text/plain": [
"['rec-2689-org,ainsley,robison,23,atherton street,villa 1/4,deer park,3418,nsw,19310531,4102867\\n',\n",
" 'rec-1056-org,chloe,imgraben,47,curlewis crescent,dragon rising,burleigh waters,2680,qld,19520516,6111417\\n',\n",
" 'rec-1820-org,liam,cullens,121,chandler street,the burrows,safety bay,3073,qld,19910811,7828812\\n',\n",
" 'rec-2192-org,ellie,fearnall,31,fishburn street,colbara,cherrybrook,5171,wa,,7745948\\n',\n",
" 'rec-2696-org,campbell,nguyen,6,diselma place,villa 2,collinswood,4343,nsw,19630325,2861961\\n',\n",
" 'rec-968-org,aidan,blake,15,namatjira drive,cooramin,dromana,4074,vic,19270928,4317464\\n',\n",
" 'rec-3833-org,nicholas,clarke,13,gaylard place,tryphinia view,wetherill park,2810,nsw,19041223,3927795\\n',\n",
" 'rec-4635-org,isabella,white,8,cooling place,,rosebud,6151,sa,19990911,2206317\\n',\n",
" 'rec-3549-org,harry,thorpe,11,kambalda crescent,louisa tor 4,angaston,2777,qld,19421128,2701790\\n',\n",
" 'rec-1220-org,lauren,weltman,6,tewksbury circuit,heritage estate,evans head,6330,nsw,19840930,9462453\\n']"
"['rec-126-org,nikki,jaric,13,renmark street,parish lorne,chester hill,2106,nsw,19520401,9272010\\n',\n",
" 'rec-3413-org,amber,durnin,41,hindmarsh drive,walmount,knoxfield,3634,vic,19251010,8760281\\n',\n",
" 'rec-2774-org,dylan,herbert,150,andrews street,brentwood vlge,sheidow park,5067,vic,19430105,4521571\\n',\n",
" 'rec-249-org,lachlan,shepherd,1,grainger circuit,willandra,adaminaby,7250,nsw,19080122,3139543\\n',\n",
" 'rec-4870-org,carlin,garcia,46,von guerard crescent,,clarendon,2140,nsw,19241024,1512461\\n',\n",
" 'rec-1327-org,emma,copperstone,5,decima circuit,crower,avoca north,2166,nsw,19451208,1538637\\n',\n",
" 'rec-959-org,zac,hand,18,kambalda crescent,mckinnon glen,east maitland,4500,sa,19970725,2709860\\n',\n",
" 'rec-2971-org,jack,highet,26,mathieson crescent,,whittington,3064,vic,19650507,6260000\\n',\n",
" \"rec-4637-org,kydan,gigney,64,o'rourke street,timdoolin,shellharbour,4059,nsw,19760409,3279351\\n\",\n",
" 'rec-2003-org,mitchell,webb,15,denovan circuit,villa 3,connewarre,5091,nsw,19801006,8786441\\n']"
]
},
"execution_count": 24,
Expand All @@ -956,16 +967,16 @@
{
"data": {
"text/plain": [
"['rec-2689-dup-0,ainsley,labalck,23,atherto n street,villa 1/4,deer park,3418,nsw,19310531,4102867\\n',\n",
" 'rec-1056-dup-0,james,imgrapen,47,curlewiscrescent,dragon rising,burleigh waters,2680,qld,19520516,6111417\\n',\n",
" 'rec-1820-dup-0,liam,cullens,121,chandlerw street,the burrows,safety bay,3073,qld,19910811,7828812\\n',\n",
" 'rec-2192-dup-0,elpie,fearnull,31,fishbunestreet,,cherrybrook,5171,wa,,7745948\\n',\n",
" 'rec-2696-dup-0,jenna,nguyen,85,diselmaplace,villz2,collinswood,4343,nsw,19630325,2861961\\n',\n",
" 'rec-968-dup-0,aidan,blake,15,namatjifra drive,cooramin,dromana,4074,vic,19270928,4317464\\n',\n",
" 'rec-3833-dup-0,nicholas,clarke,,gaylard place,tryphinia view,wetherill park,2810,nsw,19041223,3972795\\n',\n",
" 'rec-4635-dup-0,isaeblla,white,8,cooling place,massey green,rosebud,6151,sa,19990911,2206317\\n',\n",
" 'rec-3549-dup-0,taylor,thorpe,11,kambalda c rescent,louisa tor 4,angasgon,2777,qld,19421128,2701790\\n',\n",
" 'rec-1220-dup-0,lauren,welman,6,tewksburl circuit,heritage estate,evans head,6330,nsw,19840930,9462453\\n']"
"['rec-126-dup-0,nikki,jaruic,13,renmark street,parishlorne,chester hill,2106,nsw,19520401,9272910\\n',\n",
" 'rec-3413-dup-0,amber,durnin,41,hindmarsh drive,walmoutn,knoxfield,3643,vic,19521010,8760281\\n',\n",
" 'rec-2774-dup-0,dylan,herbert,150,andrews street,brentwoo dvlge,sheido wpark,5067,vic,19430105,4521571\\n',\n",
" 'rec-249-dup-0,lachlan,shephaerd,1,grainger circuit,willandra,adaminaby,7250,nsw,19080122,3139543\\n',\n",
" 'rec-4870-dup-0,carlin,munforti,46,von guerard crescent,,clarendon,2140,nsw,19241024,1512461\\n',\n",
" 'rec-1327-dup-0,emma,copperstone,5,decima circuit,crwer,avoca north,2166,nsw,19451208,1536837\\n',\n",
" 'rec-959-dup-0,zac,,18,kambalda crescent,mckinnon glen,east mailand,4500,sa,19970725,2709860\\n',\n",
" 'rec-2971-dup-0,jack,highet,26,mathieson crescent,,kaleen,3064,vic,19650507,6260000\\n',\n",
" \"rec-4637-dup-0,aurora,gigney,64,o'rourke street,timdoolin,shellharbour,4059,nsw,19760409,7169291\\n\",\n",
" 'rec-2003-dup-0,mitchell,,15,denovan circuit,villa 3,,5091,nsw,19801006,8786441\\n']"
]
},
"execution_count": 25,
Expand Down Expand Up @@ -1003,16 +1014,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Ainsley Robison (rec-2689-org) =? Ainsley Labalck (rec-2689-dup-0)\n",
"Chloe Imgraben (rec-1056-org) =? James Imgrapen (rec-1056-dup-0)\n",
"Liam Cullens (rec-1820-org) =? Liam Cullens (rec-1820-dup-0)\n",
"Ellie Fearnall (rec-2192-org) =? Elpie Fearnull (rec-2192-dup-0)\n",
"Campbell Nguyen (rec-2696-org) =? Jenna Nguyen (rec-2696-dup-0)\n",
"Aidan Blake (rec-968-org) =? Aidan Blake (rec-968-dup-0)\n",
"Nicholas Clarke (rec-3833-org) =? Nicholas Clarke (rec-3833-dup-0)\n",
"Isabella White (rec-4635-org) =? Isaeblla White (rec-4635-dup-0)\n",
"Harry Thorpe (rec-3549-org) =? Taylor Thorpe (rec-3549-dup-0)\n",
"Lauren Weltman (rec-1220-org) =? Lauren Welman (rec-1220-dup-0)\n"
"Nikki Jaric (rec-126-org) =? Nikki Jaruic (rec-126-dup-0)\n",
"Amber Durnin (rec-3413-org) =? Amber Durnin (rec-3413-dup-0)\n",
"Dylan Herbert (rec-2774-org) =? Dylan Herbert (rec-2774-dup-0)\n",
"Lachlan Shepherd (rec-249-org) =? Lachlan Shephaerd (rec-249-dup-0)\n",
"Carlin Garcia (rec-4870-org) =? Carlin Munforti (rec-4870-dup-0)\n",
"Emma Copperstone (rec-1327-org) =? Emma Copperstone (rec-1327-dup-0)\n",
"Zac Hand (rec-959-org) =? Zac (rec-959-dup-0)\n",
"Jack Highet (rec-2971-org) =? Jack Highet (rec-2971-dup-0)\n",
"Kydan Gigney (rec-4637-org) =? Aurora Gigney (rec-4637-dup-0)\n",
"Mitchell Webb (rec-2003-org) =? Mitchell (rec-2003-dup-0)\n"
]
}
],
Expand Down Expand Up @@ -1054,9 +1065,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Found 4858 correct matches out of 5000. Incorrectly linked 0 matches.\n",
"Found 4851 correct matches out of 5000. Incorrectly linked 0 matches.\n",
"Precision: 100.0%\n",
"Recall: 97.2%\n"
"Recall: 97.0%\n"
]
}
],
Expand Down
Loading