-
Notifications
You must be signed in to change notification settings - Fork 8
Fix 462 #463
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix 462 #463
Changes from 10 commits
13bdbc0
bbadadd
784ef41
3d8890b
56b5fc1
104f49e
08fc85b
db74fc0
0680f36
83263ba
5474008
ce02a82
13e65da
e5a7b7b
3a30ad2
7216f31
6a68f18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,7 @@ | ||
| import psycopg2 | ||
| import psycopg2.extras | ||
|
|
||
| from entityservice.database.util import execute_returning_id, logger | ||
| from entityservice.database.util import execute_returning_id, logger, query_db | ||
| from entityservice.errors import RunDeleted | ||
|
|
||
|
|
||
|
|
@@ -34,9 +34,9 @@ def insert_new_run(db, run_id, project_id, threshold, name, type, notes=''): | |
| def insert_dataprovider(cur, auth_token, project_id): | ||
| sql_query = """ | ||
| INSERT INTO dataproviders | ||
| (project, token) | ||
| (project, token, uploaded) | ||
| VALUES | ||
| (%s, %s) | ||
| (%s, %s, 'not_started') | ||
| RETURNING id | ||
| """ | ||
| return execute_returning_id(cur, sql_query, [project_id, auth_token]) | ||
|
|
@@ -54,10 +54,8 @@ def insert_encoding_metadata(db, clks_filename, dp_id, receipt_token, count): | |
| with db.cursor() as cur: | ||
| cur.execute(sql_insertion_query, [dp_id, receipt_token, clks_filename, count, 'pending']) | ||
|
|
||
| set_dataprovider_upload_state(db, dp_id, True) | ||
|
|
||
|
|
||
| def set_dataprovider_upload_state(db, dp_id, state=True): | ||
| def set_dataprovider_upload_state(db, dp_id, state='error'): | ||
| logger.debug("Setting dataprovider {} upload state to {}".format(dp_id, state)) | ||
| sql_update = """ | ||
| UPDATE dataproviders | ||
|
|
@@ -261,3 +259,25 @@ def get_created_runs_and_queue(db, project_id): | |
| if res is None: | ||
| res = [] | ||
| return res | ||
|
|
||
|
|
||
| def get_and_set_dataprovider_upload_state_in_progress(db, dp_id): | ||
| """ | ||
| This method returns true if it was able to update the uploaded status of this dataprovider from false to true. | ||
| It return false otherwise (i.e. the state was already set to true). | ||
| """ | ||
| logger.debug("Setting dataprovider {} upload state to True".format(dp_id)) | ||
|
||
| sql_update = """ | ||
| UPDATE dataproviders | ||
| SET uploaded = 'in_progress' | ||
| WHERE id = %s and uploaded != 'done' and uploaded != 'in_progress' | ||
| RETURNING id, uploaded | ||
| """ | ||
| query_response = query_db(db, sql_update, [dp_id]) | ||
| print(query_response) | ||
| length = len(query_response) | ||
| if length < 1: | ||
| return False | ||
| elif length > 1: | ||
| raise ValueError("Houston, we have a problem!!! This dataprovider can upload multiple times its clks.") | ||
| return True | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,6 +85,12 @@ RETURNS bool AS $$ | |
| SELECT $1.state = 'completed' | ||
| $$ STABLE LANGUAGE SQL; | ||
|
|
||
| CREATE TYPE UPLOADEDSTATE AS ENUM ( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wow, now we have
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was indeed a pickle. I will add comments on the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll finally keep |
||
| 'not_started', | ||
| 'in_progress', | ||
| 'done', | ||
| 'error' | ||
| ); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice |
||
|
|
||
| CREATE TABLE dataproviders ( | ||
| id SERIAL PRIMARY KEY, | ||
|
|
@@ -93,7 +99,7 @@ CREATE TABLE dataproviders ( | |
| token CHAR(48) NOT NULL UNIQUE, | ||
|
|
||
| -- Set after the bloom filter data has been added | ||
| uploaded BOOL NOT NULL DEFAULT FALSE, | ||
| uploaded UPLOADEDSTATE NOT NULL, | ||
|
|
||
| project CHAR(48) REFERENCES projects (project_id) on DELETE CASCADE | ||
| ); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -206,15 +206,15 @@ def test_project_binary_data_invalid_buffer_size( | |
| pid = new_project_data['project_id'] | ||
| file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'testdata/clks_128B_1k.bin') | ||
|
|
||
| upload_binary_data_from_file(requests, file_path, pid, new_project_data['update_tokens'][0], -1, status=400) | ||
| upload_binary_data_from_file(requests, file_path, pid, new_project_data['update_tokens'][0], -1, expected_status_code=400) | ||
|
|
||
| # Now try upload with valid hash-count but doesn't match actual size: | ||
| upload_binary_data_from_file(requests, file_path, pid, new_project_data['update_tokens'][0], 1000000, status=400) | ||
| upload_binary_data_from_file(requests, file_path, pid, new_project_data['update_tokens'][0], 3, status=400) | ||
| upload_binary_data_from_file(requests, file_path, pid, new_project_data['update_tokens'][0], 1000000, expected_status_code=400) | ||
| upload_binary_data_from_file(requests, file_path, pid, new_project_data['update_tokens'][0], 3, expected_status_code=400) | ||
|
|
||
| # Now try the minimum upload size (1 clk) | ||
| file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'testdata/single_clk.bin') | ||
| upload_binary_data_from_file(requests, file_path, pid, new_project_data['update_tokens'][0], 1, status=201) | ||
| upload_binary_data_from_file(requests, file_path, pid, new_project_data['update_tokens'][0], 1, expected_status_code=201) | ||
|
|
||
|
|
||
| def test_project_single_party_empty_data_upload( | ||
|
|
@@ -235,3 +235,62 @@ def test_project_single_party_empty_data_upload( | |
| ) | ||
| assert r.status_code == 400 | ||
|
|
||
|
|
||
| def test_project_upload_wrong_authentication(requests, valid_project_params): | ||
|
||
| """ | ||
| Test that a token cannot be re-used to upload clks. | ||
| So first, create a project, upload clks with a token (which should work), and then re-upload clks using the same | ||
| token which should return a 403 error. | ||
| """ | ||
| expected_number_parties = get_expected_number_parties(valid_project_params) | ||
| if expected_number_parties < 2: | ||
| # The test is not made for less than two parties | ||
| return | ||
|
|
||
| new_project_data = requests.post(url + '/projects', | ||
| json={ | ||
| 'schema': {}, | ||
| **valid_project_params | ||
| }).json() | ||
| update_tokens = new_project_data['update_tokens'] | ||
|
|
||
| assert len(update_tokens) == expected_number_parties | ||
|
|
||
| small_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'testdata/clks_128B_1k.bin') | ||
| token_to_reuse = update_tokens[0] | ||
| upload_binary_data_from_file( | ||
| requests, | ||
| small_file_path, new_project_data['project_id'], token_to_reuse, 1000) | ||
|
|
||
| upload_binary_data_from_file( | ||
| requests, | ||
| small_file_path, new_project_data['project_id'], token_to_reuse, 1000, expected_status_code=403) | ||
|
|
||
|
|
||
| def test_project_upload_fail_then_works(requests, valid_project_params): | ||
|
||
| """ | ||
| Test that a token can be re-used to upload clks after the upload failed. | ||
| So first, create a project, upload clks with a token (which should NOT work with a 400 error), | ||
| and then re-upload clks using the same token which should work. | ||
| """ | ||
| expected_number_parties = get_expected_number_parties(valid_project_params) | ||
|
|
||
| new_project_data = requests.post(url + '/projects', | ||
| json={ | ||
| 'schema': {}, | ||
| **valid_project_params | ||
| }).json() | ||
| update_tokens = new_project_data['update_tokens'] | ||
|
|
||
| assert len(update_tokens) == expected_number_parties | ||
|
|
||
| small_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'testdata/clks_128B_1k.bin') | ||
| token_to_reuse = update_tokens[0] | ||
| # This should fail as we are not providing the good count. | ||
| upload_binary_data_from_file( | ||
| requests, | ||
| small_file_path, new_project_data['project_id'], token_to_reuse, 2000, expected_status_code=400) | ||
|
|
||
| upload_binary_data_from_file( | ||
| requests, | ||
| small_file_path, new_project_data['project_id'], token_to_reuse, 1000) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -121,57 +121,69 @@ def project_clks_post(project_id): | |
| with DBConn() as conn: | ||
| dp_id = db.get_dataprovider_id(conn, token) | ||
| project_encoding_size = db.get_project_schema_encoding_size(conn, project_id) | ||
| upload_state_updated = db.get_and_set_dataprovider_upload_state_in_progress(conn, dp_id) | ||
|
||
|
|
||
| if not upload_state_updated: | ||
| return safe_fail_request(403, "This token has already been used to upload clks.") | ||
|
|
||
| log = log.bind(dp_id=dp_id) | ||
| log.info("Receiving CLK data.") | ||
| receipt_token = None | ||
|
|
||
| with opentracing.tracer.start_span('upload-clk-data', child_of=parent_span) as span: | ||
| span.set_tag("project_id", project_id) | ||
| if headers['Content-Type'] == "application/json": | ||
| span.set_tag("content-type", 'json') | ||
| # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This | ||
| # enables running the web frontend with less memory. | ||
| # However, as connexion is very, very strict about input validation when it comes to json, it will always | ||
| # consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as | ||
| # json into memory. -> issue #184 | ||
|
|
||
| receipt_token, raw_file = upload_json_clk_data(dp_id, get_json(), span) | ||
| # Schedule a task to deserialize the hashes, and carry | ||
| # out a pop count. | ||
| handle_raw_upload.delay(project_id, dp_id, receipt_token, parent_span=serialize_span(span)) | ||
| log.info("Job scheduled to handle user uploaded hashes") | ||
| elif headers['Content-Type'] == "application/octet-stream": | ||
| span.set_tag("content-type", 'binary') | ||
| log.info("Handling binary CLK upload") | ||
| try: | ||
| count, size = check_binary_upload_headers(headers) | ||
| log.info(f"Headers tell us to expect {count} encodings of {size} bytes") | ||
| span.log_kv({'count': count, 'size': size}) | ||
| except Exception: | ||
| log.warning("Upload failed due to problem with headers in binary upload") | ||
| raise | ||
| # Check against project level encoding size (if it has been set) | ||
| if project_encoding_size is not None and size != project_encoding_size: | ||
| # fail fast - we haven't stored the encoded data yet | ||
| return safe_fail_request(400, "Upload 'Hash-Size' doesn't match project settings") | ||
|
|
||
| # TODO actually stream the upload data straight to Minio. Currently we can't because | ||
| # connexion has already read the data before our handler is called! | ||
| # https://github.com/zalando/connexion/issues/592 | ||
| # stream = get_stream() | ||
| stream = BytesIO(request.data) | ||
| expected_bytes = binary_format(size).size * count | ||
| log.debug(f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B") | ||
| if len(request.data) != expected_bytes: | ||
| safe_fail_request(400, "Uploaded data did not match the expected size. Check request headers are correct") | ||
| try: | ||
| receipt_token = upload_clk_data_binary(project_id, dp_id, stream, count, size) | ||
| except ValueError: | ||
| safe_fail_request(400, "Uploaded data did not match the expected size. Check request headers are correct.") | ||
| else: | ||
| safe_fail_request(400, "Content Type not supported") | ||
|
|
||
| try: | ||
| if headers['Content-Type'] == "application/json": | ||
| span.set_tag("content-type", 'json') | ||
| # TODO: Previously, we were accessing the CLKs in a streaming fashion to avoid parsing the json in one hit. This | ||
| # enables running the web frontend with less memory. | ||
| # However, as connexion is very, very strict about input validation when it comes to json, it will always | ||
| # consume the stream first to validate it against the spec. Thus the backflip to fully reading the CLks as | ||
| # json into memory. -> issue #184 | ||
|
|
||
| receipt_token, raw_file = upload_json_clk_data(dp_id, get_json(), span) | ||
| # Schedule a task to deserialize the hashes, and carry | ||
| # out a pop count. | ||
| handle_raw_upload.delay(project_id, dp_id, receipt_token, parent_span=serialize_span(span)) | ||
| log.info("Job scheduled to handle user uploaded hashes") | ||
| elif headers['Content-Type'] == "application/octet-stream": | ||
| span.set_tag("content-type", 'binary') | ||
| log.info("Handling binary CLK upload") | ||
| try: | ||
| count, size = check_binary_upload_headers(headers) | ||
| log.info(f"Headers tell us to expect {count} encodings of {size} bytes") | ||
| span.log_kv({'count': count, 'size': size}) | ||
| except Exception: | ||
| log.warning("Upload failed due to problem with headers in binary upload") | ||
| raise | ||
| # Check against project level encoding size (if it has been set) | ||
| if project_encoding_size is not None and size != project_encoding_size: | ||
| # fail fast - we haven't stored the encoded data yet | ||
| return safe_fail_request(400, "Upload 'Hash-Size' doesn't match project settings") | ||
|
|
||
| # TODO actually stream the upload data straight to Minio. Currently we can't because | ||
| # connexion has already read the data before our handler is called! | ||
| # https://github.com/zalando/connexion/issues/592 | ||
| # stream = get_stream() | ||
| stream = BytesIO(request.data) | ||
| expected_bytes = binary_format(size).size * count | ||
| log.debug(f"Stream size is {len(request.data)} B, and we expect {expected_bytes} B") | ||
| if len(request.data) != expected_bytes: | ||
| safe_fail_request(400, "Uploaded data did not match the expected size. Check request headers are correct") | ||
| try: | ||
| receipt_token = upload_clk_data_binary(project_id, dp_id, stream, count, size) | ||
| except ValueError: | ||
| safe_fail_request(400, "Uploaded data did not match the expected size. Check request headers are correct.") | ||
| else: | ||
| safe_fail_request(400, "Content Type not supported") | ||
| except Exception: | ||
| log.info("The dataprovider was not able to upload her clks," | ||
| " re-enable the corresponding upload token to be used.") | ||
| with DBConn() as conn: | ||
| db.set_dataprovider_upload_state(conn, dp_id, state='error') | ||
| raise | ||
| with DBConn() as conn: | ||
| db.set_dataprovider_upload_state(conn, dp_id, state='done') | ||
| return {'message': 'Updated', 'receipt_token': receipt_token}, 201 | ||
|
|
||
|
|
||
|
|
@@ -250,7 +262,6 @@ def upload_clk_data_binary(project_id, dp_id, raw_stream, count, size=128): | |
| with opentracing.tracer.start_span('update-database-with-metadata', child_of=parent_span): | ||
| with DBConn() as conn: | ||
| db.update_encoding_metadata(conn, filename, dp_id, 'ready') | ||
| db.set_dataprovider_upload_state(conn, dp_id, True) | ||
|
|
||
| # Now work out if all parties have added their data | ||
| if clks_uploaded_to_project(project_id): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the status of the dataproviders is not true/false any more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well seen. I started by using the boolean value, but added more states to have an
in_progressone. But I forgot the comments...