-
Notifications
You must be signed in to change notification settings - Fork 8
Store uploaded encodings in database #516
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
febc433
a1a22eb
3187fd4
09a3486
5ae5c57
b03bf1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,11 +11,11 @@ def select_dataprovider_id(db, project_id, receipt_token): | |
| Returns None if token is incorrect. | ||
| """ | ||
| sql_query = """ | ||
| SELECT dp from dataproviders, bloomingdata | ||
| SELECT dp from dataproviders, uploads | ||
| WHERE | ||
| bloomingdata.dp = dataproviders.id AND | ||
| uploads.dp = dataproviders.id AND | ||
| dataproviders.project = %s AND | ||
| bloomingdata.token = %s | ||
| uploads.token = %s | ||
| """ | ||
| query_result = query_db(db, sql_query, [project_id, receipt_token], one=True) | ||
| logger.debug("Looking up data provider with auth. {}".format(query_result)) | ||
|
|
@@ -61,10 +61,10 @@ def check_run_exists(db, project_id, run_id): | |
| def get_number_parties_uploaded(db, project_id): | ||
| sql_query = """ | ||
| SELECT COUNT(*) | ||
| FROM dataproviders, bloomingdata | ||
| FROM dataproviders, uploads | ||
| WHERE | ||
| dataproviders.project = %s AND | ||
| bloomingdata.dp = dataproviders.id AND | ||
| uploads.dp = dataproviders.id AND | ||
| dataproviders.uploaded = 'done' | ||
| """ | ||
| query_result = query_db(db, sql_query, [project_id], one=True) | ||
|
|
@@ -77,24 +77,24 @@ def get_encoding_error_count(db, project_id): | |
| """ | ||
| sql_query = """ | ||
| SELECT count(*) | ||
| FROM dataproviders, bloomingdata | ||
| FROM dataproviders, uploads | ||
| WHERE | ||
| dataproviders.project = %s AND | ||
| bloomingdata.dp = dataproviders.id AND | ||
| bloomingdata.state = 'error' | ||
| uploads.dp = dataproviders.id AND | ||
| uploads.state = 'error' | ||
| """ | ||
| return query_db(db, sql_query, [project_id], one=True)['count'] | ||
|
|
||
|
|
||
| def get_number_parties_ready(db, resource_id): | ||
| sql_query = """ | ||
| SELECT COUNT(*) | ||
| FROM dataproviders, bloomingdata | ||
| FROM dataproviders, uploads | ||
| WHERE | ||
| dataproviders.project = %s AND | ||
| bloomingdata.dp = dataproviders.id AND | ||
| uploads.dp = dataproviders.id AND | ||
| dataproviders.uploaded = 'done' AND | ||
| bloomingdata.state = 'ready' | ||
| uploads.state = 'ready' | ||
| """ | ||
| query_result = query_db(db, sql_query, [resource_id], one=True) | ||
| return query_result['count'] | ||
|
|
@@ -187,11 +187,12 @@ def get_run_result(db, resource_id): | |
|
|
||
|
|
||
| def get_project_dataset_sizes(db, project_id): | ||
| """Returns the number of encodings in a dataset.""" | ||
| sql_query = """ | ||
| SELECT bloomingdata.count | ||
| FROM dataproviders, bloomingdata | ||
| SELECT uploads.count | ||
| FROM dataproviders, uploads | ||
| WHERE | ||
| bloomingdata.dp=dataproviders.id AND | ||
| uploads.dp=dataproviders.id AND | ||
| dataproviders.project=%s | ||
| ORDER BY dataproviders.id | ||
| """ | ||
|
|
@@ -203,9 +204,9 @@ def get_project_dataset_sizes(db, project_id): | |
| def get_uploaded_encoding_sizes(db, project_id): | ||
| sql_query = """ | ||
| SELECT dp, encoding_size | ||
| FROM dataproviders, bloomingdata | ||
| FROM dataproviders, uploads | ||
| WHERE | ||
| bloomingdata.dp=dataproviders.id AND | ||
| uploads.dp=dataproviders.id AND | ||
| dataproviders.project=%s | ||
| ORDER BY dataproviders.id | ||
| """ | ||
|
|
@@ -215,10 +216,10 @@ def get_uploaded_encoding_sizes(db, project_id): | |
|
|
||
| def get_smaller_dataset_size_for_project(db, project_id): | ||
| sql_query = """ | ||
| SELECT MIN(bloomingdata.count) as smaller | ||
| FROM dataproviders, bloomingdata | ||
| SELECT MIN(uploads.count) as smaller | ||
| FROM dataproviders, uploads | ||
| WHERE | ||
| bloomingdata.dp=dataproviders.id AND | ||
| uploads.dp=dataproviders.id AND | ||
| dataproviders.project=%s | ||
| """ | ||
| query_result = query_db(db, sql_query, [project_id], one=True) | ||
|
|
@@ -231,10 +232,10 @@ def get_total_comparisons_for_project(db, project_id): | |
| """ | ||
| expected_datasets = get_project_column(db, project_id, 'parties') | ||
| sql_query = """ | ||
| SELECT bloomingdata.count as rows | ||
| from dataproviders, bloomingdata | ||
| SELECT uploads.count as rows | ||
| from dataproviders, uploads | ||
| where | ||
| bloomingdata.dp=dataproviders.id AND | ||
| uploads.dp=dataproviders.id AND | ||
| dataproviders.project=%s | ||
| """ | ||
| query_results = query_db(db, sql_query, [project_id]) | ||
|
|
@@ -260,12 +261,12 @@ def get_dataprovider_id(db, update_token): | |
| return query_db(db, sql_query, [update_token], one=True)['id'] | ||
|
|
||
|
|
||
| def get_bloomingdata_columns(db, dp_id, columns): | ||
| def get_uploads_columns(db, dp_id, columns): | ||
| for column in columns: | ||
| assert column in {'ts', 'token', 'file', 'state', 'count', 'encoding_size'} | ||
| assert column in {'ts', 'token', 'file', 'state', 'block_count', 'count', 'encoding_size'} | ||
| sql_query = """ | ||
| SELECT {} | ||
| FROM bloomingdata | ||
| FROM uploads | ||
| WHERE dp = %s | ||
| """.format(', '.join(columns)) | ||
| result = query_db(db, sql_query, [dp_id], one=True) | ||
|
|
@@ -274,19 +275,42 @@ def get_bloomingdata_columns(db, dp_id, columns): | |
| return [result[column] for column in columns] | ||
|
|
||
|
|
||
| def get_encodingblock_ids(db, dp_id, block_name=None): | ||
| """Yield all encoding ids in either a single block, or all blocks for a given data provider.""" | ||
| sql_query = """ | ||
| SELECT encoding_id | ||
| FROM encodingblocks | ||
| WHERE dp = %s | ||
| {} | ||
| """.format("AND block_id = %s" if block_name else "") | ||
| # Specifying a name for the cursor creates a server-side cursor, which prevents all of the | ||
| # records from being downloaded at once. | ||
| cur = db.cursor(f'encodingfetcher-{dp_id}') | ||
|
|
||
| args = (dp_id, block_name) if block_name else (dp_id,) | ||
|
|
||
| cur.execute(sql_query, args) | ||
| while True: | ||
| rows = cur.fetchmany(10_000) | ||
| if not rows: | ||
| break | ||
| for row in rows: | ||
| yield row[0] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wait a minute, does that mean we will hold on to the db connection until all the yielding is done? That doesn't seem like a good idea.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't you think it would be good to keep the db connection while streaming through the blocks? Establishing a db connection is not free. The point of this sneaky Python cache is because we might not have the memory to store all (e.g. millions) of blocks if we used |
||
|
|
||
|
|
||
| def get_filter_metadata(db, dp_id): | ||
| """ | ||
| :return: The filename and the encoding size of the raw clks. | ||
| """ | ||
| filename, encoding_size = get_bloomingdata_columns(db, dp_id, ['file', 'encoding_size']) | ||
| filename, encoding_size = get_uploads_columns(db, dp_id, ['file', 'encoding_size']) | ||
| return filename.strip(), encoding_size | ||
|
|
||
|
|
||
| def get_number_of_hashes(db, dp_id): | ||
| def get_encoding_metadata(db, dp_id): | ||
| """ | ||
| :return: The count of the uploaded encodings. | ||
| :return: The number of encodings and number of blocks of the uploaded data. | ||
| """ | ||
| return get_bloomingdata_columns(db, dp_id, ['count'])[0] | ||
| return get_uploads_columns(db, dp_id, ['count', 'block_count']) | ||
|
|
||
|
|
||
| def get_project_schema_encoding_size(db, project_id): | ||
|
|
@@ -370,7 +394,7 @@ def get_all_objects_for_project(db, project_id): | |
|
|
||
| for dp in dps: | ||
| clk_file_ref = query_db(db, """ | ||
| SELECT file FROM bloomingdata | ||
| SELECT file FROM uploads | ||
| WHERE dp = %s | ||
| """, [dp['id']], one=True) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.