Transaction size for encoding upload will depend on encoding_size.

hardbyte · hardbyte · commit 5f23688bba82 · 2020-03-02T16:16:50.000+13:00
diff --git a/backend/entityservice/database/insertions.py b/backend/entityservice/database/insertions.py
@@ -78,11 +78,15 @@ def insert_encoding_metadata(db, clks_filename, dp_id, receipt_token, encoding_c
 
 
 def insert_encodings_into_blocks(db, dp_id: int, block_ids: List[List[str]], encoding_ids: List[int],
-                                 encodings: List[bytes], page_size=4096):
+                                 encodings: List[bytes], page_size: int = 4096):
     """
     Bulk load blocking and encoding data into the database.
-
     See https://hakibenita.com/fast-load-data-python-postgresql#copy-data-from-a-string-iterator-with-buffer-size
+
+    :param page_size:
+        Maximum number of rows to fetch in a given sql statement/network transfer. A larger page size
+        will require more local memory, but could be faster due to less network transfers.
+
     """
     encodings_insertion_query = "INSERT INTO encodings (dp, encoding_id, encoding) VALUES %s"
     blocks_insertion_query = "INSERT INTO encodingblocks (dp, encoding_id, block_id) VALUES %s"
diff --git a/backend/entityservice/encoding_storage.py b/backend/entityservice/encoding_storage.py
@@ -47,7 +47,6 @@ def convert_encodings_from_base64_to_binary(encodings: Iterator[Tuple[str, str,
         yield i, binary_packed_encoding, blocks
 
 
-
 def _grouper(iterable, n, fillvalue=None):
     "Collect data into fixed-length chunks or blocks"
     # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
@@ -73,23 +72,28 @@ def _transpose(group):
     return a, b, c
 
 
-def store_encodings_in_db(conn, dp_id, encodings: Iterator[Tuple[str, bytes, List[str]]]):
+def store_encodings_in_db(conn, dp_id, encodings: Iterator[Tuple[str, bytes, List[str]]], encoding_size: int=128):
     """
     Group encodings + blocks into database transactions and execute.
-
-    Assuming default encoding size of 128 B, n encodings each with their own
-    4 B encoding id, and assuming `k` multiple unique blocks of 64 B will be a transaction
-    of approximately k*64 + 132 * n. For k = 10 and n = 100_000 this gives a transaction
-    size under 100MiB.
     """
 
-    for group in _grouper(encodings, n=100_000):
+    for group in _grouper(encodings, n=_estimate_group_size(encoding_size)):
         encoding_ids, encodings, blocks = _transpose(group)
         assert len(blocks) == len(encodings)
         assert len(encoding_ids) == len(encodings)
         insert_encodings_into_blocks(conn, dp_id, block_ids=blocks, encoding_ids=encoding_ids, encodings=encodings)
 
 
+def _estimate_group_size(encoding_size):
+    """
+    Given an encoding size (e.g. 128 B), estimate the number of encodings that will likely
+    be under 100MiB in data including blocks. Note this is hopefully very conservative
+    in estimating the average number of blocks each record is in.
+    """
+    network_transaction_size = 104857600  # 100MiB
+    blocks_per_record_estimate = 50
+    return network_transaction_size / ((blocks_per_record_estimate * 64) + (encoding_size + 4))
+
 
 def convert_encodings_from_json_to_binary(f):
     """
diff --git a/backend/entityservice/tasks/encoding_uploading.py b/backend/entityservice/tasks/encoding_uploading.py
@@ -32,6 +32,8 @@ def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None):
         # Get number of blocks + total number of encodings from database
         expected_count, block_count = get_encoding_metadata(db, dp_id)
 
+        encoding_size = get_uploads_columns(db, dp_id, ['encoding_size'])[0]
+
     log.info(f"Expecting to handle {expected_count} encodings in {block_count} blocks")
     mc = connect_to_object_store()
     raw_file = Config.RAW_FILENAME_FMT.format(receipt_token)
@@ -43,7 +45,7 @@ def handle_raw_upload(project_id, dp_id, receipt_token, parent_span=None):
         # output into database for each block (temp or direct to minio?)
         pipeline = convert_encodings_from_base64_to_binary(stream_json_clksnblocks(raw_data))
         with DBConn() as db:
-            store_encodings_in_db(db, dp_id, pipeline)
+            store_encodings_in_db(db, dp_id, pipeline, encoding_size)
 
 
     #### GLUE CODE - TODO remove me once moved away from storing encodings in files