@@ -47,7 +47,6 @@ def convert_encodings_from_base64_to_binary(encodings: Iterator[Tuple[str, str,
4747 yield i , binary_packed_encoding , blocks
4848
4949
50-
5150def _grouper (iterable , n , fillvalue = None ):
5251 "Collect data into fixed-length chunks or blocks"
5352 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
@@ -73,23 +72,28 @@ def _transpose(group):
7372 return a , b , c
7473
7574
76- def store_encodings_in_db (conn , dp_id , encodings : Iterator [Tuple [str , bytes , List [str ]]]):
75+ def store_encodings_in_db (conn , dp_id , encodings : Iterator [Tuple [str , bytes , List [str ]]], encoding_size : int = 128 ):
7776 """
7877 Group encodings + blocks into database transactions and execute.
79-
80- Assuming default encoding size of 128 B, n encodings each with their own
81- 4 B encoding id, and assuming `k` multiple unique blocks of 64 B will be a transaction
82- of approximately k*64 + 132 * n. For k = 10 and n = 100_000 this gives a transaction
83- size under 100MiB.
8478 """
8579
86- for group in _grouper (encodings , n = 100_000 ):
80+ for group in _grouper (encodings , n = _estimate_group_size ( encoding_size ) ):
8781 encoding_ids , encodings , blocks = _transpose (group )
8882 assert len (blocks ) == len (encodings )
8983 assert len (encoding_ids ) == len (encodings )
9084 insert_encodings_into_blocks (conn , dp_id , block_ids = blocks , encoding_ids = encoding_ids , encodings = encodings )
9185
9286
87+ def _estimate_group_size (encoding_size ):
88+ """
89+ Given an encoding size (e.g. 128 B), estimate the number of encodings that will likely
90+ be under 100MiB in data including blocks. Note this is hopefully very conservative
91+ in estimating the average number of blocks each record is in.
92+ """
93+ network_transaction_size = 104857600 # 100MiB
94+ blocks_per_record_estimate = 50
95+ return network_transaction_size / ((blocks_per_record_estimate * 64 ) + (encoding_size + 4 ))
96+
9397
9498def convert_encodings_from_json_to_binary (f ):
9599 """
0 commit comments