chunk correctly using to_batches

apache · Fokko · Mar 28, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
commit fd1efe018e084b75c69f131fb6880e44d114de8a
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -1771,8 +1771,11 @@ def write_parquet(task: WriteTask) -> DataFile:
 def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[pa.RecordBatch]]:
     from pyiceberg.utils.bin_packing import PackingIterator
 
+    avg_row_size_bytes = tbl.nbytes / tbl.num_rows
+    max_chunksize = target_file_size // avg_row_size_bytes
+    batches = tbl.to_batches(max_chunksize)
     bin_packed_record_batches = PackingIterator(
-        items=tbl.to_batches(),
+        items=batches,
         target_weight=target_file_size,
         lookback=2,
         weight_func=lambda x: x.nbytes,