Skip to content
Merged
Prev Previous commit
Next Next commit
chunk correctly using to_batches
  • Loading branch information
kevinjqliu committed Mar 9, 2024
commit fd1efe018e084b75c69f131fb6880e44d114de8a
5 changes: 4 additions & 1 deletion pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1771,8 +1771,11 @@ def write_parquet(task: WriteTask) -> DataFile:
def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[pa.RecordBatch]]:
from pyiceberg.utils.bin_packing import PackingIterator

avg_row_size_bytes = tbl.nbytes / tbl.num_rows
max_chunksize = target_file_size // avg_row_size_bytes
batches = tbl.to_batches(max_chunksize)
bin_packed_record_batches = PackingIterator(
items=tbl.to_batches(),
items=batches,
target_weight=target_file_size,
lookback=2,
weight_func=lambda x: x.nbytes,
Expand Down