Skip to content
Merged
Prev Previous commit
Next Next commit
Go with large ones for now
  • Loading branch information
Fokko committed Jul 10, 2024
commit a760a52ec9af96ddcce41e296e975ef690e4e853
5 changes: 5 additions & 0 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,11 @@ def _task_to_record_batches(

fragment_scanner = ds.Scanner.from_fragment(
fragment=fragment,
# With PyArrow 16.0.0 there is an issue with casting record-batches:
# https://github.com/apache/arrow/issues/41884
# https://github.com/apache/arrow/issues/43183
# Would be good to remove this later on
schema=_pyarrow_schema_ensure_large_types(physical_schema),
# This will push down the query to Arrow.
# But in case there are positional deletes, we have to apply them first
filter=pyarrow_filter if not positional_deletes else None,
Expand Down
2 changes: 1 addition & 1 deletion pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2066,7 +2066,7 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
case_sensitive=self.case_sensitive,
limit=self.limit,
),
).cast(target_schema=target_schema)
)

def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
return self.to_arrow().to_pandas(**kwargs)
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_add_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def test_add_files_with_large_and_regular_schema(spark: SparkSession, session_ca
tbl.add_files([file_path])

table_schema = tbl.scan().to_arrow().schema
assert table_schema == arrow_schema
assert table_schema == arrow_schema_large

file_path_large = f"s3://warehouse/default/unpartitioned_with_large_types/v{format_version}/test-1.parquet"
_write_parquet(
Expand Down
14 changes: 7 additions & 7 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,10 +1002,10 @@ def test_read_map(schema_map: Schema, file_map: str) -> None:

assert (
repr(result_table.schema)
== """properties: map<string, string>
child 0, entries: struct<key: string not null, value: string not null> not null
child 0, key: string not null
child 1, value: string not null"""
== """properties: map<large_string, large_string>
child 0, entries: struct<key: large_string not null, value: large_string not null> not null
child 0, key: large_string not null
child 1, value: large_string not null"""
)


Expand Down Expand Up @@ -1279,9 +1279,9 @@ def test_projection_maps_of_structs(schema_map_of_structs: Schema, file_map_of_s
assert actual.as_py() == expected
assert (
repr(result_table.schema)
== """locations: map<string, struct<latitude: double not null, longitude: double not null, altitude: double>>
child 0, entries: struct<key: string not null, value: struct<latitude: double not null, longitude: double not null, altitude: double> not null> not null
child 0, key: string not null
== """locations: map<large_string, struct<latitude: double not null, longitude: double not null, altitude: double>>
child 0, entries: struct<key: large_string not null, value: struct<latitude: double not null, longitude: double not null, altitude: double> not null> not null
child 0, key: large_string not null
child 1, value: struct<latitude: double not null, longitude: double not null, altitude: double> not null
child 0, latitude: double not null
child 1, longitude: double not null
Expand Down