fix

apache · HonahX · Jul 12, 2024 · Jul 9, 2024 · Jul 10, 2024 · Jul 10, 2024
commit 6c4f5d751f2be96ca1ab765b3d91564275503500
diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py
@@ -49,6 +49,7 @@
 from pyiceberg.schema import Schema
 from pyiceberg.serializers import ToOutputFile
 from pyiceberg.table import (
+    DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE,
     CommitTableRequest,
     CommitTableResponse,
     CreateTableTransaction,
@@ -673,7 +674,7 @@ def _convert_schema_if_needed(schema: Union[Schema, "pa.Schema"]) -> Schema:
         try:
             import pyarrow as pa
 
-            from pyiceberg.io.pyarrow import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, _ConvertToIcebergWithoutIDs, visit_pyarrow
+            from pyiceberg.io.pyarrow import _ConvertToIcebergWithoutIDs, visit_pyarrow
 
             downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
             if isinstance(schema, pa.Schema):

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -166,7 +166,6 @@
 
 ONE_MEGABYTE = 1024 * 1024
 BUFFER_SIZE = "buffer-size"
-DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE = "downcast-ns-timestamp-to-us-on-write"
 
 ICEBERG_SCHEMA = b"iceberg.schema"
 # The PARQUET: in front means that it is Parquet specific, in this case the field_id
@@ -1952,7 +1951,7 @@ def data_file_statistics_from_parquet_metadata(
 
 
 def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteTask]) -> Iterator[DataFile]:
-    from pyiceberg.table import PropertyUtil, TableProperties
+    from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, PropertyUtil, TableProperties
 
     parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties)
     row_group_size = PropertyUtil.property_as_int(
@@ -2034,7 +2033,7 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[
     return bin_packed_record_batches
 
 
-def _check_schema_compatible(table_schema: Schema, other_schema: pa.Schema) -> None:
+def _check_schema_compatible(table_schema: Schema, other_schema: pa.Schema, downcast_ns_timestamp_to_us: bool = False) -> None:
     """
     Check if the `table_schema` is compatible with `other_schema`.
 
@@ -2044,7 +2043,6 @@ def _check_schema_compatible(table_schema: Schema, other_schema: pa.Schema) -> N
         ValueError: If the schemas are not compatible.
     """
     name_mapping = table_schema.name_mapping
-    downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
     try:
         task_schema = pyarrow_to_schema(
             other_schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -150,6 +150,7 @@
 )
 from pyiceberg.utils.bin_packing import ListPacker
 from pyiceberg.utils.concurrent import ExecutorFactory
+from pyiceberg.utils.config import Config
 from pyiceberg.utils.datetime import datetime_to_millis
 from pyiceberg.utils.deprecated import deprecated
 from pyiceberg.utils.singleton import _convert_to_hashable_type
@@ -166,6 +167,7 @@
 ALWAYS_TRUE = AlwaysTrue()
 TABLE_ROOT_ID = -1
 _JAVA_LONG_MAX = 9223372036854775807
+DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE = "downcast-ns-timestamp-to-us-on-write"
 
 
 class TableProperties:
@@ -478,8 +480,10 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
             raise ValueError(
                 f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}."
             )
-
-        _check_schema_compatible(self._table.schema(), other_schema=df.schema)
+        downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
+        _check_schema_compatible(
+            self._table.schema(), other_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
+        )
         # cast if the two schemas are compatible but not equal
         table_arrow_schema = self._table.schema().as_arrow()
         if table_arrow_schema != df.schema:
@@ -537,8 +541,10 @@ def overwrite(
             raise ValueError(
                 f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}."
             )
-
-        _check_schema_compatible(self._table.schema(), other_schema=df.schema)
+        downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
+        _check_schema_compatible(
+            self._table.schema(), other_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
+        )
         # cast if the two schemas are compatible but not equal
         table_arrow_schema = self._table.schema().as_arrow()
         if table_arrow_schema != df.schema:

diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py
@@ -17,6 +17,7 @@
 # pylint:disable=redefined-outer-name
 
 import os
+import re
 from datetime import date
 from typing import Iterator
 
@@ -569,7 +570,7 @@ def test_add_files_with_large_and_regular_schema(spark: SparkSession, session_ca
     assert table_schema == arrow_schema_large
 
 
-def test_timestamp_tz_ns_downcast_on_read(session_catalog: Catalog, format_version: int, mocker: MockerFixture) -> None:
+def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_version: int, mocker: MockerFixture) -> None:
     nanoseconds_schema_iceberg = Schema(NestedField(1, "quux", TimestamptzType()))
 
     nanoseconds_schema = pa.schema([
@@ -600,25 +601,18 @@ def test_timestamp_tz_ns_downcast_on_read(session_catalog: Catalog, format_versi
         partition_spec=PartitionSpec(),
     )
 
-    file_paths = [f"s3://warehouse/default/test_timestamp_tz/v{format_version}/test-{i}.parquet" for i in range(5)]
+    file_path = f"s3://warehouse/default/test_timestamp_tz/v{format_version}/test.parquet"
     # write parquet files
-    for file_path in file_paths:
-        fo = tbl.io.new_output(file_path)
-        with fo.create(overwrite=True) as fos:
-            with pq.ParquetWriter(fos, schema=nanoseconds_schema) as writer:
-                writer.write_table(arrow_table)
+    fo = tbl.io.new_output(file_path)
+    with fo.create(overwrite=True) as fos:
+        with pq.ParquetWriter(fos, schema=nanoseconds_schema) as writer:
+            writer.write_table(arrow_table)
 
     # add the parquet files as data files
-    tbl.add_files(file_paths=file_paths)
-
-    assert tbl.scan().to_arrow() == pa.concat_tables(
-        [
-            arrow_table.cast(
-                pa.schema([
-                    ("quux", pa.timestamp("us", tz="UTC")),
-                ]),
-                safe=False,
-            )
-        ]
-        * 5
-    )
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write."
+        ),
+    ):
+        tbl.add_files(file_paths=[file_path])