save

apache · jqin61 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
commit e320b73f9548743d9d9c24b6a6e3155011e9d3a9
diff --git a/pyiceberg/expressions/__init__.py b/pyiceberg/expressions/__init__.py
@@ -383,6 +383,10 @@ def __repr__(self) -> str:
     @abstractmethod
     def as_bound(self) -> Type[BoundUnaryPredicate[Any]]: ...
 
+    def __hash__(self) -> int:
+        """Return hash value of the UnaryPredicate class."""
+        return hash(str(self))
+
 
 class BoundUnaryPredicate(BoundPredicate[L], ABC):
     def __repr__(self) -> str:
@@ -412,6 +416,10 @@ def __invert__(self) -> BoundNotNull[L]:
     def as_unbound(self) -> Type[IsNull]:
         return IsNull
 
+    def __hash__(self) -> int:
+        """Return hash value of the BoundIsNull class."""
+        return hash(str(self))
+
 
 class BoundNotNull(BoundUnaryPredicate[L]):
     def __new__(cls, term: BoundTerm[L]):  # type: ignore  # pylint: disable=W0221
@@ -698,6 +706,10 @@ def __repr__(self) -> str:
     @abstractmethod
     def as_bound(self) -> Type[BoundLiteralPredicate[L]]: ...
 
+    def __hash__(self) -> int:
+        """Return hash value of the LiteralPredicate class."""
+        return hash(str(self))
+
 
 class BoundLiteralPredicate(BoundPredicate[L], ABC):
     literal: Literal[L]
@@ -731,6 +743,10 @@ def __invert__(self) -> BoundNotEqualTo[L]:
     def as_unbound(self) -> Type[EqualTo[L]]:
         return EqualTo
 
+    def __hash__(self) -> int:
+        """Return hash value of the BoundEqualTo class."""
+        return hash(str(self))
+
 
 class BoundNotEqualTo(BoundLiteralPredicate[L]):
     def __invert__(self) -> BoundEqualTo[L]:

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -1777,6 +1777,10 @@ def write_parquet(task: WriteTask) -> DataFile:
         with fo.create(overwrite=True) as fos:
             with pq.ParquetWriter(fos, schema=arrow_file_schema, **parquet_writer_kwargs) as writer:
                 writer.write(pa.Table.from_batches(task.record_batches), row_group_size=row_group_size)
+                arrow_table = pa.Table.from_batches(task.record_batches)
+                # align the columns accordingly in case input arrow table has columns in order different from iceberg table
+                df_to_write = arrow_table.select(arrow_file_schema.names)
+                writer.write_table(df_to_write, row_group_size=row_group_size)
 
         statistics = data_file_statistics_from_parquet_metadata(
             parquet_metadata=writer.writer.metadata,