make test work

apache · HonahX · Jun 26, 2024 · Apr 28, 2024 · Apr 28, 2024 · Apr 28, 2024
commit ecec57e1b823a29a76f1c15477cc6958e70105ee
diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py
@@ -226,7 +226,8 @@ def __eq__(self, other: Any) -> bool:
 class Snapshot(IcebergBaseModel):
     snapshot_id: int = Field(alias="snapshot-id")
     parent_snapshot_id: Optional[int] = Field(alias="parent-snapshot-id", default=None)
-    sequence_number: Optional[int] = Field(alias="sequence-number", default=None)
+    # cannot import `INITIAL_SEQUENCE_NUMBER` due to circular import
+    sequence_number: Optional[int] = Field(alias="sequence-number", default=0)
     timestamp_ms: int = Field(alias="timestamp-ms", default_factory=lambda: int(time.time() * 1000))
     manifest_list: Optional[str] = Field(
         alias="manifest-list", description="Location of the snapshot's manifest list file", default=None

diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py
@@ -451,6 +451,8 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non
 def test_inspect_metadata_log_entries(
     spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
 ) -> None:
+    from pandas.testing import assert_frame_equal
+
     identifier = "default.table_metadata_log_entries"
     tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version})
 
@@ -463,5 +465,15 @@ def test_inspect_metadata_log_entries(
     spark_df = spark.sql(f"SELECT * FROM {identifier}.metadata_log_entries")
     lhs = df.to_pandas()
     rhs = spark_df.toPandas()
-    breakpoint()
-    assert lhs.equals(rhs), lhs.compare(rhs)
+
+    # Timestamp in the last row of `metadata_log_entries` table is the based on when the table was read
+    # Therefore, the timestamp for pyiceberg dataframe and spark dataframe will be different
+    left_before_last, left_last = lhs[:-1], lhs[-1:]
+    right_before_last, right_last = rhs[:-1], rhs[-1:]
+
+    assert_frame_equal(left_before_last, right_before_last, check_dtype=False)
+    for column in df.column_names:
+        for left, right in zip(left_last[column], right_last[column]):
+            if column == 'timestamp':
+                continue
+            assert left == right, f"Difference in column {column}: {left} != {right}"