finished unit tests

apache · Fokko · Feb 13, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
commit a96fdf90bb6cbab9c392093a2621d4f583c6e765
diff --git a/tests/table/test_merge_rows.py b/tests/table/test_merge_rows.py
@@ -1,25 +1,5 @@
-## unit test for merging rows
+## unit tests for merging rows
 
-## todo
-"""
-    simplify this unit testing to reusable functions
-    check with how the other unit tests are flagged and add accordlingly
-    fix the warehouse path; its not creating it in this test module
-    wonder if the pyiceberg team already has a warehouse folder to stash all these tests in?
-    add a show_table function to visually see the new table
-    add a function to nuke the warehouse folder to cleanup
-
-    update these functions to all start with "test_"
-
-    test_1: single update/insert
-    test_2: scale to 1k updates/inserts on single key
-    test_3: test update only
-    test_4: test insert only
-    test_5: test no update or insert
-    test_6: composite key update/insert 100 rows
-    test_7: composite key update/insert 1000 rows 
-
-"""
 
 from datafusion import SessionContext
 from pyiceberg.catalog.sql import SqlCatalog
@@ -114,7 +94,7 @@ def gen_target_iceberg_table(start_row: int, end_row: int, composite_key: bool):
 
     return table
 
-def test_merge_scenario_1():
+def test_merge_scenario_1_simple():
 
     """
         tests a single insert and update
@@ -124,31 +104,34 @@ def test_merge_scenario_1():
     source_df = gen_source_dataset(2, 3, False, False)
 
 
-    res = table.merge_rows(source_df, ["order_id"])
+    res = table.merge_rows(df=source_df, join_cols=["order_id"])
+
+    rows_updated_should_be = 1
+    rows_inserted_should_be = 1
 
-    assert res['rows_updated'] == 1, f"rows updated should be 1, but got {res['rows_updated']}"
-    assert res['rows_inserted'] == 1, f"rows inserted should be 1, but got {res['rows_inserted']}"
+    assert res['rows_updated'] == rows_updated_should_be, f"rows updated should be {rows_updated_should_be}, but got {res['rows_updated']}"
+    assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     #print(res)
 
     #show_iceberg_table(table)
 
     purge_warehouse()
 
-def test_merge_scenario_2():
+def test_merge_scenario_2_10k_rows():
 
     """
-        tests merging 1000 rows on a single key
+        tests merging 10000 rows on a single key to simulate larger workload
     """
 
-    table = gen_target_iceberg_table(1, 1000, False)
-    source_df = gen_source_dataset(501, 1500, False, False)
+    table = gen_target_iceberg_table(1, 10000, False)
+    source_df = gen_source_dataset(5001, 15000, False, False)
 
 
-    res = table.merge_rows(source_df, ["order_id"])
+    res = table.merge_rows(df=source_df, join_cols=["order_id"])
 
-    rows_updated_should_be = 500
-    rows_inserted_should_be = 500
+    rows_updated_should_be = 5000
+    rows_inserted_should_be = 5000
 
     assert res['rows_updated'] == rows_updated_should_be, f"rows updated should be {rows_updated_should_be}, but got {res['rows_updated']}"
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
@@ -157,7 +140,7 @@ def test_merge_scenario_2():
 
     purge_warehouse()
 
-def test_merge_scenario_3():
+def test_merge_scenario_3_composite_key():
 
     """
         tests merging 200 rows with a composite key
@@ -167,7 +150,7 @@ def test_merge_scenario_3():
     source_df = gen_source_dataset(101, 300, True, False)
 
 
-    res = table.merge_rows(source_df, ["order_id", "order_line_id"])
+    res = table.merge_rows(df=source_df, join_cols=["order_id", "order_line_id"])
 
     #print(res)
 
@@ -181,6 +164,49 @@ def test_merge_scenario_3():
 
     purge_warehouse()
 
+def test_merge_update_only():
+
+    """
+        tests explicit merge options to do update only
+    """
+
+    table = gen_target_iceberg_table(1, 1000, False)
+    source_df = gen_source_dataset(501, 1500, False, False)
+
+    merge_options = {'when_matched_update_all': True, 'when_not_matched_insert_all': False}
+    res = table.merge_rows(df=source_df, join_cols=["order_id"], merge_options=merge_options)
+
+    rows_updated_should_be = 500
+    rows_inserted_should_be = 0
+
+    assert res['rows_updated'] == rows_updated_should_be, f"rows updated should be {rows_updated_should_be}, but got {res['rows_updated']}"
+    assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
+
+    #show_iceberg_table(table)
+
+    purge_warehouse()
+
+def test_merge_insert_only():
+    """
+        tests explicit merge options to do insert only
+    """
+
+    table = gen_target_iceberg_table(1, 1000, False)
+    source_df = gen_source_dataset(501, 1500, False, False)
+
+    merge_options = {'when_matched_update_all': False, 'when_not_matched_insert_all': True}
+    res = table.merge_rows(df=source_df, join_cols=["order_id"], merge_options=merge_options)
+
+    rows_updated_should_be = 0
+    rows_inserted_should_be = 500
+
+    assert res['rows_updated'] == rows_updated_should_be, f"rows updated should be {rows_updated_should_be}, but got {res['rows_updated']}"
+    assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
+
+    #show_iceberg_table(table)
+
+    purge_warehouse()
+
 def test_merge_source_dups():
 
     """
@@ -190,7 +216,7 @@ def test_merge_source_dups():
     table = gen_target_iceberg_table(1, 10, False)
     source_df = gen_source_dataset(5, 15, False, True)
 
-    res = table.merge_rows(source_df, ["order_id"])
+    res = table.merge_rows(df=source_df, join_cols=["order_id"])
 
     error_msgs = res['error_msgs']
 
@@ -200,18 +226,36 @@ def test_merge_source_dups():
 
     purge_warehouse()
 
+def test_key_cols_misaligned():
 
-def test_merge_update_only():
-    print('blah')
+    """
+        tests join columns missing from one of the tables
+    """
 
-def test_merge_insert_only():
-    print('blah')
+    #generate dummy target iceberg table
+    df = ctx.sql("select 1 as order_id, date '2021-01-01' as order_date, 'A' as order_type").to_arrow_table()
 
-def test_key_cols_misaligned():
-    print('blah')
+    catalog = get_sql_catalog(test_namespace)
+    table = catalog.create_table(f"{test_namespace}.target", df.schema)
+
+    table.append(df)
+
+    df_src = ctx.sql("select 1 as item_id, date '2021-05-01' as order_date, 'B' as order_type").to_arrow_table()
+
+    res = table.merge_rows(df=df_src, join_cols=['order_id'])
+    error_msgs = res['error_msgs']
+
+    #print(res)
+
+    assert 'Join columns missing in tables' in error_msgs, f"error message should contain 'Join columns missing in tables', but got {error_msgs}"
+
+    purge_warehouse()
 
 if __name__ == "__main__":
-    test_merge_scenario_1()
-    test_merge_scenario_2()
-    test_merge_scenario_3()
+    test_merge_scenario_1_simple()
+    test_merge_scenario_2_10k_rows()
+    test_merge_scenario_3_composite_key()
+    test_merge_update_only()
+    test_merge_insert_only()
     test_merge_source_dups()
+    test_key_cols_misaligned()