Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
fccb74b
test
mattmartin14 Jan 14, 2025
7298589
unit testing
mattmartin14 Jan 14, 2025
25bc9cf
adding unit tests
mattmartin14 Jan 14, 2025
af6c868
adding unit tests
mattmartin14 Jan 14, 2025
94be807
adding unit tests
mattmartin14 Jan 14, 2025
269d9f5
adding unit tests
mattmartin14 Jan 15, 2025
f44c61a
adding unit tests
mattmartin14 Jan 15, 2025
a96fdf9
finished unit tests
mattmartin14 Jan 16, 2025
fa5ab35
removed unnecesary return
mattmartin14 Jan 16, 2025
cfa2277
updated poetry manifest list for datafusion package dependency
mattmartin14 Jan 17, 2025
35f29be
added license headers, cleaned up dead code
mattmartin14 Jan 22, 2025
6c68d0d
updated the merge function to use bools for matched and not matched rows
mattmartin14 Jan 29, 2025
2d1e8ae
incorporated changes for boolExpression. It simplified the filters a lot
mattmartin14 Jan 31, 2025
f988f25
moved the filter build function to a separate function to accomodate …
mattmartin14 Jan 31, 2025
43393b4
removed unneccessary comment
mattmartin14 Jan 31, 2025
9a561b4
removed test files
mattmartin14 Jan 31, 2025
9ef39a6
bug fixes and removed some more dependency on datafusion
mattmartin14 Feb 3, 2025
2ba1ed6
updated various items including adding a dataclass return result
mattmartin14 Feb 4, 2025
a42eecd
updated merge_rows to remove dependency from datafusion! wahoo
mattmartin14 Feb 4, 2025
1305f58
renamed merge_rows to upsert, removed unnecessary code. will put in f…
mattmartin14 Feb 5, 2025
b2be3db
adding params to unit testing for pytest; having some errors
mattmartin14 Feb 5, 2025
f5688ad
fixed bugs on unit testing; added context wrapper for txn; fixed vari…
mattmartin14 Feb 5, 2025
7d55a4e
bug fixes
mattmartin14 Feb 6, 2025
2e14767
updated some error throwing items
mattmartin14 Feb 6, 2025
85c5848
moved datafusion to just a dev dependency in poetry toml
mattmartin14 Feb 6, 2025
6472071
updated UpsertRow class to be recognized in the return statement
mattmartin14 Feb 6, 2025
51c34da
removed some spaces and streamlined assert statements in unit testing
mattmartin14 Feb 6, 2025
862a69a
updated test cases to use an InMemory catalog
mattmartin14 Feb 7, 2025
3731b86
updated some formatting; added more commentary on the rows_to_update …
mattmartin14 Feb 7, 2025
bbb35d6
rebased poetry lock file and pyproject.toml file; removed sf repo info
mattmartin14 Feb 10, 2025
c8189c9
Merge branch 'main' into main
mattmartin14 Feb 10, 2025
02af4d4
updated equality checks with not instead of == false
mattmartin14 Feb 10, 2025
cc75192
ran ruff check --fix
mattmartin14 Feb 10, 2025
998d98b
manually added lint fixes and updated poetry toml and lock files. tha…
mattmartin14 Feb 11, 2025
513c839
added formatting fices
mattmartin14 Feb 11, 2025
0fd6446
remove the node_modules
mattmartin14 Feb 11, 2025
5fc3478
updated code for another round of fixes
mattmartin14 Feb 11, 2025
6cef789
removed npm uneeded files
mattmartin14 Feb 11, 2025
40b69b8
fixed formatting on upsert function for docs build
mattmartin14 Feb 12, 2025
804c526
Merge branch 'main' into main
mattmartin14 Feb 12, 2025
09e0347
rebased for poetry lock files
mattmartin14 Feb 12, 2025
ca2d904
updated lock files. thanks kevin
mattmartin14 Feb 12, 2025
77375fb
fixed other changes
mattmartin14 Feb 12, 2025
ba4db49
fixed gitignore file
mattmartin14 Feb 12, 2025
622e66c
no whitespace
mattmartin14 Feb 12, 2025
9e79dad
fixed vendor fb file from kevins changes
mattmartin14 Feb 12, 2025
4cbf3e3
reverting vendor changes
mattmartin14 Feb 12, 2025
5333a1e
removing node modules
mattmartin14 Feb 12, 2025
11a25be
updating vendor files
mattmartin14 Feb 12, 2025
03a8d10
Update vendor/fb303/FacebookService.py
mattmartin14 Feb 12, 2025
8a2143c
updated vendor files
mattmartin14 Feb 12, 2025
e719cf8
updated vendor files
mattmartin14 Feb 12, 2025
245b4a9
attempting to update poetry files
mattmartin14 Feb 12, 2025
e3e9611
Merge branch 'main' into main
mattmartin14 Feb 12, 2025
e575b3c
restore vendor/
kevinjqliu Feb 13, 2025
e4e530f
resetore pyproject.toml
kevinjqliu Feb 13, 2025
2ff2083
poetry lock
kevinjqliu Feb 13, 2025
8585d2d
add datafusion to tool.mypy.overrides
kevinjqliu Feb 13, 2025
f673b70
Merge remote-tracking branch 'apache/main' into StateFarmIns/main
kevinjqliu Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
finished unit tests
  • Loading branch information
mattmartin14 committed Jan 16, 2025
commit a96fdf90bb6cbab9c392093a2621d4f583c6e765
132 changes: 88 additions & 44 deletions tests/table/test_merge_rows.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,5 @@
## unit test for merging rows
## unit tests for merging rows

## todo
"""
simplify this unit testing to reusable functions
check with how the other unit tests are flagged and add accordlingly
fix the warehouse path; its not creating it in this test module
wonder if the pyiceberg team already has a warehouse folder to stash all these tests in?
add a show_table function to visually see the new table
add a function to nuke the warehouse folder to cleanup

update these functions to all start with "test_"

test_1: single update/insert
test_2: scale to 1k updates/inserts on single key
test_3: test update only
test_4: test insert only
test_5: test no update or insert
test_6: composite key update/insert 100 rows
test_7: composite key update/insert 1000 rows

"""

from datafusion import SessionContext
from pyiceberg.catalog.sql import SqlCatalog
Expand Down Expand Up @@ -114,7 +94,7 @@ def gen_target_iceberg_table(start_row: int, end_row: int, composite_key: bool):

return table

def test_merge_scenario_1():
def test_merge_scenario_1_simple():

"""
tests a single insert and update
Expand All @@ -124,31 +104,34 @@ def test_merge_scenario_1():
source_df = gen_source_dataset(2, 3, False, False)


res = table.merge_rows(source_df, ["order_id"])
res = table.merge_rows(df=source_df, join_cols=["order_id"])

rows_updated_should_be = 1
rows_inserted_should_be = 1

assert res['rows_updated'] == 1, f"rows updated should be 1, but got {res['rows_updated']}"
assert res['rows_inserted'] == 1, f"rows inserted should be 1, but got {res['rows_inserted']}"
assert res['rows_updated'] == rows_updated_should_be, f"rows updated should be {rows_updated_should_be}, but got {res['rows_updated']}"
assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"

#print(res)

#show_iceberg_table(table)

purge_warehouse()

def test_merge_scenario_2():
def test_merge_scenario_2_10k_rows():

"""
tests merging 1000 rows on a single key
tests merging 10000 rows on a single key to simulate larger workload
"""

table = gen_target_iceberg_table(1, 1000, False)
source_df = gen_source_dataset(501, 1500, False, False)
table = gen_target_iceberg_table(1, 10000, False)
source_df = gen_source_dataset(5001, 15000, False, False)


res = table.merge_rows(source_df, ["order_id"])
res = table.merge_rows(df=source_df, join_cols=["order_id"])

rows_updated_should_be = 500
rows_inserted_should_be = 500
rows_updated_should_be = 5000
rows_inserted_should_be = 5000

assert res['rows_updated'] == rows_updated_should_be, f"rows updated should be {rows_updated_should_be}, but got {res['rows_updated']}"
assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
Expand All @@ -157,7 +140,7 @@ def test_merge_scenario_2():

purge_warehouse()

def test_merge_scenario_3():
def test_merge_scenario_3_composite_key():

"""
tests merging 200 rows with a composite key
Expand All @@ -167,7 +150,7 @@ def test_merge_scenario_3():
source_df = gen_source_dataset(101, 300, True, False)


res = table.merge_rows(source_df, ["order_id", "order_line_id"])
res = table.merge_rows(df=source_df, join_cols=["order_id", "order_line_id"])

#print(res)

Expand All @@ -181,6 +164,49 @@ def test_merge_scenario_3():

purge_warehouse()

def test_merge_update_only():

"""
tests explicit merge options to do update only
"""

table = gen_target_iceberg_table(1, 1000, False)
source_df = gen_source_dataset(501, 1500, False, False)

merge_options = {'when_matched_update_all': True, 'when_not_matched_insert_all': False}
res = table.merge_rows(df=source_df, join_cols=["order_id"], merge_options=merge_options)

rows_updated_should_be = 500
rows_inserted_should_be = 0

assert res['rows_updated'] == rows_updated_should_be, f"rows updated should be {rows_updated_should_be}, but got {res['rows_updated']}"
assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"

#show_iceberg_table(table)

purge_warehouse()

def test_merge_insert_only():
"""
tests explicit merge options to do insert only
"""

table = gen_target_iceberg_table(1, 1000, False)
source_df = gen_source_dataset(501, 1500, False, False)

merge_options = {'when_matched_update_all': False, 'when_not_matched_insert_all': True}
res = table.merge_rows(df=source_df, join_cols=["order_id"], merge_options=merge_options)

rows_updated_should_be = 0
rows_inserted_should_be = 500

assert res['rows_updated'] == rows_updated_should_be, f"rows updated should be {rows_updated_should_be}, but got {res['rows_updated']}"
assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"

#show_iceberg_table(table)

purge_warehouse()

def test_merge_source_dups():

"""
Expand All @@ -190,7 +216,7 @@ def test_merge_source_dups():
table = gen_target_iceberg_table(1, 10, False)
source_df = gen_source_dataset(5, 15, False, True)

res = table.merge_rows(source_df, ["order_id"])
res = table.merge_rows(df=source_df, join_cols=["order_id"])

error_msgs = res['error_msgs']

Expand All @@ -200,18 +226,36 @@ def test_merge_source_dups():

purge_warehouse()

def test_key_cols_misaligned():

def test_merge_update_only():
print('blah')
"""
tests join columns missing from one of the tables
"""

def test_merge_insert_only():
print('blah')
#generate dummy target iceberg table
df = ctx.sql("select 1 as order_id, date '2021-01-01' as order_date, 'A' as order_type").to_arrow_table()

def test_key_cols_misaligned():
print('blah')
catalog = get_sql_catalog(test_namespace)
table = catalog.create_table(f"{test_namespace}.target", df.schema)

table.append(df)

df_src = ctx.sql("select 1 as item_id, date '2021-05-01' as order_date, 'B' as order_type").to_arrow_table()

res = table.merge_rows(df=df_src, join_cols=['order_id'])
error_msgs = res['error_msgs']

#print(res)

assert 'Join columns missing in tables' in error_msgs, f"error message should contain 'Join columns missing in tables', but got {error_msgs}"

purge_warehouse()

if __name__ == "__main__":
test_merge_scenario_1()
test_merge_scenario_2()
test_merge_scenario_3()
test_merge_scenario_1_simple()
test_merge_scenario_2_10k_rows()
test_merge_scenario_3_composite_key()
test_merge_update_only()
test_merge_insert_only()
test_merge_source_dups()
test_key_cols_misaligned()