Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ tests = [
# tensorflow is broken on 2.20.0 with macOS ARM.
"tensorflow<=2.19.0",
"tqdm",
"datafusion",
# Need to align with the datafusion version we use at lance rust
"datafusion==48.0.0",
]
dev = ["ruff==0.4.1", "pyright"]
benchmarks = ["pytest-benchmark"]
Expand Down
207 changes: 189 additions & 18 deletions python/python/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,35 +40,33 @@ def test_json_basic_write_read():
# Read back the dataset
dataset = lance.dataset(dataset_path)

# Verify schema
# Verify storage schema
assert len(dataset.schema) == 2
assert dataset.schema.field("id").type == pa.int32()

# Check that JSON field is properly recognized
json_field_schema = dataset.schema.field("data")
# Should be PyArrow JSON type
assert json_field_schema.type == pa.json_()
# Check that JSON field is stored as JSONB internally
storage_field = dataset.schema.field("data")
assert storage_field.type == pa.large_binary()
assert storage_field.metadata is not None
assert b"ARROW:extension:name" in storage_field.metadata
assert storage_field.metadata[b"ARROW:extension:name"] == b"lance.json"

# Read data back
result_table = dataset.to_table()

# Check that data is returned as Arrow JSON for Python
result_field = result_table.schema.field("data")
# PyArrow extension types print as extension<arrow.json> but
# the storage type is utf8
assert (
str(result_field.type) == "extension<arrow.json>"
or result_field.type == pa.utf8()
)

# Verify data
assert result_table.num_rows == 5
assert result_table.column("id").to_pylist() == [1, 2, 3, 4, 5]

# Verify the data column is JSON type
data_column = result_table.column("data")
assert data_column.type == pa.json_()

# Verify the JSON data is correctly preserved
for i, expected in enumerate(json_strings):
actual = data_column[i].as_py()
if expected is None:
assert actual is None
else:
# Arrow JSON returns strings, verify they match
assert actual == expected


def test_json_with_other_types():
"""Test JSON type alongside other data types."""
Expand Down Expand Up @@ -184,3 +182,176 @@ def test_json_batch_operations():

for batch in batches:
assert batch.num_rows == batch_size


def test_json_path_queries():
"""Test JSON path queries using json_extract and json_exists."""
# Create test data with JSON columns
json_data = [
{"user": {"name": "Alice", "age": 30}, "tags": ["python", "ml"]},
{"user": {"name": "Bob", "age": 25}, "tags": ["rust", "db"]},
{"user": {"name": "Charlie"}, "tags": []},
None,
]

json_strings = [json.dumps(d) if d is not None else None for d in json_data]
json_arr = pa.array(json_strings, type=pa.json_())

# Create a Lance dataset with JSON data
table = pa.table(
{
"id": [1, 2, 3, 4],
"data": json_arr,
}
)

with tempfile.TemporaryDirectory() as tmpdir:
ds_path = Path(tmpdir) / "json_test.lance"
lance.write_dataset(table, ds_path, data_storage_version="2.2")
dataset = lance.dataset(ds_path)

# Test json_extract
result = dataset.to_table(
filter="json_extract(data, '$.user.name') = '\"Alice\"'"
)
assert result.num_rows == 1
assert result["id"][0].as_py() == 1

# Test json_exists
result = dataset.to_table(filter="json_exists(data, '$.user.age')")
assert result.num_rows == 2 # Alice and Bob have age field

# Test json_array_contains
result = dataset.to_table(
filter="json_array_contains(data, '$.tags', 'python')"
)
assert result.num_rows == 1
assert result["id"][0].as_py() == 1


def test_json_get_functions():
"""Test json_get_* functions for type-safe value extraction."""
# Create test data with various JSON types
json_data = [
{"name": "Alice", "age": 30, "active": True, "score": 95.5},
{"name": "Bob", "age": 25, "active": False, "score": 87.3},
{"name": "Charlie", "age": "35", "active": "true", "score": "92"},
{"name": "David"}, # Missing fields
]

json_strings = [json.dumps(d) for d in json_data]
json_arr = pa.array(json_strings, type=pa.json_())

table = pa.table(
{
"id": [1, 2, 3, 4],
"data": json_arr,
}
)

with tempfile.TemporaryDirectory() as tmpdir:
ds_path = Path(tmpdir) / "json_get_test.lance"
lance.write_dataset(table, ds_path, data_storage_version="2.2")
dataset = lance.dataset(ds_path)

# Test json_get_string
result = dataset.to_table(filter="json_get_string(data, 'name') = 'Alice'")
assert result.num_rows == 1
assert result["id"][0].as_py() == 1

# Test json_get_int with type coercion
result = dataset.to_table(filter="json_get_int(data, 'age') > 28")
assert result.num_rows == 2 # Alice (30) and Charlie ("35" -> 35)

# Test json_get_bool with type coercion
result = dataset.to_table(filter="json_get_bool(data, 'active') = true")
assert result.num_rows == 2 # Alice (true) and Charlie ("true" -> true)

# Test json_get_float
result = dataset.to_table(filter="json_get_float(data, 'score') > 90")
assert result.num_rows == 2 # Alice (95.5) and Charlie ("92" -> 92.0)


def test_nested_json_access():
"""Test accessing nested JSON structures."""
json_data = [
{"user": {"profile": {"name": "Alice", "settings": {"theme": "dark"}}}},
{"user": {"profile": {"name": "Bob", "settings": {"theme": "light"}}}},
]

json_strings = [json.dumps(d) for d in json_data]
json_arr = pa.array(json_strings, type=pa.json_())

table = pa.table(
{
"id": [1, 2],
"data": json_arr,
}
)

with tempfile.TemporaryDirectory() as tmpdir:
ds_path = Path(tmpdir) / "nested_json_test.lance"
lance.write_dataset(table, ds_path, data_storage_version="2.2")
dataset = lance.dataset(ds_path)

# Access nested fields using json_get recursively
# First get user, then profile, then name
result = dataset.to_table(
filter="""
json_get_string(
json_get(
json_get(data, 'user'),
'profile'),
'name')
= 'Alice'"""
)
assert result.num_rows == 1
assert result["id"][0].as_py() == 1

# Or use JSONPath for deep access
result = dataset.to_table(
filter="json_extract(data, '$.user.profile.settings.theme') = '\"dark\"'"
)
assert result.num_rows == 1
assert result["id"][0].as_py() == 1


def test_json_array_operations():
"""Test JSON array operations."""
json_data = [
{"items": ["apple", "banana", "orange"], "counts": [1, 2, 3, 4, 5]},
{"items": ["grape", "melon"], "counts": [10, 20]},
{"items": [], "counts": []},
]

json_strings = [json.dumps(d) for d in json_data]
json_arr = pa.array(json_strings, type=pa.json_())

table = pa.table(
{
"id": [1, 2, 3],
"data": json_arr,
}
)

with tempfile.TemporaryDirectory() as tmpdir:
ds_path = Path(tmpdir) / "array_json_test.lance"
lance.write_dataset(table, ds_path, data_storage_version="2.2")
dataset = lance.dataset(ds_path)

# Test array contains
result = dataset.to_table(
filter="json_array_contains(data, '$.items', 'apple')"
)
assert result.num_rows == 1
assert result["id"][0].as_py() == 1

# Test array length
result = dataset.to_table(filter="json_array_length(data, '$.counts') > 3")
assert result.num_rows == 1
assert result["id"][0].as_py() == 1

# Test empty array
result = dataset.to_table(filter="json_array_length(data, '$.items') = 0")
assert result.num_rows == 1
assert result["id"][0].as_py() == 3
Loading
Loading