Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/lfx/src/lfx/_assets/component_index.json

Large diffs are not rendered by default.

50 changes: 42 additions & 8 deletions src/lfx/src/lfx/base/data/docling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,48 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
msg = "DataFrame is empty"
raise TypeError(msg)

if doc_key not in data_inputs.columns:
msg = f"Column '{doc_key}' not found in DataFrame"
raise TypeError(msg)
try:
documents = data_inputs[doc_key].tolist()
except Exception as e:
msg = f"Error extracting DoclingDocument from DataFrame: {e}"
raise TypeError(msg) from e
# Primary: Check for exact column name match
if doc_key in data_inputs.columns:
try:
documents = data_inputs[doc_key].tolist()
except Exception as e:
msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
raise TypeError(msg) from e
else:
# Fallback: Search all columns for DoclingDocument objects
found_column = None
for col in data_inputs.columns:
try:
# Check if this column contains DoclingDocument objects
sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
if sample is not None and isinstance(sample, DoclingDocument):
found_column = col
break
except (IndexError, AttributeError):
continue

if found_column:
logger.warning(
f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to surface this to the UI

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ogabrielluiz how does it look now?

try:
documents = data_inputs[found_column].tolist()
except Exception as e:
msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
raise TypeError(msg) from e
else:
# Provide helpful error message
available_columns = list(data_inputs.columns)
msg = (
f"Column '{doc_key}' not found in DataFrame. "
f"Available columns: {available_columns}. "
f"\n\nPossible solutions:\n"
f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
f"3. If using VLM pipeline, try using the standard pipeline"
)
raise TypeError(msg)
else:
if not data_inputs:
msg = "No data inputs provided"
Expand Down
116 changes: 116 additions & 0 deletions src/lfx/tests/unit/base/data/test_docling_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""Tests for docling_utils module."""

import pytest
from docling_core.types.doc import DoclingDocument
from lfx.base.data.docling_utils import extract_docling_documents
from lfx.schema.data import Data
from lfx.schema.dataframe import DataFrame


class TestExtractDoclingDocuments:
"""Test extract_docling_documents function."""

def test_extract_from_data_with_correct_key(self):
"""Test extracting DoclingDocument from Data with correct key."""
# Create a mock DoclingDocument
doc = DoclingDocument(name="test_doc")
data = Data(data={"doc": doc, "file_path": "test.pdf"})

# Extract documents
result = extract_docling_documents(data, "doc")

# Verify
assert len(result) == 1
assert isinstance(result[0], DoclingDocument)
assert result[0].name == "test_doc"

def test_extract_from_data_with_wrong_key(self):
"""Test extracting DoclingDocument from Data with wrong key raises error."""
doc = DoclingDocument(name="test_doc")
data = Data(data={"doc": doc, "file_path": "test.pdf"})

# Should raise TypeError when key is not found
with pytest.raises(TypeError, match="'wrong_key' field not available"):
extract_docling_documents(data, "wrong_key")

def test_extract_from_list_of_data(self):
"""Test extracting DoclingDocument from list of Data objects."""
doc1 = DoclingDocument(name="test_doc1")
doc2 = DoclingDocument(name="test_doc2")
data_list = [
Data(data={"doc": doc1, "file_path": "test1.pdf"}),
Data(data={"doc": doc2, "file_path": "test2.pdf"}),
]

# Extract documents
result = extract_docling_documents(data_list, "doc")

# Verify
assert len(result) == 2
assert all(isinstance(d, DoclingDocument) for d in result)
assert result[0].name == "test_doc1"
assert result[1].name == "test_doc2"

def test_extract_from_dataframe_with_correct_column(self):
"""Test extracting DoclingDocument from DataFrame with correct column name."""
doc1 = DoclingDocument(name="test_doc1")
doc2 = DoclingDocument(name="test_doc2")

# Create DataFrame with 'doc' column
df = DataFrame([{"doc": doc1, "file_path": "test1.pdf"}, {"doc": doc2, "file_path": "test2.pdf"}])

# Extract documents
result = extract_docling_documents(df, "doc")

# Verify
assert len(result) == 2
assert all(isinstance(d, DoclingDocument) for d in result)

def test_extract_from_dataframe_with_fallback_column(self):
"""Test extracting DoclingDocument from DataFrame when exact column name not found but DoclingDocument exists."""

Check failure on line 70 in src/lfx/tests/unit/base/data/test_docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (E501)

src/lfx/tests/unit/base/data/test_docling_utils.py:70:121: E501 Line too long (121 > 120)
doc1 = DoclingDocument(name="test_doc1")
doc2 = DoclingDocument(name="test_doc2")

# Create DataFrame where DoclingDocument is in a different column
# Simulate the case where pandas doesn't preserve the 'doc' column name
df = DataFrame([{"document": doc1, "file_path": "test1.pdf"}, {"document": doc2, "file_path": "test2.pdf"}])

# Extract documents - should find 'document' column as fallback
result = extract_docling_documents(df, "doc")

# Verify
assert len(result) == 2
assert all(isinstance(d, DoclingDocument) for d in result)

def test_extract_from_dataframe_no_docling_column(self):
"""Test extracting DoclingDocument from DataFrame with no DoclingDocument column raises helpful error."""
# Create DataFrame without any DoclingDocument objects
df = DataFrame([{"text": "hello", "file_path": "test1.pdf"}, {"text": "world", "file_path": "test2.pdf"}])

# Should raise TypeError with helpful message
with pytest.raises(TypeError) as exc_info:
extract_docling_documents(df, "doc")

# Verify error message contains helpful information
error_msg = str(exc_info.value)
assert "Column 'doc' not found in DataFrame" in error_msg
assert "Available columns:" in error_msg
assert "Possible solutions:" in error_msg
assert "Use the 'Data' output" in error_msg

def test_extract_from_empty_dataframe(self):
"""Test extracting from empty DataFrame raises error."""
df = DataFrame([])

with pytest.raises(TypeError, match="DataFrame is empty"):
extract_docling_documents(df, "doc")

def test_extract_from_empty_data_list(self):
"""Test extracting from empty list raises error."""
with pytest.raises(TypeError, match="No data inputs provided"):
extract_docling_documents([], "doc")

def test_extract_from_none(self):
"""Test extracting from None raises error."""
with pytest.raises(TypeError, match="No data inputs provided"):
extract_docling_documents(None, "doc")
8 changes: 4 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading