Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/lfx/src/lfx/_assets/component_index.json

Large diffs are not rendered by default.

71 changes: 61 additions & 10 deletions src/lfx/src/lfx/base/data/docling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,72 @@ def __init__(self, dependency_name: str, install_command: str):
super().__init__(f"{dependency_name} is not correctly installed. {install_command}")


def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]:
def extract_docling_documents(
data_inputs: Data | list[Data] | DataFrame, doc_key: str
) -> tuple[list[DoclingDocument], str | None]:
"""Extract DoclingDocument objects from data inputs.

Args:
data_inputs: The data inputs containing DoclingDocument objects
doc_key: The key/column name to look for DoclingDocument objects

Returns:
A tuple of (documents, warning_message) where warning_message is None if no warning

Raises:
TypeError: If the data cannot be extracted or is invalid
"""
documents: list[DoclingDocument] = []
warning_message: str | None = None

if isinstance(data_inputs, DataFrame):
if not len(data_inputs):
msg = "DataFrame is empty"
raise TypeError(msg)

if doc_key not in data_inputs.columns:
msg = f"Column '{doc_key}' not found in DataFrame"
raise TypeError(msg)
try:
documents = data_inputs[doc_key].tolist()
except Exception as e:
msg = f"Error extracting DoclingDocument from DataFrame: {e}"
raise TypeError(msg) from e
# Primary: Check for exact column name match
if doc_key in data_inputs.columns:
try:
documents = data_inputs[doc_key].tolist()
except Exception as e:
msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
raise TypeError(msg) from e
else:
# Fallback: Search all columns for DoclingDocument objects
found_column = None
for col in data_inputs.columns:
try:
# Check if this column contains DoclingDocument objects
sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
if sample is not None and isinstance(sample, DoclingDocument):
found_column = col
break
except (IndexError, AttributeError):
continue

if found_column:
warning_message = (
f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
)
logger.warning(warning_message)
try:
documents = data_inputs[found_column].tolist()
except Exception as e:
msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
raise TypeError(msg) from e
else:
# Provide helpful error message
available_columns = list(data_inputs.columns)
msg = (
f"Column '{doc_key}' not found in DataFrame. "
f"Available columns: {available_columns}. "
f"\n\nPossible solutions:\n"
f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
f"3. If using VLM pipeline, try using the standard pipeline"
)
raise TypeError(msg)
else:
if not data_inputs:
msg = "No data inputs provided"
Expand Down Expand Up @@ -69,7 +120,7 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
except AttributeError as e:
msg = f"Invalid input type in collection: {e}"
raise TypeError(msg) from e
return documents
return documents, warning_message


def _unwrap_secrets(obj):
Expand Down
4 changes: 3 additions & 1 deletion src/lfx/src/lfx/components/docling/chunk_docling_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ def _docs_to_data(self, docs) -> list[Data]:
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]

def chunk_documents(self) -> DataFrame:
documents = extract_docling_documents(self.data_inputs, self.doc_key)
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
if warning:
self.status = warning

chunker: BaseChunker
if self.chunker == "HybridChunker":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def update_build_config(self, build_config: dict, field_value: Any, field_name:
return build_config

def export_document(self) -> list[Data]:
documents = extract_docling_documents(self.data_inputs, self.doc_key)
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
if warning:
self.status = warning

results: list[Data] = []
try:
Expand Down
136 changes: 136 additions & 0 deletions src/lfx/tests/unit/base/data/test_docling_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""Tests for docling_utils module."""

import pytest

try:
from docling_core.types.doc import DoclingDocument

DOCLING_AVAILABLE = True
except ImportError:
DOCLING_AVAILABLE = False
# Skip entire module if docling not available
pytest.skip("docling_core not installed", allow_module_level=True)

from lfx.base.data.docling_utils import extract_docling_documents
from lfx.schema.data import Data
from lfx.schema.dataframe import DataFrame


class TestExtractDoclingDocuments:
"""Test extract_docling_documents function."""

def test_extract_from_data_with_correct_key(self):
"""Test extracting DoclingDocument from Data with correct key."""
# Create a mock DoclingDocument
doc = DoclingDocument(name="test_doc")
data = Data(data={"doc": doc, "file_path": "test.pdf"})

# Extract documents
result, warning = extract_docling_documents(data, "doc")

# Verify
assert len(result) == 1
assert isinstance(result[0], DoclingDocument)
assert result[0].name == "test_doc"
assert warning is None

def test_extract_from_data_with_wrong_key(self):
"""Test extracting DoclingDocument from Data with wrong key raises error."""
doc = DoclingDocument(name="test_doc")
data = Data(data={"doc": doc, "file_path": "test.pdf"})

# Should raise TypeError when key is not found
with pytest.raises(TypeError, match="'wrong_key' field not available"):
extract_docling_documents(data, "wrong_key")

def test_extract_from_list_of_data(self):
"""Test extracting DoclingDocument from list of Data objects."""
doc1 = DoclingDocument(name="test_doc1")
doc2 = DoclingDocument(name="test_doc2")
data_list = [
Data(data={"doc": doc1, "file_path": "test1.pdf"}),
Data(data={"doc": doc2, "file_path": "test2.pdf"}),
]

# Extract documents
result, warning = extract_docling_documents(data_list, "doc")

# Verify
assert len(result) == 2
assert all(isinstance(d, DoclingDocument) for d in result)
assert result[0].name == "test_doc1"
assert result[1].name == "test_doc2"
assert warning is None

def test_extract_from_dataframe_with_correct_column(self):
"""Test extracting DoclingDocument from DataFrame with correct column name."""
doc1 = DoclingDocument(name="test_doc1")
doc2 = DoclingDocument(name="test_doc2")

# Create DataFrame with 'doc' column
df = DataFrame([{"doc": doc1, "file_path": "test1.pdf"}, {"doc": doc2, "file_path": "test2.pdf"}])

# Extract documents
result, warning = extract_docling_documents(df, "doc")

# Verify
assert len(result) == 2
assert all(isinstance(d, DoclingDocument) for d in result)
assert warning is None

def test_extract_from_dataframe_with_fallback_column(self):
"""Test extracting DoclingDocument from DataFrame when exact column name not found.

But DoclingDocument exists.
"""
doc1 = DoclingDocument(name="test_doc1")
doc2 = DoclingDocument(name="test_doc2")

# Create DataFrame where DoclingDocument is in a different column
# Simulate the case where pandas doesn't preserve the 'doc' column name
df = DataFrame([{"document": doc1, "file_path": "test1.pdf"}, {"document": doc2, "file_path": "test2.pdf"}])

# Extract documents - should find 'document' column as fallback
result, warning = extract_docling_documents(df, "doc")

# Verify
assert len(result) == 2
assert all(isinstance(d, DoclingDocument) for d in result)
# Verify warning is present since we used fallback column
assert warning is not None
assert "Column 'doc' not found" in warning
assert "found DoclingDocument objects in column 'document'" in warning
assert "Consider updating the 'Doc Key' parameter" in warning

def test_extract_from_dataframe_no_docling_column(self):
"""Test extracting DoclingDocument from DataFrame with no DoclingDocument column raises helpful error."""
# Create DataFrame without any DoclingDocument objects
df = DataFrame([{"text": "hello", "file_path": "test1.pdf"}, {"text": "world", "file_path": "test2.pdf"}])

# Should raise TypeError with helpful message
with pytest.raises(TypeError) as exc_info:
extract_docling_documents(df, "doc")

# Verify error message contains helpful information
error_msg = str(exc_info.value)
assert "Column 'doc' not found in DataFrame" in error_msg
assert "Available columns:" in error_msg
assert "Possible solutions:" in error_msg
assert "Use the 'Data' output" in error_msg

def test_extract_from_empty_dataframe(self):
"""Test extracting from empty DataFrame raises error."""
df = DataFrame([])

with pytest.raises(TypeError, match="DataFrame is empty"):
extract_docling_documents(df, "doc")

def test_extract_from_empty_data_list(self):
"""Test extracting from empty list raises error."""
with pytest.raises(TypeError, match="No data inputs provided"):
extract_docling_documents([], "doc")

def test_extract_from_none(self):
"""Test extracting from None raises error."""
with pytest.raises(TypeError, match="No data inputs provided"):
extract_docling_documents(None, "doc")
Loading