langflow-ai · erichare · Dec 3, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/src/lfx/src/lfx/_assets/component_index.json b/src/lfx/src/lfx/_assets/component_index.json
diff --git a/src/lfx/src/lfx/base/data/docling_utils.py b/src/lfx/src/lfx/base/data/docling_utils.py
@@ -25,21 +25,72 @@ def __init__(self, dependency_name: str, install_command: str):
         super().__init__(f"{dependency_name} is not correctly installed. {install_command}")
 
 
-def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]:
+def extract_docling_documents(
+    data_inputs: Data | list[Data] | DataFrame, doc_key: str
+) -> tuple[list[DoclingDocument], str | None]:
+    """Extract DoclingDocument objects from data inputs.
+
+    Args:
+        data_inputs: The data inputs containing DoclingDocument objects
+        doc_key: The key/column name to look for DoclingDocument objects
+
+    Returns:
+        A tuple of (documents, warning_message) where warning_message is None if no warning
+
+    Raises:
+        TypeError: If the data cannot be extracted or is invalid
+    """
     documents: list[DoclingDocument] = []
+    warning_message: str | None = None
+
     if isinstance(data_inputs, DataFrame):
         if not len(data_inputs):
             msg = "DataFrame is empty"
             raise TypeError(msg)
 
-        if doc_key not in data_inputs.columns:
-            msg = f"Column '{doc_key}' not found in DataFrame"
-            raise TypeError(msg)
-        try:
-            documents = data_inputs[doc_key].tolist()
-        except Exception as e:
-            msg = f"Error extracting DoclingDocument from DataFrame: {e}"
-            raise TypeError(msg) from e
+        # Primary: Check for exact column name match
+        if doc_key in data_inputs.columns:
+            try:
+                documents = data_inputs[doc_key].tolist()
+            except Exception as e:
+                msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
+                raise TypeError(msg) from e
+        else:
+            # Fallback: Search all columns for DoclingDocument objects
+            found_column = None
+            for col in data_inputs.columns:
+                try:
+                    # Check if this column contains DoclingDocument objects
+                    sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
+                    if sample is not None and isinstance(sample, DoclingDocument):
+                        found_column = col
+                        break
+                except (IndexError, AttributeError):
+                    continue
+
+            if found_column:
+                warning_message = (
+                    f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
+                    f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
+                )
+                logger.warning(warning_message)
+                try:
+                    documents = data_inputs[found_column].tolist()
+                except Exception as e:
+                    msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
+                    raise TypeError(msg) from e
+            else:
+                # Provide helpful error message
+                available_columns = list(data_inputs.columns)
+                msg = (
+                    f"Column '{doc_key}' not found in DataFrame. "
+                    f"Available columns: {available_columns}. "
+                    f"\n\nPossible solutions:\n"
+                    f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
+                    f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
+                    f"3. If using VLM pipeline, try using the standard pipeline"
+                )
+                raise TypeError(msg)
     else:
         if not data_inputs:
             msg = "No data inputs provided"
@@ -69,7 +120,7 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
             except AttributeError as e:
                 msg = f"Invalid input type in collection: {e}"
                 raise TypeError(msg) from e
-    return documents
+    return documents, warning_message
 
 
 def _unwrap_secrets(obj):

diff --git a/src/lfx/src/lfx/components/docling/chunk_docling_document.py b/src/lfx/src/lfx/components/docling/chunk_docling_document.py
@@ -115,7 +115,9 @@ def _docs_to_data(self, docs) -> list[Data]:
         return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
 
     def chunk_documents(self) -> DataFrame:
-        documents = extract_docling_documents(self.data_inputs, self.doc_key)
+        documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
+        if warning:
+            self.status = warning
 
         chunker: BaseChunker
         if self.chunker == "HybridChunker":

diff --git a/src/lfx/src/lfx/components/docling/export_docling_document.py b/src/lfx/src/lfx/components/docling/export_docling_document.py
@@ -86,7 +86,9 @@ def update_build_config(self, build_config: dict, field_value: Any, field_name:
         return build_config
 
     def export_document(self) -> list[Data]:
-        documents = extract_docling_documents(self.data_inputs, self.doc_key)
+        documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
+        if warning:
+            self.status = warning
 
         results: list[Data] = []
         try:

diff --git a/src/lfx/tests/unit/base/data/test_docling_utils.py b/src/lfx/tests/unit/base/data/test_docling_utils.py
@@ -0,0 +1,136 @@
+"""Tests for docling_utils module."""
+
+import pytest
+
+try:
+    from docling_core.types.doc import DoclingDocument
+
+    DOCLING_AVAILABLE = True
+except ImportError:
+    DOCLING_AVAILABLE = False
+    # Skip entire module if docling not available
+    pytest.skip("docling_core not installed", allow_module_level=True)
+
+from lfx.base.data.docling_utils import extract_docling_documents
+from lfx.schema.data import Data
+from lfx.schema.dataframe import DataFrame
+
+
+class TestExtractDoclingDocuments:
+    """Test extract_docling_documents function."""
+
+    def test_extract_from_data_with_correct_key(self):
+        """Test extracting DoclingDocument from Data with correct key."""
+        # Create a mock DoclingDocument
+        doc = DoclingDocument(name="test_doc")
+        data = Data(data={"doc": doc, "file_path": "test.pdf"})
+
+        # Extract documents
+        result, warning = extract_docling_documents(data, "doc")
+
+        # Verify
+        assert len(result) == 1
+        assert isinstance(result[0], DoclingDocument)
+        assert result[0].name == "test_doc"
+        assert warning is None
+
+    def test_extract_from_data_with_wrong_key(self):
+        """Test extracting DoclingDocument from Data with wrong key raises error."""
+        doc = DoclingDocument(name="test_doc")
+        data = Data(data={"doc": doc, "file_path": "test.pdf"})
+
+        # Should raise TypeError when key is not found
+        with pytest.raises(TypeError, match="'wrong_key' field not available"):
+            extract_docling_documents(data, "wrong_key")
+
+    def test_extract_from_list_of_data(self):
+        """Test extracting DoclingDocument from list of Data objects."""
+        doc1 = DoclingDocument(name="test_doc1")
+        doc2 = DoclingDocument(name="test_doc2")
+        data_list = [
+            Data(data={"doc": doc1, "file_path": "test1.pdf"}),
+            Data(data={"doc": doc2, "file_path": "test2.pdf"}),
+        ]
+
+        # Extract documents
+        result, warning = extract_docling_documents(data_list, "doc")
+
+        # Verify
+        assert len(result) == 2
+        assert all(isinstance(d, DoclingDocument) for d in result)
+        assert result[0].name == "test_doc1"
+        assert result[1].name == "test_doc2"
+        assert warning is None
+
+    def test_extract_from_dataframe_with_correct_column(self):
+        """Test extracting DoclingDocument from DataFrame with correct column name."""
+        doc1 = DoclingDocument(name="test_doc1")
+        doc2 = DoclingDocument(name="test_doc2")
+
+        # Create DataFrame with 'doc' column
+        df = DataFrame([{"doc": doc1, "file_path": "test1.pdf"}, {"doc": doc2, "file_path": "test2.pdf"}])
+
+        # Extract documents
+        result, warning = extract_docling_documents(df, "doc")
+
+        # Verify
+        assert len(result) == 2
+        assert all(isinstance(d, DoclingDocument) for d in result)
+        assert warning is None
+
+    def test_extract_from_dataframe_with_fallback_column(self):
+        """Test extracting DoclingDocument from DataFrame when exact column name not found.
+
+        But DoclingDocument exists.
+        """
+        doc1 = DoclingDocument(name="test_doc1")
+        doc2 = DoclingDocument(name="test_doc2")
+
+        # Create DataFrame where DoclingDocument is in a different column
+        # Simulate the case where pandas doesn't preserve the 'doc' column name
+        df = DataFrame([{"document": doc1, "file_path": "test1.pdf"}, {"document": doc2, "file_path": "test2.pdf"}])
+
+        # Extract documents - should find 'document' column as fallback
+        result, warning = extract_docling_documents(df, "doc")
+
+        # Verify
+        assert len(result) == 2
+        assert all(isinstance(d, DoclingDocument) for d in result)
+        # Verify warning is present since we used fallback column
+        assert warning is not None
+        assert "Column 'doc' not found" in warning
+        assert "found DoclingDocument objects in column 'document'" in warning
+        assert "Consider updating the 'Doc Key' parameter" in warning
+
+    def test_extract_from_dataframe_no_docling_column(self):
+        """Test extracting DoclingDocument from DataFrame with no DoclingDocument column raises helpful error."""
+        # Create DataFrame without any DoclingDocument objects
+        df = DataFrame([{"text": "hello", "file_path": "test1.pdf"}, {"text": "world", "file_path": "test2.pdf"}])
+
+        # Should raise TypeError with helpful message
+        with pytest.raises(TypeError) as exc_info:
+            extract_docling_documents(df, "doc")
+
+        # Verify error message contains helpful information
+        error_msg = str(exc_info.value)
+        assert "Column 'doc' not found in DataFrame" in error_msg
+        assert "Available columns:" in error_msg
+        assert "Possible solutions:" in error_msg
+        assert "Use the 'Data' output" in error_msg
+
+    def test_extract_from_empty_dataframe(self):
+        """Test extracting from empty DataFrame raises error."""
+        df = DataFrame([])
+
+        with pytest.raises(TypeError, match="DataFrame is empty"):
+            extract_docling_documents(df, "doc")
+
+    def test_extract_from_empty_data_list(self):
+        """Test extracting from empty list raises error."""
+        with pytest.raises(TypeError, match="No data inputs provided"):
+            extract_docling_documents([], "doc")
+
+    def test_extract_from_none(self):
+        """Test extracting from None raises error."""
+        with pytest.raises(TypeError, match="No data inputs provided"):
+            extract_docling_documents(None, "doc")