Skip to content
64 changes: 55 additions & 9 deletions src/backend/base/langflow/base/data/docling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@
from langflow.schema.dataframe import DataFrame


class DoclingDependencyError(Exception):
"""Custom exception for missing Docling dependencies."""

def __init__(self, dependency_name: str, install_command: str):
self.dependency_name = dependency_name
self.install_command = install_command
super().__init__(f"{dependency_name} is not correctly installed. {install_command}")


def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]:
documents: list[DoclingDocument] = []
if isinstance(data_inputs, DataFrame):
Expand Down Expand Up @@ -191,22 +200,46 @@
logger.debug(f"Processing file {i + 1}/{len(file_paths)}: {file_path}")

try:
# Process single file (we can't easily interrupt convert_all)
single_result = converter.convert_all([file_path])
results.extend(single_result)

# Check for shutdown after each file
check_shutdown()

except (OSError, ValueError, RuntimeError, ImportError) as file_error:
# Handle specific file processing errors
except (OSError, ValueError, RuntimeError) as file_error:
error_msg = str(file_error)

# Check for specific dependency errors and raise custom exception
if "ocrmac is not correctly installed" in error_msg:
raise DoclingDependencyError(
"ocrmac", "Please install it via `pip install ocrmac` to use this OCR engine."

Check failure on line 213 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (EM101)

src/backend/base/langflow/base/data/docling_utils.py:213:25: EM101 Exception must not use a string literal, assign to variable first
)

Check failure on line 214 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (B904)

src/backend/base/langflow/base/data/docling_utils.py:212:21: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling
if "easyocr" in error_msg and "not installed" in error_msg:
raise DoclingDependencyError(
"easyocr", "Please install it via `pip install easyocr` to use this OCR engine."

Check failure on line 217 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (EM101)

src/backend/base/langflow/base/data/docling_utils.py:217:25: EM101 Exception must not use a string literal, assign to variable first
)

Check failure on line 218 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (B904)

src/backend/base/langflow/base/data/docling_utils.py:216:21: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling
if "tesserocr" in error_msg and "not installed" in error_msg:
raise DoclingDependencyError(
"tesserocr", "Please install it via `pip install tesserocr` to use this OCR engine."

Check failure on line 221 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (EM101)

src/backend/base/langflow/base/data/docling_utils.py:221:25: EM101 Exception must not use a string literal, assign to variable first
)

Check failure on line 222 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (B904)

src/backend/base/langflow/base/data/docling_utils.py:220:21: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling
if "rapidocr" in error_msg and "not installed" in error_msg:
raise DoclingDependencyError(
"rapidocr", "Please install it via `pip install rapidocr-onnxruntime` to use this OCR engine."

Check failure on line 225 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (EM101)

src/backend/base/langflow/base/data/docling_utils.py:225:25: EM101 Exception must not use a string literal, assign to variable first
)

Check failure on line 226 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (B904)

src/backend/base/langflow/base/data/docling_utils.py:224:21: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling

# If not a dependency error, log and continue with other files
logger.error(f"Error processing file {file_path}: {file_error}")
# Continue with other files, but check for shutdown
check_shutdown()
except Exception as file_error: # noqa: BLE001
# Catch any other unexpected errors to prevent worker crash

except ImportError as file_error:
# Handle import errors that might indicate missing dependencies
error_msg = str(file_error)
if any(ocr in error_msg.lower() for ocr in ["ocrmac", "easyocr", "tesserocr", "rapidocr"]):
raise DoclingDependencyError(
"OCR dependency", f"Please install the required OCR engine: {error_msg}"
)

Check failure on line 238 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (TRY003)

src/backend/base/langflow/base/data/docling_utils.py:236:27: TRY003 Avoid specifying long messages outside the exception class

Check failure on line 238 in src/backend/base/langflow/base/data/docling_utils.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (B904)

src/backend/base/langflow/base/data/docling_utils.py:236:21: B904 Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling
raise # Re-raise if not OCR related

except Exception as file_error:
logger.error(f"Unexpected error processing file {file_path}: {file_error}")
# Continue with other files, but check for shutdown
check_shutdown()

# Final shutdown check before sending results
Expand All @@ -223,6 +256,19 @@
logger.info(f"Successfully processed {len([d for d in processed_data if d])} files")
queue.put(processed_data)

except DoclingDependencyError as e:
# Send dependency error with special formatting
logger.error(f"Dependency error: {e}")
queue.put(
{
"error": str(e),
"error_type": "dependency_error",
"dependency_name": e.dependency_name,
"install_command": e.install_command,
}
)
return

except KeyboardInterrupt:
logger.warning("KeyboardInterrupt during processing, exiting gracefully...")
queue.put({"error": "Worker interrupted during processing", "shutdown": True})
Expand Down
30 changes: 23 additions & 7 deletions src/backend/base/langflow/components/docling/docling_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,33 @@ def _get_converter() -> DocumentConverter:
# Ignore cleanup errors, but log them
self.log(f"Warning: Error during queue cleanup - {e}")

# Check if there was an error in the worker
# Enhanced error checking with dependency-specific handling
if isinstance(result, dict) and "error" in result:
msg = result["error"]
if msg.startswith("Docling is not installed"):
raise ImportError(msg)
# Handle interrupt gracefully - return empty result instead of raising error
if "Worker interrupted by SIGINT" in msg or "shutdown" in result:
error_msg = result["error"]

# Handle dependency errors specifically
if result.get("error_type") == "dependency_error":
dependency_name = result.get("dependency_name", "Unknown dependency")
install_command = result.get("install_command", "Please check documentation")

# Create a user-friendly error message
user_message = (
f"Missing OCR dependency: {dependency_name}. "
f"{install_command} "
f"Alternatively, you can set OCR Engine to 'None' to disable OCR processing."
)
raise ImportError(user_message)

# Handle other specific errors
if error_msg.startswith("Docling is not installed"):
raise ImportError(error_msg)

# Handle graceful shutdown
if "Worker interrupted by SIGINT" in error_msg or "shutdown" in result:
self.log("Docling process cancelled by user")
result = []
else:
raise RuntimeError(msg)
raise RuntimeError(error_msg)

processed_data = [Data(data={"doc": r["document"], "file_path": r["file_path"]}) if r else None for r in result]
return self.rollup_data(file_list, processed_data)
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def convert_to_string(self) -> str | Generator[Any, None, None]:
"""Convert input data to string with proper error handling."""
self._validate_input()
if isinstance(self.input_value, list):
self.clean_data = self.clean_data if hasattr(self, "clean_data") else False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intentional?

Copy link
Contributor Author

@italojohnny italojohnny Sep 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.
For some reason I don’t fully understand (I haven’t dug into it deeply), the Chat Output component loses the clean_data attribute, which causes an error when trying to use it.
This line prevents that case.
It’s certainly a different issue from the one being prioritized in the PR, but I haven’t seen anyone else report it, and it only happened to me in this case.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just do self.clean_data = getattr(self, "clean_data", False) too

return "\n".join([safe_convert(item, clean_data=self.clean_data) for item in self.input_value])
if isinstance(self.input_value, Generator):
return self.input_value
Expand Down
Loading
Loading