enh(preprocessing): Add split_markdown_by_headings.

mozilla-ai · daavoo · Feb 5, 2025 · Jan 22, 2025 · Jan 20, 2025 · Jan 20, 2025
commit d57c367eec0d8edee70d40f2687e91cfe9b2e24e
diff --git a/docs/step-by-step-guide.md b/docs/step-by-step-guide.md
@@ -34,9 +34,7 @@ The document is first converted to markdown and then split into sections based o
 
  **Section Splitting**
 
-   - Uses [langchain-text-splitters](https://pypi.org/project/langchain-text-splitters/)
-
-   - Splits on `("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")`
+   - Uses [split_markdown_by_headings](api.md/#structured_qa.preprocessing.split_markdown_by_headings)
 
    - Each section is saved to a separate file.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,6 @@ dependencies = [
   "huggingface-hub",
   "llama-cpp-python",
   "loguru",
-  "langchain-text-splitters",
   "pydantic",
   "pymupdf4llm",
   "pyyaml",

diff --git a/src/structured_qa/preprocessing.py b/src/structured_qa/preprocessing.py
@@ -1,18 +1,70 @@
+import re
+from collections import defaultdict
 from pathlib import Path
 
 import pymupdf4llm
-from langchain_text_splitters import MarkdownHeaderTextSplitter
 
 from loguru import logger
 
 
+def split_markdown_by_headings(
+    markdown_text, heading_patterns: list[str] | None = None
+) -> dict[str, str]:
+    """Splits a markdown document into sections based on specified heading patterns.
+
+    Args:
+        markdown_text (str): The markdown document as a single string.
+        heading_patterns (str, optional): A list of regex patterns representing heading markers
+            in the markdown document.
+            Defaults to None.
+            If None, the default patterns are used:
+
+            ```python
+            [
+                r"^#\s+(.+)$",
+                r"^##\s+(.+)$",
+                r"^###\s+(.+)$",
+                r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$",
+                r"^\*\*[\d\.]+\.\*\*\s+(.+)$"
+            ]
+            ```
+
+    Returns:
+        dict[str, str]: A dictionary where the keys are the section names and the values are the section contents.
+    """
+    if heading_patterns is None:
+        heading_patterns = [
+            r"^#\s+(.+)$",
+            r"^##\s+(.+)$",
+            r"^###\s+(.+)$",
+            r"^####\s+(.+)$",
+            r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$",
+        ]
+
+    sections = defaultdict(str)
+
+    heading_text = "INTRO"
+    for line in markdown_text.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        for pattern in heading_patterns:
+            match = re.match(pattern, line)
+            if match:
+                heading_text = match.group(1)[:100]
+                break
+        sections[heading_text] += f"{line}\n"
+
+    return sections
+
+
 @logger.catch(reraise=True)
 def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:
     """
     Convert a document to a directory of sections.
 
     Uses [pymupdf4llm](https://pypi.org/project/pymupdf4llm/) to convert input_file to markdown.
-    Then uses [langchain_text_splitters](https://pypi.org/project/langchain-text-splitters/) to split the markdown into sections based on the headers.
+    Then uses [`split_markdown_by_headings`][structured_qa.preprocessing.split_markdown_by_headings] to split the markdown into sections based on the headers.
 
     Args:
         input_file: Path to the input document.
@@ -32,27 +84,23 @@ def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:
 
     logger.info(f"Converting {input_file}")
     md_text = pymupdf4llm.to_markdown(input_file)
+    Path("debug.md").write_text(md_text)
     logger.success("Converted")
 
     logger.info("Extracting sections")
-    splitter = MarkdownHeaderTextSplitter(
-        headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
+    sections = split_markdown_by_headings(
+        md_text,
     )
-    sections = splitter.split_text(md_text)
     logger.success(f"Found {len(sections)} sections")
 
     logger.info(f"Writing sections to {output_dir}")
     output_dir = Path(output_dir)
     output_dir.mkdir(exist_ok=True, parents=True)
-    section_names = []
-    for section in sections:
-        if not section.metadata:
-            continue
-        section_name = list(section.metadata.values())[-1].lower()
-        section_names.append(section_name)
+
+    for section_name, section_content in sections.items():
         (output_dir / f"{section_name.replace('/', '_')}.txt").write_text(
-            section.page_content
+            section_content
         )
     logger.success("Done")
 
-    return section_names
+    return sections.keys()
diff --git a/tests/unit/test_preprocessing.py b/tests/unit/test_preprocessing.py
@@ -1,3 +1,6 @@
+import pytest
+
+from structured_qa.preprocessing import split_markdown_by_headings
 from structured_qa.preprocessing import document_to_sections_dir
 
 
@@ -6,4 +9,45 @@ def test_document_to_sections_dir(tmp_path, example_data):
     document_to_sections_dir(example_data / "1706.03762v7.pdf", output_dir)
     sections = list(output_dir.iterdir())
     assert all(section.is_file() and section.suffix == ".txt" for section in sections)
-    assert len(sections) == 10
+    assert len(sections) == 12
+
+
+DEFAULT_HEADINGS = """
+# Introduction
+
+This is the introduction.
+
+## Related Work
+
+This is the related work.
+
+### Method
+
+This is the method.
+"""
+
+NUMERIC_HEADINGS = """
+**1.** **Introduction**
+
+This is the introduction.
+
+**2.** **Related Work**
+
+This is the related work.
+
+**2.1** **Method**
+
+This is the method.
+"""
+
+
+@pytest.mark.parametrize(
+    ("markdown_text", "n_sections"),
+    (
+        (DEFAULT_HEADINGS, 3),
+        (NUMERIC_HEADINGS, 2),
+    ),
+)
+def test_split_markdown_by_headings(markdown_text, n_sections):
+    sections = split_markdown_by_headings(markdown_text)
+    assert len(sections) == n_sections