Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
121 commits
Select commit Hold shift + click to select a range
d57c367
enh(preprocessing): Add split_markdown_by_headings.
daavoo Jan 22, 2025
fe93f74
Add benchmark
daavoo Jan 20, 2025
92c70a7
Move to structured_qa. Add entrypoint
daavoo Jan 20, 2025
70ef785
Move back outside
daavoo Jan 20, 2025
16ff8bd
Fix main
daavoo Jan 20, 2025
539898e
Update questions
daavoo Jan 20, 2025
ed71947
Update model and prompt
daavoo Jan 20, 2025
fd4fb95
Update
daavoo Jan 20, 2025
5add514
Update
daavoo Jan 20, 2025
9f8c755
fix
daavoo Jan 20, 2025
bec2ef1
Add system_instruction
daavoo Jan 20, 2025
08cad02
Update ratio
daavoo Jan 20, 2025
b7ce84e
Add more wait
daavoo Jan 20, 2025
6fc48fe
Fix return
daavoo Jan 20, 2025
8929e9e
Fix URLs
daavoo Jan 20, 2025
4a9e75e
Update download name
daavoo Jan 20, 2025
41ffc23
Update
daavoo Jan 20, 2025
4390852
Update
daavoo Jan 20, 2025
68621eb
Update with upper
daavoo Jan 20, 2025
422e5d5
Cast to str
daavoo Jan 20, 2025
3040978
Extend
daavoo Jan 20, 2025
bc0d8ce
Add benchmark
daavoo Jan 20, 2025
03e0e60
Fix
daavoo Jan 20, 2025
c19738e
fix
daavoo Jan 20, 2025
3cd7b24
Drop export
daavoo Jan 21, 2025
22df32b
Updates
daavoo Jan 21, 2025
b35dc23
Update default model
daavoo Jan 21, 2025
6cf13d7
Update
daavoo Jan 21, 2025
ad1ef9b
Use info
daavoo Jan 21, 2025
f237b89
Update with None
daavoo Jan 21, 2025
a34f4e2
Add answer type
daavoo Jan 21, 2025
291e376
Refactor
daavoo Jan 21, 2025
d7e99e7
Add fallback for out of context
daavoo Jan 21, 2025
0f381bb
Update with debugging info
daavoo Jan 21, 2025
a0391a4
Update
daavoo Jan 21, 2025
c3182cb
Update with mit-1
daavoo Jan 22, 2025
20b1651
test unsloth
daavoo Jan 22, 2025
0dd98da
Add , skip_special_tokens = True
daavoo Jan 22, 2025
6ac29aa
Update
daavoo Jan 22, 2025
95b3d57
Updates
daavoo Jan 22, 2025
d946f81
Add full_context
daavoo Jan 22, 2025
4ea1f7d
Update full context
daavoo Jan 22, 2025
a4888f2
update
daavoo Jan 22, 2025
e0f3a82
Add load and clean
daavoo Jan 22, 2025
906c8d9
Update
daavoo Jan 22, 2025
bb2afe5
Update
daavoo Jan 22, 2025
51c31f7
print
daavoo Jan 22, 2025
c5e0ac4
Update
daavoo Jan 22, 2025
cc10a9d
Add load_gemini_model
daavoo Jan 22, 2025
1560c71
Add sleep
daavoo Jan 22, 2025
94e7580
Update get_response
daavoo Jan 22, 2025
e7b5d5b
Update
daavoo Jan 22, 2025
5f6443b
Log error
daavoo Jan 22, 2025
819c6b2
fix
daavoo Jan 22, 2025
5625c39
Make the more info check more flexible
daavoo Jan 23, 2025
d125b79
Add gemini_full_context notebook
daavoo Jan 23, 2025
88a9357
typo
daavoo Jan 23, 2025
d929a80
Check por API KEY
daavoo Jan 23, 2025
9e718b3
Update with outputs
daavoo Jan 23, 2025
9027567
Add ragatouille
daavoo Jan 23, 2025
d2a3d98
Fix
daavoo Jan 23, 2025
17942ca
Update notebooks
daavoo Jan 24, 2025
fcdd953
Update gemini notebooks
daavoo Jan 24, 2025
bfdacea
Extend structured_qa. Add perfect_context.
daavoo Jan 27, 2025
a7d8dc5
Add gemini_perfect_context
daavoo Jan 27, 2025
308ab91
Update
daavoo Jan 27, 2025
704050b
fix line
daavoo Jan 27, 2025
67b8f80
fix line
daavoo Jan 27, 2025
a6bfe34
Update perfect_context
daavoo Jan 28, 2025
39a17ae
Add missing perfect context
daavoo Jan 28, 2025
ae325d3
Updates
daavoo Jan 28, 2025
56d8620
Update gemini_ragatouille
daavoo Jan 28, 2025
eb00902
Update gemini_fra
daavoo Jan 28, 2025
1d06d2c
Update
daavoo Jan 28, 2025
8ac9201
Update
daavoo Jan 28, 2025
0352173
Drop some log
daavoo Jan 28, 2025
0b8e5cf
Update
daavoo Jan 28, 2025
e2c5457
Update gemini_perfect_context with results
daavoo Jan 29, 2025
36350ee
Use rapizfuzz
daavoo Jan 29, 2025
215226e
Use question_part
daavoo Jan 29, 2025
5d4d961
Fix
daavoo Jan 29, 2025
1223b03
break when no section_names
daavoo Jan 29, 2025
08c0b85
Update prompt
daavoo Jan 29, 2025
7b9c96c
Add qwen perfect context
daavoo Jan 29, 2025
c056bdc
Update gemini_find_retrieve_answer
daavoo Jan 30, 2025
b726447
Update qwen perfect context
daavoo Jan 30, 2025
036f8a3
Add qwen RAGatouille
daavoo Jan 30, 2025
6b0a0c1
Update qwen notebooks
daavoo Jan 30, 2025
c60fe3e
Update
daavoo Jan 30, 2025
d12fa72
Update prompt
daavoo Jan 30, 2025
38d2530
Update qwen notebooks
daavoo Jan 30, 2025
1360437
Cleanup
daavoo Jan 30, 2025
6906991
Cleanup
daavoo Jan 30, 2025
8abcfb1
Add DeepSeek-R1-Distill-Qwen-7B
daavoo Jan 31, 2025
034fe29
Debug current calls. Set to 9 before reset
daavoo Feb 1, 2025
a2d301f
Add qwen find retrieve answer
daavoo Feb 1, 2025
8300573
Extend benchmark
daavoo Feb 3, 2025
4f8f82a
Update
daavoo Feb 3, 2025
2de0bfb
Add max_sections_to_check
daavoo Feb 3, 2025
8f7d173
Default to None
daavoo Feb 3, 2025
7ff95ff
Default to half of sections
daavoo Feb 3, 2025
d05d992
Update
daavoo Feb 3, 2025
db63dc9
fix
daavoo Feb 3, 2025
20f9e3f
Fix
daavoo Feb 3, 2025
c5ee8e6
Add qwen full context
daavoo Feb 3, 2025
a4da649
Update qwen_full_context
daavoo Feb 3, 2025
4ea56e2
Update gemini_full_context
daavoo Feb 3, 2025
82f37f3
Add statistics
daavoo Feb 3, 2025
a02ffd7
Update prompt
daavoo Feb 4, 2025
8af98df
Update with type
daavoo Feb 4, 2025
97049d6
Update gemini prompt and count
daavoo Feb 4, 2025
6555304
Update results with same prompts
daavoo Feb 4, 2025
0ab4688
Update with same prompt
daavoo Feb 4, 2025
5276d16
Update results
daavoo Feb 4, 2025
476bbe1
Bring back llama-cpp-python
daavoo Feb 5, 2025
fdafdc3
Update prompts
daavoo Feb 5, 2025
2ac1f61
Reduce notebook size
daavoo Feb 5, 2025
c99adb0
Update pre-commit
daavoo Feb 5, 2025
a114fe5
Update docstrings
daavoo Feb 5, 2025
df394cc
Merge branch 'main' into 5-add-benchmark
daavoo Feb 5, 2025
eec44b0
Update test
daavoo Feb 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
enh(preprocessing): Add split_markdown_by_headings.
  • Loading branch information
daavoo committed Jan 22, 2025
commit d57c367eec0d8edee70d40f2687e91cfe9b2e24e
4 changes: 1 addition & 3 deletions docs/step-by-step-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ The document is first converted to markdown and then split into sections based o

**Section Splitting**

- Uses [langchain-text-splitters](https://pypi.org/project/langchain-text-splitters/)

- Splits on `("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")`
- Uses [split_markdown_by_headings](api.md/#structured_qa.preprocessing.split_markdown_by_headings)

- Each section is saved to a separate file.

Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ dependencies = [
"huggingface-hub",
"llama-cpp-python",
"loguru",
"langchain-text-splitters",
"pydantic",
"pymupdf4llm",
"pyyaml",
Expand Down
74 changes: 61 additions & 13 deletions src/structured_qa/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,70 @@
import re
from collections import defaultdict
from pathlib import Path

import pymupdf4llm
from langchain_text_splitters import MarkdownHeaderTextSplitter

from loguru import logger


def split_markdown_by_headings(
markdown_text, heading_patterns: list[str] | None = None
) -> dict[str, str]:
"""Splits a markdown document into sections based on specified heading patterns.

Args:
markdown_text (str): The markdown document as a single string.
heading_patterns (str, optional): A list of regex patterns representing heading markers
in the markdown document.
Defaults to None.
If None, the default patterns are used:

```python
[
r"^#\s+(.+)$",
r"^##\s+(.+)$",
r"^###\s+(.+)$",
r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$",
r"^\*\*[\d\.]+\.\*\*\s+(.+)$"
]
```

Returns:
dict[str, str]: A dictionary where the keys are the section names and the values are the section contents.
"""
if heading_patterns is None:
heading_patterns = [
r"^#\s+(.+)$",
r"^##\s+(.+)$",
r"^###\s+(.+)$",
r"^####\s+(.+)$",
r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$",
]

sections = defaultdict(str)

heading_text = "INTRO"
for line in markdown_text.splitlines():
line = line.strip()
if not line:
continue
for pattern in heading_patterns:
match = re.match(pattern, line)
if match:
heading_text = match.group(1)[:100]
break
sections[heading_text] += f"{line}\n"

return sections


@logger.catch(reraise=True)
def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:
"""
Convert a document to a directory of sections.

Uses [pymupdf4llm](https://pypi.org/project/pymupdf4llm/) to convert input_file to markdown.
Then uses [langchain_text_splitters](https://pypi.org/project/langchain-text-splitters/) to split the markdown into sections based on the headers.
Then uses [`split_markdown_by_headings`][structured_qa.preprocessing.split_markdown_by_headings] to split the markdown into sections based on the headers.

Args:
input_file: Path to the input document.
Expand All @@ -32,27 +84,23 @@ def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:

logger.info(f"Converting {input_file}")
md_text = pymupdf4llm.to_markdown(input_file)
Path("debug.md").write_text(md_text)
logger.success("Converted")

logger.info("Extracting sections")
splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
sections = split_markdown_by_headings(
md_text,
)
sections = splitter.split_text(md_text)
logger.success(f"Found {len(sections)} sections")

logger.info(f"Writing sections to {output_dir}")
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True, parents=True)
section_names = []
for section in sections:
if not section.metadata:
continue
section_name = list(section.metadata.values())[-1].lower()
section_names.append(section_name)

for section_name, section_content in sections.items():
(output_dir / f"{section_name.replace('/', '_')}.txt").write_text(
section.page_content
section_content
)
logger.success("Done")

return section_names
return sections.keys()
46 changes: 45 additions & 1 deletion tests/unit/test_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import pytest

from structured_qa.preprocessing import split_markdown_by_headings
from structured_qa.preprocessing import document_to_sections_dir


Expand All @@ -6,4 +9,45 @@ def test_document_to_sections_dir(tmp_path, example_data):
document_to_sections_dir(example_data / "1706.03762v7.pdf", output_dir)
sections = list(output_dir.iterdir())
assert all(section.is_file() and section.suffix == ".txt" for section in sections)
assert len(sections) == 10
assert len(sections) == 12


DEFAULT_HEADINGS = """
# Introduction

This is the introduction.

## Related Work

This is the related work.

### Method

This is the method.
"""

NUMERIC_HEADINGS = """
**1.** **Introduction**

This is the introduction.

**2.** **Related Work**

This is the related work.

**2.1** **Method**

This is the method.
"""


@pytest.mark.parametrize(
("markdown_text", "n_sections"),
(
(DEFAULT_HEADINGS, 3),
(NUMERIC_HEADINGS, 2),
),
)
def test_split_markdown_by_headings(markdown_text, n_sections):
sections = split_markdown_by_headings(markdown_text)
assert len(sections) == n_sections
Loading