forked from MemTensor/MemOS
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmarkdown_chunker.py
More file actions
62 lines (54 loc) · 2.39 KB
/
markdown_chunker.py
File metadata and controls
62 lines (54 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from memos.configs.chunker import MarkdownChunkerConfig
from memos.dependency import require_python_package
from memos.log import get_logger
from .base import BaseChunker, Chunk
logger = get_logger(__name__)
class MarkdownChunker(BaseChunker):
"""Markdown-based text chunker."""
@require_python_package(
import_name="langchain_text_splitters",
install_command="pip install langchain_text_splitters==1.0.0",
install_link="https://github.com/langchain-ai/langchain-text-splitters",
)
def __init__(
self,
config: MarkdownChunkerConfig | None = None,
chunk_size: int = 1000,
chunk_overlap: int = 200,
recursive: bool = False,
):
from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)
self.config = config
self.chunker = MarkdownHeaderTextSplitter(
headers_to_split_on=config.headers_to_split_on
if config
else [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")],
strip_headers=config.strip_headers if config else False,
)
self.chunker_recursive = None
logger.info(f"Initialized MarkdownHeaderTextSplitter with config: {config}")
if (config and config.recursive) or recursive:
self.chunker_recursive = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size if config else chunk_size,
chunk_overlap=config.chunk_overlap if config else chunk_overlap,
length_function=len,
)
def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
md_header_splits = self.chunker.split_text(text)
chunks = []
if self.chunker_recursive:
md_header_splits = self.chunker_recursive.split_documents(md_header_splits)
for doc in md_header_splits:
try:
chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content
chunks.append(chunk)
except Exception as e:
logger.warning(f"warning chunking document: {e}")
chunks.append(doc.page_content)
logger.info(f"Generated chunks: {chunks[:5]}")
logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks