Skip to content

Commit d644d4d

Browse files
committed
add default source for File types
The chunk_file() function adds the sources when chunking. Without adding a default source this will raise an error when used without chunking
1 parent 2394245 commit d644d4d

File tree

1 file changed

+3
-0
lines changed

1 file changed

+3
-0
lines changed

knowledge_gpt/core/parsing.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def from_bytes(cls, file: BytesIO) -> "DocxFile":
6363
text = docx2txt.process(file)
6464
text = strip_consecutive_newlines(text)
6565
doc = Document(page_content=text.strip())
66+
doc.metadata["source"] = "p-1"
6667
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
6768

6869

@@ -76,6 +77,7 @@ def from_bytes(cls, file: BytesIO) -> "PdfFile":
7677
text = strip_consecutive_newlines(text)
7778
doc = Document(page_content=text.strip())
7879
doc.metadata["page"] = i + 1
80+
doc.metadata["source"] = f"p-{i+1}"
7981
docs.append(doc)
8082
# file.read() mutates the file object, which can affect caching
8183
# so we need to reset the file pointer to the beginning
@@ -90,6 +92,7 @@ def from_bytes(cls, file: BytesIO) -> "TxtFile":
9092
text = strip_consecutive_newlines(text)
9193
file.seek(0)
9294
doc = Document(page_content=text.strip())
95+
doc.metadata["source"] = "p-1"
9396
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
9497

9598

0 commit comments

Comments
 (0)