-
Notifications
You must be signed in to change notification settings - Fork 786
Feature/new doc types #1169
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/new doc types #1169
Changes from 19 commits
18c58db
95e6e25
2f32c8a
f7cc49d
e42e636
c801103
5e49ca1
65c5097
5ead779
a7c5e3a
5fe86c1
d4619bd
0be33fe
0775523
84c6bf2
e218d91
4933f16
77b95cc
280c381
d95ed9f
057c7f8
f4975fc
7bfad14
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,3 +12,4 @@ Michael Skarlinski <[email protected]> mskarlin <12701035+mskarlin@use | |
| Odhran O'Donoghue <[email protected]> odhran-o-d <[email protected]> | ||
| Odhran O'Donoghue <[email protected]> <[email protected]> | ||
| Samantha Cox <[email protected]> <[email protected]> | ||
| takeru fukushima <[email protected]><[email protected]> | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| import asyncio | ||
| import os | ||
| from collections.abc import Awaitable, Callable | ||
| from importlib.metadata import version | ||
| from math import ceil | ||
| from pathlib import Path | ||
| from typing import Literal, Protocol, cast, overload, runtime_checkable | ||
|
|
@@ -171,6 +172,61 @@ def parse_text( | |
| ) | ||
|
|
||
|
|
||
| def parse_office_doc( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you make a unit test for this in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [success] 90.64% tests/test_paperqa.py::test_parse_office_doc[dummy.docx]: 1.5548s Results (5.19s): I'm not confident, but the test passed. I'll commit.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And sorry for dummy .docx and .xlsx written in Japanese. |
||
| path: str | os.PathLike, | ||
| **kwargs, | ||
| ) -> ParsedText: | ||
| """Parse office documents (.docx, .xlsx, .pptx) using unstructured, extracting text and images.""" | ||
| try: | ||
| import unstructured | ||
| from unstructured.documents.elements import Image, Table | ||
| from unstructured.partition.auto import partition | ||
| except ImportError as exc: | ||
| raise ImportError( | ||
| "Could not import `unstructured` dependencies. " | ||
| "Please install with `pip install paper-qa[office]`." | ||
| ) from exc | ||
| UNSTRUCTURED_VERSION = version(unstructured.__name__) | ||
| elements = partition(str(path), **kwargs) | ||
|
|
||
| content_dict = {} | ||
| media_list: list[ParsedMedia] = [] | ||
| current_text = "" | ||
| media_index = 0 | ||
|
|
||
| for el in elements: | ||
| if isinstance(el, Image): | ||
| image_data = el.metadata.image_data | ||
| # Create a ParsedMedia object | ||
| parsed_media = ParsedMedia( | ||
| index=media_index, | ||
| data=image_data, | ||
| info={"suffix": el.metadata.image_mime_type}, | ||
| ) | ||
| media_list.append(parsed_media) | ||
| media_index += 1 | ||
| elif isinstance(el, Table): | ||
| # For tables, we could get the HTML representation for better structure | ||
| if el.metadata.text_as_html: | ||
| current_text += el.metadata.text_as_html + "\n\n" | ||
| else: | ||
| current_text += str(el) + "\n\n" | ||
|
|
||
| # For office docs, we can treat the whole document as a single "page" | ||
| content_dict["1"] = (current_text, media_list) | ||
|
|
||
| return ParsedText( | ||
| content=content_dict, | ||
| metadata=ParsedMetadata( | ||
| parsing_libraries=[f"{unstructured.__name__} ({UNSTRUCTURED_VERSION})"], | ||
| paperqa_version=pqa_version, | ||
| total_parsed_text_length=len(current_text), | ||
| count_parsed_media=len(media_list), | ||
| name=f"office_doc|path={path}", | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| def chunk_text( | ||
| parsed_text: ParsedText, | ||
| doc: Doc, | ||
|
|
@@ -276,7 +332,7 @@ def chunk_code_text( | |
|
|
||
| IMAGE_EXTENSIONS = tuple({".png", ".jpg", ".jpeg"}) | ||
| # When HTML reader supports images, add here | ||
| ENRICHMENT_EXTENSIONS = tuple({".pdf", *IMAGE_EXTENSIONS}) | ||
| ENRICHMENT_EXTENSIONS = tuple({".pdf", ".docx", ".xlsx", ".pptx", *IMAGE_EXTENSIONS}) | ||
|
|
||
|
|
||
| @overload | ||
|
|
@@ -383,6 +439,9 @@ async def read_doc( # noqa: PLR0912 | |
| ) | ||
| elif str_path.endswith(IMAGE_EXTENSIONS): | ||
| parsed_text = await parse_image(path, **parser_kwargs) | ||
| elif str_path.endswith((".docx", ".xlsx", ".pptx")): | ||
| # TODO: Make parse_office_doc async | ||
| parsed_text = await asyncio.to_thread(parse_office_doc, path, **parser_kwargs) | ||
| else: | ||
| parsed_text = await asyncio.to_thread( | ||
| parse_text, path, split_lines=True, **parser_kwargs | ||
|
|
@@ -412,15 +471,15 @@ async def read_doc( # noqa: PLR0912 | |
| f"|reduction=cl100k_base{enrichment_summary}" | ||
| ), | ||
| ) | ||
| elif str_path.endswith(".pdf"): | ||
| elif str_path.endswith((".pdf", ".docx", ".xlsx", ".pptx")): | ||
| chunked_text = chunk_pdf( | ||
| parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap | ||
| ) | ||
| chunk_metadata = ChunkMetadata( | ||
| size=chunk_chars, | ||
| overlap=overlap, | ||
| name=( | ||
| f"paper-qa={pqa_version}|algorithm=overlap-pdf" | ||
| f"paper-qa={pqa_version}|algorithm=overlap-document" | ||
| f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}" | ||
| ), | ||
| ) | ||
|
|
@@ -445,6 +504,7 @@ async def read_doc( # noqa: PLR0912 | |
| f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}" | ||
| ), | ||
| ) | ||
|
|
||
| else: | ||
| chunked_text = chunk_code_text( | ||
| parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -3108,3 +3108,31 @@ async def test_reader_config_propagation(stub_data_dir: Path) -> None: | |||||
| assert mock_read_doc.call_args.kwargs["chunk_chars"] == 2000 | ||||||
| assert mock_read_doc.call_args.kwargs["overlap"] == 50 | ||||||
| assert mock_read_doc.call_args.kwargs["dpi"] == 144 | ||||||
|
|
||||||
|
|
||||||
| @pytest.mark.asyncio | ||||||
| @pytest.mark.parametrize("filename", ["dummy.docx", "dummy.pptx", "dummy.xlsx"]) | ||||||
| async def test_parse_office_doc(stub_data_dir: Path, filename: str) -> None: | ||||||
| file_path = stub_data_dir / filename | ||||||
| if not file_path.exists(): | ||||||
| pytest.skip(f"{filename} not found in stub_data") | ||||||
|
|
||||||
| docs = Docs() | ||||||
|
|
||||||
| settings = Settings( | ||||||
| llm="gemini/gemini-2.5-flash", | ||||||
| embedding="gemini/text-embedding-004", | ||||||
| summary_llm="gemini/gemini-2.5-flash", | ||||||
| agent={"agent_llm": "gemini/gemini-2.5-flash"}, | ||||||
| parsing=ParsingSettings(use_doc_details=False, disable_doc_valid_check=True), | ||||||
|
||||||
| parsing=ParsingSettings(use_doc_details=False, disable_doc_valid_check=True), | |
| parsing=ParsingSettings(use_doc_details=False), |
These docs should be valid (we don't need disable_doc_valid_check=True)
jamesbraza marked this conversation as resolved.
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay I actually ran these tests just now a bit, and I noticed the question "What is the RAG system?" only applies to the dummy.docx.
Can you either:
- Adjust the question to match each document
- Change the
.pptxandxlsxto also have content for"What is the RAG system?"
Let's make the assertions:
session = await docs.aquery("What is the RAG system?", settings=settings)
assert session.used_contexts
assert len(session.answer) > 10, "Expected an answer"
assert CANNOT_ANSWER_PHRASE not in session.answer, (
"Expected the system to be sure"
)
Uh oh!
There was an error while loading. Please reload this page.