-
Notifications
You must be signed in to change notification settings - Fork 790
Feature/new doc types #1169
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/new doc types #1169
Changes from 14 commits
18c58db
95e6e25
2f32c8a
f7cc49d
e42e636
c801103
5e49ca1
65c5097
5ead779
a7c5e3a
5fe86c1
d4619bd
0be33fe
0775523
84c6bf2
e218d91
4933f16
77b95cc
280c381
d95ed9f
057c7f8
f4975fc
7bfad14
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| import asyncio | ||
| import os | ||
| from collections.abc import Awaitable, Callable | ||
| from importlib.metadata import version | ||
| from math import ceil | ||
| from pathlib import Path | ||
| from typing import Literal, Protocol, cast, overload, runtime_checkable | ||
|
|
@@ -171,6 +172,61 @@ def parse_text( | |
| ) | ||
|
|
||
|
|
||
| def parse_office_doc( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you make a unit test for this in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [success] 90.64% tests/test_paperqa.py::test_parse_office_doc[dummy.docx]: 1.5548s Results (5.19s): I'm not confident, but the test passed. I'll commit.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And sorry for dummy .docx and .xlsx written in Japanese. |
||
| path: str | os.PathLike, | ||
| **kwargs, | ||
| ) -> ParsedText: | ||
| """Parse office documents (.docx, .xlsx, .pptx) using unstructured, extracting text and images.""" | ||
| try: | ||
| import unstructured | ||
| from unstructured.documents.elements import Image, Table | ||
| from unstructured.partition.auto import partition | ||
| except ImportError as exc: | ||
| raise ImportError( | ||
| "Could not import `unstructured` dependencies. " | ||
| "Please install with `pip install paper-qa[office]`." | ||
| ) from exc | ||
| UNSTRUCTURED_VERSION = version(unstructured.__name__) | ||
| elements = partition(str(path), **kwargs) | ||
|
|
||
| content_dict = {} | ||
| media_list: list[ParsedMedia] = [] | ||
| current_text = "" | ||
| media_index = 0 | ||
|
|
||
| for el in elements: | ||
| if isinstance(el, Image): | ||
| image_data = el.metadata.image_data | ||
| # Create a ParsedMedia object | ||
| parsed_media = ParsedMedia( | ||
| index=media_index, | ||
| data=image_data, | ||
| info={"suffix": el.metadata.image_mime_type}, | ||
| ) | ||
| media_list.append(parsed_media) | ||
| media_index += 1 | ||
| elif isinstance(el, Table): | ||
| # For tables, we could get the HTML representation for better structure | ||
| if el.metadata.text_as_html: | ||
| current_text += el.metadata.text_as_html + "\n\n" | ||
| else: | ||
| current_text += str(el) + "\n\n" | ||
|
|
||
| # For office docs, we can treat the whole document as a single "page" | ||
| content_dict["1"] = (current_text, media_list) | ||
|
|
||
| return ParsedText( | ||
| content=content_dict, | ||
| metadata=ParsedMetadata( | ||
| parsing_libraries=[f"{unstructured.__name__} ({UNSTRUCTURED_VERSION})"], | ||
| paperqa_version=pqa_version, | ||
| total_parsed_text_length=len(current_text), | ||
| count_parsed_media=len(media_list), | ||
| name=f"office_doc|path={path}", | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| def chunk_text( | ||
| parsed_text: ParsedText, | ||
| doc: Doc, | ||
|
|
@@ -276,7 +332,7 @@ def chunk_code_text( | |
|
|
||
| IMAGE_EXTENSIONS = tuple({".png", ".jpg", ".jpeg"}) | ||
| # When HTML reader supports images, add here | ||
| ENRICHMENT_EXTENSIONS = tuple({".pdf", *IMAGE_EXTENSIONS}) | ||
| ENRICHMENT_EXTENSIONS = tuple({".pdf", ".docx", ".xlsx", ".pptx", *IMAGE_EXTENSIONS}) | ||
|
|
||
|
|
||
| @overload | ||
|
|
@@ -383,6 +439,9 @@ async def read_doc( # noqa: PLR0912 | |
| ) | ||
| elif str_path.endswith(IMAGE_EXTENSIONS): | ||
| parsed_text = await parse_image(path, **parser_kwargs) | ||
| elif str_path.endswith((".docx", ".xlsx", ".pptx")): | ||
| # TODO: Make parse_office_doc async | ||
| parsed_text = await asyncio.to_thread(parse_office_doc, path, **parser_kwargs) | ||
| else: | ||
| parsed_text = await asyncio.to_thread( | ||
| parse_text, path, split_lines=True, **parser_kwargs | ||
|
|
@@ -412,15 +471,15 @@ async def read_doc( # noqa: PLR0912 | |
| f"|reduction=cl100k_base{enrichment_summary}" | ||
| ), | ||
| ) | ||
| elif str_path.endswith(".pdf"): | ||
| elif str_path.endswith((".pdf", ".docx", ".xlsx", ".pptx")): | ||
| chunked_text = chunk_pdf( | ||
| parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap | ||
| ) | ||
| chunk_metadata = ChunkMetadata( | ||
| size=chunk_chars, | ||
| overlap=overlap, | ||
| name=( | ||
| f"paper-qa={pqa_version}|algorithm=overlap-pdf" | ||
| f"paper-qa={pqa_version}|algorithm=overlap-document" | ||
| f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}" | ||
| ), | ||
| ) | ||
|
|
@@ -445,6 +504,7 @@ async def read_doc( # noqa: PLR0912 | |
| f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}" | ||
| ), | ||
| ) | ||
|
|
||
| else: | ||
| chunked_text = chunk_code_text( | ||
| parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -3108,3 +3108,34 @@ async def test_reader_config_propagation(stub_data_dir: Path) -> None: | |||||
| assert mock_read_doc.call_args.kwargs["chunk_chars"] == 2000 | ||||||
| assert mock_read_doc.call_args.kwargs["overlap"] == 50 | ||||||
| assert mock_read_doc.call_args.kwargs["dpi"] == 144 | ||||||
|
|
||||||
|
|
||||||
| @pytest.mark.asyncio | ||||||
| @pytest.mark.parametrize("filename", ["dummy.docx", "dummy.pptx", "dummy.xlsx"]) | ||||||
| async def test_parse_office_doc(stub_data_dir: Path, filename: str) -> None: | ||||||
| # This test requires the user to create dummy office files in tests/stub_data | ||||||
| # For example: | ||||||
| # touch tests/stub_data/dummy.docx | ||||||
| # touch tests/stub_data/dummy.pptx | ||||||
| # touch tests/stub_data/dummy.xlsx | ||||||
| file_path = stub_data_dir / filename | ||||||
| if not file_path.exists(): | ||||||
| pytest.skip(f"{filename} not found in stub_data") | ||||||
|
|
||||||
| docs = Docs() | ||||||
| settings = Settings( | ||||||
| llm="openrouter/google/gemma-7b-it", | ||||||
| llm_config={"api_key": os.environ.get("OPEN_ROUTER_API_KEY")}, | ||||||
|
||||||
| llm_config={"api_key": os.environ.get("OPEN_ROUTER_API_KEY")}, |
I think you shouldn't need this, litellm should just auto check OPENROUTER_API_KEY: https://docs.litellm.ai/docs/providers/openrouter
So maybe update your local env to have OPENROUTER_API_KEY
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| use_doc_details=False, disable_doc_valid_check=True, defer_embedding=True | |
| use_doc_details=False, disable_doc_valid_check=True |
I think you don't need to defer embeddings, we can embed right away
jamesbraza marked this conversation as resolved.
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| dockey="dummy_doc", |
No need to specify dockey if you're not going to use it
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you look at lint CI to fix this part
Uh oh!
There was an error while loading. Please reload this page.