Future-House · jamesbraza · Nov 4, 2025 · Nov 1, 2025 · Nov 1, 2025 · Nov 1, 2025
diff --git a/.mailmap b/.mailmap
@@ -12,3 +12,4 @@ Michael Skarlinski <[email protected]> mskarlin <12701035+mskarlin@use
 Odhran O'Donoghue <[email protected]> odhran-o-d <[email protected]>
 Odhran O'Donoghue <[email protected]> <[email protected]>
 Samantha Cox <[email protected]> <[email protected]>
+takeru fukushima <[email protected]><[email protected]>
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 ![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)
 ![PyPI Python Versions](https://img.shields.io/pypi/pyversions/paper-qa)
 
-PaperQA2 is a package for doing high-accuracy retrieval augmented generation (RAG) on PDFs or text files,
+PaperQA2 is a package for doing high-accuracy retrieval augmented generation (RAG) on PDFs, text files, Microsoft Office documents, and source code files,
 with a focus on the scientific literature.
 See our [recent 2024 paper](https://paper.wikicrow.ai)
 to see examples of PaperQA2's superhuman performance in scientific tasks like
@@ -395,7 +395,7 @@ It just removes the automation associated with an agent picking the documents to
 ```python
 from paperqa import Docs, Settings
 
-# valid extensions include .pdf, .txt, .md, and .html
+# valid extensions include .pdf, .txt, .md, .html, .docx, .xlsx, .pptx, and code files (e.g., .py, .ts, .yaml)
 doc_paths = ("myfile.pdf", "myotherfile.pdf")
 
 # Prepare the Docs object by adding a bunch of documents
@@ -438,7 +438,7 @@ from paperqa import Docs
 
 async def main() -> None:
     docs = Docs()
-    # valid extensions include .pdf, .txt, .md, and .html
+    # valid extensions include .pdf, .txt, .md, .html, .docx, .xlsx, .pptx, and code files (e.g., .py, .ts, .yaml)
     for doc in ("myfile.pdf", "myotherfile.pdf"):
         await docs.aadd(doc)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -63,7 +63,7 @@ dev = [
     "ipython>=8",  # Pin to keep recent
     "litellm>=1.71",  # Lower pin for aiohttp transport adoption
     "mypy>=1.8",  # Pin for mutable-override
-    "paper-qa[docling,image,ldp,memory,pypdf-media,pymupdf,typing,zotero,local,qdrant]",
+    "paper-qa[docling,image,ldp,memory,pypdf-media,pymupdf,typing,zotero,local,qdrant,office]",
     "prek",
     "pydantic~=2.11",  # Pin for start of model_fields deprecation
     "pylint-pydantic",
@@ -92,6 +92,9 @@ memory = [
     "paper-qa[ldp]",
     "usearch>=2.16.4",  # Pin for Python 3.13 support
 ]
+office = [
+    "unstructured[docx,xlsx,pptx]",
+]
 openreview = [
     "openreview-py",
 ]

diff --git a/src/paperqa/readers.py b/src/paperqa/readers.py
@@ -3,6 +3,7 @@
 import asyncio
 import os
 from collections.abc import Awaitable, Callable
+from importlib.metadata import version
 from math import ceil
 from pathlib import Path
 from typing import Literal, Protocol, cast, overload, runtime_checkable
@@ -171,6 +172,61 @@ def parse_text(
     )
 
 
+def parse_office_doc(
+    path: str | os.PathLike,
+    **kwargs,
+) -> ParsedText:
+    """Parse office documents (.docx, .xlsx, .pptx) using unstructured, extracting text and images."""
+    try:
+        import unstructured
+        from unstructured.documents.elements import Image, Table
+        from unstructured.partition.auto import partition
+    except ImportError as exc:
+        raise ImportError(
+            "Could not import `unstructured` dependencies. "
+            "Please install with `pip install paper-qa[office]`."
+        ) from exc
+    UNSTRUCTURED_VERSION = version(unstructured.__name__)
+    elements = partition(str(path), **kwargs)
+
+    content_dict = {}
+    media_list: list[ParsedMedia] = []
+    current_text = ""
+    media_index = 0
+
+    for el in elements:
+        if isinstance(el, Image):
+            image_data = el.metadata.image_data
+            # Create a ParsedMedia object
+            parsed_media = ParsedMedia(
+                index=media_index,
+                data=image_data,
+                info={"suffix": el.metadata.image_mime_type},
+            )
+            media_list.append(parsed_media)
+            media_index += 1
+        elif isinstance(el, Table):
+            # For tables, we could get the HTML representation for better structure
+            if el.metadata.text_as_html:
+                current_text += el.metadata.text_as_html + "\n\n"
+        else:
+            current_text += str(el) + "\n\n"
+
+    # For office docs, we can treat the whole document as a single "page"
+    content_dict["1"] = (current_text, media_list)
+
+    return ParsedText(
+        content=content_dict,
+        metadata=ParsedMetadata(
+            parsing_libraries=[f"{unstructured.__name__} ({UNSTRUCTURED_VERSION})"],
+            paperqa_version=pqa_version,
+            total_parsed_text_length=len(current_text),
+            count_parsed_media=len(media_list),
+            name=f"office_doc|path={path}",
+        ),
+    )
+
+
 def chunk_text(
     parsed_text: ParsedText,
     doc: Doc,
@@ -276,7 +332,7 @@ def chunk_code_text(
 
 IMAGE_EXTENSIONS = tuple({".png", ".jpg", ".jpeg"})
 # When HTML reader supports images, add here
-ENRICHMENT_EXTENSIONS = tuple({".pdf", *IMAGE_EXTENSIONS})
+ENRICHMENT_EXTENSIONS = tuple({".pdf", ".docx", ".xlsx", ".pptx", *IMAGE_EXTENSIONS})
 
 
 @overload
@@ -383,6 +439,9 @@ async def read_doc(  # noqa: PLR0912
         )
     elif str_path.endswith(IMAGE_EXTENSIONS):
         parsed_text = await parse_image(path, **parser_kwargs)
+    elif str_path.endswith((".docx", ".xlsx", ".pptx")):
+        # TODO: Make parse_office_doc async
+        parsed_text = await asyncio.to_thread(parse_office_doc, path, **parser_kwargs)
     else:
         parsed_text = await asyncio.to_thread(
             parse_text, path, split_lines=True, **parser_kwargs
@@ -412,15 +471,15 @@ async def read_doc(  # noqa: PLR0912
                 f"|reduction=cl100k_base{enrichment_summary}"
             ),
         )
-    elif str_path.endswith(".pdf"):
+    elif str_path.endswith((".pdf", ".docx", ".xlsx", ".pptx")):
         chunked_text = chunk_pdf(
             parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap
         )
         chunk_metadata = ChunkMetadata(
             size=chunk_chars,
             overlap=overlap,
             name=(
-                f"paper-qa={pqa_version}|algorithm=overlap-pdf"
+                f"paper-qa={pqa_version}|algorithm=overlap-document"
                 f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
             ),
         )
@@ -445,6 +504,7 @@ async def read_doc(  # noqa: PLR0912
                 f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
             ),
         )
+
     else:
         chunked_text = chunk_code_text(
             parsed_text, doc, chunk_chars=chunk_chars, overlap=overlap

diff --git a/src/paperqa/settings.py b/src/paperqa/settings.py
@@ -598,7 +598,7 @@ class IndexSettings(BaseModel):
         default=lambda f: (
             f.suffix
             # TODO: add images after embeddings are supported
-            in {".txt", ".pdf", ".html", ".md"}
+            in {".txt", ".pdf", ".html", ".md", ".xlsx", ".docx", ".pptx"}
         ),
         exclude=True,
         description=(

diff --git a/tests/stub_data/dummy.docx b/tests/stub_data/dummy.docx
diff --git a/tests/stub_data/dummy.pptx b/tests/stub_data/dummy.pptx
diff --git a/tests/stub_data/dummy.xlsx b/tests/stub_data/dummy.xlsx
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -3108,3 +3108,31 @@ async def test_reader_config_propagation(stub_data_dir: Path) -> None:
     assert mock_read_doc.call_args.kwargs["chunk_chars"] == 2000
     assert mock_read_doc.call_args.kwargs["overlap"] == 50
     assert mock_read_doc.call_args.kwargs["dpi"] == 144
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("filename", ["dummy.docx", "dummy.pptx", "dummy.xlsx"])
+async def test_parse_office_doc(stub_data_dir: Path, filename: str) -> None:
+    file_path = stub_data_dir / filename
+    if not file_path.exists():
+        pytest.skip(f"{filename} not found in stub_data")
+
+    docs = Docs()
+
+    settings = Settings(
+        llm="gemini/gemini-2.5-flash",
+        embedding="gemini/text-embedding-004",
+        summary_llm="gemini/gemini-2.5-flash",
+        agent={"agent_llm": "gemini/gemini-2.5-flash"},
+        parsing=ParsingSettings(use_doc_details=False, disable_doc_valid_check=True),
-        parsing=ParsingSettings(use_doc_details=False, disable_doc_valid_check=True),
+        parsing=ParsingSettings(use_doc_details=False),
-        parsing=ParsingSettings(use_doc_details=False, disable_doc_valid_check=True),
+        parsing=ParsingSettings(use_doc_details=False),
+    )
+    docname = await docs.aadd(
+        file_path,
+        "dummy citation",
+        docname=filename,
+        settings=settings,
+    )
+    assert docname is not None
+    assert docs.texts
+    session = await docs.aquery("What is the RAG system?", settings=settings)
+    assert session.answer