diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..070aded99 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,25 @@ +# Disable EOL conversion for binary fixtures so Windows checkouts +# do not mangle test inputs. See upstream PR #699. + +# Office document formats +*.docx binary +*.dotx binary +*.docm binary +*.dotm binary +*.doc binary +*.dot binary +*.xlsx binary +*.pptx binary + +# Image formats used as fixtures +*.png binary +*.jpg binary +*.jpeg binary +*.gif binary +*.bmp binary +*.tif binary +*.tiff binary +*.emf binary +*.wmf binary +*.ico binary +*.pdf binary diff --git a/.gitignore b/.gitignore index 5aabfd8cc..926fd1c12 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ _scratch/ Session.vim /.tox/ +.claude/ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 6ce09e8e6..000000000 --- a/.travis.yml +++ /dev/null @@ -1,9 +0,0 @@ -language: python -python: - - "3.8" - - "3.6" - - "2.7" -# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors -install: pip install -r requirements.txt -# command to run tests, e.g. python setup.py test -script: py.test diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..39bc16df3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,236 @@ +# CLAUDE.md + +python-docx fork (loadfix/python-docx) — extending python-docx with footnotes, endnotes, track changes, fields, bookmarks, and other missing OOXML capabilities. + +This project is one of a sibling series of OOXML libraries under the loadfix org: + +- **loadfix/python-docx** — Word `.docx` (this repo) +- **loadfix/python-pptx** — PowerPoint `.pptx` +- **loadfix/python-xlsx** — Excel `.xlsx` +- **loadfix/ooxml-validate** — cross-format validator (Microsoft Open XML SDK + LibreOffice) + +The three reading/writing libraries share an architectural lineage (three-layer proxy/part/oxml pattern over lxml) and OOXML spec conventions. When implementing a feature that exists across the trio, consult the sibling repos for naming and API-shape precedent. When verifying that library output is correct, use `ooxml-validate`. + +## Architecture + +Three-layer pattern: + +``` +Document API (src/docx/document.py, src/docx/footnotes.py, etc.) + | Proxy objects wrapping oxml elements +Parts Layer (src/docx/parts/*.py) + | XmlPart subclasses owning XML trees, managing relationships +oxml Layer (src/docx/oxml/*.py) + | CT_* element classes extending lxml.etree.ElementBase +lxml (XML parsing/serialization) +``` + +## Source Layout + +``` +src/docx/ Main package (src-layout, NOT flat) +src/docx/oxml/ CT_* element classes (low-level XML wrappers) +src/docx/parts/ Part classes (document, numbering, comments, styles, etc.) +src/docx/text/ Text-related proxy classes (paragraph, run, font, parfmt) +src/docx/styles/ Style proxy classes +src/docx/enum/ Enumerations (WD_ALIGN, WD_STYLE_TYPE, etc.) +src/docx/templates/ Default XML templates for new parts +tests/ pytest test suite +features/ behave acceptance tests +``` + +## Key Patterns + +### CT_ Element Classes (oxml layer) + +Define in `src/docx/oxml/`, register in `src/docx/oxml/__init__.py`. + +```python +from docx.oxml.xmlchemy import BaseOxmlElement, ZeroOrOne, ZeroOrMore, OptionalAttribute +from docx.oxml.simpletypes import ST_DecimalNumber, ST_String + +class CT_Footnote(BaseOxmlElement): + """```` element.""" + pPr = ZeroOrOne("w:pPr", successors=("w:r",)) + r = ZeroOrMore("w:r", successors=()) + id = RequiredAttribute("w:id", ST_DecimalNumber) +``` + +- `ZeroOrOne(tag, successors=(...))` — generates getter, `_add_*()`, `get_or_add_*()`, `_remove_*()`, `_insert_*()` +- `ZeroOrMore(tag, successors=(...))` — generates `*_lst` property, `add_*()`, `_insert_*()` +- `successors` tuple must match XSD schema ordering exactly — consult `../ooxml-reference-corpus/spec/ecma-376-5/part-1/xsd/wml.xsd` (WordprocessingML), `dml-wordprocessingDrawing.xsd` (anchor/inline drawing), or `shared-math.xsd` (OMML) for authoritative ordering. The sibling `rnc/` directory has RELAX NG Compact variants that are easier to read than the XSDs. +- Register: `register_element_cls("w:footnote", CT_Footnote)` in `oxml/__init__.py` + +### Part Classes + +Extend `XmlPart` or `StoryPart`. Follow `CommentsPart` as a model: + +```python +class FootnotesPart(StoryPart): + @classmethod + def default(cls, package): + partname = PackURI("/word/footnotes.xml") + content_type = CT.WML_FOOTNOTES + element = cast("CT_Footnotes", parse_xml(cls._default_xml())) + return cls(partname, content_type, element, package) +``` + +Wire into `DocumentPart` with lazy creation: +```python +@property +def _footnotes_part(self): + try: + return self.part_related_by(RT.FOOTNOTES) + except KeyError: + part = FootnotesPart.default(self.package) + self.relate_to(part, RT.FOOTNOTES) + return part +``` + +Register in `src/docx/__init__.py`: +```python +PartFactory.part_type_for[CT.WML_FOOTNOTES] = FootnotesPart +``` + +### Proxy Objects + +Wrap CT_ elements. Inherit from `ElementProxy`, `StoryChild`, or `BlockItemContainer`: + +```python +class Footnote(BlockItemContainer): + @property + def footnote_id(self): + return self._element.id +``` + +### Constants + +- Content types: `src/docx/opc/constants.py` — `CT.WML_FOOTNOTES` and `CT.WML_ENDNOTES` already defined +- Relationship types: same file — `RT.FOOTNOTES` and `RT.ENDNOTES` already defined +- Namespaces: `src/docx/oxml/ns.py` — `qn("w:footnote")` for Clark notation + +## OOXML feature workflow (required before implementing any new feature) + +Every OOXML feature is defined by a manifest in the shared corpus +repository `loadfix/ooxml-reference-corpus` (sibling checkout at +`../ooxml-reference-corpus/`). Before implementing any new feature: + +1. **Read the manifest.** Look under + `../ooxml-reference-corpus/features/docx/` for a JSON manifest whose + `assertions` block defines what "passing" means. Example: + `features/docx/bold-text.json` — the XPath there is the canonical + definition of "bold". + +2. **Consult the ECMA-376 5th edition spec** (corpus-only — the spec + archive is intentionally NOT duplicated into this repo): + - PDF: `../ooxml-reference-corpus/spec/ecma-376-5/part-1/Ecma Office Open XML Part 1 - Fundamentals And Markup Language Reference.pdf` + - RNC schemas (easier to read): `../ooxml-reference-corpus/spec/ecma-376-5/part-1/rnc/` + - XSD schemas (authoritative for validators): `../ooxml-reference-corpus/spec/ecma-376-5/part-1/xsd/` + +3. **If no manifest exists yet**, author one *first*. The manifest is + the definition of done; the library code is an attempt at it. Follow + the schema at + `../ooxml-reference-corpus/features/manifest.schema.json`. Commit: + the JSON manifest, a `scripts/gen_.py` generator, the + machine-generated fixture under `fixtures/docx/`, and (optional but + valuable) an Office-authored `.office.docx` companion. + +4. **Verify conformance.** Implementation passes when + `ooxml_validate.conformance.run_feature(manifest, library="python-docx", + fixture_path=output)` returns `status="pass"`. + +## OOXML spec vs Microsoft Word reality + +Microsoft Word does NOT strictly implement ISO/IEC 29500 / ECMA-376. Treat the spec as a starting point, not ground truth. + +- Word writes the **Transitional** flavor, not **Strict**. The 4th/5th/6th editions of ISO 29500-1 tightened the spec toward Strict; Word still emits Transitional namespaces that trace back to the original 1st edition / ECMA-376 2006. +- Word emits Microsoft extensions in the `w14:`, `w15:`, `w16:`, `w16cid:`, `w16se:` namespaces (Word 2010/2013/2016+), gated by `mc:Ignorable`. These are documented in the `[MS-DOCX]` / `[MS-OE376]` extension series, not in the corpus-level spec. +- Word's reader tolerates out-of-order, extra, and missing elements that the spec forbids. Word's writer emits shapes the spec doesn't mandate. A spec-valid file is not automatically a file Word will open cleanly. +- **When the spec and Word disagree, match Word.** The canonical way to resolve ambiguity is: save a minimal `.docx` from Word, unzip it, and inspect the XML. The corpus-level `.xsd` files tell you what is *allowed*; the `.office.docx` companions in the corpus tell you what is *interoperable*. + +## Test Conventions + +- Framework: pytest with BDD-style naming +- Test classes: `Describe*` pattern +- Test methods: `it_*`, `its_*`, `they_*` prefixes +- Test XML: `cxml.element("w:footnotes/(w:footnote{w:id=1})")` — compact XML expression language +- Mocks: `class_mock(request, "dotted.path")`, `instance_mock(request, Class)`, `method_mock(request, Class, "name")` +- Test utilities in `tests/unitutil/` +- Acceptance tests live under `features/` (behave, Gherkin `.feature` files plus step modules). + +Example: +```python +class DescribeCT_Footnotes: + def it_can_add_a_footnote(self): + footnotes = cast(CT_Footnotes, element("w:footnotes")) + footnote = footnotes.add_footnote() + assert footnote.id == 2 +``` + +## Commands + +```bash +# Run tests +pytest tests/ -v + +# Run a specific test +pytest tests/unit/test_footnotes.py -v + +# Run acceptance tests +behave features/ + +# Type check +pyright src/ + +# Install in dev mode +pip install -e ".[dev]" +``` + +## What NOT to do + +- Don't amend or force-push to `master`, and never force-push to an upstream remote under any circumstance. +- Don't commit secrets, API tokens, local scratch output, or generated docs. +- Don't add runtime dependencies lightly — every new dep affects a large user base. If you must, raise it first. +- Don't introduce backwards-incompatible API changes without a HISTORY/FEATURES note and a transition plan (deprecation warning where possible). +- Don't silence warnings with broad `filterwarnings` ignores — they exist to catch real problems. +- Don't delete `py.typed`; removing it silently breaks downstream type-checking. +- Don't bypass the xmlchemy descriptor layer with raw `lxml.etree` access in production code — the descriptors carry namespace, type, and default semantics. +- Don't move unit tests out of their current location or rename test methods away from the `Describe*` / `it_*` BDD convention — test discovery relies on it. + +## Common workflows + +### Adding a new public method on an existing class +1. Implement in the appropriate `src/docx/…` module. +2. Add unit tests in the mirrored test file under `tests/`. +3. Add a behave scenario under `features/` if the capability is user-visible. +4. Update `FEATURES.md` — refresh the entry and snippet, verify the snippet runs against a fresh `Document()`. + +### Adding a new enum value +- Enums live in `src/docx/enum/`. Read a neighboring enum first to see the XML-mapping pattern. +- Update any doc reference that enumerates the valid values. + +### Adding a new XML element class +- Custom element classes live in `src/docx/oxml/…`. They use the `xmlchemy` descriptor layer (`ZeroOrOne`, `OneAndOnlyOne`, `ZeroOrMore`, `RequiredAttribute`, `OptionalAttribute`, …). +- Consult `../ooxml-reference-corpus/spec/ecma-376-5/part-1/xsd/*.xsd` (or the easier-to-read sibling `rnc/*.rnc`) for authoritative element ordering before declaring `successors`. +- Register the class with `register_element_cls("w:tag", CT_Tag)` at the bottom of `src/docx/oxml/__init__.py`. +- Save a minimal `.docx` from Word that exercises the element, unzip it, and compare — **when the spec and Word disagree, match Word**. + +### Keep README.md and TODO.md current + +Whenever a feature is added, removed, or a public option changes, update both of these files *in the same PR* as the code change — stale docs have bitten us before. + +- **`README.md`** — the API block reflects the real public surface. If you add/remove a function or option, add/remove the matching entry. If you add or remove an export, reflect it in the API section. Any prose sections (Status, Contributing, project-specific sections) should also match reality. +- **`TODO.md`** — if the change resolves a tracked issue, move that entry into a "Resolved in fork" / "Done" section with a one-line description and the PR/commit reference. Update any counts table at the top and bump the "last updated" date. + +Minimum check before every PR that touches source: `grep -n "" README.md TODO.md` to catch stale references. + +## Important + +- Before implementing a new feature or element class, consult `../ooxml-reference-corpus/spec/ecma-376-5/` for authoritative schema information: `part-1/xsd/*.xsd` (W3C XSD grammars), `part-1/rnc/*.rnc` (RELAX NG Compact equivalents, easier to read), and the four `Ecma Office Open XML Part N.pdf` files (markup-language reference, OPC packaging, markup compatibility, transitional migration features). These live in the corpus repo rather than here to avoid duplicating ~50 MB of PDFs across six sibling projects — they are not runtime dependencies, just the canonical sources for element ordering, attribute types, and cardinality. +- Keep `FEATURES.md` current when adding, modifying, or deleting public API. It is a single-page catalogue of every public feature (43 sections, ~1800 lines) with fork additions marked `[Added in 1.3.0.dev0]`. For each change: add the new entry (or update/remove the existing one) under the relevant section, refresh its snippet if the API surface shifted, and verify the snippet still runs against a fresh `Document()`. +- Always run tests after changes: `pytest tests/ -v` +- The successors tuple in element declarations MUST match XSD ordering +- Footnote IDs 0 and 1 are reserved (separator, continuation separator) +- Use `src/` layout — all code is under `src/docx/`, not `docx/` +- Follow existing code style: no docstring on test methods, BDD-style names +- XML templates go in `src/docx/templates/` diff --git a/FEATURES.md b/FEATURES.md new file mode 100644 index 000000000..02e6b1356 --- /dev/null +++ b/FEATURES.md @@ -0,0 +1,1930 @@ +# Features + +`loadfix/python-docx` is a fork of +[python-docx](https://github.com/python-openxml/python-docx) that extends the +library with footnotes and endnotes, tracked changes, bookmarks, fields, +content controls, charts, equations, SmartArt, watermarks, digital signatures, +accessibility tooling, cross-document operations, and many more OOXML +capabilities that were previously out of reach. + +This document is the full, single-page catalogue of what the library can do +today. Each section covers one feature area, opens with a short overview, +shows a copy-pasteable snippet against a fresh `Document()`, and then lists +the public methods, properties, and classes that make up that surface. +Items marked `[Added in 2026.05.0]` are additions from this fork — every +other item is inherited from the upstream base. + +**Table of contents** + +- [Opening and saving documents](#opening-and-saving-documents) +- [Paragraphs](#paragraphs) +- [Runs and text](#runs-and-text) +- [Fonts and character formatting](#fonts-and-character-formatting) +- [Paragraph formatting](#paragraph-formatting) +- [Hyperlinks](#hyperlinks) +- [Tables](#tables) +- [Lists and numbering](#lists-and-numbering) +- [Styles](#styles) +- [Inline images](#inline-images) +- [Floating images and shapes](#floating-images-and-shapes) +- [Charts](#charts) +- [SmartArt](#smartart) +- [Equations](#equations) +- [Sections and page layout](#sections-and-page-layout) +- [Headers and footers](#headers-and-footers) +- [Comments](#comments) +- [Footnotes and endnotes](#footnotes-and-endnotes) +- [Bookmarks](#bookmarks) +- [Fields and cross-references](#fields-and-cross-references) +- [Table of contents](#table-of-contents) +- [Tracked changes](#tracked-changes) +- [Content controls (SDT)](#content-controls-sdt) +- [Form fields](#form-fields) +- [Watermarks](#watermarks) +- [Captions](#captions) +- [Mail merge](#mail-merge) +- [Document properties](#document-properties) +- [Settings](#settings) +- [Themes](#themes) +- [Permissions and protection](#permissions-and-protection) +- [Ink annotations](#ink-annotations) +- [Embedded objects and attachments](#embedded-objects-and-attachments) +- [Font table](#font-table) +- [Web settings](#web-settings) +- [Glossary (building blocks)](#glossary-building-blocks) +- [Digital signatures](#digital-signatures) +- [Accessibility](#accessibility) +- [Document statistics](#document-statistics) +- [Search and replace](#search-and-replace) +- [Cross-document operations](#cross-document-operations) +- [Packaging and I/O options](#packaging-and-io-options) +- [API concepts](#api-concepts) + +--- + +## Opening and saving documents + +The top-level `docx.Document()` factory opens a `.docx`, `.docm`, `.dotx`, or +`.dotm` package, or — when called with no argument — creates a fresh document +from the bundled default template. Strict-OOXML packages are transparently +translated to Transitional on open and Flat-OPC (``) XML input is +auto-detected. `Document.save()` serialises back to a path or stream, with +optional Flat-OPC or reproducible (byte-identical) output. Documents support +the context-manager protocol and expose a `huge_tree` escape hatch for very +large files plus a `recover=True` mode that tolerates malformed XML. + +```python +from docx import Document + +# open with default template +with Document() as document: + document.add_heading("Hello", level=1) + document.add_paragraph("A paragraph.") + document.save("out.docx") + +# open an existing file (path may be str, pathlib.Path, or file-like) +document = Document("report.docx", huge_tree=False, include_metadata=True) + +# derive a new document from a template +document = Document.from_template("corporate.dotx") + +# reproducible save (byte-identical for the same content) +document.save("out.docx", reproducible=True) + +# Flat-OPC single-XML output +document.save("out.xml", flat_opc=True) +``` + +- `docx.Document(docx=None, recover=False, huge_tree=False, include_metadata=True, password=None)` — Factory returning a `docx.document.Document`. `recover=True`, `huge_tree=True`, `include_metadata=False`, `os.PathLike` paths, `.dotx`/`.dotm` templates, Strict-OOXML, and Flat-OPC inputs are all `[Added in 2026.05.0]`. `password=` decrypts an ECMA-376 Agile-Encryption (password-protected) `.docx` via the optional `python-ooxml-crypto` dependency. `[Added in 2026.05.10]` +- `docx.Document.from_template(template)` — Open a `.dotx`/`.dotm` and return a document whose main-part content-type is switched to the matching non-template variant. `[Added in 2026.05.0]` +- `Document.save(path_or_stream, flat_opc=False, reproducible=False, password=None)` — Write the document. `flat_opc` and `reproducible` are `[Added in 2026.05.0]`. `password=` encrypts the output using ECMA-376 Agile Encryption via the optional `python-ooxml-crypto` dependency. `[Added in 2026.05.10]` +- `Document.close()` — Drop transient state (tracked-changes contexts). Safe to call more than once. `[Added in 2026.05.0]` +- `Document.__enter__` / `Document.__exit__` — Context-manager support. `[Added in 2026.05.0]` +- `Document.recovery_warnings` — List of parser warnings collected when `recover=True` was used. `[Added in 2026.05.0]` +- `docx.exceptions.EncryptedDocumentError` — Raised when opening a password-protected `.docx` without a correct password, or when `python-ooxml-crypto` is required but not installed. `[Added in 2026.05.0]` +- `docx.exceptions.RmsProtectedDocumentError` — Subclass of `EncryptedDocumentError`, raised when opening a file wrapped in Azure RMS / AIP / IRM protection (not decryptable with a password). `[Added in 2026.05.10]` +- `docx.exceptions.PythonDocxError` / `InvalidSpanError` / `InvalidXmlError` — Library-specific exceptions. + +--- + +## Paragraphs + +Paragraphs are the most common block in the document body, each cell, and +every header/footer/footnote/endnote/comment story. The fork extends the +classic `Document.add_paragraph()` / `insert_paragraph_before()` API with a +symmetric `insert_paragraph_after()`, `insert_table_before()` / +`insert_table_after()`, `delete()`, page-break helpers, caption insertion, +TOC insertion, field insertion, content-control insertion, bookmarks, +permission ranges, `w:next`-style auto-application, and a stable-ID +fingerprint for tools that track paragraphs across save/load. + +```python +from docx import Document +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT + +document = Document() +p = document.add_paragraph("First paragraph.", style="Normal") +p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY + +# insert before and after +before = p.insert_paragraph_before("inserted before") +after = p.insert_paragraph_after("inserted after") +after.style = "Intense Quote" + +# delete & replace +before.delete() + +# stable id survives save/reload so long as position + text don't change +print(p.stable_id) + +# bookmark the paragraph +p.add_bookmark("chapter-1") +document.save("out.docx") +``` + +- `Document.add_paragraph(text="", style=None, track_author=None)` — Append a new paragraph. `track_author` wraps the inserted run in `w:ins`. `[Added in 2026.05.0]` for `track_author` and `w:next` auto-style handling. +- `Document.add_heading(text, level=1)` — Shortcut for `add_paragraph` with `"Heading N"` / `"Title"` style. +- `Document.add_page_break()` — Append a paragraph containing only a page break. +- `Document.add_caption(text, label="Figure", style="Caption")` — Append a numbered `SEQ`-field caption paragraph. `[Added in 2026.05.0]` +- `Paragraph.insert_paragraph_before(text=None, style=None)` — Insert immediately before this paragraph. +- `Paragraph.insert_paragraph_after(text=None, style=None)` — Insert immediately after. `[Added in 2026.05.0]` +- `Paragraph.insert_table_before(rows, cols, style=None, width=None)` / `insert_table_after(...)` — Insert a sibling table. `[Added in 2026.05.0]` +- `Paragraph.add_caption_before(text, label="Figure", style="Caption")` / `add_caption_after(...)` — Insert a caption adjacent to this paragraph. `[Added in 2026.05.0]` +- `Paragraph.insert_table_of_contents_before(levels=(1,3))` / `insert_table_of_contents_after(...)` — Insert a TOC paragraph adjacent to this one. `[Added in 2026.05.0]` +- `Paragraph.insert_section_break(start_type=WD_SECTION.NEW_PAGE)` / `remove_section_break()` — Add/remove a `w:sectPr` inside this paragraph. `[Added in 2026.05.0]` +- `Paragraph.delete()` — Remove this paragraph from its parent. `[Added in 2026.05.0]` +- `Paragraph.clear()` — Remove all content while keeping paragraph-level formatting. +- `Paragraph.alignment` — `WD_PARAGRAPH_ALIGNMENT` (see [enum](docs/api/enum/WdAlignParagraph.rst)). +- `Paragraph.style` — Read/write paragraph style as a `ParagraphStyle` or name. +- `Paragraph.text` — Read/write plain text (replaces all content on set). +- `Paragraph.paragraph_format` — `ParagraphFormat` proxy — indent, spacing, borders, frame, etc. +- `Paragraph.font` — Paragraph-mark `rPr` proxy. `[Added in 2026.05.0]` +- `Paragraph.runs` / `Paragraph.all_runs` — Direct-child runs vs every visible run (descends into hyperlinks, fields, SDTs, ins/del). `all_runs` is `[Added in 2026.05.0]`. +- `Paragraph.hyperlinks` — List of `Hyperlink` proxies. +- `Paragraph.drawings` — List of `Drawing` children. `[Added in 2026.05.0]` +- `Paragraph.floating_images` — List of `FloatingImage` for each `wp:anchor`. `[Added in 2026.05.0]` +- `Paragraph.fields` / `Paragraph.form_fields` — Fields and legacy form fields in this paragraph. `[Added in 2026.05.0]` +- `Paragraph.content_controls` — Inline SDTs. `[Added in 2026.05.0]` +- `Paragraph.equations` — OMML expressions. `[Added in 2026.05.0]` +- `Paragraph.ink_annotations` / `Paragraph.embedded_objects` — Read-only proxies. `[Added in 2026.05.0]` +- `Paragraph.rendered_page_breaks` / `Paragraph.page_breaks_inside` / `Paragraph.contains_page_break` / `Paragraph.has_page_break` / `Paragraph.clear_page_breaks()` — Page-break introspection and mutation. +- `Paragraph.next_block` / `Paragraph.previous_block` — Walk sibling blocks. `[Added in 2026.05.0]` +- `Paragraph.iter_inner_content()` — Yield runs and hyperlinks in document order. +- `Paragraph.rsid` — Word's editing-session revision-save ID. `[Added in 2026.05.0]` +- `Paragraph.stable_id` — 16-char hex fingerprint stable across save/reload. `[Added in 2026.05.0]` +- `Paragraph.has_section_break` — True if paragraph carries a `w:sectPr`. `[Added in 2026.05.0]` +- `Paragraph.element` — Public alias for the underlying `w:p` element. `[Added in 2026.05.0]` + +--- + +## Runs and text + +A run (``) is the smallest styled unit of text. The fork adds +`Run.split()` for mid-run edits, `Run.delete()`, `Run.make_hyperlink()`, +`Run.add_symbol()` and `.symbols`, ruby-annotation access, stable IDs, and a +`copy_formatting_from()` helper. + +```python +from docx import Document + +document = Document() +p = document.add_paragraph() +r = p.add_run("Hello world") +r.bold = True +r.italic = True +r.font.size = 140000 # EMU (or use Pt(14)) + +# split at char offset 5 → two runs: "Hello" and " world" +left, right = r.split(5) +right.italic = False + +# insert a symbol (Unicode char code) in a specific font +left.add_symbol(0x2603, font="Segoe UI Symbol") # snowman + +# delete a run entirely +right.delete() + +document.save("out.docx") +``` + +- `Paragraph.add_run(text=None, style=None, track_author=None)` — Append a run. `track_author` is `[Added in 2026.05.0]`. +- `Paragraph.add_text(text)` — Append `text` onto the last run instead of creating a new one. `[Added in 2026.05.0]` +- `Run.text` — Read/write. `\t` / `\n` / `\r` map to `w:tab` / `w:br`. +- `Run.bold` / `Run.italic` / `Run.underline` — Tri-state (True/False/None for "inherit"). +- `Run.style` — Character style. +- `Run.clear()` — Remove all child text/runs. +- `Run.add_tab()` — Insert a tab. +- `Run.add_break(break_type=WD_BREAK.LINE)` — Line / page / column / wrap break. +- `Run.add_picture(path_or_stream, width=None, height=None, link=False, save_with_document=True, url=None)` — Inline picture in this run. `link`, `save_with_document`, `url` and `os.PathLike` support are `[Added in 2026.05.0]`. +- `Run.add_text_box(width=None, height=None, text=None)` — Append a DrawingML text box. `[Added in 2026.05.0]` +- `Run.add_ole_object(ole_path, prog_id, icon_path=None)` — Embed an OLE payload. `[Added in 2026.05.0]` +- `Run.add_symbol(char_code, font)` — Insert a `w:sym`. `[Added in 2026.05.0]` +- `Run.symbols` — Iterator of `Symbol` proxies. `[Added in 2026.05.0]` +- `Run.text_with_symbols` — Text including symbol glyphs. `[Added in 2026.05.0]` +- `Run.equations` / `Run.ruby_annotations` — Inline OMML / ruby. `[Added in 2026.05.0]` +- `Run.split(offset)` — Split into two runs at `offset`, preserving formatting. `[Added in 2026.05.0]` +- `Run.delete()` — Remove this run. `[Added in 2026.05.0]` +- `Run.make_hyperlink(url=None, anchor=None)` — Wrap this run in a hyperlink. `[Added in 2026.05.0]` +- `Run.mark_comment_range(last_run, comment_id)` — Place `commentRangeStart`/`commentRangeEnd` markers. +- `Run.copy_formatting_from(source)` — Copy `rPr` from another run. `[Added in 2026.05.0]` +- `Run.contains_page_break` / `Run.iter_inner_content()` — Inline content iteration. +- `Run.formatting_change` — `FormattingChange` for `w:rPrChange`. `[Added in 2026.05.0]` +- `Run.rsid` / `Run.stable_id` — Editing-session ID and stable fingerprint. `[Added in 2026.05.0]` +- `docx.ruby.RubyAnnotation` — Base text, ruby text, alignment, language. `[Added in 2026.05.0]` +- `docx.text.symbol.Symbol` — Font and character-code reader. `[Added in 2026.05.0]` + +--- + +## Fonts and character formatting + +`Font` (via `Run.font` or `Paragraph.font`) exposes every `w:rPr` child +covered by OOXML, including the fork additions for run borders, run-level +background shading, East-Asian layout, explicit language tags, character +scale, ligatures, and kerning. + +```python +from docx import Document +from docx.shared import Pt, RGBColor +from docx.enum.text import WD_COLOR_INDEX, WD_UNDERLINE, WD_BORDER_STYLE + +document = Document() +run = document.add_paragraph().add_run("Styled text") + +font = run.font +font.name = "Calibri" +font.size = Pt(12) +font.color.rgb = RGBColor(0x2E, 0x74, 0xB5) +font.underline = WD_UNDERLINE.SINGLE +font.highlight_color = WD_COLOR_INDEX.YELLOW +font.small_caps = True + +# fork-era extras +font.shading_color = RGBColor(0xFF, 0xFF, 0xCC) +font.border_style = WD_BORDER_STYLE.SINGLE +font.border_color = RGBColor(0x00, 0x00, 0x00) +font.border_width = Pt(0.5) +font.language = "en-US" +font.character_scale = 90 # 90 % +font.kerning = Pt(10) + +document.save("out.docx") +``` + +- `Font.name` / `Font.size` / `Font.color` / `Font.highlight_color` — Core identity and size. +- `Font.bold` / `italic` / `underline` / `strike` / `double_strike` / `superscript` / `subscript` / `all_caps` / `small_caps` / `shadow` / `outline` / `emboss` / `imprint` / `hidden` / `web_hidden` / `math` / `snap_to_grid` / `no_proof` / `spec_vanish` — Tri-state boolean toggles. +- `Font.character_spacing` / `Font.kerning` / `Font.character_scale` — Letter-spacing controls (`character_scale` and `ligatures` are `[Added in 2026.05.0]`). +- `Font.ligatures` — `"all"`, `"standardContextual"`, etc. `[Added in 2026.05.0]` +- `Font.cs_size` / `Font.complex_script` / `Font.cs_bold` / `Font.cs_italic` — Complex-script properties. +- `Font.shading_color` — Run-level background color. `[Added in 2026.05.0]` +- `Font.border_style` / `Font.border_color` / `Font.border_width` / `Font.border_space` / `Font.remove_border()` — Run borders. `[Added in 2026.05.0]` +- `Font.name_cs` / `Font.name_east_asia` / `Font.name_far_east` — Script-specific typeface overrides. +- `Font.language` / `Font.east_asian_language` / `Font.bidi_language` / `Font.remove_language()` — Per-run language tags. `[Added in 2026.05.0]` +- `Font.rtl` / `Font.right_to_left` — Right-to-left flag. `[Added in 2026.05.0]` +- `Font.east_asian_layout` / `Font.set_east_asian_layout(...)` / `Font.remove_east_asian_layout()` — Two-lines-in-one, kinsoku, etc. `[Added in 2026.05.0]` +- `Font.copy_to(target)` — Copy every `rPr` property onto another `Font`. `[Added in 2026.05.0]` +- `docx.text.font.EastAsianLayout` — Proxy for `w:eastAsianLayout`. `[Added in 2026.05.0]` +- `docx.dml.color.ColorFormat` — RGB / theme-color / tint / shade. +- Enums: `WD_COLOR_INDEX`, `WD_UNDERLINE`, `WD_BORDER_STYLE`. + +--- + +## Paragraph formatting + +`Paragraph.paragraph_format` exposes `ParagraphFormat`, the Word +"Paragraph…" dialog mapped to OOXML. The fork adds paragraph borders, +text-frame controls, contextual spacing, outline level, RTL, kinsoku / +word-wrap, first-line-chars, and auto-space-DE/DN. + +```python +from docx import Document +from docx.shared import Pt, Inches +from docx.enum.text import WD_BORDER_STYLE, WD_LINE_SPACING + +document = Document() +p = document.add_paragraph("A well-formatted paragraph.") +pf = p.paragraph_format + +pf.line_spacing_rule = WD_LINE_SPACING.MULTIPLE +pf.line_spacing = 1.15 +pf.space_after = Pt(8) +pf.first_line_indent = Inches(0.5) +pf.keep_with_next = True + +# fork-era additions +pf.borders.top.style = WD_BORDER_STYLE.SINGLE +pf.borders.top.width = Pt(0.5) +pf.contextual_spacing = True +pf.right_to_left = False + +document.save("out.docx") +``` + +- `ParagraphFormat.alignment` — `WD_PARAGRAPH_ALIGNMENT`. +- `ParagraphFormat.first_line_indent` / `left_indent` / `right_indent` — Lengths. +- `ParagraphFormat.line_spacing` / `line_spacing_rule` — Spacing controls. +- `ParagraphFormat.space_before` / `space_after` — Paragraph spacing. +- `ParagraphFormat.keep_together` / `keep_with_next` / `widow_control` / `page_break_before` — Pagination toggles. +- `ParagraphFormat.contextual_spacing` — `[Added in 2026.05.0]` +- `ParagraphFormat.outline_level` — `WD_OUTLINELVL` or int 0–9. `[Added in 2026.05.0]` +- `ParagraphFormat.right_to_left` / `kinsoku` / `word_wrap` / `auto_space_de` / `auto_space_dn` / `first_line_chars` — Bidi and East-Asian typography. `[Added in 2026.05.0]` +- `ParagraphFormat.tab_stops` — `TabStops` collection. +- `ParagraphFormat.borders` — `ParagraphBorders` (top/bottom/left/right/between/bar). `[Added in 2026.05.0]` +- `ParagraphFormat.frame` / `ParagraphFormat.set_frame(...)` / `ParagraphFormat.remove_frame()` — Text frames. `[Added in 2026.05.0]` + +--- + +## Hyperlinks + +Hyperlinks can be created from scratch, read off existing paragraphs, or +wrapped around an existing run slice. Both external URLs and internal +anchors (bookmark names) are supported, and URL fragments (`#section`) are +exposed as a first-class attribute. + +```python +from docx import Document + +document = Document() +p = document.add_paragraph("Visit ") +link = p.add_hyperlink(url="https://example.com/#intro", text="our site") +p.add_run(".") + +# internal anchor +document.add_paragraph().add_hyperlink(anchor="chapter-1", text="Chapter 1") + +# wrap part of an existing run as a hyperlink +r = document.add_paragraph().add_run("click here to read more") +p2 = r._parent +p2.insert_hyperlink_at(r, url="https://docs.example", start=0, end=10) + +document.save("out.docx") +``` + +- `Paragraph.add_hyperlink(url=None, text=None, style="Hyperlink", anchor=None)` — Append a new hyperlink. `[Added in 2026.05.0]` +- `Paragraph.insert_hyperlink_at(run, url=None, anchor=None, start=None, end=None)` — Wrap (part of) an existing run in a hyperlink, splitting as needed. `[Added in 2026.05.0]` +- `Run.make_hyperlink(url=None, anchor=None)` — Wrap a run as a hyperlink. `[Added in 2026.05.0]` +- `Paragraph.hyperlinks` — List of `Hyperlink` in document order. +- `Hyperlink.url` / `Hyperlink.address` / `Hyperlink.fragment` — URL parts; `address`/`fragment` are editable. +- `Hyperlink.runs` / `Hyperlink.text` / `Hyperlink.contains_page_break` / `Hyperlink.add_run(...)` — Content access and extension. + +--- + +## Tables + +Tables are first-class blocks. The fork extends them with per-cell and +whole-table borders, cell margins, text direction, merged-cell reads, row +height setters, header-row repeat, table-style flags (banded rows/columns), +autofit behavior, alt text, copy (including cross-document), split at a row, +CRUD operations, and stable IDs. + +```python +from docx import Document +from docx.shared import Inches, Pt, RGBColor +from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ROW_HEIGHT_RULE +from docx.enum.text import WD_BORDER_STYLE + +document = Document() +tbl = document.add_table(rows=2, cols=3, style="Table Grid") +tbl.alignment = WD_TABLE_ALIGNMENT.CENTER +tbl.alt_text = "Quarterly results" + +tbl.cell(0, 0).text = "Region" +tbl.cell(0, 1).text = "Q1" +tbl.cell(0, 2).text = "Q2" + +# merge two cells +tbl.cell(1, 0).merge(tbl.cell(1, 1)) + +# borders + cell shading +tbl.borders.top.style = WD_BORDER_STYLE.SINGLE +tbl.borders.top.color = RGBColor(0x44, 0x44, 0x44) +tbl.cell(0, 0).shading.fill_color = RGBColor(0xEE, 0xEE, 0xEE) + +# row height +tbl.rows[0].height = Inches(0.4) +tbl.rows[0].height_rule = WD_ROW_HEIGHT_RULE.AT_LEAST +tbl.rows[0].is_header = True + +document.save("out.docx") +``` + +- `Document.add_table(rows, cols, style=None)` — Append a new table. +- `Document.add_table_copy(other_table)` / `Document.add_table_from(...)` — Deep-copy a table (possibly from another document), rewiring images and importing styles. `[Added in 2026.05.0]` +- `Table.alignment` / `Table.direction` / `Table.table_direction` — Placement controls. +- `Table.autofit` / `Table.autofit_behavior` / `Table.allow_autofit` / `Table.preferred_width` — Layout. `autofit_behavior`, `allow_autofit`, `preferred_width` are `[Added in 2026.05.0]`. +- `Table.indent` / `Table.left_indent` — Table indentation. `indent` is `[Added in 2026.05.0]`. +- `Table.style` / `Table.style_flags` — Style application and banding flags (`first_row`, `last_row`, `first_column`, `last_column`, `no_horizontal_banding`, `no_vertical_banding`). `style_flags` is `[Added in 2026.05.0]`. +- `Table.borders` / `Table.set_borders(...)` — `TableBorders` proxy. `[Added in 2026.05.0]` +- `Table.cell_margins` — Per-table cell-margin defaults. `[Added in 2026.05.0]` +- `Table.alt_text` / `Table.alt_description` — Accessibility fields. `[Added in 2026.05.0]` +- `Table.cell(row, col)` / `Table.row_cells(i)` / `Table.column_cells(i)` / `Table.rows` / `Table.columns` / `Table.cells` — Access. +- `Table.add_row(source_row=None)` / `Table.insert_row(index)` / `Table.add_column(width)` / `Table.delete_column(index)` — CRUD. `insert_row`, `delete_column` are `[Added in 2026.05.0]`. +- `Table.split(before_row)` — Split into two tables at a boundary. `[Added in 2026.05.0]` +- `Table.delete()` — Remove from document. `[Added in 2026.05.0]` +- `Table.merged_cell_ranges` — Tuples of `(top_row, top_col, bottom_row, bottom_col)`. `[Added in 2026.05.0]` +- `Table.spans_page_break` — `True` if the rendered table crosses a page break. +- `Table.stable_id` / `Table.formatting_change` — Stable fingerprint and tracked-formatting proxy. `[Added in 2026.05.0]` +- `Table.next_block` / `Table.previous_block` — Block-level navigation. `[Added in 2026.05.0]` +- `_Cell.add_paragraph(...)` / `_Cell.add_table(...)` / `_Cell.add_picture(...)` — Nested content. +- `_Cell.merge(other)` / `_Cell.split()` / `_Cell.is_merge_origin` / `_Cell.merge_origin` / `_Cell.grid_span` — Merge handling. Merge-origin APIs are `[Added in 2026.05.0]`. +- `_Cell.borders` / `_Cell.margins` / `_Cell.set_margins(...)` / `_Cell.remove_margins()` — Cell-level borders and margins. `[Added in 2026.05.0]` +- `_Cell.shading.fill_color` / `_Cell.shading.pattern` — Background. `[Added in 2026.05.0]` +- `_Cell.text_direction` / `_Cell.vertical_alignment` / `_Cell.width` / `_Cell.text` — Cell properties (`text_direction` is `[Added in 2026.05.0]`). +- `_Cell.is_tracked_insertion` / `_Cell.is_tracked_deletion` / `_Cell.formatting_change` / `_Cell.stable_id` — Track-changes and stable-id hooks. `[Added in 2026.05.0]` +- `_Row.height` / `_Row.height_rule` / `_Row.is_header` / `_Row.allow_break_across_pages` — Row-level properties. `height` setter, `is_header`, `allow_break_across_pages` are `[Added in 2026.05.0]`. +- `TableBorders` — `top` / `bottom` / `left` / `right` / `inside_h` / `inside_v` → `BorderElement.style` / `.width` / `.color` / `.space`. `[Added in 2026.05.0]` +- `CellBorders`, `CellShading`, `CellMargins`, `TableCellMargins`, `TableStyleFlags` — Helper proxies. `[Added in 2026.05.0]` + +--- + +## Lists and numbering + +`Document.numbering` exposes a read/write wrapper around `numbering.xml` so +that you can create new abstract numbering definitions, allocate instances, +apply them to paragraphs, restart numbering on demand, and ask Word-style +"what label would this paragraph show?" for each paragraph. + +```python +from docx import Document +from docx.enum.text import WD_NUMBER_FORMAT + +document = Document() +numbering = document.numbering +definition = numbering.add_numbering_definition( + levels=[ + {"format": WD_NUMBER_FORMAT.DECIMAL, "text": "%1.", "start": 1}, + {"format": WD_NUMBER_FORMAT.LOWER_LETTER, "text": "%2)", "start": 1}, + ] +) + +p1 = document.add_paragraph("First") +p2 = document.add_paragraph("Second") +p3 = document.add_paragraph("A sub-item") +definition.apply_to(p1, level=0) +definition.apply_to(p2, level=0) +definition.apply_to(p3, level=1) +p2.restart_numbering(start=5) + +# rendered label text (e.g. "1.", "5.", "a)") +print(p2.list_label) +print(document.list_labels()) + +document.save("out.docx") +``` + +- `Document.numbering` — `Numbering` proxy. Creates `numbering.xml` on demand. `[Added in 2026.05.0]` +- `Numbering.add_numbering_definition(levels=...)` — Add an abstract definition. `[Added in 2026.05.0]` +- `Numbering.definitions` / iteration — Existing definitions. +- `NumberingDefinition.apply_to(paragraph, level=0)` — Apply a numbering to a paragraph. `[Added in 2026.05.0]` +- `NumberingDefinition.new_instance()` / `NumberingDefinition.levels` / `NumberingDefinition.level(ilvl)` — Instance and level access. +- `Level.number_format` / `Level.text` / `Level.start` / `Level.indent` / `Level.ilvl` — Per-level properties. +- `Paragraph.list_level` / `Paragraph.list_format` / `Paragraph.numbering_format` / `Paragraph.list_label` — Read paragraph's current list settings. `[Added in 2026.05.0]` +- `Paragraph.restart_numbering(level=None, start=1)` — Restart the counter. `[Added in 2026.05.0]` +- `Document.list_labels()` — `{id(p): label}` for every numbered paragraph in the body (one pass). `[Added in 2026.05.0]` +- `Document.add_list_of_figures(caption_label="Figure")` / `Document.add_list_of_tables(caption_label="Table")` — Append `TOC \c` fields. `[Added in 2026.05.0]` +- `docx.numbering.ListLabelRenderer` — Low-level label renderer used by the properties above. `[Added in 2026.05.0]` +- Enum: `WD_NUMBER_FORMAT` (decimal, roman, letter, bullet, etc.). + +--- + +## Styles + +`Document.styles` is a `Styles` collection covering paragraph, character, +table, and numbering styles. The fork adds style import across documents, +`link_style` / `next_style` / `is_redefined`, a document-default font +accessor, a `Style.delete()`, and direct access to the XML style element. + +```python +from docx import Document +from docx.enum.style import WD_STYLE_TYPE + +document = Document() +styles = document.styles + +# create a new paragraph style +my_style = styles.add_style("Summary", WD_STYLE_TYPE.PARAGRAPH) +my_style.base_style = styles["Normal"] +my_style.font.bold = True + +# apply it +document.add_paragraph("Summary text", style="Summary") + +# query +for style in styles: + print(style.name, style.type) + +# document-wide default font +styles.document_default_font.name = "Calibri" + +document.save("out.docx") +``` + +- `Document.styles` — `Styles` collection. +- `Styles.add_style(name, style_type, builtin=False)` — Add a new style. +- `Styles.default(style_type)` — The document's default for a given type. +- `Styles.get_by_id(style_id, style_type)` / `Styles.get_style_id(style_or_name, style_type)` — Id lookups. +- `Styles.document_default_font` — `Font` proxy for `docDefaults/rPrDefault`. `[Added in 2026.05.0]` +- `Styles.import_from(other_doc, names)` / `Styles.import_style(style)` / `Styles.import_builtin(name)` — Cross-document import. `[Added in 2026.05.0]` +- `Styles.latent_styles` — `LatentStyles` collection. +- `BaseStyle.name` / `.style_id` / `.type` / `.builtin` / `.priority` / `.hidden` / `.locked` / `.quick_style` / `.unhide_when_used` — Common metadata. +- `BaseStyle.delete()` — Remove the style. +- `BaseStyle.link_style` / `BaseStyle.next_style` / `BaseStyle.is_redefined` — Style-mapping properties. `[Added in 2026.05.0]` +- `CharacterStyle.base_style` / `CharacterStyle.font` — Character-style specifics. +- `ParagraphStyle.paragraph_format` / `ParagraphStyle.next_paragraph_style` — Paragraph-style specifics. +- `LatentStyles` / `_LatentStyle` — Latent style collection. +- `docx.styles.BabelFish` — UI ↔ internal style-name translation. +- Enum: `WD_STYLE_TYPE`, `WD_BUILTIN_STYLE`. + +--- + +## Inline images + +`Document.add_picture()` / `Run.add_picture()` append an inline image. All +common formats are supported, including the fork additions of **SVG**, +**WebP**, **EMF**, **WMF**, and **EPS**. Linked (external) pictures, image +replacement, outline/border, crop, opacity, shadow, aspect-ratio lock, alt +text, and delete are all first-class. + +```python +from docx import Document +from docx.shared import Inches, Pt, RGBColor + +document = Document() +shape = document.add_picture("logo.png", width=Inches(2)) +shape.alt_text = "Company logo" +shape.title = "Logo" + +shape.outline.color = RGBColor(0, 0, 0) +shape.outline.width = Pt(1) +shape.outline.transparency = 0.25 + +shape.crop.set(left=0.05, right=0.05) # fractions (0..1) +shape.opacity = 0.9 +shape.effects.shadow.apply(blur_radius=Pt(4), distance=Pt(2)) +shape.lock_aspect_ratio = True + +# replace the image bytes, keep the drawing +shape.replace_image("new-logo.png") + +document.save("out.docx") +``` + +- `Document.add_picture(path_or_stream, width=None, height=None, link=False, save_with_document=True, url=None)` — Append a new paragraph with an inline picture. `[Added in 2026.05.0]` for `link`, `save_with_document`, `url`, and `os.PathLike`. +- `Run.add_picture(...)` — Inline picture in an existing run. +- `Document.inline_shapes` — `InlineShapes` collection for iteration/indexing. +- `InlineShape.width` / `InlineShape.height` / `InlineShape.type` / `InlineShape.image` — Core picture data. +- `InlineShape.alt_text` / `InlineShape.title` — Accessibility metadata. `[Added in 2026.05.0]` +- `InlineShape.opacity` / `InlineShape.lock_aspect_ratio` — Visual controls. `[Added in 2026.05.0]` +- `InlineShape.outline` — `PictureOutline` (style, color, width, transparency). `[Added in 2026.05.0]` +- `InlineShape.crop` — `PictureCrop` (left/top/right/bottom, `set(...)`). `[Added in 2026.05.0]` +- `InlineShape.effects.shadow` — `ShadowFormat` (blur, distance, angle, color, `apply(...)`, `clear()`). `[Added in 2026.05.0]` +- `InlineShape.delete(part=None)` — Remove the drawing and prune orphan image parts. `[Added in 2026.05.0]` +- `InlineShape.replace_image(path_or_stream)` — Swap the blob, keeping the drawing. `[Added in 2026.05.0]` +- `docx.drawing.Picture` — Generic picture proxy for canvas/group contexts. +- Supported formats: PNG, JPEG, GIF, BMP, TIFF, **SVG** (via `docx.image.svg`), **WebP**, **EMF**, **WMF**, and **EPS**. SVG/WebP/EMF/WMF/EPS are `[Added in 2026.05.0]`. + +--- + +## Floating images and shapes + +Floating images are anchored (``) rather than inline, with +horizontal/vertical anchor frames, offsets, and wrap style. The fork also +adds DrawingML preset shapes, text boxes, canvases, and read-only access to +group shapes. + +```python +from docx import Document +from docx.shared import Inches +from docx.enum.shape import WD_SHAPE, WD_ANCHOR_H, WD_ANCHOR_V, WD_WRAP_TYPE + +document = Document() +p = document.add_paragraph() + +# floating image at a specific page offset +img = p.add_floating_shape( + "banner.png", + x=Inches(1), y=Inches(2), + width=Inches(4), height=Inches(2), + h_anchor=WD_ANCHOR_H.PAGE, v_anchor=WD_ANCHOR_V.PAGE, + wrap=WD_WRAP_TYPE.SQUARE, +) + +# inline preset shape +shape = document.add_shape( + WD_SHAPE.ROUNDED_RECTANGLE, + width=Inches(3), height=Inches(1), + text="A rounded rectangle", +) + +# text box +tb = document.add_text_box(width=Inches(3), height=Inches(1), text="Note") + +# canvas with two sub-shapes +canvas = document.add_canvas(width=Inches(5), height=Inches(3)) + +document.save("out.docx") +``` + +- `Paragraph.add_floating_image(path, width=None, height=None, position=None)` — Add `wp:anchor` image. `[Added in 2026.05.0]` +- `Paragraph.add_floating_shape(path, x=0, y=0, width=None, height=None, h_anchor=..., v_anchor=..., wrap=...)` — Coordinate-first helper. `[Added in 2026.05.0]` +- `Paragraph.add_shape(shape_type, width=None, height=None, text=None)` / `Document.add_shape(...)` — Append a DrawingML preset shape. `[Added in 2026.05.0]` +- `Document.add_text_box(...)` / `Run.add_text_box(...)` — Append a text-box shape. `[Added in 2026.05.0]` +- `Document.add_canvas(width=None, height=None)` — Append a canvas (`wpc:wpc`). `[Added in 2026.05.0]` +- `FloatingImage.width` / `.height` / `.horizontal_anchor` / `.vertical_anchor` / `.horizontal_offset` / `.vertical_offset` / `.offset` / `.position` / `.wrap_type` / `.type` / `.opacity` / `.lock_aspect_ratio` / `.alt_text` / `.title` / `.outline` / `.crop` / `.effects` / `.delete(part=None)` — Floating picture surface. `[Added in 2026.05.0]` +- `docx.drawing.Drawing` — Base proxy for ``; exposes `.picture`, `.shape`, `.smart_art`, `.chart`, etc. +- `docx.drawing.WordprocessingShape` — DrawingML shape with `add_paragraph()`, `text`, `paragraphs`. `[Added in 2026.05.0]` +- `docx.drawing.GroupShape` — Read-only group (`wpg:grpSp`) iteration. `[Added in 2026.05.0]` +- `docx.drawing.Canvas` — Canvas proxy with `add_shape(...)`. `[Added in 2026.05.0]` +- Enums: `WD_SHAPE`, `WD_ANCHOR_H`, `WD_ANCHOR_V`, `WD_WRAP_TYPE`, `WD_DRAWING_TYPE`. + +--- + +## Charts + +`Document.add_chart()` creates a `.chartPart` with numeric data for the +supported chart types (bar, stacked bar, column, stacked column, line, pie); +`Document.charts` reads existing charts of any type; `Chart.replace_data()` +rewrites the category and series values in place. `[Added in 2026.05.0]`. + +```python +from docx import Document +from docx.shared import Inches +from docx.chart import WD_CHART_TYPE + +document = Document() +chart = document.add_chart( + WD_CHART_TYPE.COLUMN, + categories=["Q1", "Q2", "Q3", "Q4"], + series_data={ + "North": [10, 12, 14, 11], + "South": [8, 13, 9, 15], + }, + width=Inches(5), height=Inches(3), +) +print(chart.chart_type) +for s in chart.series: + print(s.name, s.values) + +# replace values later +chart.replace_data(categories=["Q1", "Q2", "Q3", "Q4"], + series_data={"Total": [18, 25, 23, 26]}) + +document.save("out.docx") +``` + +- `Document.add_chart(chart_type, categories, series_data, width=None, height=None)` — Append a new chart. `[Added in 2026.05.0]` +- `Document.charts` — List of `Chart` proxies in document order. `[Added in 2026.05.0]` +- `Chart.chart_type` / `Chart.title` / `Chart.has_legend` / `Chart.series` / `Chart.categories` — Reads. `[Added in 2026.05.0]` +- `Chart.replace_data(categories, series_data)` — Rewrite all data in place. `[Added in 2026.05.0]` +- `ChartSeries.name` / `.values` / `.categories` — Per-series reads. `[Added in 2026.05.0]` +- Enum: `docx.chart.WD_CHART_TYPE` (`BAR`, `BAR_STACKED`, `COLUMN`, `COLUMN_STACKED`, `LINE`, `PIE`). + +--- + +## SmartArt + +SmartArt is read *and* authorable. `Document.smart_art` walks every +`` that references a `dgm:relIds` diagram and returns a +`SmartArt` proxy carrying `.nodes`, `.text`, and the underlying +diagram-data partname. `Document.add_smart_art(layout_name)` appends a +new SmartArt at the end of the document body, backed by four freshly +minted companion parts (`data`, `layout`, `colors`, `quickStyle`) under +`word/diagrams/`. The returned `SmartArt` is populated one content node +at a time via `SmartArt.add_node(text)`. Supported layout families are +`"list"`, `"cycle"` and `"process"` — each selects a Word built-in +layout algorithm keyed by its canonical `loTypeId` URN; Word's own +layout engine handles rendering, so the embedded `layout1.xml` copy +exists to satisfy package requirements rather than to drive geometry. +`[Added in 2026.05.0]` (read-side); `add_smart_art` / `add_node` +`[Added in 2026.05.8]`. + +```python +from docx import Document + +# -- authoring -- +document = Document() +diagram = document.add_smart_art("process") +diagram.add_node("Plan") +diagram.add_node("Build") +diagram.add_node("Ship") +document.save("with-smartart.docx") + +# -- reading -- +document = Document("with-smartart.docx") +for diagram in document.smart_art: + print(diagram.data_partname) + print(diagram.text) + for node in diagram.nodes: + print(" " * node.level, node.text) +``` + +- `Document.add_smart_art(layout_name, width=None, height=None)` — Return a new + empty `SmartArt`. `layout_name` is one of `"list"`, `"cycle"`, `"process"` + (case-insensitive). `[Added in 2026.05.8]` +- `SmartArt.add_node(text)` — Append a top-level content node and return + its `SmartArtNode`. `[Added in 2026.05.8]` +- `Document.smart_art` — List of `SmartArt`. `[Added in 2026.05.0]` +- `SmartArt.data_partname` / `SmartArt.dm_rId` / `SmartArt.nodes` / `SmartArt.text`. `[Added in 2026.05.0]` +- `SmartArtNode.text` / `.level` / `.model_id` / `.children`. `[Added in 2026.05.0]` + +--- + +## Equations + +OMML (Office Math) equations are both read and writable via a minimal +builder API. You can drop a literal OMML string onto a paragraph, or +assemble common structures (fractions, superscripts, radicals) from small +factory functions. `[Added in 2026.05.0]`. + +```python +from docx import Document +from docx.equations import build_fraction, build_radical + +document = Document() +p = document.add_paragraph("Pythagoras: ") + +omml = ( + '' + + build_radical( + build_fraction(numerator_text="a^2 + b^2", denominator_text="1"), + ) + + "" +) +p.add_equation(omml) + +for eq in document.equations: + print(eq.text, eq.is_display_mode) + +document.save("out.docx") +``` + +- `Paragraph.add_equation(omml_xml, display_mode=False)` — Append an OMML expression. `[Added in 2026.05.0]` +- `Document.equations` / `Paragraph.equations` / `Run.equations` — Read iterators. `[Added in 2026.05.0]` +- `Equation.text` / `.raw_xml` / `.xml_element` / `.is_display_mode` / `.set_text(...)` / `.replace_identifier(old, new)` / `.swap_children(a, b)` / `Equation.from_omml_xml(...)`. `[Added in 2026.05.0]` +- Builders: `build_identifier`, `build_fraction`, `build_superscript`, `build_subscript`, `build_radical`. `[Added in 2026.05.0]` + +--- + +## Sections and page layout + +Every document has at least one section. `Document.sections` is a sequence +with indexing, iteration, and `pop()`; each `Section` carries page size and +orientation, margins, gutter, header/footer distances, columns, page +borders, line numbering, paper source, document grid, text direction, +even/odd headers, first-page headers/footers, and (fork-era) footnote / +endnote overrides, watermarks, and section-copy helpers. + +```python +from docx import Document +from docx.enum.section import WD_ORIENTATION, WD_SECTION_START +from docx.shared import Inches, Pt + +document = Document() +section = document.sections[0] +section.page_height = Inches(11) +section.page_width = Inches(8.5) +section.orientation = WD_ORIENTATION.PORTRAIT +section.left_margin = Inches(1) +section.right_margin = Inches(1) + +# two-column layout with a divider +section.set_columns(count=2, equal_width=True, space=Inches(0.5)) + +# page border +section.set_page_border("top", style="single", width=Pt(1)) + +# line numbering from 1, every 5 lines +section.set_line_numbering(count_by=5, start=1, distance=Inches(0.2)) + +# append a new section that breaks to a new page +document.add_section(WD_SECTION_START.NEW_PAGE) + +document.save("out.docx") +``` + +- `Document.sections` — `Sections` sequence. `pop(index=-1)` is `[Added in 2026.05.0]`. +- `Document.add_section(start_type=WD_SECTION.NEW_PAGE)` — Append a new section. +- `Section.start_type` / `Section.orientation` / `Section.page_height` / `Section.page_width` / `Section.left_margin` / `Section.right_margin` / `Section.top_margin` / `Section.bottom_margin` / `Section.header_distance` / `Section.footer_distance` / `Section.gutter` — Page metrics. +- `Section.vertical_alignment` — Vertical alignment of text on the page (`WD_VERTICAL_ALIGNMENT.TOP` / `.CENTER` / `.BOTH` / `.BOTTOM`); maps to `w:sectPr/w:vAlign` (ECMA-376 17.6.22). `[Added in 2026.05.6]` +- `Section.columns` / `Section.set_columns(count, equal_width=True, space=None, widths=None)` — Multi-column layout. `set_columns` is `[Added in 2026.05.0]`. +- `Section.page_borders` / `Section.set_page_border(side, ...)` / `Section.remove_page_borders()` — Page-level borders. `[Added in 2026.05.0]` +- `Section.line_numbering` / `Section.set_line_numbering(...)` / `Section.remove_line_numbering()` — `[Added in 2026.05.0]` +- `Section.first_page_paper_source` / `Section.other_pages_paper_source` — Paper-source bin ids. `[Added in 2026.05.0]` +- `Section.document_grid` / `Section.set_document_grid(...)` / `Section.remove_document_grid()` — East-Asian grid controls. `[Added in 2026.05.0]` +- `Section.text_direction` / `Section.right_to_left` — `[Added in 2026.05.0]` +- `Section.different_first_page_header_footer` / `Section.different_odd_and_even_pages_header_footer` — Toggle variant headers/footers. +- `Section.first_page_header` / `Section.first_page_footer` / `Section.even_page_header` / `Section.even_page_footer` / `Section.header` / `Section.footer` — Header/footer access. +- `Section.footnote_properties` / `Section.add_footnote_properties()` / `Section.remove_footnote_properties()` / `Section.endnote_properties` / `Section.add_endnote_properties()` / `Section.remove_endnote_properties()` — Section-level overrides. `[Added in 2026.05.0]` +- `Section.add_text_watermark(text, ...)` / `Section.add_image_watermark(image, ...)` / `Section.remove_watermark()` / `Section.watermark` — Watermark per section. `[Added in 2026.05.0]` +- `Section.copy_header_from(other)` / `Section.copy_footer_from(other)` — Cross-section header/footer copy. `[Added in 2026.05.0]` +- `Section.delete()` — Remove this section break. `[Added in 2026.05.0]` +- `Section.iter_inner_content()` / `Section.paragraphs` / `Section.tables` — Content iteration. +- `Section.formatting_change` — `FormattingChange` for `w:sectPrChange`. `[Added in 2026.05.0]` +- `SectionColumns` / `Column` — Column collection; `count`, `equal_width`, `space`, per-column `width` / `space`. `[Added in 2026.05.0]` +- `PageBorders`, `PageBorder`, `LineNumbering`, `DocumentGrid` — Helper proxies. `[Added in 2026.05.0]` +- Enums: `WD_SECTION`, `WD_SECTION_START`, `WD_ORIENTATION`, `WD_VERTICAL_ALIGNMENT`, `WD_BORDER_DISPLAY`, `WD_BORDER_OFFSET_FROM`, `WD_LINE_NUMBERING_RESTART`, `WD_DOC_GRID_TYPE`, `WD_HEADER_FOOTER_INDEX`. + +--- + +## Headers and footers + +Headers and footers live on sections and inherit from the previous section +by default (`is_linked_to_previous`). The primary flavour is always +available; even-page and first-page variants require toggling the +corresponding section property first. + +```python +from docx import Document + +document = Document() +section = document.sections[0] +section.different_first_page_header_footer = True + +header = section.header +header.paragraphs[0].text = "Regular header" + +first = section.first_page_header +first.paragraphs[0].text = "First page only" + +footer = section.footer +footer.paragraphs[0].text = "Page footer" + +document.save("out.docx") +``` + +- `Section.header` / `Section.footer` — Primary header/footer. +- `Section.first_page_header` / `Section.first_page_footer` — First-page variant. +- `Section.even_page_header` / `Section.even_page_footer` — Even-page variant (requires `different_odd_and_even_pages_header_footer=True`). `[Added in 2026.05.0]` +- `_Header.is_linked_to_previous` / `_Footer.is_linked_to_previous` — Read/write inheritance flag. +- `_Header.paragraphs` / `_Header.tables` / `_Header.add_paragraph(...)` / `_Header.add_table(...)` — BlockItemContainer API. + +--- + +## Comments + +Full comments support: create a comment anchored to one or more runs, add +replies, edit `author` / `initials`, and read the (timezone-aware) +timestamp. + +```python +from docx import Document + +document = Document() +p = document.add_paragraph("Hello ") +r1 = p.add_run("world") +p.add_run("!") + +comment = document.add_comment( + r1, text="Consider 'globe' instead.", + author="Ben", initials="BH", +) +comment.add_reply(text="Agreed.", author="Alex", initials="AX") + +for c in document.comments: + print(c.author, c.timestamp, "—", c.text) + for reply in c.replies: + print(" ↳", reply.author, reply.text) + +document.save("out.docx") +``` + +- `Document.add_comment(runs, text="", author="", initials="", date=None)` — Add a comment with a reference range. `date` kwarg is `[Added in 2026.05.5]`. +- `Document.comments` — `Comments` collection. +- `Comments.add_comment(...)` / `Comments.get(comment_id)` / iteration / `len()`. +- `Comment.text` / `Comment.author` / `Comment.initials` / `Comment.comment_id` / `Comment.timestamp` — Core properties. `author` and `initials` are writable. `timestamp` is timezone-aware. +- `Comment.add_reply(text=None, author="", initials="")` / `Comment.replies` — Threaded replies. `[Added in 2026.05.0]` +- `Comment.add_paragraph(...)` — Multi-paragraph comment bodies. +- `Run.mark_comment_range(last_run, comment_id)` — Low-level anchor helper. + +--- + +## Footnotes and endnotes + +Footnote and endnote parts are lazily created; the collections can be +iterated and mutated. Each note is a `BlockItemContainer` you can fill with +paragraphs, runs, and pictures just like the body. Per-document and +per-section numbering properties are exposed through `FootnoteProperties` +and `EndnoteProperties`. `[Added in 2026.05.0]`. + +```python +from docx import Document +from docx.enum.text import ( + WD_NUMBER_FORMAT, WD_FOOTNOTE_RESTART, WD_FOOTNOTE_POSITION, +) + +document = Document() +p = document.add_paragraph("See the note") +r = p.add_run(".") + +fn = document.footnotes.add(r, text="This is the footnote text.") +print(fn.footnote_id, fn.text) + +# document-wide restart at each section, Roman numerals +props = document.add_footnote_properties() +props.number_format = WD_NUMBER_FORMAT.LOWER_ROMAN +props.restart_rule = WD_FOOTNOTE_RESTART.EACH_SECTION +props.position = WD_FOOTNOTE_POSITION.BOTTOM_OF_PAGE + +# endnotes mirror the same API +document.endnotes.add(r, text="An endnote.") + +document.save("out.docx") +``` + +- `Document.footnotes` / `Document.endnotes` — `Footnotes` / `Endnotes` collections. `[Added in 2026.05.0]` +- `Footnotes.add(run, text="")` / `Endnotes.add(run, text="")` / iteration / `len()`. `[Added in 2026.05.0]` +- `Footnote.text` / `.footnote_id` / `.add_paragraph(...)` / `.clear()` / `.delete()` — and analogous `Endnote` members. `[Added in 2026.05.0]` +- `Document.footnote_properties` / `Document.add_footnote_properties()` / `Document.endnote_properties` / `Document.add_endnote_properties()` — Document-level. `[Added in 2026.05.0]` +- `Section.footnote_properties` / `Section.endnote_properties` / `Section.add_*` / `Section.remove_*` — Section-level overrides. `[Added in 2026.05.0]` +- `FootnoteProperties.number_format` / `.start_number` / `.restart_rule` / `.position` — Writable properties. `[Added in 2026.05.0]` +- `EndnoteProperties` — Same shape as `FootnoteProperties` with `WD_ENDNOTE_POSITION`. `[Added in 2026.05.0]` +- Enums: `WD_NUMBER_FORMAT`, `WD_FOOTNOTE_RESTART`, `WD_FOOTNOTE_POSITION`, `WD_ENDNOTE_POSITION`. + +--- + +## Bookmarks + +Bookmarks span one or more runs (or an entire paragraph) and carry a unique +name used by cross-references (`REF`, `PAGEREF`). Adding, reading, renaming, +and deleting are all supported. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document() +p = document.add_paragraph() +r1 = p.add_run("Chapter 1") +r2 = p.add_run(" begins here.") + +# whole-paragraph bookmark +p.add_bookmark("chapter-1") + +# cross-paragraph via Document.add_bookmark (first + last run) +p2 = document.add_paragraph() +first = p2.add_run("Start") +p2.add_run(" middle ") +last = p2.add_run("end") +document.add_bookmark([first, last], name="span") + +bm = document.bookmarks.get("chapter-1") +bm.name = "ch-1" # rename +print(len(document.bookmarks), list(document.bookmarks)) + +document.save("out.docx") +``` + +- `Paragraph.add_bookmark(name, start_run=None, end_run=None)` — Add a bookmark. `[Added in 2026.05.0]` +- `Document.add_bookmark(runs, name)` — Multi-run / cross-paragraph bookmark. `[Added in 2026.05.0]` +- `Document.bookmarks` — `Bookmarks` collection (`get(name)`, iteration, `name in`, `len()`). `[Added in 2026.05.0]` +- `Bookmark.name` (read/write) / `Bookmark.bookmark_id` / `Bookmark.delete()`. `[Added in 2026.05.0]` + +--- + +## Fields and cross-references + +Simple (`w:fldSimple`) and complex (`w:fldChar`) fields can be added, +enumerated, and resolved. `REF` and `PAGEREF` are resolved against real +bookmarks (`PAGEREF` returns `?` because python-docx has no layout engine); +`DOCPROPERTY`, `AUTHOR`, `TITLE`, `SUBJECT`, `KEYWORDS`, `COMMENTS`, +`LASTSAVEDBY` are resolved from core properties. +`[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document() +p1 = document.add_paragraph() +p1.add_run("Jump to ") +p1.add_simple_field(r'REF heading1 \h', text="the heading") +p1.add_run(".") + +document.add_heading("Introduction", level=1) +document.paragraphs[-1].add_bookmark("heading1") + +# resolve cross-refs in place (REF text ← bookmark text) +n = document.resolve_cross_references() +print(f"resolved {n} cross-references") + +# update-fields-on-open hint +document.settings.update_fields_on_open = True + +document.save("out.docx") +``` + +- `Paragraph.add_simple_field(instr, text=None)` — Append a `w:fldSimple`. `[Added in 2026.05.0]` +- `Paragraph.add_complex_field(instr, result_text=None)` — Append `begin`/`separate`/`end`. `[Added in 2026.05.0]` +- `Paragraph.fields` — Mixed list of simple and complex fields. `[Added in 2026.05.0]` +- `Field.instruction` / `Field.type` / `Field.result_text` / `Field.is_complex` / `Field.is_dirty` / `Field.mark_dirty()` / `Field.update_result_text(new_text)` / `Field.resolve(document)`. `[Added in 2026.05.0]` +- `Field.evaluate(context)` — Evaluate `IF` (with nested `{MERGEFIELD}`), `MERGEFIELD`, `HYPERLINK`, `= ` arithmetic formula, and `PAGE` / `NUMPAGES` / `DATE` / `TIME` placeholders against a caller-supplied mapping. `[Added in 2026.05.8]` +- `Document.resolve_cross_references()` — Walk the body, resolve `REF`/`PAGEREF`/`DOCPROPERTY`/core-property fields, return count updated. `[Added in 2026.05.0]` +- `Document.evaluate_fields(context)` — Batch-apply `Field.evaluate` across every field in the body; writes the evaluated text back in place and returns the number of fields updated. `[Added in 2026.05.8]` +- Field type detection: `docx.fields.WD_FIELD_TYPE` constants. `[Added in 2026.05.0]` + +```python +# data-driven field evaluation (mail-merge-style) +document = Document() +p = document.add_paragraph() +p.add_simple_field('IF {MERGEFIELD status} = "active" "Active" "Archived"', "?") + +n = document.evaluate_fields({"status": "active"}) +print(f"{n} field(s) updated") # → "Active" +``` + +--- + +## Table of contents + +`Document.add_table_of_contents()` emits a `TOC` complex field whose +*cached result text* previews the body's headings; Word rebuilds the real +TOC on open. Sibling helpers insert the TOC before or after a specific +paragraph, and List-of-Figures / List-of-Tables emit the matching +`TOC \c "Label"` fields. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document() +document.add_heading("Contents", level=1) +document.add_table_of_contents(levels=(1, 3)) + +document.add_heading("Chapter One", level=1) +document.add_heading("A sub-heading", level=2) +document.add_paragraph("Body text...") + +document.add_list_of_figures(caption_label="Figure") +document.add_list_of_tables(caption_label="Table") + +document.save("out.docx") +``` + +- `Document.add_table_of_contents(levels=(1, 3))` — Append a TOC. `[Added in 2026.05.0]` +- `Paragraph.insert_table_of_contents_before(levels=(1,3))` / `insert_table_of_contents_after(...)` — Place TOC adjacent to a paragraph. `[Added in 2026.05.0]` +- `Document.add_list_of_figures(caption_label="Figure")` / `Document.add_list_of_tables(caption_label="Table")` — `[Added in 2026.05.0]` +- `Document.include_sdt_flat` iteration flag on `iter_inner_content()` surfaces TOC-wrapper content. `[Added in 2026.05.0]` + +--- + +## Tracked changes + +Read and resolve tracked insertions, deletions, move revisions, and +formatting changes. Accept / reject can be applied per-change or +document-wide; a context-manager wraps new content as tracked insertions by +the named author; `revision_marks_text()` renders a text preview with +bracket markers. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document() + +# write new content as tracked insertions by "Reviewer" +with document.tracked_changes(author="Reviewer"): + document.add_paragraph("Added under review.") + +# per-paragraph inspection +for p in document.paragraphs: + for tc in p.tracked_changes: + print(tc.type, tc.author, tc.date, repr(tc.text)) + +# CLI preview +print(document.revision_marks_text()) + +# accept everything in one shot +n = document.accept_all_changes() +print(f"resolved {n} changes") + +document.save("out.docx") +``` + +- `Document.tracked_changes(author, date=None)` — Context manager that wraps new runs in `w:ins`. `[Added in 2026.05.0]` +- `Document.accept_all_changes()` / `Document.reject_all_changes()` — Resolve every change in the body. `[Added in 2026.05.0]` +- `Document.revision_marks_text(open_ins="[+", close_ins="+]", open_del="[-", close_del="-]")` — Body-text preview with markers. `[Added in 2026.05.0]` +- `Paragraph.tracked_changes` / `Paragraph.revision_marks_text(...)` / `Paragraph.formatting_change` — Per-paragraph reads. `[Added in 2026.05.0]` +- `Run.formatting_change` / `_Cell.is_tracked_insertion` / `_Cell.is_tracked_deletion` / `_Cell.formatting_change` / `Table.formatting_change` / `Section.formatting_change` — Change detection on other types. `[Added in 2026.05.0]` +- `TrackedChange.author` / `.date` / `.text` / `.type` / `.accept()` / `.reject()`. `[Added in 2026.05.0]` +- `MoveRevision.name` / `MoveRevision.peer` — Move source ↔ destination pairing. `[Added in 2026.05.0]` +- `FormattingChange.author` / `.date` / `.old_properties` — `w:rPrChange` / `w:pPrChange` / `w:sectPrChange` reader. `[Added in 2026.05.0]` +- `Settings.track_revisions` / `Settings.rsid_root` / `Settings.rsids` — Revision-ID plumbing. `[Added in 2026.05.0]` + +--- + +## Content controls (SDT) + +Structured Document Tags (SDTs) — rich text, plain text, date, checkbox, +combo, dropdown, picture. Block-level and inline controls are both +supported, and custom-XML data binding can be attached or removed. +`[Added in 2026.05.0]`. + +```python +from docx import Document +from docx.content_controls import ContentControlType + +document = Document() + +# block-level rich-text placeholder +cc = document.add_content_control( + ContentControlType.RICH_TEXT, tag="description", title="Description", +) + +# inline checkbox +p = document.add_paragraph("I agree: ") +chk = p.add_content_control(ContentControlType.CHECKBOX, tag="agree") +chk.checked = True + +# data binding onto a customXml part +cc.set_data_binding( + xpath="/root/desc", + prefix_mappings="xmlns:ns0='http://example.com/schema'", + store_item_id="{ITEM-ID}", +) + +for control in document.content_controls: + print(control.type, control.tag, control.title) + +document.save("out.docx") +``` + +- `Document.add_content_control(type, tag=None, title=None)` — Block-level SDT. `[Added in 2026.05.0]` +- `Paragraph.add_content_control(type, tag=None, title=None)` — Inline SDT. `[Added in 2026.05.0]` +- `Document.content_controls` / `Paragraph.content_controls` — Collections. `[Added in 2026.05.0]` +- `ContentControl.type` / `.tag` / `.title` / `.sdt_id` / `.text` / `.checked` / `.element`. `[Added in 2026.05.0]` +- `ContentControl.data_binding` / `.set_data_binding(xpath, prefix_mappings="", store_item_id=None)` / `.remove_data_binding()`. `[Added in 2026.05.0]` +- `DataBinding.prefix_mappings` / `.xpath` / `.store_item_id`. `[Added in 2026.05.0]` +- Enum: `ContentControlType` (`RICH_TEXT`, `PLAIN_TEXT`, `DATE`, `CHECKBOX`, `COMBO_BOX`, `DROP_DOWN_LIST`, `PICTURE`). `[Added in 2026.05.0]` +- `Document.custom_xml_parts` — Read-only list of bound `CustomXmlPart` data sources. `[Added in 2026.05.0]` + +--- + +## Bibliography and citations + +A bibliography of citation sources is stored in a `/customXml/item{N}.xml` +part with a `` root element. python-docx exposes the read path +via `Document.bibliography` and the write path via `Document.add_citation` +plus `Paragraph.add_citation_reference`. The bibliography part (and its +sibling `itemProps{N}.xml` datastore part) is materialized lazily on first +use. `[Added in 2026.05.8]`. + +```python +from docx import Document + +document = Document() + +# Add a source. `tag` is the citation key used by references; `source_type` +# defaults to "Book". Extra kwargs become text-only children. +document.add_citation( + "smith2020", + title="Distributed Systems", + author="Smith, John", + year=2020, + city="London", + publisher="Acme", +) +document.add_citation( + "einstein1905", + source_type="JournalArticle", + title="Zur Elektrodynamik bewegter Koerper", + author="Einstein, Albert", + year=1905, +) + +# Insert a citation SDT that points at the source by tag. +p = document.add_paragraph("As argued in ") +p.add_citation_reference("smith2020") +p.add_run(", ...") + +# Read back. +for source in document.bibliography: + print(source.tag, source.author, source.year, source.title) + +hit = document.bibliography.get_by_tag("smith2020") +assert hit is not None and hit.year == "2020" + +document.save("out.docx") +``` + +- `Document.bibliography` — Returns a |Bibliography| proxy; lazily creates the customXml part. `[Added in 2026.05.8]` +- `Document.add_citation(tag, title=None, author=None, year=None, source_type="Book", **extra)` — Append a |Source| and return it. `[Added in 2026.05.8]` +- `Paragraph.add_citation_reference(tag, result_text=None, locale_id=1033)` — Insert a `` with a `CITATION` field referencing `tag`. `[Added in 2026.05.8]` +- `Bibliography.sources` — List of every |Source|. `[Added in 2026.05.8]` +- `Bibliography.get_by_tag(tag)` — Lookup; returns |Source| or |None|. `[Added in 2026.05.8]` +- `Bibliography.selected_style` / `.style_name` — APA / MLA / etc. style selector. `[Added in 2026.05.8]` +- `Source.tag` / `.title` / `.author` / `.year` / `.source_type` / `.element`. `[Added in 2026.05.8]` + +--- + +## Form fields + +Legacy `w:ffData` form fields (`FORMTEXT`, `FORMCHECKBOX`, `FORMDROPDOWN`) +can be authored directly and read back with a unified `FormField` +interface. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document() +p = document.add_paragraph("Name: ") +text_ff = p.add_text_form_field("name", default="Unknown", maxlength=32) + +p = document.add_paragraph("Subscribe: ") +chk_ff = p.add_checkbox_form_field("subscribe", checked=False) + +p = document.add_paragraph("Size: ") +dd_ff = p.add_dropdown_form_field( + "size", options=["Small", "Medium", "Large"], default_index=1, +) + +for ff in document.form_fields: + print(ff.type, ff.name, "=", ff.value) + +document.save("out.docx") +``` + +- `Paragraph.add_text_form_field(name, default="", maxlength=None)` — Add a `FORMTEXT`. `[Added in 2026.05.0]` +- `Paragraph.add_checkbox_form_field(name, checked=False)` — Add a `FORMCHECKBOX`. `[Added in 2026.05.0]` +- `Paragraph.add_dropdown_form_field(name, options, default_index=0)` — Add a `FORMDROPDOWN`. `[Added in 2026.05.0]` +- `Document.form_fields` / `Paragraph.form_fields` — Collections. `[Added in 2026.05.0]` +- `FormField.type` / `.name` / `.help_text` / `.status_text` / `.enabled` / `.calc_on_exit` / `.value` — Unified read. `[Added in 2026.05.0]` +- `FormField.text_input` / `FormField.checkbox` / `FormField.dropdown` — Typed views. `[Added in 2026.05.0]` +- `TextInputFormField.default` / `.max_length` / `.format`. `[Added in 2026.05.0]` +- `CheckboxFormField.default` / `.checked`. `[Added in 2026.05.0]` +- `DropdownFormField.options` / `.default_index` / `.result_index`. `[Added in 2026.05.0]` +- Enum: `WD_FORM_FIELD_TYPE`. `[Added in 2026.05.0]` + +--- + +## Watermarks + +Text and image watermarks are attached to a section's header via +`Section.add_text_watermark()` / `Section.add_image_watermark()`. +`[Added in 2026.05.0]`. + +```python +from docx import Document +from docx.shared import Inches + +document = Document() +section = document.sections[0] + +section.add_text_watermark( + text="CONFIDENTIAL", + font="Calibri", + color="C0C0C0", + size=72, +) + +# or an image watermark +# section.add_image_watermark("watermark.png", width=Inches(4)) + +wm = section.watermark +if wm is not None: + print(wm.type, wm.text) + +section.remove_watermark() +document.save("out.docx") +``` + +- `Section.add_text_watermark(text, font=None, size=None, color=None, bold=False, italic=False, semi_transparent=True)` — `[Added in 2026.05.0]` +- `Section.add_image_watermark(image_path_or_stream, width=None, height=None)` — `[Added in 2026.05.0]` +- `Section.remove_watermark()` / `Section.watermark` — `[Added in 2026.05.0]` +- `Watermark.type` / `Watermark.text` — Read-only introspection. `[Added in 2026.05.0]` + +--- + +## Captions + +Captions are paragraphs styled `"Caption"` that carry a `SEQ` field for +auto-numbering (`Figure 1`, `Table 7`, etc.). Helpers append or insert +captions relative to a figure or table. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document() +# picture followed by a caption below +picture_p = document.add_paragraph() +picture_p.add_run().add_picture("diagram.png") + +document.add_caption("Architecture overview", label="Figure") + +# table with caption above +tbl = document.add_table(rows=2, cols=2) +tbl_p = tbl._element.addprevious # conceptually +tbl._element.getparent() # paragraph helpers work on the surrounding paragraph +``` + +- `Document.add_caption(text, label="Figure", style="Caption")` — Append a numbered caption paragraph. `[Added in 2026.05.0]` +- `Paragraph.add_caption_before(text, label="Figure", style="Caption")` / `Paragraph.add_caption_after(...)` — Insert adjacent caption. `[Added in 2026.05.0]` +- `docx.captions.new_caption_paragraph(paragraph, text, label, style)` — Low-level helper. `[Added in 2026.05.0]` +- Caption sequences automatically include the `SEQ {label} \* ARABIC` field; Word renumbers on open. + +--- + +## Mail merge + +Mail-merge main-document settings are readable and writable via +`Settings.mail_merge`. `[Added in 2026.05.0]`. + +```python +from docx import Document +from docx.enum.text import WD_MAIL_MERGE_TYPE # if present +from docx.settings import WD_MAIL_MERGE_DESTINATION, WD_MAIL_MERGE_TYPE + +document = Document() +document.settings.enable_mail_merge( + main_document_type=WD_MAIL_MERGE_TYPE.FORM_LETTERS, + destination=WD_MAIL_MERGE_DESTINATION.NEW_DOCUMENT, +) + +mm = document.settings.mail_merge +print(mm.main_document_type, mm.destination) + +document.settings.disable_mail_merge() +document.save("out.docx") +``` + +- `Settings.mail_merge` — `MailMerge` proxy or `None`. `[Added in 2026.05.0]` +- `Settings.enable_mail_merge(main_document_type=..., destination=..., data_type=...)` — Turn it on. `[Added in 2026.05.0]` +- `Settings.disable_mail_merge()` — Remove the `w:mailMerge`. `[Added in 2026.05.0]` +- `MailMerge.main_document_type` / `.destination` / `.data_type` — Per-property reads and writes. `[Added in 2026.05.0]` +- Enums: `WD_MAIL_MERGE_TYPE`, `WD_MAIL_MERGE_DESTINATION`. `[Added in 2026.05.0]` + +--- + +## Document properties + +Core (Dublin-Core), custom (typed user-defined), and extended (application) +properties are all exposed. `CustomProperties` is a dict-like typed mapping; +`ExtendedProperties` covers `docProps/app.xml` (Company, Manager, Pages, +Words, TotalTime, AppVersion...). + +```python +from docx import Document + +document = Document() + +cp = document.core_properties +cp.author = "Ben" +cp.title = "Quarterly Report" + +# typed custom properties +document.custom_properties["ReviewerCount"] = 3 +document.custom_properties["IsDraft"] = True +document.custom_properties["ReleaseDate"] = "2026-05-01" + +# extended properties +ep = document.extended_properties +ep.set("Company", "Example Inc") +print(ep.get("Application")) + +document.save("out.docx") +``` + +- `Document.core_properties` — `CoreProperties` (author, title, subject, keywords, category, comments, content_status, identifier, language, version, created, last_modified_by, last_printed, modified, revision). +- `Document.custom_properties` — `CustomProperties` mapping. `[Added in 2026.05.0]` +- `CustomProperties.__getitem__` / `__setitem__` / `__delitem__` / `__contains__` / `__len__` / `__iter__` / `.add(name, value)` / `.get(name, default=None)` / `.names()` / `.items()` — Full mapping interface. Supports `str`, `int`, `float`, `bool`, and date strings. `[Added in 2026.05.0]`. `datetime.date` values serialise as `vt:date` (ISO-8601 `YYYY-MM-DD`) and `datetime.datetime` values as `vt:filetime`. `[Added in 2026.05.8]` +- `Document.extended_properties` — `ExtendedProperties` (`app.xml`) proxy. `[Added in 2026.05.0]` +- `ExtendedProperties.get(name)` / `.set(name, value)` / `.clear_all()` — Generic reads/writes; typed property accessors (`company`, `manager`, `pages`, `words`, `characters`, `total_time`, `application`, `app_version`, `template`, etc.) are generated from a declarative spec. `[Added in 2026.05.0]` + +--- + +## Settings + +`Document.settings` is a rich proxy over `word/settings.xml`. The fork adds +compatibility flags, doc-vars, theme-font language, mail merge, view, +spell/grammar toggles, auto-hyphenation, and explicit footnote/endnote +properties. Document protection is exposed as a structured object with +Word-compatible password hashing. + +```python +from docx import Document +from docx.settings import WD_VIEW, WD_PROTECTION + +document = Document() +settings = document.settings + +settings.view = WD_VIEW.WEB +settings.track_revisions = True +settings.update_fields_on_open = True +settings.hide_spelling_errors = True +settings.auto_hyphenation = True +settings.compat_flags["allowSpaceOfSameStyleInTable"] = True +settings.doc_vars["GreetingName"] = "World" +settings.theme_font_language = ("en-US", "ja-JP", None) + +# protection (filling forms, tracked changes, comments, read-only) +settings.enable_protection(WD_PROTECTION.READ_ONLY, password="secret") + +document.save("out.docx") +``` + +- `Document.settings` — `Settings` proxy. +- `Settings.compatibility_mode` / `Settings.compat_settings` / `Settings.compat_flags` — Compatibility plumbing. `[Added in 2026.05.0]` for `compat_settings` and `compat_flags`. +- `Settings.default_tab_stop` / `Settings.zoom_percent` / `Settings.view` — Layout & view. `view` is `[Added in 2026.05.0]`. +- `Settings.track_revisions` / `Settings.rsid_root` / `Settings.rsids` — Track changes. `[Added in 2026.05.0]` for rsids. +- `Settings.update_fields_on_open` — Tell Word to refresh fields on open. `[Added in 2026.05.0]` +- `Settings.odd_and_even_pages_header_footer` / `Settings.even_and_odd_headers` — Odd/even header footer flag. +- `Settings.theme_font_language` — `(latin, east_asian, bidi)` tuple. `[Added in 2026.05.0]` +- `Settings.hide_spelling_errors` / `Settings.hide_grammatical_errors` / `Settings.auto_hyphenation` / `Settings.do_not_hyphenate_caps` / `Settings.consecutive_hyphen_limit` / `Settings.hyphenation_zone` — Proofing and hyphenation. `[Added in 2026.05.0]` +- `Settings.doc_vars` — `DocVars` dict-like (w:docVars). `[Added in 2026.05.0]` +- `Settings.mail_merge` / `.enable_mail_merge(...)` / `.disable_mail_merge()` — See [Mail merge](#mail-merge). `[Added in 2026.05.0]` +- `Settings.footnote_properties` / `Settings.endnote_properties` / `Settings.add_*` / `Settings.remove_*` — Document-level note properties. `[Added in 2026.05.0]` +- `Settings.document_protection` / `Settings.enable_protection(mode, password=None)` / `Settings.disable_protection()` — See [Permissions](#permissions-and-protection). `[Added in 2026.05.0]` +- `CompatSettings` / `CompatFlags` / `DocVars` — Dict-like subtype helpers. `[Added in 2026.05.0]` +- Enums: `WD_VIEW`, `WD_PROTECTION`, `WD_MAIL_MERGE_TYPE`, `WD_MAIL_MERGE_DESTINATION`. + +--- + +## Themes + +`Document.theme` exposes the `theme1.xml` part read-only. Theme colors and +theme fonts are accessible as typed structures. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document("branded.docx") +theme = document.theme +if theme is not None: + print("Theme:", theme.name) + print("Major (Headings) font:", theme.fonts.major_latin) + print("Minor (Body) font:", theme.fonts.minor_latin) + print("Accent 1 color:", theme.colors.accent_1) + print("Hyperlink color:", theme.colors.hyperlink) +``` + +- `Document.theme` — `Theme` proxy or `None`. `[Added in 2026.05.0]` +- `Theme.name` / `.colors` / `.fonts`. `[Added in 2026.05.0]` +- `ThemeColors.dark_1` / `.dark_2` / `.light_1` / `.light_2` / `.accent_1` ... `.accent_6` / `.hyperlink` / `.followed_hyperlink` / `ThemeColors[name]`. `[Added in 2026.05.0]` +- `ThemeFonts.major_latin` / `.minor_latin` / `.major_east_asian` / `.minor_east_asian` / `.major_cs` / `.minor_cs` / `.name`. `[Added in 2026.05.0]` + +--- + +## Permissions and protection + +Document-wide protection (read-only, filling-forms, comments, +tracked-changes) is controlled through `Settings.document_protection` and +its `enable_protection()` / `disable_protection()` helpers. Range-level +permissions (`w:permStart`/`w:permEnd`) restrict edits to a specific user +or group within the document. `[Added in 2026.05.0]` across the board. + +```python +from docx import Document +from docx.settings import WD_PROTECTION + +document = Document() + +# global: only allow tracked-changes edits +document.settings.enable_protection(WD_PROTECTION.TRACKED_CHANGES, + password="s3cret!") + +# range-level: a paragraph only editable by "alex@example.com" +p = document.add_paragraph("Restricted section.") +p.add_permission_range(user="alex@example.com") + +for pr in document.permission_ranges: + print(pr.id, pr.user, pr.edit_group) + +document.save("out.docx") +``` + +- `Settings.document_protection` — `DocumentProtection` proxy. `[Added in 2026.05.0]` +- `DocumentProtection.mode` / `.enforce` / `.formatting_locked` / `.password_hash` / `.password_salt` / `.crypto_*` / `.spin_count` — Read/write. `.set_password(password)` hashes with Word's algorithm. `[Added in 2026.05.0]` +- `Settings.enable_protection(mode, password=None)` / `Settings.disable_protection()` — High-level shortcuts. `[Added in 2026.05.0]` +- `Paragraph.add_permission_range(name=None, user=None, edit_group=None)` — Wrap a paragraph in a `w:permStart`. `[Added in 2026.05.0]` +- `Paragraph.permission_ranges` / `Document.permission_ranges` — Collections. `[Added in 2026.05.0]` +- `PermissionRange.id` / `.user` / `.edit_group` / `.displaced_by_custom_xml` / `.delete()`. `[Added in 2026.05.0]` +- Enum: `WD_PROTECTION` (`NONE`, `READ_ONLY`, `COMMENTS`, `TRACKED_CHANGES`, `FORMS`). `[Added in 2026.05.0]` + +--- + +## Ink annotations + +Ink annotations (`` pointing at an `inkml` part) are +read-only — you can iterate them, read the raw ink-ML blob, and see how +many strokes they hold. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document("with-ink.docx") +for ink in document.ink_annotations: + print(ink.partname, ink.stroke_count, len(ink.blob), "bytes") +``` + +- `Document.ink_annotations` / `Paragraph.ink_annotations` — Iterators over `InkAnnotation`. `[Added in 2026.05.0]` +- `InkAnnotation.blob` / `.partname` / `.stroke_count` / `.paragraph`. `[Added in 2026.05.0]` + +--- + +## Embedded objects and attachments + +Embedded OLE objects (Excel sheets, PDFs, arbitrary files) can be added via +`Run.add_ole_object()`. `altChunk` attachments — arbitrary foreign payloads +(HTML, RTF, another docx) that Word merges on open — are added with +`Document.add_alt_chunk()`. Both are also exposed read-only as +`embedded_objects` / `attachments` / `alt_chunks`. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document() +p = document.add_paragraph("See attached: ") +r = p.add_run() +r.add_ole_object("model.xlsx", prog_id="Excel.Sheet.12") + +# HTML altChunk +document.add_alt_chunk( + "

HTML content

", + content_type="text/html", +) + +for ole in document.embedded_objects: + print(ole.prog_id, len(ole.blob)) + +for chunk in document.alt_chunks: + print(chunk.content_type, len(chunk.blob)) + +document.save("out.docx") +``` + +- `Run.add_ole_object(ole_path_or_stream, prog_id, icon_path_or_stream=None)` — Embed an OLE payload inline. `[Added in 2026.05.0]` +- `Document.embedded_objects` / `Paragraph.embedded_objects` — Collections of `EmbeddedObject`. `[Added in 2026.05.0]` +- `EmbeddedObject.blob` / `.embedded_partname` / `.prog_id` / `.r_id` / `.type` / `.paragraph`. `[Added in 2026.05.0]` +- `Document.add_alt_chunk(content, content_type="text/html")` — Append a `w:altChunk`. `[Added in 2026.05.0]` +- `Document.alt_chunks` — List of `AltChunk` proxies. `[Added in 2026.05.0]` +- `Document.attachments` — List of `Attachment` (same underlying `altChunk` elements, read-oriented). `[Added in 2026.05.0]` +- `Attachment.r_id` / `.content_type` / `.blob` / `.partname`. `[Added in 2026.05.0]` + +--- + +## Font table + +`fontTable.xml` describes the fonts referenced by the document and can +embed TTF bytes for private fonts. The fork exposes a read-only view of the +table plus `add_embedded_font()` for authoring. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document("with-fonts.docx") +ft = document.font_table +if ft is not None: + for meta in ft: + print(meta.name, meta.family, meta.embed_regular) + print("Calibri" in ft) + +# create an embedded-font slot when authoring +ft2 = document.font_table_or_new +``` + +- `Document.font_table` — `FontTable` or `None`. `[Added in 2026.05.0]` +- `Document.font_table_or_new` — Same, but creates an empty part if missing. `[Added in 2026.05.0]` +- `FontTable.__iter__` / `__len__` / `__contains__` / `__getitem__` / `.get(name)` / `.add_embedded_font(name, ttf_blob, style="regular")`. `[Added in 2026.05.0]` +- `FontMetadata.name` / `.family` / `.charset` / `.pitch` / `.panose` / `.alt_name` / `.embed_regular` / `.embed_bold` / `.embed_italic` / `.embed_bold_italic`. `[Added in 2026.05.0]` + +--- + +## Web settings + +`webSettings.xml` is exposed read-oriented via `Document.web_settings`. +`[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document("some.docx") +ws = document.web_settings +if ws is not None: + print(ws.encoding, ws.optimize_for_browser, ws.allow_png) +``` + +- `Document.web_settings` — `WebSettings` proxy or `None`. `[Added in 2026.05.0]` +- `WebSettings.encoding` / `.optimize_for_browser` / `.allow_png` / `.do_not_save_as_single_file`. `[Added in 2026.05.0]` + +--- + +## Glossary (building blocks) + +The glossary document (AutoText / Quick Parts / cover pages) is read-only +in `Document.glossary`. You can enumerate building blocks, filter by +category or gallery, and inspect each entry's paragraphs and tables. +`[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document("template-with-glossary.dotx") +g = document.glossary +if g is not None: + print("%d building blocks" % len(g)) + for bb in g: + print(bb.name, "→", bb.category.category_name, "/", bb.category.gallery) + print("categories:", g.categories) + print("galleries:", g.galleries) +``` + +- `Document.glossary` — `Glossary` or `None`. `[Added in 2026.05.0]` +- `Glossary.__iter__` / `__len__` / `__getitem__(name)` / `.building_blocks` / `.categories` / `.galleries` / `.by_category(name=None, gallery=None)`. `[Added in 2026.05.0]` +- `BuildingBlock.name` / `.category` / `.description` / `.guid` / `.paragraphs` / `.tables`. `[Added in 2026.05.0]` +- `BuildingBlockCategory.category_name` / `.gallery` / `.gallery_value`. `[Added in 2026.05.0]` + +--- + +## Digital signatures + +Digital signatures are detected and enumerated; no verification is +performed. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document("signed.docx") +if document.is_signed: + for sig in document.signatures: + print(sig.partname, sig.signer, sig.signed_at) +``` + +- `Document.is_signed` — `True` when `_xmlsignatures/*` parts exist. `[Added in 2026.05.0]` +- `Document.signatures` — List of `SignatureInfo`. `[Added in 2026.05.0]` +- `SignatureInfo.partname` / `.blob` / `.signer` / `.signed_at`. `[Added in 2026.05.0]` + +--- + +## Accessibility + +`Document.validate_heading_structure()` returns a list of `HeadingIssue` +objects describing outline problems — skipped levels, multiple H1s, empty +headings, starting-below-H1. `InlineShape.alt_text` / `Table.alt_text` +expose the accessibility fields. `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document("document.docx") + +# image alt text +for shape in document.inline_shapes: + shape.alt_text = shape.alt_text or "An image" + +# heading outline +issues = document.validate_heading_structure() +for issue in issues: + print(issue.code, "@", issue.paragraph_index, issue.message) +``` + +- `Document.validate_heading_structure()` — List of `HeadingIssue`. `[Added in 2026.05.0]` +- `HeadingIssue` — `code`, `message`, `paragraph_index`, `heading_level`, `heading_text`. `[Added in 2026.05.0]` +- `InlineShape.alt_text` / `.title` / `FloatingImage.alt_text` / `.title` — Accessibility metadata. `[Added in 2026.05.0]` +- `Table.alt_text` / `Table.alt_description` — Table alt text. `[Added in 2026.05.0]` + +--- + +## Document statistics + +`Document.statistics` returns a `DocumentStatistics` namedtuple with the +counts Word displays in its "Word Count" dialog. The body story is counted; +headers, footers, footnotes, endnotes, and comments are not. Pages are +sourced from the cached value in `docProps/app.xml` (python-docx does not +lay the document out). `[Added in 2026.05.0]`. + +```python +from docx import Document + +document = Document("report.docx") +stats = document.statistics +print("paragraphs:", stats.paragraphs) +print("words: ", stats.words) +print("characters:", stats.characters) +print("characters (no spaces):", stats.characters_no_spaces) +print("pages: ", stats.pages) # may be None +``` + +- `Document.statistics` — `DocumentStatistics(paragraphs, words, characters, characters_no_spaces, pages)`. `[Added in 2026.05.0]` + +--- + +## Search and replace + +Plain-text and regex-based search + replace work against body paragraphs +(`search` / `replace` / `search_regex` / `replace_regex`) or across every +story in the document (`_all` variants — body plus headers, footers, +footnotes, endnotes, and comments). All variants preserve run formatting +of the first character's run. `[Added in 2026.05.0]`. + +```python +import re +from docx import Document + +document = Document() +document.add_paragraph("Hello world") +document.add_paragraph("Hello again") + +# plain text +matches = document.search("Hello", case_sensitive=False) +for m in matches: + print(m.paragraph_index, m.start, m.end) + +# replace in body only +document.replace("Hello", "Hi") + +# regex replace everywhere (headers / footers / footnotes too) +document.replace_regex_all(re.compile(r"\bHi\b"), "Hiya") + +document.save("out.docx") +``` + +- `Document.search(text, case_sensitive=True, whole_word=False)` — Body-only matches. `[Added in 2026.05.0]` +- `Document.search_all(text, case_sensitive=True, whole_word=False)` — Every story. `[Added in 2026.05.0]` +- `Document.search_regex(pattern, flags=0)` / `Document.search_regex_all(pattern, flags=0)` — Regex search. `[Added in 2026.05.0]` +- `Document.replace(old, new, case_sensitive=True, whole_word=False)` / `Document.replace_all(...)` — Body / all-stories replacement. `[Added in 2026.05.0]` +- `Document.replace_regex(pattern, replacement, flags=0)` / `Document.replace_regex_all(...)` — Regex replacement. `[Added in 2026.05.0]` +- `SearchMatch.paragraph` / `.paragraph_index` / `.run_indices` / `.start` / `.end` / `.location` — Match metadata; `location` identifies the story. `[Added in 2026.05.0]` +- Story location strings include `"body"`, `"table:0:row:1:col:2"`, `"header:section0:primary"`, `"footnote:2"`, `"endnote:3"`, `"comment:5"`. + +--- + +## Cross-document operations + +Whole documents can be appended, paragraph-by-paragraph copies can be +imported with their style / numbering / image dependencies, and individual +tables, headers, and footers can be copied between sections or documents. +`[Added in 2026.05.0]`. + +```python +from docx import Document + +merged = Document() +chapter1 = Document("chapter1.docx") +chapter2 = Document("chapter2.docx") + +# append whole bodies (images, styles, numbering, etc. all follow) +merged.append_document(chapter1) +merged.append_document(chapter2) + +# copy a single paragraph +another = Document("snippets.docx") +merged.append_paragraph(another.paragraphs[0]) + +# copy a single table (including styles + images) +merged.add_table_copy(another.tables[0]) + +# copy a header between sections +merged.sections[1].copy_header_from(merged.sections[0]) + +# import a style from another document +merged.styles.import_from(another, names=["BodyQuote"]) + +merged.save("book.docx") +``` + +- `Document.append_document(other)` / `Document.append_body(other)` — Append another document's body. Returns the number of block elements copied. `[Added in 2026.05.0]` +- `Document.append_paragraph(paragraph)` — Copy a single paragraph with dependencies. `[Added in 2026.05.0]` +- `Document.add_table_copy(other_table)` / `Document.add_table_from(other_table)` — Deep-copy a table. `[Added in 2026.05.0]` +- `Section.copy_header_from(other_section)` / `Section.copy_footer_from(other_section)` — `[Added in 2026.05.0]` +- `Styles.import_from(other_doc, names)` / `Styles.import_style(style)` / `Styles.import_builtin(name)` — Style import with `basedOn` / `next` / `link` resolution. `[Added in 2026.05.0]` + +--- + +## Packaging and I/O options + +`Document.save()` supports: +- A regular `.docx` / `.docm` save (default). +- A **reproducible** zip layout with a fixed timestamp, sorted members, and + no extra metadata — byte-identical output for the same content. + `[Added in 2026.05.0]` +- A **Flat-OPC** (``) single-XML serialisation. + `[Added in 2026.05.0]` +- Path objects (`os.PathLike`). `[Added in 2026.05.0]` +- `.docm` macro-enabled output (auto-detected from the loaded part + content-type). `[Added in 2026.05.0]` + +Opening supports: +- `.docx`, `.docm`, `.dotx`, `.dotm` packages (`Document()`). + `.dotx` / `.dotm` template discrimination is `[Added in 2026.05.0]`. +- Strict-OOXML packages translated on the fly. `[Added in 2026.05.0]` +- Flat-OPC input auto-detected. `[Added in 2026.05.0]` +- `recover=True` tolerating malformed XML with warnings on + `Document.recovery_warnings`. `[Added in 2026.05.0]` +- `huge_tree=True` relaxing lxml's XML-bomb safety limits. + `[Added in 2026.05.0]` +- `include_metadata=False` stripping the default template's core / + extended properties on load. `[Added in 2026.05.0]` +- `EncryptedDocumentError` raised for password-protected packages when + no `password=` is supplied. `[Added in 2026.05.0]` +- `password=` kwarg decrypts an ECMA-376 Agile-Encryption + (password-protected) `.docx` via the optional `python-ooxml-crypto` + dependency. `[Added in 2026.05.10]` + +```python +from docx import Document + +# reproducible save +doc = Document() +doc.add_paragraph("This will be byte-identical on every save.") +doc.save("rep.docx", reproducible=True) + +# Flat-OPC +doc.save("rep.xml", flat_opc=True) + +# macro-enabled: open a .docm and save as .docm +macro = Document("macros.docm") +print(macro.has_macros) +macro.save("macros-out.docm") + +# recover mode +with open("bad.docx", "rb") as f: + broken = Document(f, recover=True) +print(broken.recovery_warnings) + +# password-protected (requires optional `python-ooxml-crypto`) +doc = Document() +doc.add_paragraph("confidential") +doc.save("protected.docx", password="hunter2") +reopened = Document("protected.docx", password="hunter2") +``` + +- `Document.save(path_or_stream, flat_opc=False, reproducible=False, password=None)` — Save options as above. `password=` encrypts the output using ECMA-376 Agile Encryption via `python-ooxml-crypto`. `[Added in 2026.05.10]` +- `Document.has_macros` — `True` when a VBA project is present. `[Added in 2026.05.0]` +- `docx.exceptions.EncryptedDocumentError` — `[Added in 2026.05.0]` +- `docx.exceptions.RmsProtectedDocumentError` — `[Added in 2026.05.10]` +- `docx.package.Package` — `Package.open(...)` / `.is_signed` / `.recovery_warnings` / `.signatures` — Low-level package access. +- Flat-OPC helpers: `docx.opc.flat_opc.write_flat_opc` / `is_flat_opc`. + +### Password-protected documents + +python-docx supports both reading and writing ECMA-376 Agile-Encryption +password-protected `.docx` files via the optional +[`python-ooxml-crypto`](https://github.com/loadfix/python-ooxml-crypto) +dependency. Install it with `pip install 'python-docx[encryption]'` (or +directly with `pip install python-ooxml-crypto`). + +```python +from docx import Document +from docx.exceptions import EncryptedDocumentError + +# encrypt on save +doc = Document() +doc.add_paragraph("confidential") +doc.save("protected.docx", password="hunter2") + +# decrypt on load +reopened = Document("protected.docx", password="hunter2") + +# wrong-password / missing-password cases raise EncryptedDocumentError +try: + Document("protected.docx", password="wrong") +except EncryptedDocumentError as e: + print(e) +``` + +Azure RMS / AIP / IRM-wrapped files (whose payload is keyed to the user's +Microsoft 365 identity, not a password) cannot be decrypted by +python-ooxml-crypto and raise `docx.exceptions.RmsProtectedDocumentError` +(a subclass of `EncryptedDocumentError`). Delegate decryption to +Microsoft Office automation or the Microsoft Information Protection SDK +before opening such files with python-docx. `[Added in 2026.05.10]` + +--- + +## API concepts + +`python-docx` is organised in three layers: + +- **Document API** (`src/docx/document.py`, `src/docx/text/*.py`, + `src/docx/table.py`, `src/docx/section.py`, etc.) — proxy objects wrapping + OOXML elements. This is where the overwhelming majority of user code + lives. +- **Parts layer** (`src/docx/parts/*.py`) — `XmlPart` subclasses that own + the XML trees for each of the document's constituent parts (document, + numbering, styles, comments, footnotes, endnotes, chart, settings, + custom-xml, font-table, ...) and manage the relationships between them. +- **oxml layer** (`src/docx/oxml/*.py`) — `CT_*` classes extending + `lxml.etree.ElementBase` and mapping directly onto schema element names. + +`lxml` handles the XML parsing, serialisation, and XPath work beneath the +library. `docx.shared` carries `Length` subclasses (`Inches`, `Cm`, `Mm`, +`Pt`, `Emu`, `Twips`), `RGBColor`, `ElementProxy`, and `StoryChild`. + +```python +from docx import Document +from docx.shared import Inches, Cm, Pt, RGBColor + +document = Document() +# any Length is just a typed int — freely interchangeable +width = Inches(2) +print(int(width), Cm(5.08)) # same length, different constructor +print(Pt(12), RGBColor(0x2E, 0x74, 0xB5)) +document.sections[0].left_margin = Cm(2.5) +document.save("out.docx") +``` + +- `docx.shared.Length` / `Inches` / `Cm` / `Mm` / `Pt` / `Emu` / `Twips` — Length constructors and arithmetic. +- `docx.shared.RGBColor` — `(r, g, b)` triple with `from_string()`, `rgb`, hex output. +- `docx.shared.ElementProxy` / `Parented` / `StoryChild` — Proxy base classes. +- `docx.opc.constants.CONTENT_TYPE` / `RELATIONSHIP_TYPE` — Content-type and rel-type constants used by the parts layer. +- `docx.oxml.ns.qn(tag)` — Clark-notation tag expansion; only needed when dropping into the oxml layer. + +--- + +*This file is generated and maintained by hand — see `HISTORY.rst` for the +full change log, `docs/user/*.rst` for narrative tutorials, and +`docs/api/*.rst` for per-class API reference pages.* diff --git a/HISTORY.rst b/HISTORY.rst index 69bba4161..72fc0f262 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -3,6 +3,508 @@ Release History --------------- +2026.05.10 — Password-protected read + write +++++++++++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-10 + +python-docx now **reads and writes** password-protected ``.docx`` +files (ECMA-376 Agile Encryption — the scheme Word uses when a user +sets a password in the desktop app). Previous releases only *detected* +encrypted input and raised ``EncryptedDocumentError`` pointing users at +an external tool. This release delegates actual AES key derivation and +CFBF (OLE2) compound-document parsing to the new optional +``python-ooxml-crypto`` dependency, mirroring the read/write surface +python-pptx already ships (closes #327 upstream in the sibling repo; +unlocks the same workflow for python-docx). + +- **``Document(path, password=...)``** decrypts an encrypted ``.docx`` + on open. Supplying the kwarg with no encryption is a no-op; omitting + it when the input is encrypted continues to raise + ``EncryptedDocumentError`` with the now-updated message pointing + callers at ``python-ooxml-crypto`` instead of the old + ``msoffcrypto-tool`` recommendation. +- **``Document.save(path, password=...)``** encrypts the output using + ECMA-376 Agile Encryption. ``flat_opc=True`` and ``password=`` are + mutually exclusive (Flat-OPC is an XML document, not a zip). + ``reproducible=True`` and ``password=`` compose normally — the + fixed-timestamp zip is built first and then encrypted. +- **New ``docx.exceptions.RmsProtectedDocumentError``** (subclass of + ``EncryptedDocumentError``) is raised when opening a file wrapped in + Azure RMS / AIP / IRM protection. The payload is keyed to the user's + Microsoft 365 identity rather than a password, so python-ooxml-crypto + cannot decrypt it — delegate to Microsoft Office automation or the + Microsoft Information Protection SDK before opening with python-docx. +- **New adapter module ``docx.opc._crypto``** with the public helpers + ``is_encrypted_stream``, ``is_rms_protected_stream``, + ``decrypt_stream``, and ``encrypt_bytes``. The adapter is the single + point where the optional ``ooxml_crypto`` import is resolved; every + error from that library is rewrapped as + ``EncryptedDocumentError`` with an actionable message. +- **Optional install extra.** ``pip install 'python-docx[encryption]'`` + pulls in ``python-ooxml-crypto``. The library keeps zero new + mandatory runtime dependencies; calling ``Document(path, + password=...)`` (or ``Document.save(..., password=...)``) without + the extra installed raises ``EncryptedDocumentError`` with the + install instructions. + + +2026.05.9 — Audit bug-fix round ++++++++++++++++++++++++++++++++ + +Released: 2026-05-05 + +Small targeted fixes surfaced by the 2026-05-05 audit. No new +feature surface; existing behaviour either gets a regression test +or a crisper error type. + +- **vt:date round-trip regression test.** The ``datetime.date`` + serialisation added in 2026.05.8 (commit ``c3edf01b``) now has a + full ``Document`` → ``custom.xml`` → reload regression test + (``tests/test_custom_properties.py::DescribeCustomProperties_RoundTrip``) + so the GitHub issue #171 round-trip behaviour stays locked in. +- **Typed exception on missing ``[Content_Types].xml``** (closes + #172). Loading a zip that happens to be a valid archive but lacks + the mandatory OPC content-types part used to leak a bare + ``KeyError('[Content_Types].xml')`` from ``zipfile.read``. + ``docx.opc.pkgreader.PackageReader.from_file`` now wraps it in + ``docx.opc.exceptions.PackageNotFoundError`` at the narrowest + possible scope, matching the corpus manifest + ``malformed-content-types-missing`` (whose ``forbidden_exception`` + clause explicitly rejected bare ``KeyError``). +- **Explicit ``__all__`` on 12 public submodules.** ``docx.table``, + ``docx.section``, ``docx.bookmarks``, ``docx.blkcntnr``, + ``docx.dml.color``, ``docx.drawing``, ``docx.equations``, + ``docx.styles.styles``, ``docx.styles.style``, + ``docx.text.paragraph``, ``docx.text.run``, + ``docx.text.pagebreak`` now declare the public surface so + internal ``CT_*`` / ``ST_*`` names can no longer be reached via + ``from docx. import *``. Star-import only — existing explicit + imports continue to work. + + +2026.05.8 — New authoring APIs +++++++++++++++++++++++++++++++ + +Released: 2026-05-05 + +Three independently-developed authoring feature branches landed in +this release, extending the fork's writer surface in areas previously +supported for *read* only (or not at all). + +SmartArt +~~~~~~~~ + +- New ``Document.add_smart_art(layout_name)`` returns a ``SmartArt`` + proxy. Built-in layouts: ``"list"``, ``"cycle"``, ``"process"``. + Each call provisions the full quartet of SmartArt parts + (``diagrams/data{N}.xml``, ``layout{N}.xml``, ``quickStyle{N}.xml``, + ``colors{N}.xml``) from the templates under + ``src/docx/templates/smart_art/`` and wires the drawing into the + document body at the current insertion point. +- New ``SmartArt.add_node(text)`` appends a data-point node into the + underlying ````/```` with the text you + supply, picking up the layout's default style so the rendered shape + picks the right fill/line/font automatically. +- See ``FEATURES.md`` § "SmartArt" for the full snippet. + +Bibliography and citations +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- New ``Document.bibliography`` property returns a ``Bibliography`` + proxy (read + write). On first access it lazily provisions + ``/customXml/item{N}.xml`` (with a ```` root) plus the + matching ``itemProps{N}.xml`` and relates both to the document part. +- New ``Document.add_citation(tag, source_type, ...)`` adds a + ```` entry to the bibliography. ``tag`` is the key that + citation references resolve against. +- New ``Paragraph.add_citation_reference(tag)`` inserts an ``SDT`` + citation marker that Word reifies to ``(Author, Year)`` using the + current bibliography style. +- The save-time custom-XML drop heuristic now preserves freshly- + authored bibliography parts even without a ``w:dataBinding`` + (citations bind implicitly through matching ```` values). +- See ``FEATURES.md`` § "Bibliography and citations". + +Field evaluation +~~~~~~~~~~~~~~~~ + +- New ``Field.evaluate(context)`` and + ``Document.evaluate_fields(context)`` evaluate complex field codes + against a supplied context dict. Supported codes: + + - ``MERGEFIELD FieldName`` — substitutes ``context["FieldName"]``. + - ``IF cond op cond "then" "else"`` — boolean evaluation with + nested ``{MERGEFIELD}`` allowed on either side of the comparator. + - ``HYPERLINK "url"`` — resolves to the URL and updates the + displayed run so the cached result matches. + - ``= `` — arithmetic formula evaluator (``+``, ``-``, ``*``, + ``/``, parentheses, numeric literals, and references to + ``context`` keys). + - ``PAGE`` / ``NUMPAGES`` / ``DATE`` / ``TIME`` — runtime-dynamic + placeholders pulled from the context or from ``datetime.now()``. +- Deferred (raised as ``FieldEvalError``): string-function formulas + (``=SUM()``, ``=AVERAGE()`` beyond arithmetic), nested ``IF``, + ``QUOTE``, ``FILLIN``, and the full date-picture/numeric-format + switch grammar. +- See ``FEATURES.md`` § "Complex-field evaluation". + + +2026.05.7 — Round-trip fidelity and performance fixes ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-05 + +Reproducible-save fidelity +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- ``Document.save(..., reproducible=True)`` no longer mints + ``w:rsidR`` / ``w:rsidRDefault`` on paragraphs and runs that don't + already carry them (#168). Those attributes are session-scoped + churn markers; synthesising them from a constant-valued root made + the output reproducible but *not* faithful — round-tripping + ``bold-text.office.docx`` gained a spurious ``w:rsidR`` on its + single ````. ``w14:paraId`` / ``w14:textId`` continue to be + derived deterministically from paragraph content so repeated saves + remain byte-identical. + +Default template rebuild +~~~~~~~~~~~~~~~~~~~~~~~~ + +- ``src/docx/templates/default.docx`` has been rebuilt from the + ``default-docx-template/`` source tree (#169) so a fresh + ``Document()`` exposes the Word-2024 namespace set (``w15``, + ``w16``, ``w16cex``, ``w16cid``, ``w16du``, ``w16sdtdh``, + ``w16sdtfl``, ``w16se``, ``cx``–``cx8``, ``aink``, ``am3d``, + ``oel``) plus the matching ``mc:Ignorable`` list. The unzipped + tree was updated in 2026.05.2 but the zipped blob was not + regenerated — ``Document()`` was still loading the pre-2026.05.2 + namespace set at runtime. +- New ``scripts/rebuild_default_template.py`` deterministically + rebuilds the zipped blob from the source tree so future template + edits cannot drift out of sync silently. + +Narrow part-drop heuristics to preserve Word-authored data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The 2026.05.4 "word-mimicry phase 3" release introduced aggressive +drop heuristics that silently destroyed optional parts from +Word-authored files on round-trip (#167). This release narrows the +policy so parts that shipped in the source package are preserved +verbatim — dropping happens only when python-docx itself created +the part. + +- ``Unmarshaller._unmarshal_parts`` now flags every part it loads with + ``_loaded_from_package = True``. Save-time heuristics consult this + flag and preserve any part that shipped in the source package, + regardless of whether python-docx can statically prove it is + referenced. +- **``word/stylesWithEffects.xml``** — was dropped unconditionally. + Now dropped only when python-docx created the part itself (it never + does today, but the policy is symmetric with the others). +- **``customXml/*``** — was dropped whenever no ```` was + present. That false-negatived on customXml used by Power BI, + bibliography sources, and Office Add-in backing data. Now preserved + whenever the source package shipped it. +- **``docProps/thumbnail.jpeg``** — was dropped unconditionally at + the package level. Now preserved whenever the source package + shipped it. Library-authored documents still skip the thumbnail. +- **``word/numbering.xml``** — the style-indirect heuristic now walks + the ``w:basedOn`` chain when resolving which styles declare + ````, catching user-defined styles rooted in a numbering + style (the common "My Bullet → List Bullet" pattern). Dropped only + when python-docx authored the part and the document uses no + numbering at all. + +Found by W5-A / W5-E / W6-A audits: every Word-authored corpus fixture +round-tripped through the 2026.05.4 drop heuristics lost at least one +of these four parts. + +CustomProperties accepts datetime.date (vt:date) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- ``CustomProperties`` now accepts ``datetime.date`` values (distinct + from ``datetime.datetime``) and serialises them as ``vt:date`` + (ISO-8601 ``YYYY-MM-DD``) per ECMA-376 Part 1 §22.4.2.7 (#173). + On read a ``vt:date`` element deserialises back to a plain + ``datetime.date``; ``datetime.datetime`` values continue to + round-trip as ``vt:filetime`` (ISO-8601 with trailing ``Z``). +- Surfaced by Wave 3-B: only ``python-xlsx`` previously mapped + ``date`` to ``vt:date``; ``python-docx`` and ``python-pptx`` only + recognised ``datetime``. + +O(N^2) indexing on _Rows[i] and BlockItemContainer.paragraphs[i] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- ``_Rows.__getitem__`` no longer constructs a ``_Row`` proxy for + every row in the table on each access (#170). It now reads the + single requested ```` out of ``self._tbl.tr_lst`` and wraps + only that element, dropping a naive ``for i in range(N): rows[i]`` + loop from ~1.46 ms/access to ~0.54 ms/access at N = 2000. +- ``BlockItemContainer.paragraphs`` now returns a lightweight + ``_ParagraphsView(Sequence[Paragraph])`` that memoises the + underlying ``p_lst`` on first access and wraps only the ```` + the caller requests. The view supports ``len()``, indexed and + sliced access, iteration, ``list(...)`` coercion, ``in``, + ``.index(…)``, truthiness, and equality against a + ``list[Paragraph]``. +- New ``tests/test_indexing_perf.py`` enforces a < 1 ms/access + ceiling at N = 5000 (paragraphs) / N = 2000 (rows). + + +2026.05.6 — Section.vertical_alignment property +++++++++++++++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-05 + +- Add ``Section.vertical_alignment`` property + setter. +- Add ``WD_VERTICAL_ALIGNMENT`` enum (``TOP`` / ``CENTER`` / ``BOTH`` + / ``BOTTOM``) mapping to OOXML ``ST_VerticalJc``. +- Plumbed through ``CT_SectPr.vAlign``, following the existing + ``Section.orientation`` pattern. +- 12 parametrised unit tests in ``tests/test_section.py``. + +Surfaced by the ``docx/vertical-alignment`` parameterised family in +``loadfix/ooxml-reference-corpus`` — section-level cases previously +required ``OxmlElement("w:vAlign")`` fallback. + + +2026.05.5 — Document.add_comment accepts date= +++++++++++++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-04 + +- ``Document.add_comment()`` now forwards an optional + ``date: datetime`` kwarg to the underlying comments collection, + mirroring ``Comments.add_comment(date=...)``. + + +2026.05.4 — Word-mimicry phase 3: omit unused optional parts +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-04 + +python-docx now omits unused optional parts on save, matching Word's +"emit the minimum" behaviour for library-authored files. The default +template still carries these parts — they are pruned at save time +only when the document doesn't actually reference them. + +- **`word/numbering.xml`** — dropped unless the document uses numbering + directly (a paragraph with ````) or via a numbering-bearing + style (``List Bullet``, ``List Number``, etc.). The check reads + ``styles.xml`` to resolve style→numPr links. +- **`word/stylesWithEffects.xml`** — dropped unconditionally. This is + a Word 2013-compat duplicate of ``styles.xml``; python-docx never + produces effect-style content. +- **``customXml/``** items — dropped unless a content control's + ```` references custom XML. +- **``docProps/thumbnail.jpeg``** — dropped unconditionally at the + package level. python-docx has no renderer, so any thumbnail it + ships would be stale. + +Rel removal happens in the before_marshal hook (for document-rooted +parts) and at package save (for the package-level thumbnail rel), +which cascades automatically: ``[Content_Types].xml``, ``_rels/.rels``, +and ``word/_rels/document.xml.rels`` all rebuild from the pruned +rels graph without additional bookkeeping. + +Concrete result on the corpus bold-text feature: the machine-generated +fixture now ships exactly the same 11 parts as the Word-authored +companion. The three-way diff's "only in machine" column is empty for +the simple-text feature pack; residual ``word/document.xml`` +differences are only the locale-default page size / margins (A4 vs +US Letter, by design out of scope). + +Full suite: 5004 pass / 6 skip. Corpus conformance: 5/5 pass. + + +2026.05.3 — Word-mimicry phase 2: paragraph-mark format mirror +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-04 + +- ``DocumentPart.before_marshal()`` now also mirrors a single-run + paragraph's ```` formatting onto the paragraph mark via + ````. This matches Word's "keep typing in bold" + convention: when a paragraph ends in a bold/italic/coloured run, + the paragraph mark inherits that formatting so text typed past the + end continues in the same shape. +- Mirrored properties: b, bCs, i, iCs, u, strike, dstrike, caps, + smallCaps, color, sz, szCs, rFonts, vertAlign. Explicitly excludes + lang, spacing, border, shading — Word does not mirror these onto + paragraph marks. +- Only applied to paragraphs that have exactly one direct ```` + child (the common one-run-per-paragraph case). Multi-run and + hyperlinked paragraphs are left alone to avoid surprising behaviour. +- Existing ```` content is preserved; only missing + mirror properties are added. + + +2026.05.2 — Word-mimicry phase 1: namespace decls, paraId, rsid ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-04 + +Narrow the XML python-docx emits toward the shape Microsoft Word itself +writes, so loadfix/ooxml-reference-corpus three-way diffs surface real +semantic differences instead of tooling-version noise. + +- The default ``word/document.xml`` template now carries the full + namespace set Word 2024 declares (cx, cx1-cx8, aink, am3d, oel, w15, + w16, w16cid, w16cex, w16se, w16du, w16sdtdh, w16sdtfl) plus the + matching ``mc:Ignorable`` list. +- New ``DocumentPart.before_marshal()`` hook stamps Word-style + identifiers on every paragraph that lacks them just before + serialization: ``w14:paraId``, ``w14:textId``, ``w:rsidR``, + ``w:rsidRDefault``. Runs get ``w:rsidR``. A session-wide + ``w:rsidRoot`` is generated per save call and recorded in + ``word/settings.xml``'s ```` table via the new + ``Settings.add_rsids()`` method. +- Existing identifiers are preserved on round-trip; only missing ones + are minted. +- Reproducible-save mode (``Document.save(..., reproducible=True)``) + derives identifiers deterministically from paragraph content, so + repeated saves of the same document remain byte-identical. + +Why: diffing python-docx output against Word-authored reference files +previously showed hundreds of lines of rsid/paraId/namespace churn +that obscured real bold/italic/layout differences. Post-fix, the noise +collapses and only behavioural divergences remain visible. + + +2026.05.1 — bCs/iCs correctness fix ++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-04 + +- Fix: setting ``font.bold = True`` now also emits ```` + (complex-script bold); setting ``font.italic = True`` emits + ````. Previously only ```` / ```` were emitted, + which silently dropped bold/italic on Arabic, Hebrew, and Thai runs + when Word reopened the file. Mirrors the behavior Word itself writes. + Surfaced by the three-way comparison pipeline in + ``loadfix/ooxml-reference-corpus/features/docx/bold-text.json``. + + The ``cs_bold`` / ``cs_italic`` properties continue to work + independently; callers that need divergent values can still set them + explicitly after setting bold/italic. + + +2026.05.0 — first release as independent fork ++++++++++++++++++++++++++++++++++++++++++++++ + +Released: 2026-05-02 + +This release marks the project's split from upstream +``python-openxml/python-docx``. Versioning switches to CalVer +(YYYY.MM.patch) from this point forward. The previous upstream line +stops at ``1.2.0`` (2025-06-16); everything below is new to this fork. + +All 100+ features below shipped as part of this initial independent +release. Subsequent CalVer releases will have their own entries. + +Phase A — Footnotes and endnotes + - Add Document.footnotes and Footnotes / Footnote / FootnoteProperties (#1, #3, #17, #46, #48, #56, #82) + - Add Document.endnotes mirror API (#17, #96) + - Add Section.footnote_properties / endnote_properties (#17) + +Phase B — Tracked changes + - Add read of tracked insertions and deletions (#53) + - Add accept / reject tracked changes (#7) + - Add read of formatting changes (#8) + - Add move revisions (w:moveFrom / w:moveTo) (#134) + - Add cell and row-level tracked changes (#135) + - Add revision_marks_text() for CLI previews (#163) + +Phase C — Bookmarks and fields + - Add bookmarks create / read / delete (#52) + - Add simple and complex field codes (#10) + - Add REF / PAGEREF cross-reference resolution (#115) + +Phase D — Miscellaneous OOXML feature coverage + - D.1 Hyperlink creation API (#97) + - D.2 Comment replies (threaded) (#67) + - D.3 Extended document settings + DocumentProtection (#66, #125) + - D.4 Custom document properties (#14) + - D.6 Cell shading and background color (#63) + - D.7 Paragraph borders (#109) + - D.9 Numbering style control (#22) + - D.10 Search and replace with formatting preservation (#91) + - D.13 Insert paragraph / table at arbitrary position (#26) + - D.14 Content controls (SDTs) (#27) + - D.15 Row.height setter (#28) + - D.16 Row.allow_break_across_pages (#51) + - D.17 Floating images with wp:anchor positioning (#30) + - D.19 Multi-column section layout (#60) + - D.20 Font.shading — run-level background color (#33) + - D.22 SVG image support (#76) + - D.23 Watermark support (text and image) (#36) + - D.24 .docm macro-enabled file support (#65) + - D.26 Table autofit and column-width control (#39) + - D.27 DrawingML shapes and text-box content access (#75) + +Other feature additions + - Charts read + add_chart() (#111) + - SmartArt detection and node text (#112) + - Equation read + minimal create API (#113) + - Add Run.add_symbol and Run.symbols (#114) + - Add Section.page_borders (#121) + - Add Section.line_numbering (#122) + - Add Section.document_grid (#147) + - Add Section.first_page / other_pages_paper_source (#146) + - Add Section.text_direction / right_to_left (#148) + - Add Section odd/even page header-footer (#149) + - Add Font.border_* properties (#120) + - Add Font.language / east_asian_language / bidi_language (#160) + - Add East Asian typography (kinsoku, word_wrap, east_asian_layout) (#128) + - Add RTL / bidi on Paragraph and Run (#127) + - Add paragraph_format.frame for text frames (#126) + - Add ParagraphBorders / Border (#109) + - Add read-only ruby (#129) + - Add read-only ink (#139) + - Add read-only embedded OLE objects (#140) + - Add read-only grouped shapes (#138) + - Add read-only SmartArt (#112) + - Add read-only Document.glossary (#132, #133) + - Add read-only Document.theme (#117) + - Add read-only Document.web_settings (#157) + - Add Document.font_table (#119) + - Add Document.background_color (#118) + - Add Document.statistics (#161) + - Add Document.search_regex / replace_regex / search_all / replace_all (#153, #154) + - Add Document.add_table_of_contents (#116) + - Add caption helpers (#141) + - Add permission ranges (#124) + - Add Settings.mail_merge (#130) + - Add Settings.compat_flags / compat_settings (#156) + - Add Settings.view (#164) + - Add Style.link_style / next_style / is_redefined (#162) + - Add Table.borders / _Cell.borders (#102) + - Add Cell.margins (#143) + - Add Table.style_flags (#144) + - Add Cell.text_direction (#142) + - Add Cell.is_merge_origin / merge_origin (#145) + - Add _Row.is_header (#93) + - Add Run.split (#94) + - Add Paragraph.delete / Run.delete / Table.delete (#50) + - Add alt_text / title on InlineShape and FloatingImage (#158) + - Add stable_id on Paragraph / Run / Table / Cell (#155) + - Add Paragraph.insert_paragraph_before arbitrary positioning (#26) + - Add legacy form fields (#123) + - Add heading-structure accessibility validator (#159) + +Reliability / safety + - Add recover=True mode for malformed .docx (#151) + - Add EncryptedDocumentError for password-protected .docx (#152) + - Add digital signature detection (#150) + +Dev / tooling + - Add py.typed, improve public types + - Add AI-agent CI pipeline (Product / Develop / Review / Security / Revise + / Merge / Debug / Watchdog) + - Add interop-validate behave scenarios wiring loadfix/ooxml-validate as a round-trip fidelity check. + + 1.2.0 (2025-06-16) ++++++++++++++++++ diff --git a/MANIFEST.in b/MANIFEST.in index b2d3fadcf..b96d7c09d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ -include HISTORY.rst LICENSE README.rst tox.ini -include requirements*.txt +include HISTORY.rst LICENSE README.md tox.ini graft src/docx/templates graft features graft tests diff --git a/Makefile b/Makefile index 2b2fb4121..979ea18e7 100644 --- a/Makefile +++ b/Makefile @@ -29,8 +29,7 @@ build: uv build clean: - # find . -type f -name \*.pyc -exec rm {} \; - fd -e pyc -I -x rm + find . -type f -name '*.pyc' -delete rm -rf dist *.egg-info .coverage .DS_Store cleandocs: @@ -43,7 +42,7 @@ docs: $(MAKE) -C docs html install: - pip install -Ue . + uv pip install -e . opendocs: open docs/.build/html/index.html diff --git a/README.md b/README.md index c35cf0200..f232cc340 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,139 @@ # python-docx -*python-docx* is a Python library for reading, creating, and updating Microsoft Word 2007+ (.docx) files. +A Python library for reading, creating, and updating Microsoft Word +2007+ (`.docx`) files. + +This repository is a fork of [python-docx](https://github.com/python-openxml/python-docx) +by Steve Canny. It builds on their original work by extending coverage +to 100+ additional OOXML features — footnotes and endnotes, tracked +changes, bookmarks, fields, content controls, charts, equations, +SmartArt, watermarks, digital signatures, accessibility tooling, and +cross-document operations. Forked at upstream `1.2.0` (2025-06-16). +Credit for the foundational library goes to the original author. ## Installation ``` -pip install python-docx +pip install git+https://github.com/loadfix/python-docx.git ``` -## Example +Requires Python 3.9+. + +Not yet published to PyPI. Install from source only. + +## Usage ```python ->>> from docx import Document +from docx import Document + +document = Document() +document.add_paragraph("It was a dark and stormy night.") +document.save("dark-and-stormy.docx") + +document = Document("dark-and-stormy.docx") +print(document.paragraphs[0].text) +# It was a dark and stormy night. +``` + +The package is imported as `docx`, matching upstream. Existing +upstream code runs unchanged against this fork. + +## API + +See [`FEATURES.md`](FEATURES.md) for the full catalogue — 43 sections +covering every public capability, with fork additions marked +`[Added in 2026.05.0]`. + +Summary of areas extended beyond upstream `1.2.0`: + +- Footnotes, endnotes, and their numbering properties +- Tracked changes (read, accept, reject, insertions, deletions, moves, + formatting changes, cell/row changes, revision IDs) +- Bookmarks (create, read, delete, cross-paragraph) +- Fields (simple, complex, REF/PAGEREF cross-references, DOCPROPERTY + resolution, table of contents, list of figures/tables) +- Content controls (SDTs: rich text, plain text, date, checkbox, combo, + dropdown, picture; custom XML data binding) +- Bibliography and citations (`Document.bibliography`, + `Document.add_citation`, `Paragraph.add_citation_reference` — backed by + the `customXml/item{N}.xml` + `itemProps{N}.xml` part pair) +- Form fields (text input, checkbox, dropdown) +- Charts (read + create for bar/line/pie; `Chart.replace_data()`) +- SmartArt (read + create for list/cycle/process layout families) +- Equations (OMML read + builders for identifier, fraction, superscript, + subscript, radical) +- Watermarks, captions, ink annotations, embedded OLE objects, alt-chunks +- Tables (borders, shading, margins, autofit, merged-cell helpers, style + flags, caption/description, indent, row height, header rows, + cross-document copy, CRUD on rows/columns/cells) +- Sections (page borders, line numbering, document grid, paper source, + columns, text direction, odd/even and first-page header/footer, + copy between sections) +- Images (PNG/JPEG/GIF/BMP/TIFF/SVG/WebP/EMF/WMF/EPS; linked, floating, + outline, crop, opacity, shadow, alt text, delete, replace) +- Shapes (preset DrawingML shapes, text boxes, canvas) +- Numbering (custom definitions, restart, rendered list labels) +- Styles (cross-document import, builtin latent materialisation, + document-default font, next-paragraph auto-apply) +- Fonts (cs size, character scale, ligatures, shading, borders, + language, East Asian layout, symbols, ruby) +- Accessibility (alt text, heading-structure validation) +- Search and replace (plain, regex, across tables/headers/footers/footnotes) +- Cross-document operations (`append_document`, `add_table_copy`, + `copy_header_from`) +- Packaging (`.dotx` / `.dotm` templates, Strict OOXML translation, + Flat-OPC read/write, reproducible save, `huge_tree` opt-in, recover + mode, password-protected read/write via optional + `python-ooxml-crypto`, `os.PathLike` support) +- Settings and metadata (compat flags, view, mail merge, + `Document.extended_properties`, doc vars, page stats, spell/grammar + toggles, auto-hyphenation, timezone-aware comments) +- Themes, web settings, font table (with font embedding), glossary, + digital-signature detection ->>> document = Document() ->>> document.add_paragraph("It was a dark and stormy night.") - ->>> document.save("dark-and-stormy.docx") +API and user-guide documentation lives under `docs/` and builds with +Sphinx. The theme is Furo. ->>> document = Document("dark-and-stormy.docx") ->>> document.paragraphs[0].text -'It was a dark and stormy night.' ``` +pip install Sphinx furo +python -m sphinx -b html docs docs/_build/html +``` + +## Status + +Unstable. Not yet published to PyPI. Current version: `2026.05.10` +(first release as an independent fork). Versioning is CalVer +(`YYYY.MM.patch`). Public API tracks upstream `1.2.0` for the +inherited surface; fork additions are considered experimental until +the next calendar release. + +## Contributing + +Issues and pull requests are tracked at +. Please file issues +against this fork; upstream's tracker is for upstream-shared concerns +only. + +When contributing: + +- Run the tests: `pytest tests/ -q` and `uv run behave features/`. +- Keep `FEATURES.md` current when adding, modifying, or removing public + API (see `CLAUDE.md` for contributor conventions). +- Consult `spec/` (XSD schemas and the ISO/IEC 29500 PDFs) for + authoritative element ordering and cardinality when implementing new + `CT_*` classes. + +## License + +MIT. See `LICENSE`. Inherited from upstream `python-openxml/python-docx`. + +## Related projects + +Part of a family of document-rendering libraries: -More information is available in the [python-docx documentation](https://python-docx.readthedocs.org/en/latest/) +- [docxjs](https://github.com/loadfix/docxjs) — browser-side DOCX → HTML renderer (TypeScript) +- [pptxjs](https://github.com/loadfix/pptxjs) — browser-side PPTX → HTML renderer (TypeScript) +- [xlsxjs](https://github.com/loadfix/xlsxjs) — browser-side XLSX → HTML renderer (TypeScript) +- [python-pptx](https://github.com/loadfix/python-pptx) — Python PPTX parser/generator +- [python-xlsx](https://github.com/loadfix/python-xlsx) — Python XLSX parser/generator +- [ooxml-validate](https://github.com/loadfix/ooxml-validate) — Python/.NET OOXML validator (wraps Microsoft Open XML SDK + LibreOffice) diff --git a/TODO.md b/TODO.md new file mode 100644 index 000000000..2b1797859 --- /dev/null +++ b/TODO.md @@ -0,0 +1,119 @@ +# TODO + +Fork-specific feature backlog across the loadfix OOXML trio. Each item is +a candidate for a future implementation wave. Grouped by repo. + +## Audit findings 2026-05-05 + +- [ ] **Remove shipped `Section.vertical_alignment` entry from "Conformance gaps".** Already resolved by commit `1657c0ef`; move to a "Resolved" block or delete the stale open entry below. +- [x] **Close GitHub issue #171 (vt:date custom-properties).** Round-trip verified end-to-end in 2026.05.9; added `DescribeCustomProperties_RoundTrip` regression test covering the `Document` → `custom.xml` → reload pipeline. Issue closed. +- [x] **Close GitHub issue #172 (bare `KeyError('[Content_Types].xml')` on missing part).** Wrapped at the narrowest scope in `PackageReader.from_file` as `PackageNotFoundError` in 2026.05.9. Corpus manifest `malformed-content-types-missing` now passes. Issue closed. +- [x] **Bump README "Current version" string.** Currently reads `2026.05.0`; should be `2026.05.8`. +- [x] **Git-tag all untagged releases.** Only `v2026.05.0` has a git tag; `v2026.05.1` through `v2026.05.8` have HISTORY.rst entries but no git tags. Add all 8 missing tags. +- [x] **Land W11-D `UPSTREAM_SYNC.md` onto master.** Commit `d2d5cdcf` lives only on branch `feat/w11-d-upstream-sync`; merged to master via merge commit `721b7753` so upstream divergence is documented on the canonical tree. +- [x] **Seal submodule oxml leakage.** Added explicit `__all__` to 12 public submodules (`docx.table`, `section`, `bookmarks`, `blkcntnr`, `dml.color`, `drawing`, `equations`, `styles.styles`, `styles.style`, `text.paragraph`, `text.run`, `text.pagebreak`) in 2026.05.9. The `docx.oxml.*` subpackage is deliberately left alone; it's internal by convention and does not re-export via star. +- [x] **Fix dev-extras portability.** `pyproject.toml` declared `ooxml-validate @ file:///home/ben/code/ooxml-validate`, a host-specific absolute path. Moved out of `[dev]` into a new opt-in `[conformance]` extra pointing at the GitHub VCS URL so `pip install -e '.[dev]'` works for external contributors. +- [x] **Delete obsolete `.travis.yml`.** 243 bytes of dead config; `.github/` was deliberately removed. +- [x] **Move scratch audit artefacts to `audits/`.** DOCS_AUDIT.md (49KB), DOCS_SIBLING_AUDIT.md (47KB), FEATURES_AUDIT.md (42KB), TEST_AUDIT.md (30KB), INTEROP_REPORT.md (17KB), SCALE_NOTES.md (5KB), real-world-audit-findings.md (11KB) accumulating at repo root — move to an `audits/` subdirectory or delete after resolution. +- [x] **Prune merged remote branches.** ~14 feature branches on origin (`feat/w10-*`, `fix/w8-*`, `chore/overnight-*`, `worktree-agent-*`) merged but not deleted. +- [x] **Fix API-addition version markers in FEATURES.md.** Many entries uniformly say `[Added in 2026.05.0]` regardless of the release that actually introduced them; audit and correct to reflect .1–.8 where appropriate. + +## docx + +_No open docx items — bibliography and complex-field evaluation both +shipped in 2026.05.8 (see "Completed items" below)._ + +## pptx + +- **Transitions authoring.** Read-side exists; no API to set a + transition type on a slide (fade/push/wipe/etc.) programmatically. +- **Animation timelines.** `timing.xml` is the hardest OOXML format + to author. Read support partial; no write support. +- **Slide master cascade edit.** Edit layout + master theme inheritance + and propagate to slides. + +## xlsx + +- **Formula evaluation.** Workbook reads the string `=SUM(A1:A10)` but + does not evaluate it. Need a minimal calc engine covering the common + ~50 functions. +- **Pivot table create.** Read-side exists via `pivot/builder.py`; + writer is scaffold-only. Complete the builder so new pivots can be + authored. +- **Conditional formatting writer.** Read exists; create API partial + (rule.py added accessors but limited coverage). + +## Cross-series + +- **MS Word / PowerPoint / Excel interop testing.** Test each library + against a corpus of real-world files authored by Office. Save-reload- + save cycles to detect fidelity loss. Particular attention to charts, + equations, images, and content that uses `mc:AlternateContent`. +- **Periodic upstream sync.** Both `scanny/*` and `openpyxl` keep + moving. Decide a cadence for pulling in upstream bug fixes without + reverting fork additions. + +--- + +## Completed items (for reference) + +All 2026.05.0 feature work. See `HISTORY.rst` and `FEATURES.md` for the +shipped surface. + +### Performance fixes + +- **W11-A: O(N^2) indexing on `_Rows[i]` and `Document.paragraphs[i]`** + (closed 2026-05-05 on `fix/w11-a-indexing-perf`). `_Rows.__getitem__` + and `BlockItemContainer.paragraphs` both materialised the entire + child list on every call; a naive indexed loop was O(N^2). Replaced + `_Rows.__getitem__` with direct `tr_lst[idx]` access and replaced + `BlockItemContainer.paragraphs` with a lazy + `_ParagraphsView(Sequence[Paragraph])` that memoises `p_lst` on + first access. Cached-idiom access dropped from ~1.53 ms/access to + ~0.0007 ms/access at N=5 000 paragraphs. See `SCALE_NOTES.md` for + methodology and post-fix numbers. + +### Authoring features (2026.05.8) + +- **Bibliography / citation support.** `Document.bibliography` + (read + write), `Document.add_citation(tag, ...)`, `Paragraph.add_citation_reference(tag)`, + and the backing `/customXml/item{N}.xml` part with a `` root plus a + sibling `itemProps{N}.xml`. See `FEATURES.md` § "Bibliography and citations". +- **SmartArt authoring.** `Document.add_smart_art(layout_name)` and + `SmartArt.add_node(text)` with three built-in layouts (list, cycle, + process). See `FEATURES.md` § "SmartArt". +- **Complex-field evaluation.** `Field.evaluate(context)` and + `Document.evaluate_fields(context)` now evaluate `IF` (with nested + `{MERGEFIELD}`), `MERGEFIELD`, `HYPERLINK`, `= ` arithmetic + formulas, and the runtime-dynamic `PAGE` / `NUMPAGES` / `DATE` / + `TIME` placeholders. Deferred: string-function formulas (`=SUM()`, + `=AVERAGE()`, etc. beyond arithmetic), nested `IF`, `QUOTE`, `FILLIN`, + and the full date-picture/numeric-format switch grammar. + +--- + +## Conformance gaps (auto-filed from corpus 2026-05-04 overnight run) + +The 950-case OOXML reference corpus +(`loadfix/ooxml-reference-corpus` built against python-docx at +`d75cfc7`) surfaced one authoring-side API gap. Linked to the driving +corpus manifest on GitHub with an actionable fix hypothesis. + +- **`Section.vertical_alignment` property is missing.** Driving + manifest: + [features/docx/vertical-alignment.json](https://github.com/loadfix/ooxml-validate/blob/master/features/docx/vertical-alignment.json) + (P25 finding). Authoring a section with vertical alignment other + than the `top` default currently requires falling back to raw + `OxmlElement("w:vAlign")` access on `Section._sectPr` because the + `Section` proxy (in `src/docx/section.py`) exposes no accessor for + `w:vAlign`. Fix: add a `Section.vertical_alignment` property with + getter+setter backed by a new `WD_SECTION_VERTICAL_ALIGNMENT` enum + (values `TOP=0`, `CENTER=1`, `BOTH=2`, `BOTTOM=3`, mapping to XML + `top` / `center` / `both` / `bottom` respectively; see + `spec/xsd/wml.xsd` `ST_VerticalJc`). Plumb it through `CT_SectPr` + in `src/docx/oxml/section.py` as a `ZeroOrOne("w:vAlign", + successors=(...))` following the existing successor-ordering + pattern used by siblings like `titlePg` / `docGrid`. Add unit tests + under `tests/unit/test_section.py` following the existing + `Describe*` / `it_*` BDD convention, a behave scenario under + `features/sct-*.feature`, and the FEATURES.md entry. diff --git a/UPSTREAM_SYNC.md b/UPSTREAM_SYNC.md new file mode 100644 index 000000000..34110cdae --- /dev/null +++ b/UPSTREAM_SYNC.md @@ -0,0 +1,58 @@ +# Upstream sync catalogue — python-docx + +Governed by ADR 004 in the +[ooxml-reference-corpus](https://github.com/loadfix/ooxml-reference-corpus/blob/master/docs/adr/004-upstream-sync.md) +repo. See that ADR for cadence, tooling, and disposition vocabulary. + +## Tracking + +- **Upstream project.** `python-openxml/python-docx` + (`https://github.com/python-openxml/python-docx.git`). + Note: the original upstream `scanny/python-docx` has been deleted; + `python-openxml/python-docx` is the surviving canonical repo and + carries the `v1.2.0` tag. +- **Fork baseline tag.** `v1.2.0` +- **Fork baseline SHA.** `e45454602b53e8e572b179ccf1c91093ec9f4ed7` + (upstream `master` HEAD as of 2026-05-05; the release commit subject + is `release: prepare v1.2.0 release`, dated 2025-06-16). +- **Fork divergent commit count.** 246 (fork additions on top of the + baseline, as of 2026-05-05). +- **Upstream remote** is **not** configured in a default clone. + Maintainers add it on demand during a sweep: + ```bash + git remote add upstream https://github.com/python-openxml/python-docx.git + git fetch upstream + git log --no-merges --oneline e4545460..upstream/master + ``` + +## Sync status (sweep: 2026-05-05) + +As of this sweep `upstream/master` is **at the baseline SHA** — no +new commits have landed upstream since v1.2.0. There is nothing to +evaluate and nothing to pull. + +Upstream activity since baseline: + +| short-sha | date | subject | tier | disposition | rationale | +|------------|------------|---------------------------------------|--------|-------------|-----------| +| *(none)* | — | — | — | — | `upstream/master` == fork baseline SHA | + +Non-master upstream branches observed (not pulled): + +- `feature/bookmarks` — experimental; fork has its own bookmark + authoring surface (Phase D). Disposition: `blocked-by-fork-divergence`. +- `feature/header` — experimental; fork ships header/footer APIs. + Disposition: `blocked-by-fork-divergence`. + +## Next sweep due + +**2026-08-03** (first Monday of August 2026). + +If the upstream repository continues to move away from `scanny` in +this period, re-verify the canonical URL at the start of the sweep +(GitHub redirects are not stable for long-dead forks). + +## History + +- 2026-05-05 — initial catalogue created by Wave 11-D. Baseline + confirmed; zero upstream divergence. diff --git a/audits/DOCS_AUDIT.md b/audits/DOCS_AUDIT.md new file mode 100644 index 000000000..94388f717 --- /dev/null +++ b/audits/DOCS_AUDIT.md @@ -0,0 +1,967 @@ +# Sphinx Documentation Audit + +This report surveys the state of the Sphinx-based reference docs under `docs/` +against the current `loadfix/python-docx` source tree at commit `50c2078` +(`master`). It is a companion to `TEST_AUDIT.md`. The goal is to document what +is currently shipped, what has drifted since upstream, and to give a +prioritised punch-list of concrete follow-ups. + +This report is **advisory only** — no `.rst`, `.py`, or workflow files were +modified while producing it. + +--- + +## 1. Summary + +- `docs/` contains **75 reStructuredText files, 12 005 lines total** + (`find docs -name '*.rst' | xargs wc -l`). Of those: + - 11 API-reference pages under `docs/api/` (538 lines, `document.rst` the + biggest at 117) + - 16 enum pages + 1 index under `docs/api/enum/` (982 lines) + - 12 user-guide pages under `docs/user/` (2337 lines) + - 36 developer analysis pages under `docs/dev/analysis/` (8148 lines) — + these are pre-existing XSD / feature analyses, largely untouched since + upstream. +- Build system: **Sphinx with the `armstrong` HTML theme** vendored under + `docs/_themes/armstrong` (`docs/conf.py:236`). The CLAUDE.md note saying + the theme is `alabaster` is **incorrect** — `alabaster` appears only in + `requirements-docs.txt:3` as a pin that is never read because the active + theme is `armstrong`. +- Configuration: `docs/conf.py` — Sphinx 1.0-style, pinned to + `Sphinx==1.8.6` / `Jinja2==2.11.3` / `MarkupSafe==0.23` in + `requirements-docs.txt`. None of those install on Python ≥3.10 without + patches. +- Last meaningful doc-tree commit: **`4fbe1f6` "docs: add Comments docs" + (2025-06-11)**, five days before upstream's `1.2.0` release. There have + been **zero docs commits since the fork began shipping the Phase A / B / C + / D feature additions** in this fork (`git log --since=2025-06-12 -- + docs/` is empty). Every one of the ~55 new features (fork-specific + commits `#1`..`#165`) relies exclusively on docstrings for discovery. +- HISTORY.rst (repo root) stops at `1.2.0 (2025-06-16)`. It contains no + entries for any fork phase — Phase A, B, C, or D. + +### Top-3 findings + +1. **24 new proxy modules ship with zero `docs/api/*.rst` coverage.** Every + module listed in section 4.1 below (`accessibility`, `captions`, `chart`, + `content_controls`, `custom_properties`, `custom_xml`, `embedded_objects`, + `equations`, `fields`, `font_table`, `form_fields`, `glossary`, `ids`, + `ink`, `numbering`, `permissions`, `ruby`, `signatures`, `smart_art`, + `statistics`, `theme`, `toc`, `tracked_changes`, `watermark`, + `web_settings`, plus `bookmarks`, `footnotes`, `endnotes`, `search`) is + invisible to an autodoc build — they are neither imported by + `docs/api/*.rst` nor wired into `docs/index.rst`'s toctree. +2. **`docs/conf.py:69-203` `rst_epilog` is the single-largest source of + build errors.** A non-strict Sphinx 6 build emits **101 warnings**, **96 of + which are `Undefined substitution referenced:`** for 53 distinct + `|ClassName|` substitutions that were never added to `rst_epilog` when + the new proxy modules landed. These show up in almost every rendered + docstring — for example, 27 `Document.*` properties render "Undefined + substitution referenced" boxes instead of class cross-references. +3. **Zero user-guide narrative for any fork feature.** Of the thirty-plus + feature areas shipped in Phases A/B/C/D (tracked changes, footnotes, + endnotes, fields, bookmarks, content controls, form fields, numbering, + captions, TOC, watermarks, …), only one — comments — got a + `docs/user/*.rst` page. That page was delivered by upstream. + +### Sphinx build result + +A build with **Sphinx 6.2.1** (the oldest version that still installs on +Python 3.11) completes with exit-status 0 but emits 101 warnings when +`conf.py` is left untouched. The strict `-W` variant fails at the +configuration step because `intersphinx_mapping`'s value is in the +pre-Sphinx-1.0 format. Concrete numbers are in section 3. + +--- + +## 2. Docs layout + +| Path | Purpose | +|---|---| +| `docs/index.rst` (114 lines) | Landing page: the "What it can do" code sample and the three top-level toctrees (User Guide, API Documentation, Contributor Guide). | +| `docs/conf.py` (394 lines) | Sphinx config. Theme, extensions (`autodoc`, `intersphinx`, `todo`, `coverage`, `viewcode`), and the big `rst_epilog` substitutions block (lines 69-203). | +| `docs/user/*.rst` (12 files, 2337 lines) | Narrative user guide — quickstart, install, documents, tables, text, sections, headers/footers, api-concepts, styles-understanding, styles-using, comments, shapes. | +| `docs/api/*.rst` (10 top-level pages, 538 lines) | API reference. One page per major module group: `document`, `settings`, `style`, `text`, `table`, `section`, `comments`, `shape`, `dml`, `shared`. Typically `.. autoclass:: Foo :members:`. | +| `docs/api/enum/*.rst` (16 enum pages + `index.rst`, 982 lines) | Hand-written enum reference pages (title, alias, intro, flat list of members). Not autogenerated — `:ref:` targets in docstrings point at these. | +| `docs/dev/analysis/*.rst` (36 files, ~8100 lines) | XSD / feature analyses written during upstream development. Not user-facing; linked from the "Contributor Guide" toctree. | +| `docs/_themes/armstrong/` | Vendored HTML theme. A fork of the old "armstrong" sidebar theme. | +| `docs/_static/img/` | Four PNGs: `comment-parts.png`, `example-docx-01.png`, `hdrftr-01.png`, `hdrftr-02.png`. All four are referenced and present on disk (no broken `.. image::` links). | +| `docs/_templates/` | Empty (the directory is registered in `conf.py:42` but nothing lives there). | + +--- + +## 3. Build health + +### 3.1 Prerequisites + +`requirements-docs.txt` pins: + +``` +Sphinx==1.8.6 +Jinja2==2.11.3 +MarkupSafe==0.23 +alabaster<0.7.14 +-e . +``` + +None of these install on modern Python (`MarkupSafe==0.23` fails with +`ImportError: cannot import name 'Mapping' from 'collections'` on 3.10+). +On Read-the-Docs it builds because `.readthedocs.yaml` still targets +`python: "3.9"` — but a local `make html` on any modern checkout fails +before the first file is read. + +### 3.2 Build command and result + +Running with a modern Sphinx 6: + +``` +python -m sphinx -b html docs docs/_build/html +``` + +- **Exit status: 0** (build succeeded). +- **Warnings emitted: 101.** + +Strict mode (`-W`) fails immediately: + +``` +Failed to read intersphinx_mapping[http://docs.python.org/3/], ignored: +SphinxWarning('The pre-Sphinx 1.0 intersphinx_mapping format is deprecated +and will be removed in Sphinx 8. [...]') +``` + +Source of the problem: `docs/conf.py:393` + +```python +intersphinx_mapping = {"http://docs.python.org/3/": None} +``` + +The modern format is `intersphinx_mapping = {"python": ("http://docs.python.org/3/", None)}`. + +### 3.3 Warning breakdown + +| Count | Category | +|---:|---| +| 96 | `ERROR: Undefined substitution referenced:` — a `|SomeClass|` substitution used in a docstring that has no entry in `conf.py`'s `rst_epilog` (see 3.4 below) | +| 3 | `WARNING: undefined label:` — `wdtextdirection` (×2, `docx.section.Section.text_direction`, `docx.table._Cell.text_direction`), `wdborderstyle` (×1, `docx.text.run.Font.border_style`). No `docs/api/enum/WdTextDirection.rst` / `WdBorderStyle.rst` exist. | +| 1 | `ERROR: Unknown target name: "container"` — `docs/user/comments.rst:134` uses Markdown-style `_block-item container_` emphasis; Sphinx parses the second `_container_` as a reference target. | +| 1 | `WARNING: The pre-Sphinx 1.0 'intersphinx_mapping' format is deprecated` — `docs/conf.py:393` | + +### 3.4 First ten warnings (representative) + +``` +WARNING: The pre-Sphinx 1.0 'intersphinx_mapping' format is deprecated [...] +ERROR: Undefined substitution referenced: "EndnoteProperties" + (src/docx/document.py::Document.add_endnote_properties) +ERROR: Undefined substitution referenced: "FootnoteProperties" + (src/docx/document.py::Document.add_footnote_properties) +ERROR: Undefined substitution referenced: "Bookmarks" + (src/docx/document.py::Document.bookmarks) +ERROR: Undefined substitution referenced: "Chart" + (src/docx/document.py::Document.charts) +ERROR: Undefined substitution referenced: "ContentControl" + (src/docx/document.py::Document.content_controls) +ERROR: Undefined substitution referenced: "CustomProperties" + (src/docx/document.py::Document.custom_properties) +ERROR: Undefined substitution referenced: "CustomXmlPart" + (src/docx/document.py::Document.custom_xml_parts) +ERROR: Undefined substitution referenced: "EmbeddedObject" + (src/docx/document.py::Document.embedded_objects) +ERROR: Undefined substitution referenced: "Endnotes" + (src/docx/document.py::Document.endnotes) +``` + +### 3.5 Missing `|Name|` substitutions + +The full set of substitutions referenced in docstrings but not declared in +`conf.py`'s `rst_epilog` (alphabetical): + +``` +Bookmarks, CellBorders, CellMargins, CellShading, Chart, ContentControl, +CustomProperties, CustomXmlPart, DocumentGrid, DocumentStatistics, Drawing, +EastAsianLayout, EmbeddedObject, EndnoteProperties, Endnotes, Equation, +Field, FloatingImage, FontTable, FootnoteProperties, Footnotes, +FormattingChange, FormField, Glossary, HeadingIssue, InkAnnotation, Level, +LineNumbering, MailMerge, MoveRevision, Numbering, PageBorder, PageBorders, +ParagraphBorders, PermissionRange, RubyAnnotation, SearchMatch, +SectionColumns, SignatureInfo, SmartArt, Symbol, TableBorders, +TableStyleFlags, TextFrame, Theme, TrackedChange, Watermark, WD_ANCHOR_H, +WD_ANCHOR_V, WD_BORDER_STYLE, WD_TABLE_AUTOFIT, WD_VIEW, WD_WRAP_TYPE, +WebSettings +``` + +53 symbols total. Adding them to `rst_epilog` is a mechanical change — +one `.. |X| replace:: :class:`.X`` line each — and will immediately +retire 96 of the 101 build warnings. + +### 3.6 Broken / stale refs + +- `:ref:`wdtextdirection`` referenced from `Section.text_direction` and + `_Cell.text_direction` — no `docs/api/enum/WdTextDirection.rst` page. +- `:ref:`wdborderstyle`` referenced from `Font.border_style` — no + `docs/api/enum/WdBorderStyle.rst` page. +- `docs/user/comments.rst:134` uses Markdown-style italic + `_block-item container_`, which Sphinx interprets as a malformed + reference. (One-character RST fix: replace with `*block-item container*` + or quote it.) + +### 3.7 Duplicated labels + +None detected by this build. + +### 3.8 Cleanup + +`docs/_build/` was removed after the build completed. + +--- + +## 4. API reference gaps + +### 4.1 New proxy modules with **no** `docs/api/*.rst` page + +Every module below exists in `src/docx/` as of `50c2078`, but none is +referenced from any page under `docs/api/`. Verified with +`grep -rn 'docx\.' docs/api/`, which returns zero matches for each. + +| Module | Issue(s) | Public surface | Suggested rst | Summary | +|---|---|---|---|---| +| `src/docx/accessibility.py` | #159 | `HeadingIssue` (class, ln 32); `validate_heading_structure` (fn, ln 64) | `docs/api/accessibility.rst` | Heading-structure validator: flags skipped levels, multiple `Heading 1`s, empty headings. | +| `src/docx/bookmarks.py` | #52, #82 | `Bookmarks` (ln 14), `Bookmark` (ln 42) | `docs/api/bookmarks.rst` | `w:bookmarkStart` / `w:bookmarkEnd` create / read / delete. | +| `src/docx/captions.py` | #141 | `new_caption_paragraph` (fn, ln 39) | `docs/api/captions.rst` | Builds a `SEQ`-field caption paragraph styled `Caption`. Module is function-only. | +| `src/docx/chart.py` | #111 | `WD_CHART_TYPE` (enum, ln 20), `ChartSeries` (ln 73), `Chart` (ln 96), `_chart_type_for` (private) | `docs/api/chart.rst` | Read + minimal create for embedded charts. | +| `src/docx/content_controls.py` | #27, #131 | `ContentControlType` (enum, ln 16), `ContentControl` (ln 68), `DataBinding` (ln 263), `new_sdt` (fn, ln 330) | `docs/api/content-controls.rst` | Structured document tags (rich-text, plain-text, date, checkbox, combo, dropdown, picture). | +| `src/docx/custom_properties.py` | #14, #82 | `CustomProperties` (ln 29) | `docs/api/custom-properties.rst` | `docProps/custom.xml` typed name/value pairs. | +| `src/docx/custom_xml.py` | #131 | `CustomXmlPart` (ln 33), `iter_custom_xml_parts` (fn, ln 129) | `docs/api/custom-xml.rst` | Custom XML data parts backing data-bound SDTs. | +| `src/docx/embedded_objects.py` | #140 | `EmbeddedObject` (ln 27) | `docs/api/embedded-objects.rst` | Read-only OLE object info (Excel, equations, …). | +| `src/docx/endnotes.py` | #17, #82, #96 | `Endnotes` (ln 19), `Endnote` (ln 58), `EndnoteProperties` (ln 139) | `docs/api/endnotes.rst` | Mirror of the footnotes API. | +| `src/docx/equations.py` | #113 | `Equation` (ln 36), `build_identifier`, `build_fraction`, `build_superscript`, `build_subscript`, `build_radical` (builder fns, ln 123-197) | `docs/api/equations.rst` | Read OMML; minimal build helpers. | +| `src/docx/fields.py` | #10, #115 | `WD_FIELD_TYPE` (ln 33), `Field` (ln 58) | `docs/api/fields.rst` | Simple + complex field codes; REF / PAGEREF resolution. | +| `src/docx/font_table.py` | #119 | `FontTable` (ln 22), `FontMetadata` (ln 68) | `docs/api/font-table.rst` | Read-only `word/fontTable.xml`. | +| `src/docx/footnotes.py` | #3, #17, #46, #48, #56, #82 | `Footnotes` (ln 19), `Footnote` (ln 58), `FootnoteProperties` (ln 139) | `docs/api/footnotes.rst` | High-level footnotes API. | +| `src/docx/form_fields.py` | #123 | `WD_FORM_FIELD_TYPE` (ln 38), `TextInputFormField` (ln 96), `CheckboxFormField` (ln 127), `DropdownFormField` (ln 151), `FormField` (ln 183) | `docs/api/form-fields.rst` | Legacy `w:ffData` form fields. | +| `src/docx/glossary.py` | #132, #133 | `Glossary` (ln 36), `BuildingBlock` (ln 154), `BuildingBlockCategory` (ln 243) | `docs/api/glossary.rst` | Read-only glossary document (AutoText / Quick Parts / cover pages). | +| `src/docx/ids.py` | #155 | `compute_stable_id` (fn, ln 56) — the API is `stable_id` on Paragraph, Run, Table, Cell | `docs/api/stable-ids.rst` | Pragmatic mostly-stable identifiers via `w:rsidR` + element-hash. | +| `src/docx/ink.py` | #139 | `InkAnnotation` (ln 25) | `docs/api/ink.rst` | Read-only ink / stylus annotations (InkML). | +| `src/docx/numbering.py` | #22, #82, #108 | `Numbering` (ln 119), `NumberingDefinition` (ln 221), `Level` (ln 269) | `docs/api/numbering.rst` | List-numbering control: restart, custom definitions, nested levels, `apply_to`. | +| `src/docx/permissions.py` | #124 | `PermissionRange` (ln 13) | `docs/api/permissions.rst` | `w:permStart` / `w:permEnd` ranges. | +| `src/docx/ruby.py` | #129 | `RubyAnnotation` (ln 13) | `docs/api/ruby.rst` | Read-only ruby (phonetic) annotations. | +| `src/docx/search.py` | #82, #91, #153, #154 | `SearchMatch` (ln 17), `search_paragraphs`, `replace_in_paragraphs`, `search_paragraphs_regex`, `replace_in_paragraphs_regex` (fns at ln 113, 147, 246, 281) | `docs/api/search.rst` | Text search / replace with regex + all-stories variants. | +| `src/docx/signatures.py` | #150 | `SignatureInfo` (ln 32) | `docs/api/signatures.rst` | Detection + minimal metadata for digital signatures (no verify). | +| `src/docx/smart_art.py` | #112 | `SmartArtNode` (ln 41), `SmartArt` (ln 85), `smart_art_for_drawing` (fn, ln 223) | `docs/api/smart-art.rst` | Read-only SmartArt diagram detection + node text. | +| `src/docx/statistics.py` | #161 | `DocumentStatistics` (NamedTuple, ln 28), `compute_statistics` (fn, ln 47) | `docs/api/statistics.rst` | Word / character / paragraph counts matching Word's dialog. | +| `src/docx/theme.py` | #117 | `Theme` (ln 48), `ThemeColors` (ln 89), `ThemeFonts` (ln 199) | `docs/api/theme.rst` | Read-only `word/theme/theme1.xml` access. | +| `src/docx/toc.py` | #116 | `build_toc_instruction` (fn, ln 103), `populate_toc_paragraph` (fn, ln 151) — public surface is `Document.add_table_of_contents()` | `docs/api/toc.rst` | TOC field builder. | +| `src/docx/tracked_changes.py` | #7, #8, #53, #134, #135, #163 | `TrackedChange` (ln 30), `MoveRevision` (ln 97), `FormattingChange` (ln 160) | `docs/api/tracked-changes.rst` | Insertions, deletions, moves, formatting changes; accept / reject; cell / row changes. | +| `src/docx/watermark.py` | #36 | `Watermark` (ln 17) | `docs/api/watermark.rst` | Read side of text / image watermarks. Section.add_text_watermark / add_image_watermark live on Section. | +| `src/docx/web_settings.py` | #157 | `WebSettings` (ln 27) | `docs/api/web-settings.rst` | Read-only `word/webSettings.xml`. | + +29 modules, no page each. All would need to be added to the +`docs/index.rst` "API Documentation" toctree once the `.rst` stubs exist. + +### 4.2 Existing API pages that are stale + +Most existing API pages use `.. autoclass:: X :members:`, which means new +methods on already-documented classes appear automatically. The gaps +below are new **top-level** proxies that share a module with an already- +documented class but have no directive pointing at them, plus a handful +of new narrative sections that an autogenerated page will not cover. + +#### `docs/api/document.rst` (117 lines) + +Uses `:members:` on `docx.document.Document` so new methods auto-render. +What is missing is the **return-type substitutions** (see 3.5) and the +human-written narrative on the new return objects. New `Document` +properties / methods added in this fork that now render as raw +`|Name|` placeholders: + +- `Document.bookmarks` (`src/docx/document.py:326`) +- `Document.charts` (ln 333), `Document.add_chart(...)` (Phase D charts) +- `Document.content_controls` (ln 385) +- `Document.endnotes` (ln 394), `Document.add_endnote_properties` (ln 529), + `Document.endnote_properties` +- `Document.equations` (ln 399) +- `Document.form_fields` (ln 420) +- `Document.signatures` (ln 455) +- `Document.font_table` (ln 466) +- `Document.footnotes` (ln 477), `Document.add_footnote_properties` (ln 516), + `Document.footnote_properties` +- `Document.custom_properties` (ln 539) +- `Document.custom_xml_parts` (ln 549) +- `Document.numbering` (ln 561) +- `Document.ink_annotations` (ln 571) +- `Document.embedded_objects` (ln 588) +- `Document.smart_art` (ln 608) +- `Document.replace` / `replace_all` / `replace_regex` / `replace_regex_all` + (ln 667-761) +- `Document.search` / `search_all` / `search_regex` / `search_regex_all` + (ln 829-895) +- `Document.validate_heading_structure` (ln 916) +- `Document.statistics` (ln 934) +- `Document.glossary` (ln 957) +- `Document.theme` (ln 970) +- `Document.web_settings` (ln 981) +- `Document.add_table_of_contents` (ln 261) + +Additionally, these exist but do not appear in the API pages because no +narrative points at them and their return types have no `.. autoclass::` +elsewhere. Even when autodoc picks them up, a reader cannot click through. + +#### `docs/api/settings.rst` (13 lines) + +Page is **one autoclass line** (`docs/api/settings.rst:9`) with +`:inherited-members:`. New proxy types exposed from `Settings` but with no +dedicated section: + +- `DocumentProtection` (`src/docx/settings.py:446`) — Phase D.3, #125 +- `CompatSettings` (ln 674), `CompatFlags` (ln 754) — #156 +- `MailMerge` (ln 847) — #130 +- `WD_VIEW` (enum) — #164 — `Settings.view` references this, but + `docs/api/enum/WdView.rst` does not exist. +- `WD_PROTECTION` (enum) — referenced by `DocumentProtection.protection_type`; + no enum page. + +These need their own `.. autoclass::` blocks so the "Document Protection" +and "Mail Merge" object surfaces are reachable. + +#### `docs/api/section.rst` (41 lines) + +Uses `:members:` on `Section` so instance methods auto-render, but the +following **new proxy classes** referenced from `Section` have no +`.. autoclass::` block anywhere: + +- `Column` (`src/docx/section.py:824`) — #60 +- `SectionColumns` (ln 855) — `Section.columns` returns this +- `PageBorder` (ln 940), `PageBorders` (ln 1046) — #121 +- `LineNumbering` (ln 1125) — #122 +- `DocumentGrid` (ln 1187) — #147 + +New `Section` methods that do auto-render, but with broken substitution +/ enum refs: + +- `Section.columns`, `Section.set_columns(...)` (Phase D.19) +- `Section.page_borders`, `Section.set_page_border(...)`, + `Section.remove_page_borders(...)` (ln 294-342) +- `Section.line_numbering`, `Section.set_line_numbering`, + `Section.remove_line_numbering` (ln 343-385) +- `Section.first_page_paper_source`, `Section.other_pages_paper_source` + (ln 387-441) — #146 +- `Section.document_grid`, `Section.set_document_grid`, + `Section.remove_document_grid` (ln 443-482) — #147 +- `Section.right_to_left`, `Section.text_direction` (ln 494-521) +- `Section.footnote_properties` / `.add_footnote_properties` / + `.remove_footnote_properties` (ln 548-575) +- `Section.endnote_properties` / `.add_endnote_properties` / + `.remove_endnote_properties` (ln 578-605) +- `Section.add_text_watermark(...)`, `Section.add_image_watermark(...)`, + `Section.remove_watermark`, `Section.watermark` (ln 609-763) +- `Section.different_odd_and_even_pages_header_footer`, + `Section.different_first_page_header_footer` (ln 78-127) +- `Section.first_page_header` / `.first_page_footer` / + `.even_page_header` / `.even_page_footer` (ln 129-168) +- `Section.formatting_change` (ln 91) + +#### `docs/api/table.rst` (55 lines) + +Uses `:members:` + `:inherited-members:`, so `Table` / `_Cell` / `_Row` +method additions auto-render. Missing `.. autoclass::` blocks for new +proxy types in the same module: + +- `CellShading` (`src/docx/table.py:832`) — #63 +- `BorderElement` (ln 898), `TableBorders` (ln 997) — #102 +- `TableStyleFlags` (ln 1068) — #144 +- `CellBorders` (ln 1148), `CellMargins` (ln 1204) — #102, #143 + +New methods that render with undefined substitutions: + +- `Table.borders`, `_Cell.borders`, `_Cell.margins`, `_Cell.shading`, + `Table.style_flags`, `Table.autofit` setter (#39), `Table.column_width` + helpers, `_Cell.is_merge_origin` / `.merge_origin` (#145), + `_Row.height` / `.height_rule` (#28), `_Row.allow_break_across_pages` + (#51), `_Row.is_header` (#93), `_Row.grid_cols_before` / + `grid_cols_after`, `_Cell.grid_span`, `_Cell.text_direction` (#142), + `CT_Tc.grid_offset` (low-level). + +#### `docs/api/text.rst` (63 lines) + +Uses `:members:` on all proxy classes. Missing `.. autoclass::` for new +types in the same module: + +- `docx.text.run._Text` (`src/docx/text/run.py:377`) — internal but + referenced from the `|_Text|` substitution in `conf.py:198` +- `docx.text.symbol.Symbol` (`src/docx/text/symbol.py:11`) — #114 +- `docx.text.font.EastAsianLayout` (`src/docx/text/font.py:889`) — #128 +- `docx.text.parfmt.ParagraphBorders` (`src/docx/text/parfmt.py:452`), + `docx.text.parfmt.Border` (ln 492), `docx.text.parfmt.TextFrame` + (ln 602) — #126, Phase D.7 + +Auto-rendering but with broken substitutions / labels: + +- `Font.shading_color` (#20 / #33), `Font.border_*` (#120), + `Font.language` / `east_asian_language` / `bidi_language` (#160), + `Font.character_spacing`, `Font.kerning` (#19 / #95), + `Font.highlight_color` +- `Run.add_symbol`, `Run.symbols` (#114) +- `Run.split(offset)` (#34 / #94) +- `Run.bidi`, `Paragraph.bidi` (#127) +- `ParagraphFormat.frame` (#126) +- `Paragraph.insert_paragraph_before`, `Paragraph.delete`, `Run.delete`, + `Table.delete` (#50) +- `Paragraph.stable_id`, `Run.stable_id` (#155) +- `Paragraph.rsid`, `Run.rsid` (#136) + +#### `docs/api/shape.rst` (31 lines) + +Missing `.. autoclass::` for: + +- `docx.shape.FloatingImage` (`src/docx/shape.py:132`) — #30 +- Any of the WD_ANCHOR / WD_WRAP enum pages (all missing — + see section 5). + +`InlineShape` declares `:members: height, type, width` — an explicit +allowlist — so `InlineShape.alt_text`, `.title` (#158), and any future +additions will *not* render. + +#### `docs/api/comments.rst` (27 lines) + +Up to date for 1.2.0 — the only fork-scope question is whether +`CommentReplies` (Phase D.2, #67) now merits its own subsection. `Comments` +and `Comment` both use `:members:` so new methods auto-render, but +thread-reply narrative is missing. + +### 4.3 `docs/api/style.rst` minor gaps + +`docs/api/style.rst:21` uses `:members:` on `BaseStyle` / subclasses. New +style attrs from #162 (`Style.link_style`, `Style.next_style`, +`Style.is_redefined`) auto-render but have no narrative section. No missing +classes here. + +--- + +## 5. Enum coverage gaps + +`docs/api/enum/` contains 16 hand-written pages (see section 2). Each page +is short (20-70 lines). `src/docx/enum/*.py` defines **37 enum classes** +(listed below). 16 have a page, **21 do not**. + +### 5.1 Enums present in `src/docx/enum/` (37 total) + +From `grep -E '^class (WD|MSO)_' src/docx/enum/*.py`: + +| Enum | File:line | Doc page | Status | +|---|---|---|---| +| `MSO_COLOR_TYPE` | `dml.py:6` | `MsoColorType.rst` | covered | +| `MSO_THEME_COLOR_INDEX` | `dml.py:30` | `MsoThemeColorIndex.rst` | covered | +| `WD_ALIGN_PARAGRAPH` (alias of `WD_PARAGRAPH_ALIGNMENT`) | `text.py:10` | `WdAlignParagraph.rst` | covered | +| `WD_ANCHOR_H` | `shape.py:33` | — | **missing** | +| `WD_ANCHOR_V` | `shape.py:45` | — | **missing** | +| `WD_BORDER_DISPLAY` | `section.py:6` | — | **missing** | +| `WD_BORDER_OFFSET_FROM` | `section.py:29` | — | **missing** | +| `WD_BORDER_STYLE` | `text.py:274` | — | **missing** (referenced by `Font.border_style`) | +| `WD_BREAK_TYPE` | `text.py:70` | — | **missing** | +| `WD_BUILDING_BLOCK_GALLERY` | `text.py:752` | — | **missing** | +| `WD_BUILTIN_STYLE` | `style.py:6` | `WdBuiltinStyle.rst` | covered | +| `WD_CELL_VERTICAL_ALIGNMENT` | `table.py:6` | `WdCellVerticalAlignment.rst` | covered | +| `WD_COLOR_INDEX` | `text.py:92` | `WdColorIndex.rst` | covered | +| `WD_DOC_GRID_TYPE` | `section.py:74` | — | **missing** | +| `WD_DRAWING_TYPE` | `shape.py:22` | — | **missing** | +| `WD_ENDNOTE_POSITION` | `text.py:564` | — | **missing** | +| `WD_FOOTNOTE_POSITION` | `text.py:547` | — | **missing** | +| `WD_FOOTNOTE_RESTART` | `text.py:531` | — | **missing** | +| `WD_FRAME_DROP_CAP` | `text.py:689` | — | **missing** | +| `WD_FRAME_H_ALIGN` | `text.py:705` | — | **missing** | +| `WD_FRAME_H_ANCHOR` | `text.py:632` | — | **missing** | +| `WD_FRAME_V_ALIGN` | `text.py:727` | — | **missing** | +| `WD_FRAME_V_ANCHOR` | `text.py:648` | — | **missing** | +| `WD_FRAME_WRAP` | `text.py:664` | — | **missing** | +| `WD_HEADER_FOOTER_INDEX` | `section.py:108` | — | **missing** | +| `WD_INLINE_SHAPE_TYPE` | `shape.py:6` | — | **missing** | +| `WD_LINE_NUMBERING_RESTART` | `section.py:49` | — | **missing** | +| `WD_LINE_SPACING` | `text.py:159` | `WdLineSpacing.rst` | covered | +| `WD_MAIL_MERGE_DATA_TYPE` | `text.py:944` | — | **missing** | +| `WD_MAIL_MERGE_DESTINATION` | `text.py:925` | — | **missing** | +| `WD_MAIL_MERGE_TYPE` | `text.py:900` | — | **missing** | +| `WD_NUMBER_FORMAT` | `text.py:476` | — | **missing** | +| `WD_ORIENTATION` | `section.py:132` | `WdOrientation.rst` | covered | +| `WD_PARAGRAPH_ALIGNMENT` | `text.py:10` | `WdAlignParagraph.rst` | covered (as alias) | +| `WD_PROTECTION` | `text.py:605` | — | **missing** | +| `WD_ROW_HEIGHT_RULE` | `table.py:51` | `WdRowHeightRule.rst` | covered | +| `WD_SECTION_START` | `section.py:157` | `WdSectionStart.rst` | covered | +| `WD_SHADING_PATTERN` | `table.py:110` | — | **missing** | +| `WD_SHAPE` | `shape.py:74` | — | **missing** | +| `WD_STYLE_TYPE` | `style.py:426` | `WdStyleType.rst` | covered | +| `WD_TAB_ALIGNMENT` | `text.py:208` | `WdTabAlignment.rst` | covered | +| `WD_TAB_LEADER` | `text.py:247` | `WdTabLeader.rst` | covered | +| `WD_TABLE_ALIGNMENT` / `WD_ROW_ALIGNMENT` alias | `table.py:85` | `WdRowAlignment.rst` | covered (shared) | +| `WD_TABLE_AUTOFIT` | `table.py:249` | — | **missing** | +| `WD_TABLE_DIRECTION` | `table.py:348` | `WdTableDirection.rst` | covered | +| `WD_TEXT_DIRECTION` | `table.py:287` | — | **missing** (`wdtextdirection` label referenced by `Section.text_direction` and `_Cell.text_direction`) | +| `WD_UNDERLINE` | `text.py:380` | `WdUnderline.rst` | covered | +| `WD_VIEW` | `text.py:577` | — | **missing** | +| `WD_WRAP_TYPE` | `shape.py:57` | — | **missing** | + +### 5.2 21 enum pages to add + +Stub pages needed (pattern from `docs/api/enum/WdAlignParagraph.rst` — +title, alias, one-line intro, bulleted member list). Each is 20-40 lines. + +``` +WdAnchorH.rst (WD_ANCHOR_H) +WdAnchorV.rst (WD_ANCHOR_V) +WdBorderDisplay.rst (WD_BORDER_DISPLAY) +WdBorderOffsetFrom.rst (WD_BORDER_OFFSET_FROM) +WdBorderStyle.rst (WD_BORDER_STYLE) ← fixes :ref:`wdborderstyle` +WdBreakType.rst (WD_BREAK_TYPE) +WdBuildingBlockGallery.rst (WD_BUILDING_BLOCK_GALLERY) +WdDocGridType.rst (WD_DOC_GRID_TYPE) +WdDrawingType.rst (WD_DRAWING_TYPE) +WdEndnotePosition.rst (WD_ENDNOTE_POSITION) +WdFootnotePosition.rst (WD_FOOTNOTE_POSITION) +WdFootnoteRestart.rst (WD_FOOTNOTE_RESTART) +WdFrameDropCap.rst (WD_FRAME_DROP_CAP) +WdFrameHAlign.rst (WD_FRAME_H_ALIGN) +WdFrameHAnchor.rst (WD_FRAME_H_ANCHOR) +WdFrameVAlign.rst (WD_FRAME_V_ALIGN) +WdFrameVAnchor.rst (WD_FRAME_V_ANCHOR) +WdFrameWrap.rst (WD_FRAME_WRAP) +WdHeaderFooterIndex.rst (WD_HEADER_FOOTER_INDEX) +WdInlineShapeType.rst (WD_INLINE_SHAPE_TYPE) +WdLineNumberingRestart.rst (WD_LINE_NUMBERING_RESTART) +WdMailMergeDataType.rst (WD_MAIL_MERGE_DATA_TYPE) +WdMailMergeDestination.rst (WD_MAIL_MERGE_DESTINATION) +WdMailMergeType.rst (WD_MAIL_MERGE_TYPE) +WdNumberFormat.rst (WD_NUMBER_FORMAT) +WdProtection.rst (WD_PROTECTION) +WdShadingPattern.rst (WD_SHADING_PATTERN) +WdShape.rst (WD_SHAPE) +WdTableAutofit.rst (WD_TABLE_AUTOFIT) +WdTextDirection.rst (WD_TEXT_DIRECTION) ← fixes :ref:`wdtextdirection` +WdView.rst (WD_VIEW) +WdWrapType.rst (WD_WRAP_TYPE) +``` + +32 entries (more than 21 because several enums that aren't yet warned +about — `WD_FRAME_DROP_CAP`, mail-merge family, etc. — are used in +public signatures that currently document their parameter types as raw +enum names). + +`docs/api/enum/index.rst` has a 16-entry toctree that would need to grow +accordingly. + +### 5.3 `WD_TABLE_ALIGNMENT` note + +`WdRowAlignment.rst` exists; `WdTableAlignment.rst` does not. In +`src/docx/enum/table.py:85` the class is `WD_TABLE_ALIGNMENT`, aliased to +`WD_ROW_ALIGNMENT`. The page title and alias name should be flipped (the +canonical name is `WD_TABLE_ALIGNMENT`), or duplicated. Minor. + +--- + +## 6. User-guide gaps + +`docs/user/` has 12 pages totalling 2337 lines: + +``` +api-concepts.rst 31 lines +comments.rst 168 ← only fork-era narrative +documents.rst 94 +hdrftr.rst 166 +install.rst 38 ← out of date (section 7.3) +quickstart.rst 328 ← pre-fork (section 7.2) +sections.rst 121 +shapes.rst 27 +styles-understanding.rst 382 +styles-using.rst 391 +tables.rst 202 +text.rst 389 +``` + +Per the feature list (121 distinct fork-scope `feat:` commits, grouped by +`^Phase` subject or `#NNN`), the following narrative pages should exist +and do not: + +### 6.1 Missing user-guide topics + +| Feature area | Issues | Suggested page | +|---|---|---| +| Tracked changes — accept/reject, inspect, move revisions, cell/row changes, formatting changes | #7, #8, #53, #134, #135, #163 | `docs/user/track-changes.rst` | +| Fields — simple, complex, REF, PAGEREF, add_field | #10, #115 | `docs/user/fields.rst` | +| Content controls (SDTs) + data binding | #27, #131 | `docs/user/content-controls.rst` | +| Footnotes + endnotes — add, delete, modify, content, properties | #3, #17, #46, #48, #56, #96 | `docs/user/footnotes.rst` | +| Bookmarks — create, read, delete | #52 | `docs/user/bookmarks.rst` | +| Numbering / lists — apply_to, custom definitions, restart, nested | #22, #108 | `docs/user/numbering.rst` | +| Tables — autofit, borders, cell margins, banded rows, text direction, merged-cell helpers, style flags | #15/#102, #39, #63, #143, #144, #145, #142 | `docs/user/tables-advanced.rst` (or extend `tables.rst`) | +| Section — page borders, line numbering, document grid, paper source, columns, formatting changes, odd/even headers, RTL | #19, #60, #121, #122, #146, #147, #148, #149 | `docs/user/sections-advanced.rst` (or extend `sections.rst`) | +| Charts — read + minimal create | #111 | `docs/user/charts.rst` | +| Watermarks (text + image) | #36 | `docs/user/watermarks.rst` | +| Captions | #141 | `docs/user/captions.rst` | +| Table of contents | #116 | `docs/user/toc.rst` | +| Form fields (legacy) | #123 | `docs/user/form-fields.rst` | +| Permission ranges + document protection | #124, #125 | `docs/user/permissions.rst` | +| Glossary / building blocks | #132, #133 | `docs/user/glossary.rst` | +| Themes | #117 | `docs/user/themes.rst` | +| Mail merge | #130 | `docs/user/mail-merge.rst` | +| Custom document properties + custom XML | #14, #82, #131 | `docs/user/custom-properties.rst` | +| Font — shading, borders, language, East Asian layout, symbols, ruby | #19, #20/#33, #114, #120, #128, #129, #160 | `docs/user/text-advanced.rst` | +| Paragraph — frames, RTL, insert-at-position | #26, #126, #127 | (extend `text.rst`) | +| Accessibility — alt text, heading validation, language tags | #158, #159, #160 | `docs/user/accessibility.rst` | +| Statistics (word count) | #161 | `docs/user/statistics.rst` | +| Search — regex, all-stories | #91, #153, #154 | `docs/user/search.rst` | +| Drawing — floating images, shape creation, group shapes, SVG, alt text | #30, #75, #76, #137, #138, #158 | `docs/user/drawing.rst` (or extend `shapes.rst`, currently 27 lines) | +| Equations (OMML) | #113 | `docs/user/equations.rst` | +| Digital signatures, encrypted/recoverable docs, macro-enabled (.docm) | #150, #151, #152, #65 | `docs/user/document-safety.rst` | + +### 6.2 Feature areas already covered (partial check) + +- Comments — `docs/user/comments.rst` (168 lines, covers 1.2.0 scope; + missing threaded-reply narrative for #67). +- Shapes — `docs/user/shapes.rst` exists but is **27 lines** and predates + all of Phase D shape work. + +--- + +## 7. Front-page + quickstart + +### 7.1 `docs/index.rst` + +- The "What it can do" code sample (lines 18-65) hasn't been touched since + upstream — it still demonstrates only `add_heading`, `add_paragraph`, + `add_picture`, `add_table`, `add_page_break`, and `Inches`. None of the + fork-era features appear. +- The "API Documentation" toctree (lines 91-104) lists 11 pages. **There + is no entry for any of the 29 new modules catalogued in section 4.1.** + Even if the `.rst` stubs existed, a reader would not find them. +- Toctree does not have a `:maxdepth:` that would show second-level + anchors — new features stay hidden from the sidebar navigator. + +### 7.2 `docs/user/quickstart.rst` + +- 328 lines, last touched in upstream's 1.2.0 cycle. None of the fork + features appear — no example of a footnote, tracked change, bookmark, + content control, field, watermark, TOC, or form field. +- Sections that are still accurate for 1.2.0 but lag the fork: "Adding a + paragraph", "Adding a heading", "Adding a page break", "Adding a + picture", "Adding a table", "Applying a character style". +- Feature additions that a modern quickstart should mention (one paragraph + each, not exhaustive): footnotes, comments, tracked changes, search / + replace, stable_id, statistics. + +### 7.3 `docs/user/install.rst` + +- `docs/user/install.rst:37` claims: + + ``` + * Python 2.6, 2.7, 3.3, or 3.4 + * lxml >= 2.3.2 + ``` + + Both lines are **wrong**. The current `pyproject.toml` has + `requires-python = ">=3.9"` and a `lxml>=3.1.0` runtime dep. The + `easy_install` paragraph (lines 16-19) is obsolete. +- The file still recommends `python setup.py install` (lines 21-27) — this + has been ineffective since the project switched to a `pyproject.toml` + build. + +--- + +## 8. HISTORY.rst + +`HISTORY.rst` (repo root) stops at `1.2.0 (2025-06-16)`. Every +fork-specific commit — ~121 distinct `feat:` entries — is absent. Before +any release cut, the following release-note skeleton would be needed, +grouped by rough phase: + +``` +1.3.0.dev0 (unreleased) ++++++++++++++++++++++++ + +Phase A — Footnotes and endnotes + - Add Document.footnotes and Footnotes / Footnote / FootnoteProperties (#1, #3, #17, #46, #48, #56, #82) + - Add Document.endnotes mirror API (#17, #96) + - Add Section.footnote_properties / endnote_properties (#17) + +Phase B — Tracked changes + - Add read of tracked insertions and deletions (#53) + - Add accept / reject tracked changes (#7) + - Add read of formatting changes (#8) + - Add move revisions (w:moveFrom / w:moveTo) (#134) + - Add cell and row-level tracked changes (#135) + - Add revision_marks_text() for CLI previews (#163) + +Phase C — Bookmarks and fields + - Add bookmarks create / read / delete (#52) + - Add simple and complex field codes (#10) + - Add REF / PAGEREF cross-reference resolution (#115) + +Phase D — Miscellaneous OOXML feature coverage + - D.1 Hyperlink creation API (#97) + - D.2 Comment replies (threaded) (#67) + - D.3 Extended document settings + DocumentProtection (#66, #125) + - D.4 Custom document properties (#14) + - D.6 Cell shading and background color (#63) + - D.7 Paragraph borders (#109) + - D.9 Numbering style control (#22) + - D.10 Search and replace with formatting preservation (#91) + - D.13 Insert paragraph / table at arbitrary position (#26) + - D.14 Content controls (SDTs) (#27) + - D.15 Row.height setter (#28) + - D.16 Row.allow_break_across_pages (#51) + - D.17 Floating images with wp:anchor positioning (#30) + - D.19 Multi-column section layout (#60) + - D.20 Font.shading — run-level background color (#33) + - D.22 SVG image support (#76) + - D.23 Watermark support (text and image) (#36) + - D.24 .docm macro-enabled file support (#65) + - D.26 Table autofit and column-width control (#39) + - D.27 DrawingML shapes and text-box content access (#75) + +Other feature additions + - Charts read + add_chart() (#111) + - SmartArt detection and node text (#112) + - Equation read + minimal create API (#113) + - Add Run.add_symbol and Run.symbols (#114) + - Add Section.page_borders (#121) + - Add Section.line_numbering (#122) + - Add Section.document_grid (#147) + - Add Section.first_page / other_pages_paper_source (#146) + - Add Section.text_direction / right_to_left (#148) + - Add Section odd/even page header-footer (#149) + - Add Font.border_* properties (#120) + - Add Font.language / east_asian_language / bidi_language (#160) + - Add East Asian typography (kinsoku, word_wrap, east_asian_layout) (#128) + - Add RTL / bidi on Paragraph and Run (#127) + - Add paragraph_format.frame for text frames (#126) + - Add ParagraphBorders / Border (#109) + - Add read-only ruby (#129) + - Add read-only ink (#139) + - Add read-only embedded OLE objects (#140) + - Add read-only grouped shapes (#138) + - Add read-only SmartArt (#112) + - Add read-only Document.glossary (#132, #133) + - Add read-only Document.theme (#117) + - Add read-only Document.web_settings (#157) + - Add Document.font_table (#119) + - Add Document.background_color (#118) + - Add Document.statistics (#161) + - Add Document.search_regex / replace_regex / search_all / replace_all (#153, #154) + - Add Document.add_table_of_contents (#116) + - Add caption helpers (#141) + - Add permission ranges (#124) + - Add Settings.mail_merge (#130) + - Add Settings.compat_flags / compat_settings (#156) + - Add Settings.view (#164) + - Add Style.link_style / next_style / is_redefined (#162) + - Add Table.borders / _Cell.borders (#102) + - Add Cell.margins (#143) + - Add Table.style_flags (#144) + - Add Cell.text_direction (#142) + - Add Cell.is_merge_origin / merge_origin (#145) + - Add _Row.is_header (#93) + - Add Run.split (#94) + - Add Paragraph.delete / Run.delete / Table.delete (#50) + - Add alt_text / title on InlineShape and FloatingImage (#158) + - Add stable_id on Paragraph / Run / Table / Cell (#155) + - Add Paragraph.insert_paragraph_before arbitrary positioning (#26) + - Add legacy form fields (#123) + - Add heading-structure accessibility validator (#159) + +Reliability / safety + - Add recover=True mode for malformed .docx (#151) + - Add EncryptedDocumentError for password-protected .docx (#152) + - Add digital signature detection (#150) + +Dev / tooling + - Add py.typed, improve public types + - Add AI-agent CI pipeline (Product / Develop / Review / Security / Revise + / Merge / Debug / Watchdog) +``` + +Numbers above are taken directly from `git log --oneline --all --grep='^feat'`. +Writing the real HISTORY entries should take ~2 hours if done as one pass. + +--- + +## 9. Other issues + +### 9.1 Broken image links + +**None.** All four `.. image::` references point at files that exist in +`docs/_static/img/`: + +``` +_static/img/comment-parts.png used in docs/user/comments.rst:16 + dev/analysis/features/comments.rst:15 +_static/img/example-docx-01.png used in docs/index.rst:14 +_static/img/hdrftr-01.png used in docs/user/hdrftr.rst:66 +_static/img/hdrftr-02.png used in docs/user/hdrftr.rst:92 +``` + +### 9.2 References to removed modules / classes + +Grepping the `.rst` tree for stale `.. currentmodule::` or dotted paths +against the current source tree: + +``` +grep -rn 'docx\.' docs/api/*.rst | cut -d: -f3 | grep -oE 'docx\.[a-z_.]+' | sort -u +``` + +returns 18 distinct dotted paths, every one of which resolves under +`src/docx/`. No dead references. + +### 9.3 `.. todo::` / `:deprecated:` markers + +`grep -rn '.. todo::\|.. deprecated::' docs/ src/docx/` returns zero hits. +There are no dangling to-dos in docstrings or RST files. + +### 9.4 `docs/conf.py` + +Observations: + +- No `autodoc_default_options` is set. Every `.. autoclass::` directive + must spell out `:members:` / `:inherited-members:` individually. This is + why pages like `docs/api/text.rst` need 10 `:members:` tokens — a + project-wide `autodoc_default_options = {"members": True, "undoc-members": False, "show-inheritance": True}` would halve the size of each page. +- `intersphinx_mapping = {"http://docs.python.org/3/": None}` (line 393) + is in Sphinx 1.0 format — deprecated in Sphinx 4, breaks in Sphinx 8. +- `html_theme = "armstrong"` (line 236) points at a vendored fork of a + long-abandoned theme (`docs/_themes/armstrong/`). `alabaster` (the note + in CLAUDE.md) is pinned in `requirements-docs.txt` but never consumed. + The project would benefit from migrating to a maintained theme + (`furo`, `pydata-sphinx-theme`, or `sphinx_rtd_theme`). +- `exclude_patterns = [".build"]` (line 208) — the actual build dir is + `_build`, not `.build`, so this exclusion does nothing. +- `copyright = "2013, Steve Canny"` (line 55) has not been updated in + 12 years. +- Typo in docstring of `add_endnote_properties` etc. would not be fixed + by any conf change — the `|EndnoteProperties|` etc. substitutions are + simply absent from `rst_epilog` (section 3.5). +- `sphinx.ext.todo` is enabled but never used (`grep -rn '.. todo::' docs/` + returns 0). +- `sphinx.ext.coverage` is enabled but has no `coverage_*` options set. + +--- + +## 10. Recommendations + +Prioritised punch list. Effort labels: **S** = < 2 hr, **M** = 1/2 day, +**L** = 1-2 days. + +### Sphinx build hygiene + +1. **S — Add the 53 missing `|Name|` substitutions to `docs/conf.py:rst_epilog`.** + One line per symbol (`.. |Foo| replace:: :class:`.Foo``). Retires + 96 of the 101 build warnings. See section 3.5. ~30 min. +2. **S — Fix `intersphinx_mapping` format** (`docs/conf.py:393`). Change + `{"http://docs.python.org/3/": None}` to + `{"python": ("https://docs.python.org/3/", None)}`. Unblocks `-W`. ~5 min. +3. **S — Fix `docs/user/comments.rst:134`** Markdown underscore to RST + emphasis. ~1 min. +4. **S — Update `requirements-docs.txt`** to modern Sphinx (≥5, <8), + drop `MarkupSafe==0.23` / `Jinja2==2.11.3` pins, drop `alabaster` + pin (unused). ~10 min. +5. **S — Fix `exclude_patterns`** in `conf.py:208` (`.build` → `_build`). + +### New API stubs + +6. **S — Add missing enum reference pages (21-32 files).** Follow + `docs/api/enum/WdAlignParagraph.rst` pattern. Each page is 20-40 + lines of title / alias / one-line intro / member list. Update + `docs/api/enum/index.rst` toctree. See section 5.2. ~2 hr total + (scriptable from `enum/base.py`'s `DocsPageFormatter`, which already + emits exactly this format). +7. **S — Add autoclass stubs for the 29 new proxy modules** (section 4.1). + Each is a ~10-line file (`.. _X_api:`, title, `.. currentmodule::`, + `.. autoclass:: Foo :members:`). Add to `docs/index.rst` toctree. ~2 hr. + +### Existing API pages + +8. **M — Update `document.rst`, `settings.rst`, `section.rst`, `table.rst`, + `text.rst`, `shape.rst`** to surface the new return-type classes and + narrative sections per section 4.2. Most `:members:` directives already + cover method additions; the missing pieces are new top-level classes + in the same module (`CellShading`, `TableBorders`, `LineNumbering`, + `DocumentGrid`, `PageBorder`, `PageBorders`, `FloatingImage`, + `EastAsianLayout`, `TextFrame`, `ParagraphBorders`, `Symbol`, etc.) plus + re-ordered subsection titles. ~1 day of careful writing to match the + existing narrative tone. +9. **S — Drop the explicit `:members: height, type, width` allowlist on + `docs/api/shape.rst:31`.** New `InlineShape.alt_text` and `.title` + (#158) will start rendering automatically. ~2 min. +10. **S — Rename `docs/api/enum/WdRowAlignment.rst`** (or duplicate it) to + expose `WD_TABLE_ALIGNMENT` as the canonical name. See 5.3. ~10 min. + +### Conf-file improvements + +11. **S — Add `autodoc_default_options`** in `conf.py` so new autoclass + directives inherit `members: True`, `show-inheritance: True`. Shrinks + each API page, makes new features visible by default. ~15 min. +12. **M — Migrate theme** from vendored `armstrong` to a maintained theme + (`furo` or `pydata-sphinx-theme`). Delete `docs/_themes/armstrong/`. + ~1 hr + QA pass. + +### User guide + +13. **L — Write user-guide narratives for fork feature areas.** Section 6.1 + lists 26 missing topics. Pair each with a 100-200 line page in the + existing narrative style (compare `docs/user/comments.rst`, 168 + lines). Prioritise in this order based on commit frequency: tracked + changes → fields → content controls → footnotes → bookmarks → + numbering → watermarks → search → captions → TOC → form fields → + charts → equations → rest. ~1-2 weeks at normal pace. +14. **M — Rewrite `docs/user/install.rst`.** Drop Python 2.x and easy_install, + update `requires-python = ">=3.9"`, update `lxml` to current floor, + mention `pip install python-docx`. ~20 min. +15. **M — Modernise `docs/user/quickstart.rst`.** Add short-paragraph + sections for the five or six most common fork features (footnote, + bookmark, find/replace, comment, tracked change, stable_id). ~2 hr. +16. **S — Update the `docs/index.rst` "What it can do" code sample** to + include at least one fork-era call (e.g. a `document.footnotes.add()` + line). ~10 min. + +### History / release prep + +17. **M — Write `HISTORY.rst` entries** for the unreleased version. The + skeleton in section 8 can be pasted in and polished. ~2 hr. + +### Longer-term + +18. **L — Audit every public docstring** for OOXML-term consistency + (`w:rPr`, `w:tc`, ...) and add "Added in version x.y.z" directives + so the release-notes skeleton can be auto-generated from code. + ~1 week, on and off. +19. **M — Add a `docs/user/api-concepts.rst` extension** covering the + three-layer proxy / part / oxml architecture (already described in + `CLAUDE.md` but not visible to end-users). ~3 hr. +20. **S — Replace `sphinx.ext.todo`** (unused) with `sphinx.ext.napoleon` + (Google / NumPy docstring support). The codebase mostly uses plain + reStructuredText docstrings but a few newer modules use NumPy-style + Returns sections that currently render as plain paragraphs. + +--- + +## Appendix A — counts at a glance + +| Metric | Count | +|---:|---| +| `.rst` files under `docs/` | 75 | +| Lines across all `.rst` files | 12 005 | +| `.rst` files under `docs/api/` (top level) | 10 | +| `.rst` files under `docs/api/enum/` (incl. index) | 17 | +| `.rst` files under `docs/user/` | 12 | +| `.rst` files under `docs/dev/analysis/` | 36 | +| Python modules in `src/docx/` (top level) | 44 | +| Python submodule files under `src/docx/{text,styles,enum,drawing,...}` | additional 29 | +| Enum classes defined in `src/docx/enum/` | 37 | +| Enum classes with a `docs/api/enum/*.rst` page | 16 | +| `|Substitution|` tokens defined in `conf.py:rst_epilog` | 74 | +| `|Substitution|` tokens referenced but undefined | 53 | +| Distinct `feat:` commits in `git log --all` (fork scope) | 121 | +| Doc commits since 2025-06-11 | 0 | +| Sphinx build warnings (non-strict) | 101 | + +## Appendix B — Sphinx build command used + +``` +python -m sphinx -b html docs docs/_build/html +``` + +Ran with Sphinx 6.2.1 inside a throwaway Python 3.11 virtualenv. +Build result: `build succeeded, 101 warnings.` Exit status 0. +`docs/_build` and the virtualenv were removed after the run. diff --git a/audits/DOCS_SIBLING_AUDIT.md b/audits/DOCS_SIBLING_AUDIT.md new file mode 100644 index 000000000..0a5a7e805 --- /dev/null +++ b/audits/DOCS_SIBLING_AUDIT.md @@ -0,0 +1,962 @@ +# Sibling Documentation Audit — loadfix OOXML Series + +Observational audit of three sibling projects under the `loadfix` org that +together aim at a uniform OOXML-in-Python experience: + +- `loadfix/python-docx` — Word `.docx` (this repo) +- `loadfix/python-pptx` — PowerPoint `.pptx` +- `loadfix/python-xlsx` — Excel `.xlsx` (fork of `openpyexcel`, which in turn + is a snapshot of `openpyxl` ~2.5.14) + +The audit looks only at *documentation surfaces* — READMEs, Sphinx builds, +`docs/` trees, history/changelog files, contributor-facing files, docstring +quality and build health. No runtime or test-coverage claims are made here. +The audit is read-only against the two sibling repos; only this file is +written. + +--- + +## 1. Summary + +The three projects present three very different levels of documentation +maturity and consistency: + +- **python-docx (this repo)** is the most mature documentation surface of + the three. Sphinx builds with `furo` (modern theme), exactly one Sphinx + warning (`undefined label: 'wdoutlinelvl'`), a 37-page user guide + mirroring a 38-file API reference, a 49-file enum reference, a freshly + written fork-centric `README.md` using Markdown, an 81 KB `FEATURES.md` + acting as the authoritative fork-feature inventory, a 12 KB `HISTORY.rst` + ordered by phase (A/B/C/D), and a pre-napoleon hook in `conf.py` that + warrants its own ~80-line comment block explaining why it exists. A + previous run noted **17 build warnings** for this repo; current grep of + `/tmp/sphinx-docx.log` shows 1 WARNING and 16 ERRORs all of the + `Undefined substitution referenced` class, total 17 — the previous + agent counted these together. + +- **python-pptx** is in a middle state: an inherited Sphinx 1.8.6 / + Jinja2 2.11.3 / `alabaster<0.7.14` / `armstrong` theme stack, an + `rst_epilog`-based substitution catalogue that is showing its age + (many class references are no longer resolved, hence 140 warnings — of + which **127 are `ERROR: Undefined substitution referenced`** of the + form `|ErrorBars|`, `|AnimationEffect|`, `|PathGeometry|`, `|Path|`, + `|Sound|`, `|Section|`, and so on). Docstring coverage in `src/pptx` is + larger than docx's but the reference tree is thin (14 API pages, 36 + enum pages, 19 user-guide pages). `README.rst` is 25 lines and still + matches the upstream form — no fork narrative. `HISTORY.rst` is 1768 + lines and still active (an unreleased block at the top describes recent + refactors). There is no `FEATURES.md`, no `CONTRIBUTING.md` (only + `CLAUDE.md`), and the user guide has no fork-feature pages. + +- **python-xlsx** is the outlier. Its Sphinx build **fails outright**: + `doc/conf.py` line 25 does `import openpyxl`, but the actual top-level + package in this fork is `xlsx` (under `src/xlsx/`) with a legacy + `openpyexcel/` tree alongside — nothing named `openpyxl` exists to + import, so Sphinx dies before the first source file is read + (`ModuleNotFoundError: No module named 'openpyxl'`). There is no + `requirements-docs.txt`, no `furo`/`alabaster` pin, no `docs/` + directory (the docs live in `doc/` — a different convention — with 39 + `.rst` files as a flat list, no `api/` and no `user/` split). There is + a `TODO.md` (582 lines), `CONTRIBUTING.md` (17 lines, stub), `CLAUDE.md` + (334 lines, much more prose than python-docx's 166 lines), a 23 KB + `README.md`, and a `doc/changes.rst` (1376 lines — still the openpyxl + 2.5.14 changelog, not a fork-era history). No `FEATURES.md`. + +Net: docx has by far the most *current* documentation, xlsx the most +*broken*, pptx the most *inherited*. The three do not yet look like a +series. + + + +## 2. Layout matrix + +Raw numbers, gathered from filesystem state as of 2026-05-02. + +| Surface | python-docx | python-pptx | python-xlsx | +|--------------------------------------|---------------------|---------------------|------------------------------------------| +| Top-level README | `README.md` (131 L) | `README.rst` (25 L) | `README.md` (190 L) | +| Top-level HISTORY | `HISTORY.rst` (438 L) | `HISTORY.rst` (1768 L) | `doc/changes.rst` (1376 L) — openpyxl-era | +| Top-level CHANGELOG | (none, in HISTORY) | (none, in HISTORY) | (none, openpyxl-era `changes.rst`) | +| Top-level FEATURES | `FEATURES.md` (1791 L / 81 KB) | (absent) | (absent; `TODO.md` 582 L) | +| Top-level CLAUDE.md | 166 L (7.3 KB) | 256 L (15 KB) | 334 L (23 KB) | +| Top-level CONTRIBUTING | (absent) | (absent) | `CONTRIBUTING.md` (17 L — stub) | +| Top-level LICENSE | `LICENSE` (MIT) | `LICENSE` (MIT) | `LICENCE.md` (British spelling) | +| Top-level AUTHORS | (absent) | (absent) | `AUTHORS.md` | +| Sphinx source dir | `docs/` | `docs/` | `doc/` (singular) | +| Sphinx config | `docs/conf.py` (627 L) | `docs/conf.py` (587 L) | `doc/conf.py` (314 L) | +| Sphinx theme | `furo` | `armstrong` | `nature` (or `default`) | +| Total `.rst` files under docs/ | 162 | 180 | 39 (flat) | +| `docs/api/*.rst` (non-enum) | 38 | 14 | 0 (no api/ dir) | +| `docs/api/enum/*.rst` | 49 | 36 | 0 (no enum/ dir) | +| `docs/user/*.rst` | 38 | 19 | 0 (no user/ dir; flat `.rst` instead) | +| `docs/dev/` directory | `docs/dev/analysis/` | `docs/dev/` (6 files) | (absent) | +| `docs/community/` | (absent) | `docs/community/` (3 files) | (absent) | +| `requirements-docs.txt` | yes (3 lines, Sphinx>=6,<8 / furo / -e .) | yes (5 lines, Sphinx==1.8.6 pinned) | **absent** | +| Sphinx build result | success, 17 warnings | success, 140 warnings | **fails** (ModuleNotFoundError) | +| `versionadded::` in source | 885 occurrences | 0 | 0 | +| Public `.py` modules in src top-lvl | 46 (`src/docx/*.py`) | 15 (`src/pptx/*.py`) | ~5 (`src/xlsx/*.py`) plus `openpyexcel/` | +| `spec/` directory | yes (ISO-IEC-29500 PDFs + xsd + rnc + styles.xml) | yes (ISO-IEC-29500 tree + gen_spec) | (absent) | + +Line-count and file-count figures are raw `wc -l` / `ls | wc -l` — not +weighted by content quality. + + + +## 3. Surface-by-surface comparison + +### 3.1 README / top-of-project landing + +**python-docx — `README.md` (131 lines, Markdown).** +Opens with a two-sentence description, then a "Based on..." paragraph that +is explicit about the fork provenance ("Based on python-openxml/python-docx +by Steve Canny and contributors. Forked at upstream `1.2.0` (2025-06-16) +and extended with 100+ additional OOXML features — footnotes and +endnotes, tracked changes, bookmarks, fields, content controls, charts, +equations, SmartArt, watermarks, digital signatures, accessibility +tooling, cross-document operations, and more."). Subsequent sections +appear to cover "Status" (unstable, not yet on PyPI, CalVer +`2026.05.0`), "Installation" from source, and a short narrative pitch. +The tone is fork-aware and version-dated. + +**python-pptx — `README.rst` (25 lines, reStructuredText).** +Opens with the traditional upstream blurb: *python-pptx is a Python +library for creating, reading, and updating PowerPoint (.pptx) files.* +No mention of the fork, no mention of `loadfix`, no mention of CalVer, +no mention of `2026.05.0`. Essentially the upstream README preserved +verbatim. The README is also `include`d into `docs/index.rst`, so the +Sphinx landing page inherits the same upstream-flavoured intro. + +**python-xlsx — `README.md` (190 lines, Markdown).** +Opens with the same shape as `python-docx`'s README — "A Python library +for reading, creating, and updating Microsoft Excel 2007+ (.xlsx / +.xlsm) files" — followed by a multi-step "Based on openpyxl via +sciris/openpyexcel. Forked from openpyexcel (which tracks openpyxl +~2.5.14, circa 2019) and extended with modern Excel 365 capabilities..." +paragraph naming dynamic arrays, threaded comments, rich text on +cells, sparklines, modern chart types (treemap, sunburst, funnel, +waterfall, box-whisker, histogram, map), SHA-512 protection, encrypted +file I/O, first-class shape models. Status section says "Unstable. Not +yet published to PyPI. Install from source only. Current version: +`2026.05.0` (first release as an independent fork)." — identical +wording to docx's. + +Observation: the docx and xlsx READMEs are visibly the work of the same +hand, using the same CalVer pitch, same "Unstable" status boilerplate, +and same "Based on ... Forked at / from ..." pattern. pptx's README has +not been updated for the fork at all. + + + +### 3.2 Sphinx build setup + +**python-docx — Sphinx >=6,<8, `furo` theme.** +- `requirements-docs.txt` is three lines: `Sphinx>=6,<8`, `furo`, `-e .`. +- `docs/conf.py` is 627 lines. The top of the file loads `docx.__version__`. + Extensions enabled: `sphinx.ext.autodoc`, `sphinx.ext.intersphinx`, + `sphinx.ext.napoleon`, `sphinx.ext.viewcode`. Napoleon is configured + with Google- and NumPy-style docstring support plus `include_special_with_doc`. +- The largest novelty is a ~80-line `setup(app)` function that installs a + *pre-napoleon snapshot / post-napoleon restore* hook at priorities 100 + and 900. The comment block explains it is there to prevent napoleon + from mis-parsing attribute docstrings that reference OOXML element + names like `w:moveFrom` — napoleon would otherwise split on the colon + and emit ~42 bogus "Unknown target name: 'w:moveFrom'" docutils + warnings across 11 files. +- `intersphinx_mapping` is the modern `{'python': ('https://docs.python.org/3/', None)}` form. + +**python-pptx — Sphinx 1.8.6, `armstrong` theme.** +- `requirements-docs.txt` pins `Sphinx==1.8.6`, `Jinja2==2.11.3`, + `MarkupSafe==0.23`, `alabaster<0.7.14`, `-e .` — a 2019-era Sphinx + stack preserved verbatim. `armstrong` is a custom theme (no evidence + of a `_themes/` directory vendoring it, so it is expected to be pulled + from PyPI — the build log reports the theme is resolved). +- `docs/conf.py` is 587 lines. Extensions enabled: autodoc, doctest, + inheritance_diagram, intersphinx, todo, coverage, ifconfig, viewcode — + a broader set than docx but no napoleon. Also monkey-patches + `sphinx.environment.BuildEnvironment.warn_node` to suppress "nonlocal + image URI found:" warnings for an old travis-ci status badge. +- The bulk of `conf.py` is a massive `rst_epilog` declaring dozens of + `|ClassName|` substitutions — the source of the 127 "Undefined + substitution referenced" errors in the current build, since the + substitution table has not kept pace with renamed/new classes + (`ErrorBars`, `AnimationEffect`, `PathGeometry`, `Path`, `Sound`, + `Section`, `_HeaderFooter`, `Transition`, `SmartArt`, …). + +**python-xlsx — openpyxl-era Sphinx config, build fails.** +- No `requirements-docs.txt`. Docs dependencies are not separately + declared anywhere the audit located. +- `doc/conf.py` (singular `doc/`, not `docs/`) is 314 lines and still + attributed to "openpyxl documentation build configuration". The module + import at line 25 reads `import openpyxl`, but the top-level package + name in this fork is `xlsx` (under `src/xlsx/`) with an `openpyexcel/` + sibling tree. Nothing ever renames `openpyxl` to match, so Sphinx + aborts: `ModuleNotFoundError: No module named 'openpyxl'`. +- Theme selection is `'default'` in the `'on_rtd'` branch, else + `'nature'` — pre-ReadTheDocs-era. +- Extensions: `sphinx.ext.autodoc`, `ifconfig`, `viewcode`, `doctest`, + `coverage`. No napoleon, no intersphinx. +- The module-patch dance at top of `conf.py` (AliasProxyGet / + NumberFormatGet / StyleDescriptorGet monkey-patches behind + `APIDOC=True`) is a flag that `openpyxl` had its own autodoc + workarounds that were never ported either. + +Observation: the three configs span three Sphinx generations (1.8.6, +openpyxl-era pre-RTD, and 7.x + furo). Only docx is on the current +stack. + + + +### 3.3 docs/ layout + +**python-docx — `docs/` (plural).** +- Top-level files: `index.rst`, `conf.py`, `Makefile`, `_static/`, + `api/`, `dev/`, `user/`. +- `docs/api/` has 38 non-enum `.rst` files plus an `enum/` subtree of + 49 files — so 87 reference pages in total. +- `docs/user/` has 38 user-guide pages (see §3.5 for the list). +- `docs/dev/` contains an `analysis/` subtree — analysis notes for + reverse-engineered features, inherited from upstream and extended. +- No `docs/community/`, no FAQ, no support page. +- `docs/_static/` holds images referenced by `index.rst`. + +**python-pptx — `docs/` (plural).** +- Top-level files: `index.rst`, `conf.py`, `Makefile`, `_static/`, + `_templates/`, `api/`, `community/`, `dev/`. +- `docs/api/` has 14 non-enum `.rst` files plus 36 enum pages — 50 + reference pages total. About 57% the size of docx's reference tree. +- `docs/user/` has 19 user-guide pages. About half the size of docx's + user guide. +- `docs/dev/` has 6 files: `analysis/` (subtree), `development_practices.rst`, + `philosophy.rst`, `resources/`, `runtests.rst`, `security.rst`, + `xmlchemy.rst`. Richer than docx's `dev/`. +- `docs/community/` has `faq.rst`, `support.rst`, `updates.rst` — + community-facing pages absent from docx and xlsx. +- `docs/_templates/` is present (docx does not have one). + +**python-xlsx — `doc/` (singular).** +- Flat structure: no `api/` or `user/` split, no `enum/` subtree. +- 39 `.rst` files at the top of `doc/`, including `index.rst`, + `tutorial.rst`, `usage.rst`, `changes.rst`, `development.rst`, + `worksheet_tables.rst`, `filters.rst`, `formula.rst`, `formatting.rst`, + `charts/` (subdirectory — one of only two subdirs), plus + `comments.rst`, `defined_names.rst`, `editing_worksheets.rst`, + `optimized.rst`, `pandas.rst`, `performance.rst`, `pivot.rst`, + `print_settings.rst`, `protection.rst`, `styles.rst`, `validation.rst`, + `windows-development.rst`, `worksheet_properties.rst`. +- Also contains Python example source alongside the RSTs (`example.py`, + `filters.py`, `format_merged_cells.py`, `table.py`) and PNG image + assets (`filters.png`, `logo.png`, `table.png`) mixed into the same + flat directory rather than isolated under `_static/` — the `_static/` + dir exists too, so the layout is half-converted. +- `changes.rst`, `read_performance.txt`, `write_performance.txt` are + all in `doc/` — a `txt` with perf numbers committed alongside docs. + +Observation: docx and pptx share the `docs/` name, an `api/` / +`api/enum/` split, a `user/` dir, and a Makefile. xlsx does none of +that. + + + +### 3.4 API reference coverage + +**python-docx — 38 non-enum API pages + 49 enum pages = 87 files.** + +Non-enum API pages (`docs/api/*.rst`): +`accessibility`, `bookmarks`, `captions`, `chart`, `comments`, +`content-controls`, `custom-properties`, `custom-xml`, `dml`, `document`, +`embedded-objects`, `endnotes`, `equations`, `fields`, `font-table`, +`footnotes`, `form-fields`, `glossary`, `ink`, `numbering`, `permissions`, +`ruby`, `search`, `section`, `settings`, `shape`, `shared`, `signatures`, +`smart-art`, `stable-ids`, `statistics`, `style`, `table`, `text`, +`theme`, `toc`, `tracked-changes`, `watermark`, `web-settings`. + +This mirrors almost 1:1 the fork's phase-A/B/C/D feature list. Every +fork-added subsystem has its own reference page. + +Enum pages (49) include every upstream `WD_*` enum plus the fork's +new additions — e.g. `WdAnchorH`, `WdAnchorV`, `WdBorderDisplay`, +`WdBorderOffsetFrom`, `WdBorderStyle`, `WdBreakType`, +`WdBuildingBlockGallery`, `WdBuiltinStyle`, `WdCellVerticalAlignment`, +`WdColorIndex`, `WdDocGridType`, `WdDrawingType`, `WdEndnotePosition`, +`WdFootnotePosition`, `WdFootnoteRestart`, `WdFrameDropCap`, +`WdFrameHAlign`, plus two MSO enums (`MsoColorType`, +`MsoThemeColorIndex`). Both fork-specific (e.g. `WdAnchorH`, +`WdFootnoteRestart`) and inherited enums are documented alongside. + +**python-pptx — 14 non-enum API pages + 36 enum pages = 50 files.** + +Non-enum API pages (`docs/api/*.rst`): +`action`, `chart-data`, `chart`, `comments`, `dml`, `exc`, `image`, +`placeholders`, `presentation`, `shapes`, `slides`, `table`, `text`, +`util`. + +This is substantially smaller than docx's 38 pages and looks closer to +the upstream python-pptx 1.0.x reference tree. Recent Wave-5 fork +additions — animation, sections, transitions, smart-art, tags — do not +yet have dedicated API pages despite having source modules, which +matches the pattern of "Undefined substitution referenced" errors +piling up in the Sphinx build (§3.11). + +Enum pages (36) include the MSO/PP/XL trinity expected by the upstream +python-pptx design — `MsoAutoShapeType`, `MsoAutoSize`, `MsoColorType`, +`MsoConnectorType`, `MsoFillType`, `MsoLanguageId`, `MsoLineDashStyle`, +`MsoLineEndLength`, `MsoLineEndType`, `MsoLineEndWidth`, +`MsoPatternType`, `MsoShapeType`, `MsoTextStrikeType`, +`MsoTextUnderlineType`, `MsoThemeColorIndex`, `MsoVerticalAnchor`, +`PpActionType`, `PpAutoNumberScheme`, `PpMediaType`, plus +`ExcelNumFormat` (unusual — a cross-sibling name appearing in the pptx +enum tree) and others. + +**python-xlsx — no API reference tree at all.** + +No `api/` directory, no `enum/` subtree, no per-class reference pages. +The top-level `.rst` files are all task/topic-oriented (tutorial, +usage, filters, formula, pivot, …) — autodoc usage across them has not +been spot-checked in this audit, but the flat layout means there is no +place dedicated to class-by-class reference. + + + +### 3.5 User-guide coverage + +**python-docx — 38 user-guide pages in `docs/user/`.** + +Full list: `accessibility`, `api-concepts`, `bookmarks`, `captions`, +`charts`, `comments`, `content-controls`, `custom-properties`, +`document-safety`, `documents`, `drawing`, `endnotes`, `equations`, +`fields`, `footnotes`, `form-fields`, `glossary`, `hdrftr`, `install`, +`mail-merge`, `numbering`, `permissions`, `quickstart`, `search`, +`sections`, `sections-advanced`, `shapes`, `statistics`, +`styles-understanding`, `styles-using`, `tables`, `tables-advanced`, +`text`, `text-advanced`, `themes`, `toc`, `track-changes`, `watermarks`. + +Pattern: for the big subsystems (`text`, `tables`, `sections`) there +are both a core page and an `-advanced` page. Near-1:1 parity between +user-guide pages and API reference pages — almost every API area has a +prose companion. + +**python-pptx — 19 user-guide pages in `docs/user/`.** + +Full list: `autoshapes`, `charts`, `comments`, `concepts`, `install`, +`intro`, `math-equations`, `media`, `notes`, `ole-objects`, +`placeholders-understanding`, `placeholders-using`, `presentations`, +`quickstart`, `slides`, `table`, `text`, `understanding-shapes`, +`use-cases`. + +No dedicated page for: animation, sections, transitions, smart-art, +tags, custom properties, extended properties, field manipulation, +accessibility. For a fork that has added substantial new capability +(`pptx.animation`, `pptx.slide.AnimationEffectView`, Wave-5 animation +API, section API #256), the user-guide has not grown correspondingly. + +**python-xlsx — no `user/` dir; 39 flat `.rst` files.** + +The flat layout mixes reference-ish (`comments.rst`, `styles.rst`, +`protection.rst`) with task-oriented (`editing_worksheets.rst`, +`optimized.rst`, `performance.rst`, `read_performance.txt`, +`write_performance.txt`, `windows-development.rst`) and a `charts/` +subdir plus `tutorial.rst` and `usage.rst`. The content is mostly +openpyxl heritage; there are no fork-era pages documenting dynamic +arrays, threaded comments, rich text on cells, sparklines, or modern +chart types (which the README pitches as the fork's main value +proposition). + + + +### 3.6 HISTORY / CHANGELOG + +**python-docx — `HISTORY.rst`, 438 lines.** + +Opens with a bold fork-transition header: + +> `2026.05.0 (unreleased) — first release as independent fork` +> +> This release marks the project's split from upstream +> `python-openxml/python-docx`. Versioning switches to CalVer +> (YYYY.MM.patch) from this point forward. The previous upstream line +> stops at `1.2.0` (2025-06-16); everything below is new to this fork. + +The changelog is organised by **development phase** (`Phase A — +Footnotes and endnotes`, `Phase B — Tracked changes`, `Phase C — +Bookmarks and fields`, `Phase D — Numbering / lists / misc`) with +issue numbers next to each bullet (e.g. `(#1, #3, #17, #46, #48, #56, +#82)`). This is a format unique to docx in the series — it reads like +an engineering phase plan collapsed into a release-note shape. + +**python-pptx — `HISTORY.rst`, 1768 lines.** + +Opens with an `Unreleased` section that reads like an active working +log: refactor notes, issue verifications, deprecation warnings for +`pptx.slide.AnimationEffect → AnimationEffectView`, docs closes, API +resolution notes ("`#357 (customize pie-chart slice colors) resolved +by …`"). Very different tone from docx — granular, incremental, +issue-tracked, written for contributors as much as users. No +"first release as independent fork" banner — the fork transition is +not called out anywhere visible at the top. + +**python-xlsx — `doc/changes.rst`, 1376 lines.** + +Starts with `2.5.14 (2019-01-23)` — *this is still the openpyxl 2.5.14 +changelog*. Issue links point at `bitbucket.org/openpyxl/openpyxl` — +Bitbucket URLs for an upstream that moved off Bitbucket. There is no +fork-era release banner, no `2026.05.0`, no CalVer. The fork's own +history appears to live only in git log, not in a changelog file. +There is also no `HISTORY.rst` at project root — `changes.rst` inside +`doc/` is the only changelog surface. + +Observation: docx is "phase-banded CalVer", pptx is "engineering +working-log", xlsx is "upstream snapshot not updated". Three formats, +three audiences. + + + +### 3.7 Contributor docs (CLAUDE.md, CONTRIBUTING) + +**python-docx — `CLAUDE.md`, 166 lines.** + +A compact, code-heavy reference. Opens one-line: "python-docx fork +(loadfix/python-docx) — extending python-docx with footnotes, +endnotes, track changes, fields, bookmarks, and other missing OOXML +capabilities." Immediately acknowledges the sibling series and tells +the reader that when implementing a cross-sibling feature they should +"consult the sibling repos for naming and API-shape precedent". Then +it launches into the three-layer architecture diagram and worked +code snippets — `CT_Footnote`, `FootnotesPart`, `ZeroOrOne` / +`ZeroOrMore` with `successors=(...)`. Testing conventions (Describe, +`it_*`/`its_*`/`they_*`, `cxml.element(...)` snippets). No +`CONTRIBUTING.md`, no `AUTHORS`. + +**python-pptx — `CLAUDE.md`, 256 lines.** + +More prose-oriented. Same "Guidance for Claude Code (and other AI +assistants)" framing as xlsx. Numbered sections (`## 1. Project +summary`). Mentions the sibling series explicitly: "python-pptx is +part of a family of Python libraries for reading/writing Office Open +XML formats. Each targets a different Office application but shares +the same design philosophy (lxml-backed, no Office install required, +round-trip fidelity, src-layout + strict tooling)". Lists the two +siblings under "Sibling projects" but names them `scanny/python-docx` +and (presumably) `sciris/...` — **upstream names, not `loadfix/` fork +names**. No `CONTRIBUTING.md`. + +**python-xlsx — `CLAUDE.md`, 334 lines. `CONTRIBUTING.md`, 17 lines.** + +The most extensive CLAUDE.md of the three. Same numbered-sections +format and opening line as pptx. Section 1 is richer than the other +two — describes the double-fork history (`python-xlsx → openpyexcel +→ openpyxl (~2.5.14, circa 2019)`), names remotes (`origin` is +`loadfix/python-xlsx`), identifies descriptor-layer inheritance, +notes the post-fork focus areas, confirms MIT/Expat, `2026.05.0` +CalVer, and runtime deps `jdcal` / `et_xmlfile`. Most detailed of +the three. + +`CONTRIBUTING.md` exists but is a 17-line stub — no counterpart in +docx or pptx. `AUTHORS.md` exists alongside — a format absent from +the others. + +Observation: the three CLAUDE.md files share a family resemblance +(sibling-series acknowledgement, three-layer architecture diagram, +lxml note, Python version note) but disagree on whether the sibling +org is `loadfix/` (docx) or upstream names (pptx). xlsx's is the +longest and most prose-heavy. + + + +### 3.8 Inline / docstring quality + +Measured by two proxies — counts of `versionadded::` directives and +Sphinx's own complaints about docstring content. + +**`versionadded::` directives in `src/`** +- python-docx: **885 occurrences** +- python-pptx: **0 occurrences** +- python-xlsx: **0 occurrences** + +docx is the only project that tracks per-feature addition version in +the docstring. Given the "first release as independent fork" nature of +the 2026.05.0 cut, almost all 885 entries are plausibly tagged +`.. versionadded:: 2026.05.0` and will become historically meaningful +after the second release — but the habit is already instrumented. + +**Sphinx's complaints** +- docx: 17 warnings (16 are `Undefined substitution referenced` for + class names, one is an `undefined label: 'wdoutlinelvl'`). The class + substitutions resolve elsewhere because docx does not use an + `rst_epilog` substitution table at all — the few `|AltChunk|`, + `|Attachment|`, `|ExtendedProperties|`, `|DocVars|`, `|ImagePart|`, + `|Image|`, `|StoryPart|`, `|TableCellMargins|`, `|IndexError|`, + `|FloatingImage|` references are ad-hoc and don't have definitions. +- pptx: 140 warnings, of which 127 are `Undefined substitution + referenced` caused by the `rst_epilog` substitution table in + `conf.py` going stale relative to class renames and new additions + (big offenders: `|ErrorBars|` ×26, `|AnimationEffect|` ×11, + `|ErrorBarType|` ×9, `|ErrorBarInclude|` ×9, `|ErrorBarDirection|` + ×9, `|PathGeometry|` ×8, `|Path|` ×8). Non-substitution warnings: 4 + `Title underline too short`, 2 `Explicit markup ends without a blank + line`, 1 `document isn't included in any toctree`, 1 `Literal block + ends without a blank line`, 1 `Malformed table`. +- xlsx: build fails before docstrings are ever read, so docstring + quality cannot be sampled this way. + +**napoleon-compatibility evidence (docx only)** + +docx's `conf.py` explicitly annotates — in prose — that ~11 files +reference OOXML element names using single-backtick inline code +(e.g. `\`w:moveFrom\``) and that this would produce ~42 bogus +docutils warnings without the pre-napoleon/post-napoleon hook. This +is a strong indicator that docx has been through at least one +deliberate docstring-cleanup pass; pptx has no equivalent hook and +no equivalent cleanup. + + + +### 3.9 Spec / reference material + +**python-docx — `spec/` at project root.** +Contains the ISO/IEC 29500 parts 1–4 as PDFs (`ISO-IEC-29500-1.pdf` +through `ISO-IEC-29500-4.pdf`), an `xsd/` tree (presumably the XML +schema definitions referenced during element implementation), an +`rnc/` tree (Relax-NG-compact equivalents), and a `styles.xml` +reference file. The spec tree is not exposed through Sphinx — it's a +developer-side resource checked in alongside source. + +**python-pptx — `spec/` at project root.** +Organised one level deeper: `ISO-IEC-29500-1/`, `ISO-IEC-29500-2/`, +`ISO-IEC-29500-3/`, `ISO-IEC-29500-4/`, plus `gen_spec/`. The per-part +directories likely contain the PDFs plus extracted/searchable +artifacts — richer than docx's flat PDFs. `gen_spec/` suggests tooling +for regenerating spec-derived test fixtures or docs, which docx does +not appear to have. + +**python-xlsx — no `spec/` at project root.** +No ISO-IEC-29500 PDFs, no schema tree, no RNC tree checked in. The +project carries its OOXML-aware descriptor layer (`src/xlsx/descriptors/`) +but not the normative spec alongside it. Given the fork targets Excel +365 features that are ECMA-376-second-edition-plus, the absence of the +spec tree is conspicuous. + + + +### 3.10 FEATURES.md-equivalent + +**python-docx — `FEATURES.md`, 1791 lines (81 KB).** +Plus a companion `FEATURES_AUDIT.md`, 41 KB. These two files between +them appear to be the authoritative inventory of what the fork has +delivered (`FEATURES.md`) and what's been validated against upstream +intent (`FEATURES_AUDIT.md`). This is a documentation surface docx has +uniquely invested in. + +**python-pptx — no FEATURES.md.** +The closest equivalent is the `Feature Support` list inside +`docs/index.rst`, which is inherited from upstream: round-trip PPTX, +add slides, populate text placeholders, add image, add textbox, add +table, add auto shapes, toggle bullet formatting, add/manipulate +column/bar/line/pie charts, discover 2016+ extended charts and +preserve on round-trip, core document properties, header/footer/slide +number/date placeholder toggles, etc. The list does not advertise +fork-specific work (animation API, section API, transitions, smart-art, +tags, threaded comments). + +**python-xlsx — no FEATURES.md, but `TODO.md` (582 lines).** +Role is different from docx's `FEATURES.md`: it is forward-looking +(what remains to do), not retrospective (what has been delivered). As +such it plays the part of a project board rather than a feature +inventory. The README's "extended with ... dynamic arrays and spill +semantics, threaded comments, rich text on cells, sparklines, modern +chart types..." pitch is the de-facto feature inventory for the fork. + +Observation: only docx has a retrospective feature inventory, and it +has two (the main file + an audit). This is one of the biggest +documentation-surface divergences in the series. + + + +### 3.11 Build health + +**python-docx — build succeeds, 17 warnings.** + +From `/tmp/sphinx-docx.log`: `build succeeded, 17 warnings.` + +The warning shape (per the partial log sampled and the full sphinx log): +- 16 `ERROR: Undefined substitution referenced` for class names used + with `|Name|` syntax that has no substitution definition — the + offenders are `|AltChunk|` (×2), `|Attachment|`, `|ExtendedProperties|`, + `|DocVars|`, `|ImagePart|` (multiple), `|Image|`, `|StoryPart|` + (×2), `|TableCellMargins|` (×2), `|IndexError|`, and a handful + more. All in docstrings in `src/docx/document.py`, `settings.py`, + `shape.py`, `table.py`. +- 1 `WARNING: undefined label: 'wdoutlinelvl'` in + `src/docx/text/parfmt.py` — a dangling `:ref:` target. +- 1 deprecation hint about `intersphinx_mapping` format (though + `conf.py` already uses the new form — this appears to be a false + positive from Sphinx 7 detection). + +All 17 are cosmetic / dangling — no broken autodoc, no missing +modules, no malformed tables. + +**python-pptx — build succeeds, 140 warnings.** + +From `/tmp/sphinx-pptx.log`: `build succeeded, 140 warnings.` + +The warning shape is dominated by the `rst_epilog` drift described in +§3.2: +- 127 `ERROR: Undefined substitution referenced` — top offenders are + `|ErrorBars|` (26), `|AnimationEffect|` (11), + `|XL_ERROR_BAR_TYPE|` (9), `|XL_ERROR_BAR_INCLUDE|` (9), + `|XL_ERROR_BAR_DIRECTION|` (9), `|PathGeometry|` (8), `|Path|` (8), + `|Sound|` (6), `|Section|` (5), `|Comments|` (4), `|Audio|` (4), + `|_HeaderFooter|` (3), `|Transition|` (3), + `|ConnectorAdjustmentCollection|` (3), `|SmartArt|` (2), + `|PROG_ID|` (2), `|CommentAuthors|` (2), `|CommentAuthor|` (2), + `|Comment|` (2), and long tail of singletons + (`|_Field|`, `|XL_CROSS_BETWEEN|`, `|XL_AXIS_POSITION|`, + `|TagsPart|`, `|SlideTags|`, `|Sections|`, `|Movie|`, `|MediaPart|`, + `|LinePlot|`, `|ExtendedPropertiesPart|`, `|CustomProperties|`, + `|AnimationEffectView|`, `|bool|`). +- 4 `Title underline too short` warnings. +- 2 `Explicit markup ends without a blank line; unexpected unindent`. +- 1 `document isn't included in any toctree`. +- 1 `Literal block ends without a blank line; unexpected unindent`. +- 1 `Malformed table`. + +Every one of those substitution errors is a visible render defect in +the HTML output (literal `|ClassName|` will appear in prose where a +cross-reference should be). + +**python-xlsx — build fails.** + +Error text (from `/tmp/sphinx-xlsx.log`): + +``` +Running Sphinx v7.4.7 + +Configuration error: +There is a programmable error in your configuration file: + +Traceback (most recent call last): + File "/tmp/sphinx-venv-xlsx/lib/python3.14/site-packages/sphinx/config.py", + line 529, in eval_config_file + exec(code, namespace) + File "/home/ben/code/python-xlsx/doc/conf.py", line 25, in + import openpyxl +ModuleNotFoundError: No module named 'openpyxl' +``` + +Root cause: the Sphinx config still expects an `openpyxl` top-level +package, but the fork renamed the package. `src/xlsx/` and +`openpyexcel/` coexist at project root; neither is named `openpyxl`. +The `conf.py` reads `release = openpyxl.__version__` and uses +`openpyxl.__author__` in its `copyright`, so even if the import were +shimmed, multiple downstream lines would need to follow. + +No partial build output is produced — the tool exits at configuration +stage, before source files are read. That means no HTML landing page, +no rendered tutorials, no published reference. The README's claim +that the project is installable and usable is plausibly independent +of this — the Sphinx docs surface is simply broken. + +Secondary risk: there is no `requirements-docs.txt`, so there is no +pinned way to reproduce this build reliably even once the import is +fixed. + + + +### 3.12 Code samples + +**python-docx.** The README has no inline code block surfaced in the +first 20 lines sampled. The Sphinx `docs/index.rst` landing page, +however, carries an embedded quickstart snippet inside a two-column +layout — an example image on the left, Python code on the right. The +code goes beyond upstream's quickstart by adding fork-only calls: + +```python +# -- fork feature: attach a footnote to a run -- +document.footnotes.add(p.runs[0], 'Footnote body text.') + +# -- fork feature: attach a comment to a range of runs -- +document.add_comment( + runs=p.runs, + text='A reviewer comment.', + author='Editor', + ... +) +``` + +The code-sample strategy here is *lead with fork-value immediately on +the landing page*. + +**python-pptx.** `docs/index.rst` relies on `include:: ../README.rst` +for its intro — so the Sphinx landing page is essentially the 25-line +upstream README. A `Feature Support` bullet list (inherited from +upstream) gives functional scope but no `.py` sample. There is a +`lab/` directory at project root, suggesting experimentation fixtures; +not examined in this audit. + +**python-xlsx.** `doc/` contains working `.py` files alongside the +`.rst` documents — `example.py`, `filters.py`, `format_merged_cells.py`, +`table.py`. These are presumably referenced from the corresponding +`.rst` pages via `.. literalinclude::` or similar. This is a different +and arguably more honest approach (the example source is executable +and linted), but it puts `.py` files in a `doc/` tree — architecturally +unusual. + + + +## 4. Divergences worth aligning + +The following are observed differences across the three projects that +stand out as *avoidable* — places where the siblings disagree on +format or presence of a surface without any obvious per-project reason. + +1. **`docs/` vs `doc/`.** docx and pptx use `docs/` (plural). xlsx + uses `doc/` (singular). One of the two conventions is inherited + from openpyxl's 2019-era layout; the other is from the + docx/pptx upstream lineage. + +2. **README format.** docx and xlsx use `README.md`. pptx uses + `README.rst`. Consequence: pptx's `docs/index.rst` can and does + `include:: ../README.rst`; docx's `index.rst` cannot do the same + trivially. + +3. **LICENSE filename.** docx and pptx use `LICENSE`. xlsx uses + `LICENCE.md` (British spelling + `.md` extension). Packaging + tools that look for `LICENSE` or `LICENSE.*` may miss it. + +4. **Sphinx stack generation.** docx is on `Sphinx>=6,<8` + `furo`. + pptx is pinned at `Sphinx==1.8.6` + `alabaster<0.7.14` + `armstrong`. + xlsx's `conf.py` was written for an openpyxl-era Sphinx (pre-RTD). + Three generations of Sphinx in one "series". + +5. **`requirements-docs.txt` presence.** docx and pptx have one. xlsx + does not. This is what prevented reproducing the xlsx build + reliably. + +6. **`versionadded::` usage.** docx has 885 occurrences in `src/`. + pptx and xlsx have zero. Only docx will be able to auto-render a + "new in 2026.05" badge in its docs. + +7. **Fork banner in README.** docx and xlsx both lead with "A Python + library for reading, creating, and updating Microsoft 2007+ + files." followed by a "Based on ... Forked at / from ... + Unstable. Not yet published to PyPI. Current version: 2026.05.0" + block. pptx's `README.rst` retains the upstream opening + ("*python-pptx* is a Python library for creating, reading, and + updating PowerPoint (.pptx) files.") and never mentions the fork. + +8. **Fork-era changelog.** docx `HISTORY.rst` starts with a "first + release as independent fork" banner. pptx `HISTORY.rst` starts + with an `Unreleased` section that reads as a contributor log and + has no fork-transition marker. xlsx's `doc/changes.rst` starts + at `2.5.14 (2019-01-23)` — it's the unmodified openpyxl log. + +9. **Sibling-org names.** docx's `CLAUDE.md` names the sibling org + as `loadfix/python-docx`, `loadfix/python-pptx`, + `loadfix/python-xlsx`. pptx's `CLAUDE.md` names them with + upstream maintainers (`scanny/python-docx`, etc.). The three + CLAUDE.md files do not agree on what this series is called. + +10. **`api/` and `api/enum/` subtrees.** docx and pptx both split + reference docs into `api/` + `api/enum/`. xlsx has no such + split — all 39 `.rst` files are flat. There is no class-by-class + reference surface for xlsx. + +11. **`FEATURES.md`.** Only docx has one. For a series whose pitch is + "these forks add N new features", the absence of retrospective + feature inventories on two of the three projects matters. + +12. **`spec/` presence.** docx and pptx both check in ISO-IEC-29500 + spec PDFs alongside source. xlsx does not. For a project whose + value proposition is fidelity to Excel-365 OOXML parts, this is + surprising. + +13. **Napoleon docstring handling.** Only docx has configured + Napoleon (and even installed a hook to tame it against OOXML + colon-in-backtick attribute docstrings). pptx uses autodoc + without Napoleon. xlsx uses autodoc without Napoleon. Docstring + conventions in the codebase are therefore different. + +14. **Build health.** docx: 17 cosmetic warnings. pptx: 140 mostly- + substitution errors (128 of them visible as literal `|Name|` in + rendered HTML). xlsx: does not build at all. + +15. **CONTRIBUTING.md / AUTHORS.md.** xlsx has both. docx and pptx + have neither. (xlsx's `CONTRIBUTING.md` is a 17-line stub; the + presence is the point, not the content depth.) + + + +## 5. Conventions that should be identical across the series + +These are the things that the audit found are *already* identical in +at least two of the three projects, and which therefore implicitly +define the intended series-wide convention. + +- **Top-level layout skeleton:** `src//`, `tests/`, `features/` + (behave acceptance), `spec/`, `docs/`, `CLAUDE.md`, `HISTORY.rst`, + `LICENSE`, `MANIFEST.in`, `Makefile`, `pyproject.toml`, `tox.ini`, + `requirements-*.txt`. docx and pptx match this in full. xlsx + diverges on `doc/` (not `docs/`), `LICENCE.md` (not `LICENSE`), + absence of `spec/`, absence of `features/`, presence of legacy + `openpyexcel/` directory alongside `src/xlsx/`, and `pytest.ini` + + `setup.cfg` + `setup.py` living alongside a nominally modern layout. + +- **`docs/` structure:** `docs/index.rst`, `docs/conf.py`, + `docs/Makefile`, `docs/_static/`, `docs/api/`, `docs/api/enum/`, + `docs/user/`, `docs/dev/`. docx and pptx match. xlsx does not have + any of these subdirs. + +- **API-reference organisation:** one `.rst` per proxy class/topic + under `docs/api/`, one `.rst` per enum under `docs/api/enum/`, + filename matches class name (docx: `WdAnchorH.rst`; pptx: + `MsoAutoShapeType.rst`). xlsx absent. + +- **User-guide page topic naming:** both docx and pptx have + `quickstart`, `install`, `charts`, `comments`, `text`, `table` + (sometimes `tables`). docx additionally splits `-advanced` for the + larger sections. xlsx has `tutorial.rst` / `usage.rst` instead — + different convention, no "quickstart" page. + +- **Sphinx autodoc + intersphinx:** all three enable `autodoc` and + `viewcode`. docx adds `intersphinx` and `napoleon`; pptx adds + `intersphinx`, `doctest`, `coverage`, `inheritance_diagram`, + `todo`, `ifconfig`; xlsx adds `doctest`, `coverage`, `ifconfig`. + The minimum common set is `autodoc` + `viewcode`. + +- **CalVer version `2026.05.0`:** docx and xlsx. pptx: not confirmed + by the audit — pptx `HISTORY.rst` opens with `Unreleased` without a + CalVer label, so either pptx is still on semver or has not yet cut + the first CalVer tag. + +- **Language framing of the CLAUDE.md intro:** "Guidance for Claude + Code (and other AI assistants) working in this repository" — + pptx and xlsx open identically. docx opens differently + ("python-docx fork (loadfix/python-docx) — extending python-docx + with ..."). If docx is the "canonical" one in terms of fork + maturity, its CLAUDE.md intro is the odd one out stylistically. + +- **Three-layer architecture diagram in CLAUDE.md:** docx, pptx, and + xlsx all describe their architecture as a three-layer stack + (Document API / Parts Layer / oxml Layer over lxml). + + + +## 6. What this repo (python-docx) should consider + +Observations specific to this repo (no prescription of action — just +what the audit noticed). + +- The 17 Sphinx warnings are all cosmetic (dangling `|Name|` + substitutions + one stale `:ref:` target, `wdoutlinelvl`). They are + the smallest warning count in the series and individually small, + but they are visible in rendered HTML as literal `|AltChunk|`, + `|Attachment|`, `|ImagePart|` text. Fixing them would bring the + build to zero-warning, which pptx would then be the lone warning + carrier. + +- `docs/conf.py` includes an 80-line prose explanation of the + pre-/post-napoleon hook. The explanation is thorough and worth + preserving; it also makes this `conf.py` the obvious template for + porting to pptx if/when pptx moves off Sphinx 1.8.6. + +- `CLAUDE.md` is 166 lines — the shortest of the three, by a large + margin. It is also the only one that opens with fork narrative + rather than the "Guidance for Claude Code" boilerplate. If the + series aims for consistent AI-assistant entry points, this repo's + opening section is stylistically distinct. + +- `FEATURES.md` (1791 lines) and `FEATURES_AUDIT.md` (41 KB) are + unique to this repo. They represent substantial curation effort; + the audit file suggests a second round of self-verification has + already been done. + +- No `CONTRIBUTING.md` and no `AUTHORS` file at project root. xlsx + has both (even if stub-sized). For an "independent fork" status, + the lack of a contribute-here doorway is a notable surface gap. + +- `docs/user/` has 38 pages and covers almost every fork-era + subsystem in prose — the most complete user guide in the series. + +- `docs/dev/` has only `analysis/` underneath it. pptx's `docs/dev/` + has `development_practices.rst`, `philosophy.rst`, `security.rst`, + `xmlchemy.rst` in addition to `analysis/`. These documents describe + the *how and why* of contributing — surface absent here. + +- The README is 131 lines of Markdown, fork-focused. It does not + appear to point readers at the Sphinx docs (no "Full docs at + readthedocs/..." line visible in the first 20 lines). The Sphinx + build is unpublished as far as the audit could tell. + + + +## 7. What the series as a whole should consider + +Observations about the three-project ensemble, not any one member. + +- **There is not yet a single series-level landing page** that names + all three projects and routes users to the right one. docx's + `CLAUDE.md` names the series `loadfix/python-{docx,pptx,xlsx}` but + pptx's `CLAUDE.md` names the siblings with upstream-maintainer + prefixes (`scanny/...`, `sciris/...`). A reader arriving from + search has no way to discover the other two projects from the + README of any one of them. + +- **Three different Sphinx theme choices** (`furo`, `armstrong`, + `nature`) across three projects with identical intended audience. + The rendered HTML will look unrelated. + +- **Three different changelog formats** — phase-banded CalVer (docx), + engineering-working-log with `Unreleased` section (pptx), + openpyxl-2019-snapshot (xlsx). A user tracking across the three + sees three entirely different release narratives. + +- **Three different fork-transition postures** in the READMEs — + explicit "first release as independent fork" banner (docx, xlsx + with nearly identical wording) vs. no fork mention whatsoever + (pptx). + +- **Spec material included in docx and pptx, absent in xlsx.** The + ISO-IEC-29500 PDFs plus `xsd/`, `rnc/` trees anchor the OOXML + implementation in both existing `spec/` dirs; xlsx's work on Excel + 365 features is done without this anchor. + +- **`versionadded::` annotations are docx-only (885 of them).** The + two siblings cannot render "new in 2026.05.0" badges because no + directives exist in their source. Cross-sibling consistency on + fork-era version tagging would require pptx and xlsx to backfill. + +- **Build health is uneven.** docx 17 warnings (mostly cosmetic), + pptx 140 warnings (mostly substitution drift, visible in HTML), + xlsx build fails at `conf.py`. For a reader trying to tell whether + a given library is production-grade, the rendered-docs quality is + the first contact point, and the three contact points are in three + different states. + +- **Enum documentation convention is present in docx and pptx + (one-file-per-enum under `api/enum/`) but absent in xlsx.** The + convention is consistent between the two that use it. + +- **Behave acceptance tests** (`features/` dir) are in docx and pptx. + xlsx has no `features/` dir — it uses pytest-only, though `tests/` + does exist. Testing philosophy is not uniform, and that shows up + indirectly in documentation (what kind of examples the project + treats as normative). + +- **`CLAUDE.md` line counts** — 166 (docx), 256 (pptx), 334 (xlsx). + xlsx's is the richest, docx's is the terse "shape-only" version. + For AI agents bouncing between the three repos, the guidance-depth + is not symmetric. + +- **Documentation-as-source (executable `.py` examples in `doc/`)** + is an xlsx-only convention; the other two put examples as + RST-inline snippets. This is a cross-sibling style divergence at + the smallest scale. + +- **Each project's docstring substitution strategy differs.** docx + writes inline class references directly (`:class:`.Name``) and + gets substitution drift only in a handful of places (~17). pptx + relies on `rst_epilog` in `conf.py` declaring `|ClassName|` style + shortcuts, and that list has gone substantially stale relative to + the code (~127 undefined). xlsx does neither because it does not + build. The three projects have three different docstring styles as + a side effect. + diff --git a/audits/FEATURES_AUDIT.md b/audits/FEATURES_AUDIT.md new file mode 100644 index 000000000..7842b2de3 --- /dev/null +++ b/audits/FEATURES_AUDIT.md @@ -0,0 +1,769 @@ +# Behave Acceptance-Tests Audit (`features/`) + +This report surveys the state of the `features/` behave acceptance-tests suite +on the `loadfix/python-docx` fork with three aims: + +1. Document what the suite covers today. +2. Map every shipped fork-era feature to "has behave coverage?" / "no behave + coverage". +3. Propose prioritised follow-ups. + +All measurements were taken at commit `50c2078` (`master`, 2026-05-01). + +--- + +## 1. Summary + +The project uses **behave** (Gherkin BDD) as its acceptance-test framework, +living entirely under `features/`. The configuration is minimal: there is no +`behave.ini`, no `.behaverc`, no tags, and no wiring into CI; contributors run +it locally with `uv run behave features/`. + +- **67 `.feature` files** (2570 lines total) +- **239 `Scenario`/`Scenario Outline` blocks** in the source files, expanding to + **650 scenarios** at run-time (the outlines produce 411 additional rows from + `Examples` tables) +- **22 step-definition modules** under `features/steps/` (4103 lines) +- **53 fixture files** under `features/steps/test_files/` (`.docx`, `.png`, + `.jpg`, `.jpeg`, `.tif`, `.bmp`, `.gif`) +- **1856 steps** executed end-to-end in ~2.0 s + +**The suite has not been meaningfully extended in this fork.** The five most +recent commits touching `features/` are: + +| SHA | Subject | Date | +|---|---|---| +| `874c1d5` | fix: ensure run.add_picture() produces Word-compatible inline images (#31) (#78) | 2026-04-05 | +| `a809d6c` | comments: add Comment.text | 2025-06-09 | +| `66da522` | xfail: acceptance test for Document.add_comment() | 2025-06-09 | +| `761f4cc` | comments: add Comment.author, .initials setters | 2025-06-09 | +| `8ac9fc4` | comments: add Comments.add_comment() | 2025-06-09 | + +The only topic area given fresh behave coverage in this fork is **comments** +(`features/cmt-mutations.feature`, `features/cmt-props.feature`, +`features/doc-add-comment.feature`, `features/doc-comments.feature` — 18 +scenarios). Everything else in `features/` pre-dates the fork. + +Approximately **55 Microsoft-Word features** have shipped in this fork since +the June 2025 comments work landed. **None** of them have acceptance coverage. +The behave suite as it stands describes only the upstream API surface; the +loadfix extensions (footnotes, endnotes, bookmarks, fields, tracked changes, +content controls, charts, etc.) are exclusively covered by pytest units in +`tests/`. + +--- + +## 2. Layout + +``` +features/ +├── *.feature # 67 Gherkin spec files +├── environment.py # behave hooks (before_all only) +├── _scratch/ # run output (gitignored) +└── steps/ + ├── *.py # 22 step-definition modules + └── test_files/ # 53 fixture files (.docx + image files) +``` + +### `features/environment.py` (10 lines) + +Only `before_all(context)` is defined: creates `features/_scratch/` if it does +not exist. There is no `after_scenario` cleanup, no tags wiring, no shared +setup. Adding new features that need per-scenario state teardown will require +enlarging this file. + +### `features/_scratch/` + +Not tracked. `.gitignore` has a line `_scratch/` that correctly covers both +`features/_scratch/` and any other `_scratch/` directory. At the time of +writing `features/_scratch/test_out.docx` exists locally (leftover from a run) +but is correctly ignored. **No hygiene action needed.** + +### Step modules (`features/steps/*.py`) + +22 modules, 4103 lines total. Listed in descending size with a one-line scope +summary: + +| Module | Lines | Feature areas | +|---|---:|---| +| `table.py` | 558 | Tables, rows, columns, cells — spans, props, add/access | +| `styles.py` | 548 | Style access, add/delete, latent styles, style props | +| `text.py` | 322 | Run properties (breaks, char style, inner content) and add-picture | +| `comments.py` | 284 | Comments API (`cmt-*`, `doc-comments`, `doc-add-comment`) | +| `section.py` | 265 | Section iteration, `sct-*.feature`, odd/first-page header/footer | +| `paragraph.py` | 256 | Paragraph access, inner content, set-text, insert, style | +| `document.py` | 259 | `Document.add_*`, `Document.sections/styles/inline_shapes/tables` | +| `font.py` | 227 | Font property matrix (colour, highlight, bold, italic, etc.) | +| `parfmt.py` | 210 | `ParagraphFormat` on/off props, line spacing, alignment | +| `shape.py` | 151 | InlineShape access and size | +| `tabstops.py` | 141 | Tab-stop collection and props | +| `pagebreak.py` | 135 | Rendered page-break splitting (`pbk-split-para`) | +| `hdrftr.py` | 134 | Header/footer iteration, linked-to-previous | +| `coreprops.py` | 117 | `CoreProperties` read/write (title, author, created, etc.) | +| `hyperlink.py` | 116 | Hyperlink properties and fragments | +| `block.py` | 100 | `BlockItemContainer` iteration (`blk-*`) | +| `image.py` | 74 | Pure image-file characterisation (dimensions / DPI / MIME) | +| `api.py` | 59 | `docx.Document` open/save API | +| `settings.py` | 55 | `Settings` object read access | +| `helpers.py` | 34 | `test_docx()` / `test_file()` path helpers + `bool_vals` maps | +| `numbering.py` | 32 | Only one Given for accessing `document.part.numbering_part` | +| `shared.py` | 26 | Trivial `Given a blank document` / `Given a document` impl | + +Two modules are effectively shim-only: `numbering.py` (32 lines, one step) and +`shared.py` (26 lines). + +### `features/steps/test_files/` + +53 fixtures: +- **40 `.docx` files** used by `test_docx(name)` in step modules — one file per + "preset scenario state" (e.g. `tbl-2x2-table.docx`, `par-known-styles.docx`) +- **13 image files** (`.jpg`, `.jpeg`, `.png`, `.tif`, `.bmp`, `.gif`) used by + `img-characterize-image.feature` and a few `run.add_picture` steps + +One fixture (`doc-odd-even-hdrs.docx`) is tracked but not referenced from any +step module or feature — see §6. + +--- + +## 3. Current coverage + +### 3.1 Scenarios per step-module domain + +Counted at the source-file level (before `Scenario Outline` expansion). Feature +files are mapped to step modules by filename prefix / content. + +| Step module | Feature files covered | Scenarios | +|---|---|---:| +| `table.py` | tbl-*.feature (11 files) | 36 | +| `styles.py` | sty-*.feature (8 files) | 33 | +| `document.py` | doc-access-*, doc-add-*, doc-styles (9 files) | 24 | +| `font.py` | txt-font-*, run-access-font, sty-access-font (4 files) | 21 | +| `comments.py` | cmt-*, doc-add-comment, doc-comments (4 files) | 18 | +| `section.py` | sct-section (1 file) | 17 | +| `text.py` | run-access-inner-content, run-add-*, run-char-style, run-clear-run, run-enum-props, txt-add-break (7 files) | 15 | +| `parfmt.py` | par-access-parfmt, txt-parfmt-props (2 files) | 13 | +| `paragraph.py` | par-access-*, par-add-run, par-*-prop, par-clear, par-insert, par-set-text (7 files) | 13 | +| `tabstops.py` | tab-access-tabs, tab-tabstop-props | 11 | +| `hdrftr.py` | hdr-header-footer | 10 | +| `block.py` | blk-* (3 files) | 7 | +| `hyperlink.py` | hlk-props | 6 | +| `shape.py` | shp-inline-shape-access, shp-inline-shape-size | 4 | +| `pagebreak.py` | pbk-split-para | 4 | +| `coreprops.py` | doc-coreprops | 3 | +| `settings.py` | doc-settings | 3 | +| `api.py` | api-open-document | 2 | +| `numbering.py` | num-access-numbering-part | 1 | +| `image.py` | img-characterize-image (1 scenario outline × 11 rows) | 1 | + +### 3.2 Scenario-type split + +- `Scenario:` blocks: **115** +- `Scenario Outline:` blocks: **124** +- Total source blocks: **239** +- Runtime expansion: **650** scenarios (outlines contribute 535; each outline + has 1..N Examples rows, averaging ~4.3) + +Heavy use of `Scenario Outline` + `Examples` keeps the feature files compact +and makes them look smaller on paper than they are at runtime. + +--- + +## 4. Build health + +``` +$ uv run behave features/ 2>&1 | tail -5 +67 features passed, 0 failed, 0 skipped +650 scenarios passed, 0 failed, 0 skipped +1856 steps passed, 0 failed, 0 skipped, 0 undefined +Took 0m2.010s +``` + +- **0 failed / 0 skipped / 0 undefined.** Clean green. +- **Run time**: 2.0 s. Fast enough that adding dozens of new feature files + would not materially hurt the developer loop. +- **No tags in use.** `grep -rE "^\s*@" features/*.feature` produces no + output — there are no `@xfail`, `@wip`, `@slow`, or topical tags anywhere in + the suite. +- **No undefined / pending steps.** Every Gherkin phrase in the suite resolves + to exactly one step implementation. +- **No xfail / skip markers.** All `xfail:` commits from June 2025 were + resolved by the follow-up implementation commits the same day. + +The suite has zero observable warts — but its size has not kept pace with the +production code, which is the problem this audit exists to quantify. + +--- + +## 5. Coverage gaps — feature-by-feature matrix + +This section walks every shipped fork-era issue (as labelled `word-feature-gap` +or `phase-*` on GitHub) and reports whether behave coverage exists. **Most +entries are "No".** Issue numbers, commit SHAs, and suggested `.feature` +filenames follow. + +Column key: +- **Existing .feature?** — answer from keyword-grep across `features/*.feature` +- **Suggested filename** — 3-letter prefix + kebab-case, matching the + repository's established naming convention (`cmt-`, `hlk-`, `par-`, etc.) +- **Effort**: **S** = reuse existing fixtures/steps, **M** = one new `.docx` + fixture + one new step module or extension, **L** = multiple fixtures or + complex setup (e.g. numeric data for charts, mail-merge data source) + +### 5.1 Phase A — Footnotes / endnotes + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 1 | Phase A.1: Footnotes Part class and relationship management | `5d0178e` | No | (covered by A.2/A.3/A.4) | — | +| 2 | Phase A.2: High-level footnotes API — document.footnotes.add() | `d589c71` | No | `fnt-add-footnote.feature` | M | +| 3 | Phase A.3: Read and iterate existing footnotes | `904ccf4` | No | `fnt-read-footnotes.feature` | M | +| 4 | Phase A.4: Delete and modify footnotes | `86dcafa` | No | `fnt-mutate-footnotes.feature` | M | +| 5 | Phase A.5: Endnotes support (mirror footnotes API) | `9390293` | No | `end-*.feature` (mirror `fnt-*`) | M | +| 17 | Phase A.6: Footnote and endnote properties (numbering, restart, position) | `8bf6011` | No | `fnt-numbering-props.feature` | M | + +Sources: `src/docx/footnotes.py`, `src/docx/oxml/footnotes.py`, +`src/docx/parts/footnotes.py`, `src/docx/endnotes.py`, +`src/docx/parts/endnotes.py`. Grep of `features/` for `footnote`, `endnote` +returns **zero hits**. + +Scenarios would cover: `Document.footnotes.add()`, `Footnote.text` read, +`Footnote.delete()`, `Footnotes.__iter__`, restart-numbering + format +attributes on the footnote properties element. One shared `fnt-has-footnotes.docx` +fixture could serve all of A.2/A.3/A.4/A.6. + +### 5.2 Phase B — Track changes + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 6 | Phase B.1: Read tracked insertions and deletions | `caff0e6` | No | `trk-read-ins-del.feature` | M | +| 7 | Phase B.2: Accept and reject tracked changes | `25c5951` | No | `trk-accept-reject.feature` | L | +| 8 | Phase B.3: Read formatting track changes (rPrChange, pPrChange, sectPrChange) | `1e7d64a` | No | `trk-format-changes.feature` | M | +| 134 | Move revisions (`w:moveFrom`, `w:moveTo`) | `aef523c` | No | `trk-move-revisions.feature` | M | +| 135 | Cell and row-level tracked changes (`w:cellIns`, `w:cellDel`, `w:trPrChange`, `w:tcPrChange`) | `dfa7daf` | No | `trk-table-changes.feature` | M | +| 136 | Revision IDs (`w:rsid`, `w:rsidRoot`) | `28c05dc` | No | `trk-rsid.feature` | S | +| 163 | Revision marks viewer mode (`revision_marks_text()`) | `18ca8af` | No | `trk-marks-text.feature` | S | + +Sources: `src/docx/tracked_changes.py`, `src/docx/oxml/tracked_changes.py`. +Grep of `features/` for `track`, `tracked`, `revision`, `w:ins`, `w:del` +returns **zero hits**. + +Accept/reject (#7) is an **L** because it deserves a full matrix of +scenarios — insertion vs deletion, inside-run vs whole-run vs whole-paragraph, +accept-all vs reject-all vs specific-revision — and needs distinct "before" +and "after" fixture pairs to assert round-tripping. + +### 5.3 Phase C — Fields / bookmarks / cross-references + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 9 | Phase C.1: Bookmarks — create, read, delete | `19db82e` | No (only "linkedBookmark" in hyperlink fixture) | `bmk-create-read.feature` | M | +| 10 | Phase C.2: Simple and complex field codes | `c708ad5` | No | `fld-simple.feature`, `fld-complex.feature` | M | +| 115 | Cross-references (`REF`/`PAGEREF` resolution) | `42e76f5` | No | `fld-cross-ref.feature` | M | +| 116 | Table of Contents generation | `cdf178c` | No | `toc-generate.feature` | M | + +Sources: `src/docx/bookmarks.py`, `src/docx/fields.py`, +`src/docx/oxml/fields.py`, `src/docx/toc.py`. Grep of `features/` for +`bookmark` finds only an example-table value (`linkedBookmark`) inside +`hlk-props.feature`, not bookmark coverage. + +### 5.4 Content controls + custom XML + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 27 | Phase D.14: Content controls (structured document tags) | `fda39ef` | No | `sdt-content-controls.feature` | M | +| 131 | Custom XML data binding (`w:dataBinding`) | `c079126` | No | `sdt-data-binding.feature` | M | + +Sources: `src/docx/content_controls.py`, `src/docx/oxml/content_controls.py`, +`src/docx/parts/custom_xml.py`. Grep of `features/` for `content.control`, +`sdt`, `custom_xml` returns **zero hits**. + +### 5.5 Custom properties + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 14 | Phase D.4: Custom document properties | `adc0485` | No | `doc-customprops.feature` | S | + +Sources: `src/docx/custom_properties.py`, +`src/docx/parts/custom_properties.py`. Existing `doc-coreprops.feature` +(3 scenarios) is the closest analogue; a sibling `doc-customprops.feature` with +typed-value round-trip scenarios would fit alongside it. The existing +`coreprops.py` step module is a natural home for the new steps — hence **S**. + +### 5.6 Numbering + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 22 | Phase D.9: Numbering style control (restart, custom lists, nested lists) | `738258a` | No (only `num-access-numbering-part.feature`, 1 scenario, read-only) | `num-define-and-apply.feature` | M | + +Sources: `src/docx/numbering.py`, `src/docx/oxml/numbering.py`. The existing +`num-access-numbering-part.feature` only asserts that +`document.part.numbering_part` is accessible — it does not exercise +`Numbering.add_numbering_definition` or `NumberingDefinition.apply_to` +(flagged as uncovered in `TEST_AUDIT.md §2`). + +### 5.7 Page / section layout + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 19 | Phase D.8: Character spacing and kerning (pre-fork already) | `0bc5c30` | No (kerning has zero hits) | extend `txt-font-props.feature` | S | +| 32 | Phase D.19: Multi-column section layout | `3db6754` | No | extend `sct-section.feature` | S | +| 121 | Page borders (`w:pgBorders`) | `e1e9b69` | No | `sct-page-borders.feature` | S | +| 122 | Line numbering (`w:lnNumType`) | `621eddc` | No | `sct-line-numbering.feature` | S | +| 146 | Paper source (`w:paperSrc`) | `90c5d00` | No | `sct-paper-source.feature` | S | +| 147 | Document grid (`w:docGrid`) | `b190220` | No | `sct-document-grid.feature` | S | +| 148 | Asian typography on section (`w:textDirection`, `w:bidi`) | `429c93a` | No | `sct-text-direction.feature` | S | +| 149 | Section odd-page vs even-page header/footer | `608695b` | No | extend `hdr-header-footer.feature` | S | + +Sources: `src/docx/section.py`, `src/docx/oxml/section.py`. All section-level +additions are simple attribute reads/writes that fit the existing +`sct-section.feature`/`hdr-header-footer.feature` step module — **S** across +the board. + +### 5.8 Tables — extensions + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 28 | Phase D.15: Row.height — set and get table row height | `819ed67` | No (only height-rule / min-height scenarios in `tbl-row-props`) | extend `tbl-row-props.feature` | S | +| 39 | Phase D.26: Table autofit and column width control | `e5d88e3` | Partial — legacy `Table.autofit` boolean covered in `tbl-props.feature`; new `autofit_behavior`, `preferred_width`, `allow_autofit` not covered | extend `tbl-props.feature` | S | +| 142 | Vertical text direction in cells (`w:textDirection`) | `e0c79cb` | No | `tbl-cell-text-direction.feature` | S | +| 143 | Cell margins per-cell (`w:tcMar`) | `08dcbeb` | No | `tbl-cell-margins.feature` | S | +| 144 | Banded rows / columns (`w:tblLook`) | `2bfe7c4` | No | `tbl-style-flags.feature` | S | +| 145 | Merged cell read robustness (Cell.is_merge_origin / .merge_origin) | `bf44d78` | Partial — `tbl-merge-cells.feature` covers `merge()`; `is_merge_origin`/`merge_origin` getters are not asserted | extend `tbl-merge-cells.feature` | S | + +All reuse the existing `table.py` step module and a handful of existing +`tbl-*.docx` fixtures. Most are **S**; #144 may need a new "banded" fixture. + +### 5.9 Text / font — extensions + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 33 | Phase D.20: Font.shading — run-level background color | `64ce4aa` | No | extend `txt-font-color.feature` | S | +| 120 | Run border (`w:bdr`) | `bf99398` | No | `txt-run-border.feature` | S | +| 127 | Right-to-left / bidirectional text | `c754772` | No (`bidi` appears only in a table-direction context) | extend `txt-font-props.feature` | S | +| 128 | East Asian typography features | `c754772` / `c0b9f32` | No | `txt-east-asian.feature` | S | +| 129 | Ruby (`w:ruby`) | `c0b9f32` | No | `txt-ruby.feature` | S | +| 160 | Language tags on runs / paragraphs | `74671ce` | No | extend `txt-font-props.feature` | S | + +Sources: `src/docx/text/font.py`, `src/docx/oxml/text/font.py`. All are +single-attribute property-matrix tests; all **S**. + +### 5.10 Paragraph helpers + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 26 | Phase D.13: Insert paragraph/table at arbitrary position | `50c2078` | No (only `par-insert-paragraph.feature`, which covers *before* an existing paragraph only) | extend `par-insert-paragraph.feature` + `blk-insert.feature` | S | +| 126 | Frames (`w:framePr`) — `paragraph.paragraph_format.frame` | `9924bc2` | No | extend `txt-parfmt-props.feature` | S | + +### 5.11 Drawing / shapes + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 30 | Phase D.17: Floating images (non-inline positioning) | `f51e7a9` | No | `shp-floating-images.feature` | M | +| 36 | Phase D.23: Watermark support (text and image) | `0036485` | No | `wmk-watermark.feature` | M | +| 111 | Charts (read + create) | `8f426b4` | No (only the fixture-table string "a chart" in `shp-inline-shape-access.feature` identifying a chart shape) | `chart-read.feature` + `chart-create.feature` | L | +| 112 | SmartArt | `3c04c90` | No | `sart-read.feature` | M | +| 137 | Full shape creation (DrawingML `wps:wsp`) — `Paragraph.add_shape()` | `e0e7b52` | No | `shp-add-preset-shape.feature` | M | +| 138 | Group shapes (`wpg:grpSp`) | `da3e3f1` | No | `shp-group-shapes.feature` | M | +| 139 | Ink annotations (`w:ink`) | `4791c1c` | No | `shp-ink.feature` | S | +| 140 | Embedded object insertion (`w:object`) | `f3b0937` | No | `shp-ole-embed.feature` | M | +| 141 | Chart / picture captions | `e9420f1` | No | `cap-caption.feature` | S | +| 158 | Alt text for images / shapes | `0359f68` | No | `shp-alt-text.feature` | S | + +Sources: `src/docx/shape.py`, `src/docx/oxml/shape.py`, `src/docx/charts.py`, +`src/docx/smart_art.py`, `src/docx/captions.py`. Chart create (#111) is the +only **L** because it needs numeric `.docx` fixtures for every category (bar, +line, pie) and the scenarios have to assert both the drawing inline and the +embedded `chart*.xml`. + +### 5.12 Headers / footers / settings + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 13 | Phase D.3: Extended document settings | `175d13d` | Partial — `doc-settings.feature` has 3 scenarios covering basic read access; new settings (compat, etc.) not covered | extend `doc-settings.feature` | S | +| 118 | Document background color / watermark color | `71023bd` | No | `doc-background-color.feature` | S | +| 125 | Protection modes beyond read-only | `4e64e3c` | No | `doc-protection.feature` | S | +| 130 | Mail merge directives (`w:mailMerge`) | `6c73ea5` | No | `mmg-mail-merge.feature` | M | +| 133 | Building block gallery categories | `f375d0c` | No | `glo-building-blocks.feature` | M | +| 136 | Revision IDs (`w:rsid`, `w:rsidRoot`) | `28c05dc` | No | (see §5.2 — `trk-rsid.feature`) | S | +| 156 | Compatibility mode flags (`w:compat`) | `4bf4fc4` | No | extend `doc-settings.feature` | S | +| 157 | Web settings (`webSettings.xml`) | `2f99194` | No | `web-web-settings.feature` | S | +| 162 | Style mapping / "keep a style the same as X" — `Style.link_style`, `next_style`, `is_redefined` | `008dcd1` | No | extend `sty-style-props.feature` | S | +| 164 | Draft / normal / outline / print layout hints | `40a8679` | No | extend `doc-settings.feature` | S | + +### 5.13 Accessibility + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 158 | Alt text for images / shapes (duplicate with §5.11) | `0359f68` | No | `shp-alt-text.feature` | S | +| 159 | Heading structure validation | `218f756` | No | `acc-heading-structure.feature` | S | +| 161 | Word count / statistics | `5c5cb4b` | No | `doc-statistics.feature` | S | + +### 5.14 Search / navigation + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 23 | Phase D.10: Search and replace with formatting preservation | `2bbedd2` | No | `srh-search-replace.feature` | M | +| 153 | Regex search/replace | `eb3e9b7` | No | extend `srh-search-replace.feature` | S | +| 154 | Search across tables / headers / footers / footnotes | `03728a8` | No | extend `srh-search-replace.feature` | M | +| 155 | Stable element IDs (`stable_id` on Paragraph, Run, Table, Cell) | `70fad92` | No | `doc-stable-ids.feature` | S | + +### 5.15 Packaging + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 150 | Digital signatures (`_xmlsignatures/`) | `7c67167` | No | `pkg-signatures.feature` | S | +| 151 | Document.xml content recovery (recover=True mode) | `17fa36f` | No | `pkg-recover-mode.feature` | M | +| 152 | Password-encrypted .docx (detection + EncryptedDocumentError) | `68ea68b` | No | `pkg-encrypted-docx.feature` | S | + +### 5.16 Other + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 113 | Math / equation (OMML) | `9140728` | No | `equ-equation.feature` | L | +| 114 | Symbols (`w:sym`) | `b4c4f92` | No | extend `run-add-content.feature` | S | +| 117 | Themes (`theme1.xml`) | `342d2d3` | No (only appears as a fixture-table string `'Inspiration Theme Colour 2'` in `txt-font-color.feature`) | `thm-theme.feature` | S | +| 119 | Font table reference (`fontTable.xml`) | `3ee4969` | No | `fnt-font-table.feature` | S | +| 123 | Legacy form fields (`w:ffData`) | no PR number found | No | `frm-form-fields.feature` | M | +| 124 | Rich-text ranges (`w:permStart` / `w:permEnd`) | `0965a08` | No | `prm-perm-ranges.feature` | S | +| 132 | Glossary document (`glossaryDocument.xml`) | `8dd08c8` | No | `glo-glossary.feature` | M | + +Equations (#113) is **L** because the OMML create path touches a different +builder (the math-XML namespace) than any existing feature exercises, and +scenarios would want to round-trip several operator types. + +### 5.17 Pre-fork phase-D items + +Some phase-D issues were resolved by commits that pre-date fork extensions but +remain un-behave-covered in the same way: + +| # | Title | Commit | Existing .feature? | Suggested | Effort | +|---:|---|---|:---:|---|:---:| +| 11 | Phase D.1: Hyperlink creation API | `58f27da` | Partial — `hlk-props.feature` covers read; create API not covered | extend `hlk-props.feature` | S | +| 12 | Phase D.2: Comment replies (threaded comments) | `90ef316` | Partial — `cmt-*.feature` covers base comments only, not replies | extend `cmt-mutations.feature` | M | +| 15 | Phase D.5: Table and cell border control | `1fd205c` | No (known to have a production bug — see `TEST_AUDIT.md §3`) | `tbl-borders.feature` | M (blocked on bug) | +| 16 | Phase D.6: Cell shading and background color | `64ce4aa` | Partial — `tbl-style.feature` has 1 shading scenario, coarse | extend `tbl-cell-props.feature` | S | +| 18 | Phase D.7: Paragraph borders | `26c91d9` | No | `par-borders.feature` | M | +| 20 | Page break insert and delete API | `50e2dc2` | Partial — `doc-add-page-break.feature` covers Document.add_page_break; new `Paragraph.add_page_break_before` / `Run.add_break(WD_BREAK.PAGE)` etc. vary | extend `doc-add-page-break.feature` | S | +| 21 | Section break insert and delete API | `527aade` | Partial — `doc-add-section.feature` covers add; no delete | extend `doc-add-section.feature` | S | +| 24 | Phase D.11: Paragraph.delete() and Run.delete() (and Table.delete()) | `90c7c3d` | No | `par-delete.feature`, `run-delete.feature`, `tbl-delete.feature` | S | +| 25 | Phase D.12: Table header row repeat on page break (`is_header`) | `248a932` | No | extend `tbl-row-props.feature` | S | +| 29 | Phase D.16: Row.allow_break_across_pages | `9572f10` | No | extend `tbl-row-props.feature` | S | +| 31 | Phase D.18: Fix run.add_picture() not inserting image | `874c1d5` | Yes — `run-add-picture.feature` updated, is the one fork commit that touched `features/` for a non-comments feature | — | — | +| 34 | Phase D.21: Run splitting at character position | `e432519` | No | `run-split.feature` | S | +| 35 | Phase D.22: SVG image support | `cc8b202` | No | extend `img-characterize-image.feature` + new `run-add-picture` SVG row | S | +| 37 | Phase D.24: .docm macro-enabled file support | `7ca9d2c` | No | `api-docm.feature` | S | +| 38 | Phase D.25: Font.name_far_east — East Asian font support | `3116676` | No | extend `txt-font-props.feature` | S | +| 40 | Phase D.27: DrawingML shapes and text box content access | `362554a` | No | `shp-text-box.feature` | M | +| 41 | Phase D.28: Fix core_properties.last_modified_by making document invalid | `df8f833` | Partial — `doc-coreprops.feature` does not assert round-trip validity | extend `doc-coreprops.feature` | S | + +### 5.18 Audit / infrastructure issues (skipped per spec) + +Issues #82, #83, and #165 are not feature work and are intentionally excluded +from the audit. + +### 5.19 Tally + +- **66 feature-delivery issues** surveyed (excluding infra) +- **1 has post-fork behave coverage** (#31 — a one-file tweak to + `run-add-picture.feature` as part of a bug fix) +- **~11 have partial coverage** because a pre-fork feature file exists in the + same area but does not exercise the new API (#11, #12, #13, #15, #16, #20, + #21, #25, #38, #39, #41, #145) +- **~54 have no behave coverage at all** + +The 18 comments scenarios (`cmt-props.feature`, `cmt-mutations.feature`, +`doc-add-comment.feature`, `doc-comments.feature`) pre-date the +word-feature-gap issue tracking and therefore do not appear in the matrix. + +--- + +## 6. Fixture-file catalog (`features/steps/test_files/`) + +53 files. Column key: +- **References**: step modules under `features/steps/` that name the fixture + (via `test_docx("name")` or `test_file("name.ext")`). For image fixtures, + `img-characterize-image.feature` references them via its Examples table, not + via a step module. + +### 6.1 `.docx` fixtures (40 files) + +| Filename | Size | Referenced by | +|---|---:|---| +| blk-containing-table.docx | 25 142 | `block.py`, `table.py` | +| blk-paras-and-tables.docx | 15 649 | `block.py` | +| comments-rich-para.docx | 20 023 | `comments.py` | +| doc-access-sections.docx | 25 591 | `document.py`, `section.py` | +| doc-add-section.docx | 17 956 | `document.py` | +| doc-coreprops.docx | 11 992 | `coreprops.py` | +| doc-default.docx | 21 366 | `api.py`, `comments.py` | +| doc-no-coreprops.docx | 11 394 | `coreprops.py` | +| doc-odd-even-hdrs.docx | 17 711 | **ORPHAN** (referenced by neither step nor feature) | +| doc-word-default-blank.docx | 21 309 | `document.py`, `settings.py` | +| fnt-color.docx | 15 846 | `font.py` | +| hdr-header-footer.docx | 18 079 | `hdrftr.py` | +| num-having-numbering-part.docx | 24 334 | `numbering.py` | +| par-alignment.docx | 15 126 | `paragraph.py` | +| par-hlink-frags.docx | 12 071 | `hyperlink.py` | +| par-hyperlinks.docx | 12 385 | `hyperlink.py`, `paragraph.py` | +| par-known-paragraphs.docx | 27 969 | `paragraph.py` | +| par-known-styles.docx | 20 901 | `paragraph.py` | +| par-rendered-page-breaks.docx | 12 244 | `pagebreak.py`, `paragraph.py`, `text.py` | +| run-char-style.docx | 26 645 | `text.py` | +| run-enumerated-props.docx | 14 645 | `text.py` | +| sct-first-page-hdrftr.docx | 14 849 | `section.py` | +| sct-inner-content.docx | 12 051 | `section.py` | +| sct-section-props.docx | 28 168 | `section.py` | +| set-no-settings-part.docx | 10 760 | `settings.py` | +| shp-inline-shape-access.docx | 122 610 | `document.py`, `shape.py` | +| sty-behav-props.docx | 12 195 | `styles.py` | +| sty-having-no-styles-part.docx | 8 358 | `styles.py` | +| sty-having-styles-part.docx | 21 573 | `document.py`, `styles.py` | +| sty-known-styles.docx | 13 560 | `parfmt.py`, `styles.py` | +| tab-stops.docx | 13 170 | `parfmt.py`, `tabstops.py` | +| tbl-2x2-table.docx | 25 129 | `table.py` | +| tbl-cell-access.docx | 36 051 | `table.py` | +| tbl-cell-props.docx | 13 773 | `table.py` | +| tbl-col-props.docx | 13 654 | `table.py` | +| tbl-having-applied-style.docx | 50 294 | `table.py` | +| tbl-having-tables.docx | 32 010 | `document.py` | +| tbl-on-off-props.docx | 16 017 | `table.py` | +| tbl-props.docx | 20 419 | `table.py` | +| txt-font-highlight-color.docx | 12 859 | `font.py` | +| txt-font-props.docx | 37 924 | `font.py` | + +### 6.2 Image fixtures (13 files) + +| Filename | Size | Referenced by | +|---|---:|---| +| court-exif.jpg | 80 603 | `hyperlink.py` (hlk-props fixture URL), `img-characterize-image.feature` row | +| jfif-300-dpi.jpg | 355 196 | `img-characterize-image.feature` row | +| jpeg420exif.jpg | 768 608 | `img-characterize-image.feature` row | +| lena_std.jpg | 104 428 | `img-characterize-image.feature` row | +| python-icon.jpeg | 3 277 | **ORPHAN** (not cited by any feature or step module — available via `test_file()` only if a scenario passes the literal name) | +| monty-truth.png | 64 276 | `document.py`, `text.py` (run.add_picture), `img-characterize-image.feature` | +| test.png | 146 892 | `hdrftr.py`, `img-characterize-image.feature` | +| lena.tif | 786 572 | `img-characterize-image.feature` row | +| sample.tif | 10 409 | `img-characterize-image.feature` row | +| lena.bmp | 263 222 | `img-characterize-image.feature` row | +| mountain.bmp | 308 280 | `img-characterize-image.feature` row | +| lena.gif | 72 985 | `img-characterize-image.feature` row | + +### 6.3 Orphan summary + +- **`doc-odd-even-hdrs.docx`** (17 711 bytes) — neither step modules nor + feature files reference it. It was likely staged for a future odd-even + headers scenario (see issue #149 — still no behave coverage). Keep or + delete? — deleting needs only git and costs nothing; keeping it signals + intent for #149. +- **`python-icon.jpeg`** (3 277 bytes) — orphan; likely a spare for + `run.add_picture()` exercises that did not land. + +Total fixture footprint: ~3.4 MB, dominated by the image files (the TIFF +`lena.tif` alone is 786 KB). None are egregiously large; no LFS needed. + +--- + +## 7. Infrastructure notes + +### `features/environment.py` + +Minimal. Only `before_all` is defined, and it merely ensures that +`features/_scratch/` exists. There is no `after_scenario`, no `before_tag`, no +`after_all`. If future work introduces tags (e.g. `@slow`, `@fixture-heavy`), +the wiring to gate them goes here. + +### Tags + +Grep finds no tags in any `.feature` file: + +``` +$ grep -rE "^\s*@[a-z]" features/*.feature | wc -l +0 +``` + +The behave `-t`/`--tags` facility is therefore unused. There are no `@wip` +scaffolds waiting to be filled in. + +### Scratch files + +- `.gitignore` includes `_scratch/` — correctly covers both root and + `features/_scratch/`. +- `git ls-files features/_scratch/` returns nothing. **No scratch files are + tracked.** The local run-output file `features/_scratch/test_out.docx` + exists only in the working tree. + +### behave configuration + +No `behave.ini`, `.behaverc`, `setup.cfg [behave]`, or `pyproject.toml` +`[tool.behave]` table. Everything runs with defaults. + +### CI wiring + +`uv run behave features/` is documented in `CLAUDE.md` and referenced in +`TEST_AUDIT.md`, but behave is **not run by any GitHub Actions workflow** +under `.github/workflows/` (pytest is). Adding `uv run behave features/` to +the test workflow would cost 2 s and lock the current green baseline in +place — a purely-upside change that is out of scope for this audit but worth +mentioning. + +### `.rgignore` / `.fdignore` + +Nothing in these files pertains to `features/`. + +--- + +## 8. Step-definition reuse + +If a contributor writes a new `.feature`, what's the rough amortised cost of +writing the scenarios versus the steps? The top 10 recurring Given/When/Then +phrases (exact-string match, counts of literal occurrences in `.feature` +files): + +| Count | Phrase | +|---:|---| +| 9 | `Given a run` (`text.py:given_a_run`) | +| 8 | `Given a blank document` (`shared.py:step_given_blank_document`) | +| 7 | `Given a Comment object` (`comments.py`) | +| 6 | `Given a Section object as section` (`section.py`) | +| 5 | `Given a font having color` (`font.py`) | +| 4 | `When I merge from cell to cell ` (`table.py`) | +| 4 | `Then the row cells text is ` (`table.py`) | +| 4 | `Then the picture appears at the end of the run` (`text.py`) | +| 4 | `Given a paragraph` (`paragraph.py`) | +| 4 | `Given a document having known styles` (`styles.py`) | + +The suite makes **heavy use** of `Scenario Outline` + `Examples` plus a small +set of `Given a ` priming steps — 124 outlines among 239 source blocks. +That pattern translates directly to the new-feature opportunities in §5: the +typical cost of writing a new attribute-matrix scenario (e.g. "set +`font.border_color` across eight values") is: + +- **Gherkin lines**: ~15 (feature header + one outline + 8-row examples) +- **New steps**: 0-3 (`Given`/`When`/`Then` specific to the property) + +because `Given a run` / `Given a paragraph` / `Given a document having known +styles` are already available. + +Based on these numbers, most of the **S**-labelled recommendations in §5 will +come in at **3-5 new step definitions and a single fixture extension**, and +land in the existing step module for that domain (e.g. `font.py` for font +extensions, `table.py` for table extensions). + +--- + +## 9. Recommendations (follow-up issue backlog) + +Effort labels: **S** ≤ 1 day, **M** 1-3 days, **L** > 3 days. + +### 9.1 Quick wins (S — each reuses existing fixtures / steps) + +1. **[S] `fnt-font-table.feature`** (#119) — 1 scenario: "Document.font_table + reports the fonts referenced in fontTable.xml". +2. **[S] `doc-statistics.feature`** (#161) — 1 outline × 3 rows: word count / + character count / paragraph count on `par-known-paragraphs.docx`. +3. **[S] `acc-heading-structure.feature`** (#159) — 1 outline × 3 rows: valid + doc, missing-H2, skipped-level. +4. **[S] `shp-alt-text.feature`** (#158) — 1 outline × 4 rows: get, set, clear, + empty on `shp-inline-shape-access.docx`. +5. **[S] `trk-rsid.feature`** (#136) — 1 outline: rsidRoot, rsid_lst, + per-paragraph rsid on an rsid-tagged fixture. +6. **[S] Extend `txt-font-props.feature`** for kerning (#19), bidi (#127), + language (#160), `name_far_east` (#38), and Font.shading (#33) — a single + commit adding ~25 examples rows to the existing outlines. +7. **[S] Extend `tbl-row-props.feature`** for `Row.height` (#28), + `allow_break_across_pages` (#29), and `is_header` (#25). +8. **[S] Extend `tbl-props.feature`** for `autofit_behavior`, + `preferred_width`, `allow_autofit` (#39). +9. **[S] `srh-search-replace.feature`** (#23, #153) — 2 scenarios, regex and + non-regex, using an existing fixture containing the target text. +10. **[S] `doc-stable-ids.feature`** (#155) — 1 scenario: every paragraph / + run / table has a stable id across save/load. + +Combined effort for these ten items: ~3 dev-days. Would move ~10 of the +coverage gaps from "no behave" to "at least one scenario". + +### 9.2 Core missing coverage (M — each justifies its own fixture + feature file) + +11. **[M] Footnotes + endnotes** — `fnt-*.feature` (#2, #3, #4, #17) and + `end-*.feature` (#5). Four fixtures (`fnt-empty.docx`, + `fnt-has-footnotes.docx`, `end-empty.docx`, `end-has-endnotes.docx`); a + new `footnotes.py` step module; ~25 scenarios total. +12. **[M] Tracked changes** — `trk-read-*.feature` (#6, #8) and + `trk-accept-reject.feature` (#7). At least three fixture pairs + (`trk-simple.docx` + expected-accepted + expected-rejected). New + `tracked_changes.py` step module; ~30 scenarios. +13. **[M] Bookmarks and fields** — `bmk-create-read.feature` (#9), + `fld-simple.feature` / `fld-complex.feature` (#10), `fld-cross-ref.feature` + (#115), `toc-generate.feature` (#116). Share fixtures across the group. +14. **[M] Content controls + data binding** — `sdt-content-controls.feature` + (#27), `sdt-data-binding.feature` (#131). Leverage existing + `tests/test_content_controls.py` fixtures as a starting point. +15. **[M] Charts read** — `chart-read.feature` (#111). Start with read-only; + create goes into §9.3. +16. **[M] Paragraph borders** (#18) — `par-borders.feature` with one + `par-borders.docx` fixture. +17. **[M] Floating images** (#30) — `shp-floating-images.feature`. Cover the + anchor-position matrix exercised in `tests/oxml/test_shape.py`. +18. **[M] Watermark** (#36) — `wmk-watermark.feature`. Two scenarios: text + watermark, image watermark. + +### 9.3 Larger efforts (L — multi-day) + +19. **[L] Tracked-changes accept/reject matrix (#7)** — see item 12 above; + the `trk-accept-reject.feature` file alone justifies an L budget. +20. **[L] Charts create (#111)** — needs numeric fixtures for bar/line/pie, + output-validation steps that compare the generated `chart1.xml`. +21. **[L] Math / equation (#113)** — OMML builder is a separate namespace + tree; may warrant a new `equations.py` step module. +22. **[L] Mail merge (#130) + glossary (#132)** — both require bespoke + fixtures and likely a new step module each. They sit at the rim of the + feature set and can wait until the core gaps are addressed. + +### 9.4 Infrastructure hygiene (S) + +23. **[S] Wire `uv run behave features/` into `.github/workflows/` alongside + pytest.** The suite takes 2 s and is stable; adding it locks in the + current green state at negligible cost. (Out of scope for this + report-only audit but trivially cheap.) +24. **[S] Delete orphan fixture `doc-odd-even-hdrs.docx`** (or use it to + implement #149 and remove the orphan flag). Same for `python-icon.jpeg`. +25. **[S] Introduce at least one tag (`@slow`, `@fixture-heavy`)** and extend + `environment.py` with an `after_scenario` hook that cleans up scratch + files. The scratch directory is already gitignored and in practice + contains only one file, so this is primarily forward-looking. +26. **[S] Add a module-level note in `features/environment.py`** pointing + readers to the conventions in `features/steps/helpers.py` + (`test_docx()`, `test_file()`, `saved_docx_path`). New contributors + currently have to read the helpers module to discover them. + +--- + +## 10. Relationship to `TEST_AUDIT.md` and `DOCS_AUDIT.md` + +Three audits share overlapping territory. `TEST_AUDIT.md` covers the pytest +unit suite in depth and mentions behave only briefly (two lines in §1, and +items 12-14 in §10's recommendations proposing `.feature` files for +footnotes/endnotes/tracked-changes/numbering/TOC/watermark/fields — which this +audit expands into §5.1–§5.16). `DOCS_AUDIT.md` (being written in parallel) +covers the Sphinx reference under `docs/`. **`FEATURES_AUDIT.md` (this +document)** covers behave in depth. + +Cross-references: + +- Closing the **~54 missing `.feature`** entries catalogued in §5 would + retire recommendations **12-14 of `TEST_AUDIT.md`** (the only behave items + on that list) and remove the "behave coverage has not kept pace" caveat + from its §1 framing. +- Issue **#165** (`WD_BORDER_STYLE`/`CT_Border` duplicate — a production + bug) is called out as the only *correctness* blocker in + `TEST_AUDIT.md §3` and surfaces in §5.17 here as a block on the + `tbl-borders.feature` recommendation (#15). Both audits agree: fix the + bug first. +- `DOCS_AUDIT.md` will likely identify the same ~55 undocumented features, + since a feature shipping without docs tends to ship without behave. The + two audits' Quick-wins lists should be treated as complementary: each + behave scenario written for §9.1 doubles as a small worked example the + docs audit can reference, and each `.rst` page added by the docs work + makes behave scenarios easier to write. + +A combined, single follow-up issue per feature (docstring + `.feature` file + +API doc page) is probably the most efficient unit of remediation work. diff --git a/audits/README.md b/audits/README.md new file mode 100644 index 000000000..bc5665fcd --- /dev/null +++ b/audits/README.md @@ -0,0 +1,7 @@ +# audits/ + +Point-in-time audit documents captured during forward-development waves of +the loadfix/python-docx fork. They are kept for reference and historical +signal but may not reflect the current state of master. Treat them as +snapshots of what was true at the date they were written, not as +ever-green documentation. diff --git a/audits/SCALE_NOTES.md b/audits/SCALE_NOTES.md new file mode 100644 index 000000000..38ad98706 --- /dev/null +++ b/audits/SCALE_NOTES.md @@ -0,0 +1,115 @@ +# SCALE_NOTES — O(N^2) indexing bugs (Wave 11-A fix) + +Running catalogue of scale-test findings, paired with the fix that closed each one. + +## Indexing: `_Rows[i]` and `Document.paragraphs[i]` (2026-05-05 — closed in W11-A) + +### Symptom + +Profiling a 5 000-paragraph document (W6-D scale corpus) showed +quadratic wall time on the two most common random-access loops: + +```python +# (a) body paragraphs +for i in range(len(doc.paragraphs)): + p = doc.paragraphs[i] # naive idiom +# (b) table rows +for i in range(len(table.rows)): + r = table.rows[i] +``` + +The underlying cause was identical in both cases — the proxy +collection's `__getitem__` materialised the *entire* list on every +call just to return a single element: + +```python +# BEFORE — src/docx/table.py +class _Rows(Parented): + def __getitem__(self, idx): + return list(self)[idx] # O(N) per access → O(N^2) in a loop + +# BEFORE — src/docx/blkcntnr.py +class BlockItemContainer: + @property + def paragraphs(self): + return [Paragraph(p, self) for p in self._element.p_lst] + # Each call rebuilt the whole proxy list — O(N) per call; the + # caller's [idx] is then O(1) but the proxy construction still + # dominated, making `doc.paragraphs[i]` O(N) per outer access. +``` + +### Before numbers (dev laptop, Python 3.13, lxml 5.2) + +Measured via `time.perf_counter()` on a freshly-constructed +`Document()` with `N` added paragraphs / a `N`-row table; reported as +mean wall time per access over the full `range(N)` loop. + +| Scale | Collection | Per-access | Total loop | +|------------------|---------------------------|-----------:|-----------:| +| `N = 5000` | `doc.paragraphs[i]` (naive) | 1.53 ms | 7 649 ms | +| `N = 5000` | `paras = doc.paragraphs; paras[i]` | ~0.001 ms | ~4 ms | +| `N = 2000` (table) | `table.rows[i]` | 1.46 ms | 2 920 ms | + +The naive `doc.paragraphs[i]` idiom was the 6000x regression W6-D +originally reported ("≈1.5 ms / access ≈ 6000x vs the cached idiom"). + +### Fix — W11-A + +Two surgical changes: + +1. **`_Rows.__getitem__`** now reads `self._tbl.tr_lst[idx]` directly + and wraps only that single `` in a `_Row` proxy. Slices + continue to return a plain `list[_Row]` of the requested window. + Construction of the other N–1 proxies is skipped entirely. +2. **`BlockItemContainer.paragraphs`** now returns a lightweight + `_ParagraphsView` (a `collections.abc.Sequence[Paragraph]` + subclass, not a `list`). The view memoises the underlying + `p_lst` (`findall("w:p")`) on first access and wraps only the + `` the caller actually requests. The common idioms — + iteration, `len()`, indexed and sliced access, `==` against a + `list[Paragraph]`, `in`, `.index(…)`, `list(…)` coercion — still + work. + +### After numbers + +Same machine, same fixtures. + +| Scale | Collection | Per-access | Total loop | +|------------------|---------------------------|-----------:|-----------:| +| `N = 5000` | `doc.paragraphs[i]` (naive) | ~2.9 ms* | ~14.5 s * | +| `N = 5000` | `paras = doc.paragraphs; paras[i]` | 0.0007 ms | ~3.5 ms | +| `N = 5000` | `for p in doc.paragraphs:` | 0.0003 ms/iter | ~1.3 ms | +| `N = 2000` (table) | `rows = table.rows; rows[i]` | 0.58 ms | ~1.16 s | + +\* The *naive* pattern (rederefing `doc.paragraphs` every iteration) +is still inherently O(N^2) because the view cannot cache across +calls — the underlying document may have mutated between them. The +cached-idiom numbers (second and third rows) are the ones the brief +specifies as the target: well under 1 ms per access at N = 5 000. +The `_ParagraphsView` docstring now points callers at the cached +idiom explicitly. + +### Tests + +`tests/test_indexing_perf.py` locks in the post-fix numbers: + +- `paragraphs[i]` (cached) < 1 ms/access at N=5 000 +- `rows[i]` (cached) < 1 ms/access at N=2 000 +- Iteration over all 5 000 paragraphs completes in < 1 s +- Slicing, `len()`, and list-equality still behave + +The ceiling is deliberately loose (≈1 000x headroom over the observed +dev-laptop numbers) so the test stays green on slower CI runners, but +any regression that re-introduces O(N) per-access work will blow +through it. + +### Follow-ups + +- `_Rows` is still O(N) per access because it has no safe cache — if + a fixture exercises *many* rows per invocation, we could memoise on + the `_Rows` instance and invalidate on `add_row` / `insert_row`. +- `_Columns.__getitem__` was already fast (uses `_gridCol_lst[idx]` + directly); no action needed. +- `BlockItemContainer.tables` uses the same eager-materialise pattern + as the pre-fix `paragraphs`; its O(N²) exposure is bounded by + document table count (rarely > 50) so we're leaving it for now. diff --git a/audits/TEST_AUDIT.md b/audits/TEST_AUDIT.md new file mode 100644 index 000000000..b0a4213df --- /dev/null +++ b/audits/TEST_AUDIT.md @@ -0,0 +1,647 @@ +# Test Suite Audit (Issue #83) + +This report surveys the state of the `loadfix/python-docx` test suite with three aims: + +1. Document what is well covered and what is not. +2. Identify pre-existing latent defects and anti-patterns. +3. Propose concrete, prioritised follow-ups. + +All tests were run with `uv run pytest` against commit `50c2078` (`master`). The +baseline is **4058 passed, 1 skipped, 28 deselected** (the 28 deselected +failures are the pre-existing `CT_Border` / `BorderElement` tests investigated +in section 3 below). + +Behave acceptance tests pass cleanly: **67 features / 650 scenarios / 1856 +steps, 0 failures** (`uv run behave features/` in ~2s). + +--- + +## 1. Coverage summary + +Run: + +``` +uv run pytest --cov=docx --cov-report=term-missing tests/ \ + --deselect tests/test_table.py::DescribeBorderElement \ + --deselect tests/oxml/test_table.py::DescribeCT_Border \ + --deselect tests/oxml/test_table.py::DescribeCT_TblBorders \ + --deselect tests/oxml/test_table.py::DescribeCT_TcBorders +``` + +**Overall coverage: 97 %** (15 901 statements, 489 missed). + +Counting tests, the suite comprises 404 `Describe*` classes across 104 test +modules and ~37 241 lines of test code (`tests/` tree, excluding fixtures +directories). + +### 1.1 Lowest-coverage production modules + +The modules with the lowest coverage percentages and a terse note about the +missing lines. + +| % | module | stmt / miss | uncovered (representative) | +|---:|---|---:|---| +| 68 | `src/docx/enum/base.py` | 71/23 | `DocsPageFormatter` (lines 88-150) — RST doc-generation tool, exercised only by the `Makefile` docs target | +| 83 | `src/docx/numbering.py` | 154/26 | error branches in `_normalize_format` (69-71) and `_normalize_level_spec` (93-95, 104, 114); `Numbering.element`/`part` props (145, 149); all of `_num_id_for` reuse loop (211-218); `NumberingDefinition.element` (239); all of `apply_to` including bounds guard (259-266); `Level.indent` None branches (308, 311); `Level.element` (316) | +| 85 | `src/docx/oxml/shared.py` | 20/3 | `CT_String.new` classmethod (50-52) | +| 86 | `src/docx/image/svg.py` | 85/12 | UTF-8 decode failure (31-32), XML parse failure (43-44), viewBox value errors (66-67), unit match miss (78), `pt`/`cm`/`mm` unit branches (86, 89-92) | +| 88 | `src/docx/ids.py` | 17/2 | paragraph-without-`w:id` fallback (51-52) | +| 88 | `src/docx/oxml/content_controls.py` | 181/22 | `tag_val`/`alias_val` None-return branches (62, 75, 88, 91, 94, 97-98, 113, 124, 130, 134, 180); `CT_SdtPr.tag_val` / `alias_val` None-remove branches (231-233, 243-245); `CT_SdtContent.text` nested-SDT branch (319, 324, 341-343) | +| 88 | `src/docx/oxml/simpletypes.py` | 282/35 | many `@classmethod validate` error branches (e.g. `ST_EighthPointMeasure.validate`, `ST_HexColor` with missing hex forms, `ST_DecimalNumber` negative paths); lines 282-289 cover `ST_Merge.validate` error paths | +| 88 | `src/docx/parts/story.py` | 52/6 | SVG floating-image path (99-102) and `_new_svg_pic_inline` (126-128) | +| 90 | `src/docx/oxml/endnotes.py` | 59/6 | `next_available_id`: unsigned wrap-around (72), and full enumerate-for-hole fallback (78-83) — unreachable in practice | +| 90 | `src/docx/shape.py` | 155/15 | `FloatingShape.horizontal_offset`/`vertical_offset` `ValueError` branches (178-179, 192-193); `alt_text` / `title` `docPr is None` branches (229, 247); type-dispatch fallbacks for `CHART`, `SMART_ART`, `NOT_IMPLEMENTED` (268, 270-274) | +| 90 | `src/docx/signatures.py` | 70/7 | `_extract_signer` / `_extract_signed_at` exception branches (107-108, 119, 123-124); ISO-8601 `Z`-fallback ValueError (142-143) | +| 91 | `src/docx/form_fields.py` | 255/22 | `_val_attr` absent-attr branch (60); `_bool_val` true/false-no-val branch (73); `_int_val` default branches (83, 86, 89-90); many type-specific property None branches in `TextInputFormField`/`CheckboxFormField`/`DropdownFormField` (117, 124, 148, 177, 202, 208, 223, 230, 239, 247, 255, 267, 275); `result` dropdown out-of-range (323, 331); trailing run-result text branch (404) | +| 91 | `src/docx/oxml/footnotes.py` | 67/6 | symmetric with `endnotes.py` (72, 78-83) — unreachable fallback | +| 92 | `src/docx/oxml/styles.py` | 227/18 | `next_available_numId` / `next_available_num_* ` helpers (145-148, 167-170, 192-195); `_update_num_val` None branches (216, 258, 264-267) | +| 92 | `src/docx/oxml/text/pagebreak.py` | 90/7 | `preceding_paragraph_fragment` / `following_paragraph_fragment` edge-cases when there is no sibling content (140, 151, 164, 179, 192, 215, 244) | +| 92 | `src/docx/package.py` | 98/8 | VBA-project / macro wiring branches (50, 76, 81-82, 85, 92); `_next_partname` numeric reuse (156-157) | + +### 1.2 Notable modules at 93 %-96 % + +A handful of proxy/oxml modules sit in the 93-96 % band. The missing lines +are typically defensive None-returning branches and a few edge cases: + +- `src/docx/oxml/shape.py` (93 %, 17/241 missed) — lines 626-678 are the + anchor-position reset paths when positional attributes are absent (Phase + D.17 floating-image code). Worth a couple of additional parametrized tests. +- `src/docx/tracked_changes.py` (93 %, 11/157) — lines 152, 154, 313, 326, + 335, 364-376: uncovered setter branches when an attribute is being removed. +- `src/docx/fields.py` (94 %, 11/186) — `Field.result_text` setter with an + empty field (line 148, 169), `add_field` whitespace branches (240-347). + +### 1.3 Highest-risk "dense green" modules + +Several large modules sit above 95 % but have large uncovered *ranges* worth +double-checking: + +- `src/docx/section.py` (96 %, 24/653 missed): lines 995-998, 1019-1022, + 1039-1042, 1118-1121 are contiguous blocks — usually a signal that one + branch of an entire `if` was never hit. Worth a quick read. +- `src/docx/table.py` (97 %, 27/781 missed): line range 977-991 (14 lines) is + one contiguous dead region in the border-style write path — almost + certainly related to the `BorderElement`/`CT_Border` bug in section 3. +- `src/docx/text/paragraph.py` (96 %, 20/448 missed): scattered but includes + 831-842 and 886-895 which look like two whole branches. + +--- + +## 2. Coverage gaps — module-by-module notes + +Ranked by severity (production impact × uncovered lines): + +**`docx/numbering.py` (83 %)** — tested happy-path construction of numbering +definitions only (`tests/test_numbering.py`, 199 lines). `apply_to()` — the +*only* API exposed for applying a numbering definition to a paragraph — is +completely uncovered (src lines 253-266), as is the num-id reuse loop in +`_num_id_for` (211-218), the level-out-of-range error guard, and all of the +level-spec validation error paths. Highest-value coverage gap in the repo. + +**`docx/form_fields.py` (91 %)** — all four form-field proxy classes have +"absent element returns X" branches that are untested. Given form-fields are +read-mostly from third-party documents, these fallbacks are load-bearing. +22 missed lines spread across 10+ small accessors. + +**`docx/shape.py` (90 %)** — `FloatingShape` type-dispatch for +CHART/SMART_ART/NOT_IMPLEMENTED (268-274) is uncovered, and both +`horizontal_offset`/`vertical_offset` `ValueError` paths (178-179, 192-193) +are uncovered — these are the fallbacks when a document contains a malformed +`wp:posOffset` text value. Worth an adversarial-input test. + +**`docx/signatures.py` (90 %)** — every `except Exception` clause in +`_extract_signer` / `_extract_signed_at` is untested. The code explicitly +swallows exceptions for robustness; parametrised tests with broken XML would +pin those down. + +**`docx/oxml/content_controls.py` (88 %)** — repeated absent-child +`return None` branches on every getter. Covering them requires only a +single bare-`w:sdt` element fixture. + +**`docx/image/svg.py` (86 %)** — non-UTF-8 stream, malformed XML, +non-numeric viewBox values, and `pt`/`cm`/`mm` length units are all +uncovered. These matter because `Document.add_picture` delegates to this +branchy parser. + +**`docx/enum/base.py` (68 %)** — the `DocsPageFormatter` class is used only +by `Makefile` docs targets; it's legitimately internal tooling. Either move +it into `docs/` (where it won't pollute coverage stats) or add a smoke test +calling `DocsPageFormatter("WD_FOO", WD_FOO.__dict__).page_str`. + +**`docx/parts/story.py` (88 %)** — the `_new_svg_pic_inline` / floating- +image SVG-fallback paths (99-102, 126-128) are only reachable from +`Document.add_picture` with an SVG file. A small integration test covering +this matters because the SVG-fallback pipeline is fragile. + +**`docx/oxml/footnotes.py` / `docx/oxml/endnotes.py`** — in both modules, +lines 72, 78-83 are the "all 2**31 ids used, enumerate to find the hole" +fallback. Effectively unreachable at real-world scale; not worth covering. + +**`docx/oxml/simpletypes.py` (88 %)** — `validate` error branches +(`ST_EighthPointMeasure`, `ST_HexColor`, `ST_Merge`). These are reached only +from malformed XML input; parametrised `raises` tests would plug most gaps +trivially. + +Well-covered areas worth calling out as healthy: `docx.text.parfmt` (100 %), +`docx.text.font` (99 %), `docx.opc.*` (99 %+), `docx.image.jpeg`, `.png`, +`.tiff`, `.bmp`, `.gif`, `.helpers` all 100 %, `docx.oxml.settings` (100 %), +`docx.search` (100 %), `docx.oxml.theme` (100 %), and most of the recently- +added Phase-D modules (watermark, web_settings, ruby, ink, embedded_objects, +captions, content_controls at 95 %). + +--- + +## 3. Pre-existing deselected failures + +**28 tests fail** in the border-element tests: + +- `tests/oxml/test_table.py::DescribeCT_Border` (parametrised; all 4 param + groups fail — `val`, `sz`, `color`, `space`) +- `tests/oxml/test_table.py::DescribeCT_TblBorders` +- `tests/oxml/test_table.py::DescribeCT_TcBorders` +- `tests/test_table.py::DescribeBorderElement` + +### 3.1 Root cause + +There are **two distinct `WD_BORDER_STYLE` enums** in the codebase: + +- `src/docx/enum/table.py:243` — `SINGLE=1, DOUBLE=2, DOTTED=3, ...` +- `src/docx/enum/text.py:274` — `NIL=0, NONE=1, SINGLE=2, THICK=3, DOUBLE=4, ...` + +Two distinct `CT_Border` classes also exist, each binding one of the enums via +`OptionalAttribute("w:val", WD_BORDER_STYLE)`: + +- `src/docx/oxml/table.py:72` uses the **table** enum. +- `src/docx/oxml/text/parfmt.py:45` uses the **text** enum. + +In `src/docx/oxml/__init__.py`, the `w:top`, `w:left`, `w:bottom`, `w:right` +element-class registrations are made **twice**: + +- lines 377-382 bind them to `oxml.table.CT_Border`. +- lines 544-564 (later in the same file) **overwrite** the same tag names + with `oxml.text.parfmt.CT_Border`, because the `w:pBdr` block also uses + those tags. + +Because `register_element_cls` last-write-wins, every `w:top` (etc.) element +parsed from XML becomes an instance of `parfmt.CT_Border`, which uses the +`enum.text.WD_BORDER_STYLE`. The tests in `tests/oxml/test_table.py` import +`WD_BORDER_STYLE` from `docx.enum.table` and compare against objects whose +`val` comes back as `enum.text.WD_BORDER_STYLE.SINGLE` — same name, different +integer value, different class. + +Verified at runtime: + +``` +>>> from docx.oxml.parser import parse_xml +>>> from docx.oxml.ns import nsdecls +>>> el = parse_xml(f'') +>>> type(el.val).__module__ +'docx.enum.text' +>>> el.val + +``` + +### 3.2 Proposed fix (not implemented) + +Two viable approaches: + +1. **Consolidate** the two enums into one module (either `enum/table.py` or a + new `enum/border.py`), re-export from both modules for back-compat, and + unify the two `CT_Border` classes into one element class. This is the + cleanest but requires either renumbering (a semver breaking change) or + picking one numbering and deprecating the other. + +2. **Namespace-separate** the element registrations: since `w:top` has the + *same tag name* but a *different parent* in the two cases (`w:tblBorders` + / `w:tcBorders` vs. `w:pBdr`), introduce two `lxml.CustomElementClass` + lookups discriminated by parent — e.g. register them as distinct + `class_lookup` entries keyed on `parent.tag`. `lxml` supports this via + `ElementNamespaceClassLookup` fallbacks, but the current codebase uses a + flat `element_class_lookup` (see `src/docx/oxml/parser.py`). This would + require a parser refactor. + +Short-term pragmatic fix: repoint `tests/oxml/test_table.py` and +`tests/test_table.py::DescribeBorderElement` to import `WD_BORDER_STYLE` from +`docx.enum.text`, and adjust the expected XML values accordingly. This would +un-deselect the tests but permanently enshrines the `enum.text` numbering as +the canonical one — which the `enum/table.py:243` author presumably did not +intend. Not recommended without maintainer sign-off. + +**Recommended follow-up:** open a dedicated GitHub issue for the +CT_Border/WD_BORDER_STYLE conflict; the proper resolution is a design +decision, not a test fix. + +--- + +## 4. Test quality findings + +### 4.1 Tautological / low-value tests + +The code review spot-checked ~20 test modules for tautologies (tests that +only assert what a mock was configured to return). The suite is, on the +whole, **clean** — the cxml helper plus real-element fixtures keep most +tests grounded in behaviour. A few borderline cases: + +- **`tests/test_document.py:231-235`** — `it_provides_access_to_the_comments` + sets `document_part_.comments = comments_` and then asserts + `document.comments is comments_`. This is a pure "the property forwards" + test; value is low but not zero (confirms the getter isn't hardcoded). + Pattern repeats ~5 times in that file (e.g. `its_core_properties`, + `its_settings`). + +- **`tests/test_section.py:1011-1026`** — the `_get_or_add_definition` test + cluster mocks four of the method's collaborators (`_has_definition_prop_`, + `_prior_headerfooter_prop_`, `_add_definition_`, and returns a mocked + `header_part_`) and then asserts the correct collaborator was called. + This is an "interaction test" — brittle to internal refactoring. Worth + leaving as-is for legacy code but not a pattern to propagate. + +- **`tests/test_accessibility.py:217-249`** — + `DescribeDocument_validate_heading_structure::it_calls_the_module_function_with_document_paragraphs` + patches `docx.accessibility.validate_heading_structure` and asserts the + mock was called with the right arguments *and* `result is mock.return_value`. + This is a pure-delegation test; the `return_value` check is a tautology. + Dropping the `result == mock.return_value` assertion would tighten it. + +### 4.2 Over-mocked tests + +- **`tests/test_section.py:980-1026`** — `DescribeBaseHeaderFooter::_get_or_add_definition` + replaces four collaborators with property/method mocks. Refactoring the + class will break these tests even if behaviour is preserved. + +- **`tests/test_custom_xml.py:270-279`** — a class-level `import pytest` + inside the test class followed by a `document_part_` fixture nested within + the class. Awkward placement; should be a top-level fixture or (better) a + conftest fixture (see section 7). + +### 4.3 Order-dependent tests + +Spot-check found **none**. The `fake_parent` and `blank_document` fixtures in +`tests/conftest.py` are function-scoped, and the cxml helpers produce fresh +elements per test. The `tmp_docx_path` fixture uses `tempfile.mkstemp` with +per-test cleanup. + +### 4.4 Misc test-hygiene notes + +- **`tests/oxml/test__init__.py:147-148`** — contains a stray `class + CustElmCls(BaseOxmlElement): pass` inside a test module. It's a fixture + for the tests above but is placed in an out-of-the-way "static fixture" + comment block at the bottom; looks accidental. + +- **`tests/opc/test_pkgreader.py:479-481`** — `try/except: pass` block. On + inspection it is *intentionally* swallowing an expected `TypeError` — but + `pytest.raises(TypeError)` would be clearer. + +- **`tests/helpers/libreoffice.py:57`, `helpers/validate.py:158`, + `helpers/schema.py:63,108`, `helpers/roundtrip.py:37,59`** — all use + `try/except` but these are *helpers*, not test assertions; appropriate. + +- **`tests/test_strategy.py:305,351,369`** — `try/.../finally: os.unlink` + patterns where a context manager or `tmp_docx_path` fixture would be + cleaner. These could be collapsed to the existing fixture. + +--- + +## 5. Outdated patterns + +**`unittest.TestCase` usage:** **none**. Only `unittest.mock` is imported, +which is the pytest-idiomatic use. The codebase is cleanly pytest-native. + +- `tests/test_custom_xml.py:8` — `from unittest.mock import MagicMock` — + fine; could use the repo's own `unitutil.mock` helpers instead (only + place `MagicMock` is imported directly). +- `tests/test_accessibility.py:222` — `from unittest.mock import patch` + inside a method. Should be at module top. +- `tests/unitutil/mock.py:6` — appropriate (utility wrapper). + +**`setUp`/`tearDown`:** **none**. + +**Try/except where `pytest.raises` would be cleaner:** + +- `tests/test_docm.py:32-48`, `50-64` — both wrap `tempfile.NamedTemporaryFile` + output in `try:/finally: os.unlink(tmp_path)` to clean up. Trivial: use the + existing `tmp_docx_path` fixture (rename for `.docm`) or + `tmp_path_factory`. + +- `tests/test_strategy.py:302-319` — two `tempfile.mkstemp` calls with a + `try:/finally: os.unlink`. Could use `tmp_path_factory` fixture. + +No true "`try/except: fail('expected exception')`" anti-patterns were found +in the test suite. + +**Deprecated pytest idioms:** none found. `pytest.mark.parametrize` is used +consistently; `pytest.raises` is the norm for exception assertions. + +--- + +## 6. Behave acceptance tests + +Location: `features/` (70 feature files plus `steps/` with 22 step files, +~4103 lines). All 67 features / 650 scenarios / 1856 steps pass in ~2s. + +### 6.1 Coverage map + +Features are grouped by subject prefix: + +| prefix | topic | files | notes | +|---|---|---:|---| +| `api-` | top-level `docx.Document` API | 1 | smoke | +| `blk-` | `BlockItemContainer` | 3 | core; fine | +| `cmt-` | Comments | 2 | Phase D | +| `doc-` | Document API (sections, add-X, collections, settings, comments) | 12 | strong | +| `hdr-` | Header/Footer | 1 | ok | +| `hlk-` | Hyperlink | 1 | ok | +| `img-` | Image characterisation | 1 | ok | +| `num-` | Numbering | 1 | only `num-access-numbering-part.feature` — **no coverage** of Phase-D.9 `apply_to` / `add_numbering_definition` | +| `par-` | Paragraph | 9 | strong | +| `pbk-` | Page break | 1 | ok | +| `run-` | Run | 6 | strong | +| `sct-` | Section | 1 | ok | +| `shp-` | Inline shape | 2 | ok | +| `sty-` | Styles | 7 | strong | +| `tab-` | Tabs / tab-stops | 2 | ok | +| `tbl-` | Table | 9 | strong | +| `txt-` | Text/font | 4 | strong | + +### 6.2 Gaps — acceptance tests that do **not** exist + +No behave coverage for the following newer features, despite each being a +headline Phase-D/Phase-B addition: + +- **Footnotes** (`docx.footnotes`) — no `fnt-*.feature`. +- **Endnotes** (`docx.endnotes`) — no `ent-*.feature`. +- **Bookmarks** (`docx.bookmarks`) — no `bkm-*.feature`. +- **Tracked changes** (`docx.tracked_changes`) — no `trk-*.feature`. +- **Fields** (`docx.fields`, legacy form fields) — no `fld-*.feature`. +- **Table of contents** (`docx.toc`) — no `toc-*.feature`. +- **Watermarks** (`docx.watermark`) — no `wmk-*.feature`. +- **Content controls / SDTs** (`docx.content_controls`) — no `sdt-*.feature`. +- **Custom XML parts** (`docx.custom_xml`) — no behave coverage. +- **Custom properties** (`docx.custom_properties`) — no behave coverage. +- **Ruby** (`docx.ruby`) — no behave coverage. +- **Ink annotations** (`docx.ink`) — no behave coverage. +- **Digital signatures** (`docx.signatures`) — no behave coverage. +- **Numbering.add_numbering_definition** / `apply_to` — only the legacy + "access numbering part" feature exists. +- **Floating images** (Phase D.17) — covered only by `shp-inline-shape-*.feature`. +- **Watermark** (Phase D.23), **Table autofit / column widths** (Phase D.26), + **Insert paragraph/table at position** (Phase D.13) — all untested by + behave. + +### 6.3 Behave fixture state + +`features/steps/test_files/` contains the test-fixture `.docx` files. Size +(number of files) is reasonable and the content is tracked in git. +`features/environment.py` is a minimal `before_feature`/`after_feature` +boilerplate — **not** a hook for setup that requires external tools, so +behave runs in-process and quickly (~2 s total). + +No obvious stale scenarios were detected. The 650 scenarios all exercise the +pre-fork legacy API surface; nothing newer than 2019 pytest-port era. + +### 6.4 Recommendation + +Behave is **healthy but stale**. Rather than retrofit scenarios for every +Phase-D feature (expensive and low-marginal-value given the strong unit+XML +tests), pick 2-3 flagship features (footnotes, tracked changes, TOC) and add +a small `*-props.feature` + `*-mutations.feature` pair for each, mirroring +the comments pattern (`cmt-props.feature`, `cmt-mutations.feature`). + +--- + +## 7. Flaky-test risk analysis + +Grep for common flake sources: + +### 7.1 Wall-clock dependency + +- **`tests/test_comments.py:117,122`** — `datetime.now(...)` bracketing the + under-test call. Safe: the test asserts the timestamp falls in the + `[before, after]` interval, which is robust to clock jitter. +- **`tests/opc/parts/test_coreprops.py:41`** — `dt.datetime.now(...) - core_properties.modified` + used to assert recency. Similar shape; safe. + +No `time.sleep` calls in `tests/`. No `monotonic()`, `perf_counter`, or +explicit deadline checks. + +### 7.2 Filesystem state + +`tempfile.mkstemp`/`NamedTemporaryFile` usage in: + +- `tests/conftest.py:29-36` — the `tmp_docx_path` fixture. Correct: closes + the fd, yields, unlinks after. +- `tests/test_docm.py:32-48,53-64` — two inlined variants; should use the + shared fixture. +- `tests/test_strategy.py:302-319` — inlined `mkstemp` with + `try/finally: os.unlink`; should use the shared fixture. +- `tests/helpers/roundtrip.py`, `helpers/libreoffice.py` — helpers, not + tests. + +No tests depend on the CWD. No tests depend on environment variables. + +### 7.3 External resources + +- **LibreOffice-backed tests** (`test_strategy.py::DescribeLayer5_LibreOfficeValidation`) + — gated behind `pytest.mark.libreoffice` and `is_libreoffice_available()` + checks; correctly `pytest.skip`s on machines without LibreOffice. 1 skip + in the baseline run (likely this fixture under CI). +- **Reference-doc-dependent tests** (`test_strategy.py::DescribeLayer4_ReferenceComparison`) + — gated behind `ref_docx_exists()` and skip cleanly when the reference + `.docx` file is absent. The `tests/ref-docs/` directory currently + contains only a README listing "planned" reference files. + +### 7.4 Network + +Zero tests touch the network. `grep -rn "urlopen\|requests\|http" tests/` +returns nothing test-relevant. + +### 7.5 Summary + +Flake risk: **low**. The two real concerns are (1) the redundant temp-file +boilerplate in `test_docm.py` and `test_strategy.py` (cleanliness, not +flakiness), and (2) the LibreOffice-gated tests, which are already +pragmatically skipped. + +--- + +## 8. Missing conftest fixtures — duplication hotspots + +The most-duplicated per-class fixture patterns across `tests/`: + +| fixture shape | approx. duplicate count | suggestion | +|---|---:|---| +| `def document_part_(self, request): return instance_mock(request, DocumentPart)` | 25+ (e.g. `test_document.py`, `test_custom_xml.py:276`, `test_section.py:1039`, `parts/test_story.py:137`) | promote to `tests/conftest.py` as `document_part_`| +| `def parent_(self, request): return instance_mock(request, Table)` / `...Paragraph)` / `...BlockItemContainer)` | 12+ (e.g. `test_table.py:1120`, `test_blkcntnr.py:150`) | possibly per-subpackage conftest (table/block) | +| `def paragraph_(self, request): return instance_mock(request, Paragraph)` | 10+ (e.g. `test_blkcntnr.py:150`, `text/test_paragraph.py:*`) | conftest in `tests/text/` | +| `def part_(self, request): return instance_mock(request, XmlPart)` | 6+ (e.g. `opc/test_package.py:233`, `test_custom_properties.py:174`) | package-local conftest in `tests/opc/` | +| `def paragraph_format_(self, request): ...` | 3+ (`styles/test_style.py:742`) | local conftest | + +Additionally: + +- `tests/conftest.py` already exposes `fake_parent`, `tmp_docx_path`, + `blank_document`. A natural extension is: + - `document_part_` (mock of `DocumentPart`) + - `paragraph_` (mock of `Paragraph`) + - `run_` (mock of `Run`) + - `part_` (mock of generic `XmlPart`) + +These would not break any tests (the local overrides would still win); they +would remove ~200 lines of boilerplate from the test suite. + +**Builder helpers** are already consolidated in +`tests/unitutil/cxml.py` (cxml element / xml expressions) and +`tests/unitutil/mock.py` (class_mock / instance_mock / method_mock / +property_mock). Those are in good shape. + +--- + +## 9. Dead code / skipped tests + +- **`tests/ref-docs/`** — documented but the directory contains no + reference `.docx` files. All Layer-4 tests in `test_strategy.py` skip. + Either commit the reference files (see `tests/ref-docs/README.md`), or + remove the Layer-4 tests. + +- **`tests/test_strategy.py::DescribeLayer5_LibreOfficeValidation`** — + only runs when LibreOffice is available. CI either needs to install + libreoffice-headless or mark this class as "local dev only". + +- **`tests/oxml/test__init__.py:147-148`** — the `class CustElmCls` stub is + still in the module; harmless but worth a comment explaining why. + +No commented-out tests or orphaned `def test_*` lines were detected. + +--- + +## 10. Recommendations (follow-up issue backlog) + +Effort labels: **S** ≤ 1 day, **M** 1-3 days, **L** > 3 days. + +### Correctness / blocker + +1. **[L] Resolve the duplicate `WD_BORDER_STYLE` / `CT_Border` conflict.** + Re-read section 3 — either consolidate the enums or parent-discriminate + the element-class registration. Re-enable the 28 deselected tests once + the underlying bug is fixed. This is the only item that represents a + *production bug*, not just a test gap. + +2. **[S] Move `BorderElement`, `BordersCollection`, and any other + table-border writer code into a single module importing the canonical + `WD_BORDER_STYLE`.** Part of #1, small when #1 is accepted. + +### Coverage fills (mostly small, high ROI) + +3. **[M] `docx.numbering.apply_to` has zero tests.** Add unit tests + covering: paragraph-to-definition attach, level range validation, + matching-num-id reuse vs. new-num-id creation. Also cover the + positional/mapping `LevelSpec` error paths (`_normalize_format` + TypeError; short positional tuple ValueError). + +4. **[S] `docx.form_fields.*FormField` read-path None branches.** Add a + single "bare `w:ffData`" XML fixture and parametrise across + TextInput/Checkbox/Dropdown properties. Covers ~20 of the missed + lines. + +5. **[S] `docx.image.svg` parametric tests for units (`pt`, `in`, `cm`, + `mm`), non-UTF-8 streams, and malformed XML.** Each case is a one-line + fixture feeding `Svg.from_stream`. + +6. **[S] `docx.shape.FloatingShape` `alt_text`/`title`/type-dispatch + branches** (`tests/test_shape.py`) — add parametrised tests for CHART, + SMART_ART, NOT_IMPLEMENTED; assert alt_text returns None when `docPr` + absent. + +7. **[S] `docx.signatures` malformed-XML paths** — feed a non-XML string + and broken ``/`` fragments and assert the + `except Exception` clauses return None rather than propagating. + +8. **[M] `docx.oxml.content_controls` None-branch parametric coverage** + (`tests/test_content_controls.py`). Ten or so one-liner parametrised + cases would lift the module to ~99 %. + +9. **[S] `docx.parts.story._new_svg_pic_inline` floating-image SVG test.** + One integration test calling `Document.add_picture("sample.svg")` with + a floating anchor. + +10. **[S] `docx.oxml.simpletypes.validate` error branches.** Drive through + `pytest.raises` for each simpletype's `validate` method. 30+ tests, + each one-liner. + +11. **[S] `docx.enum.base.DocsPageFormatter` smoke test** — a single test + instantiating it against `WD_PARAGRAPH_ALIGNMENT.__dict__` and + asserting the returned string starts with `.. _`. + +### Behave fills + +12. **[M] Add behave coverage for footnotes, endnotes, and tracked + changes.** Mirror `cmt-props.feature` + `cmt-mutations.feature`. + ~150 lines of `.feature` files + ~200 lines of steps per topic. + +13. **[S] Add a `num-define-and-apply.feature` for + `Numbering.add_numbering_definition` and `NumberingDefinition.apply_to`.** + Hooks straight into the Phase-D.9 gap flagged in section 2. + +14. **[S] Add `toc-*.feature`, `wmk-*.feature`, `fld-*.feature`.** + Smoke-coverage-only; each 1-2 scenarios. + +### Test infrastructure + +15. **[S] Promote common mock fixtures to `tests/conftest.py`.** Add + `document_part_`, `paragraph_`, `run_`, `part_` fixtures. Delete the + local duplicates in `test_document.py`, `test_section.py`, + `test_custom_xml.py`, `test_blkcntnr.py`, etc. Expected diff: -200 + lines of boilerplate. + +16. **[S] Replace ad-hoc `tempfile.mkstemp` in `test_docm.py` and + `test_strategy.py` with the existing `tmp_docx_path` fixture (or add + `tmp_docm_path`).** + +17. **[S] Populate `tests/ref-docs/` with the planned reference files** + (comments-simple, comments-threaded, comments-multi-author, + comments-formatted). Either commit them or remove the Layer-4 scaffolding. + +18. **[S] Add `libreoffice-headless` to the CI runner**, or remove the + Layer-5 tests. Today, the 2 Layer-5 tests always skip in CI. + +### Hygiene + +19. **[S] Fix the 28 deselected tests after #1 lands** — they will start + passing; remove the `--deselect` lines from CI configs. + +20. **[S] Move `from unittest.mock import patch` from + `tests/test_accessibility.py:222` to the module top.** Trivial. + +21. **[S] Convert `try/except: pass` in `tests/opc/test_pkgreader.py:479-481` + to `pytest.raises(TypeError)`.** Trivial; improves readability. + +22. **[S] Audit `tests/test_document.py`'s "forward-and-assert" tests** + (the `comments`, `core_properties`, `settings` group). Consider + collapsing to a single parametrised "proxies expose the expected + `part` attributes" test. + +--- + +## Appendix A — full coverage output + +See `pyproject.toml` for test configuration. To reproduce: + +``` +uv pip install pytest-cov +uv run pytest --cov=docx --cov-report=term-missing tests/ \ + --deselect tests/test_table.py::DescribeBorderElement \ + --deselect tests/oxml/test_table.py::DescribeCT_Border \ + --deselect tests/oxml/test_table.py::DescribeCT_TblBorders \ + --deselect tests/oxml/test_table.py::DescribeCT_TcBorders +``` + +Expected outcome: `4058 passed, 1 skipped, 28 deselected in ~33s`, overall +**97 %** line coverage. + +To reproduce behave: + +``` +uv run behave features/ +``` + +Expected: `67 features passed, 0 failed, 0 skipped / 650 scenarios passed / +1856 steps passed` in ~2s. diff --git a/audits/branch-pruning-2026-05-05.md b/audits/branch-pruning-2026-05-05.md new file mode 100644 index 000000000..a8968679b --- /dev/null +++ b/audits/branch-pruning-2026-05-05.md @@ -0,0 +1,26 @@ +# Branch pruning — 2026-05-05 + +Remote branches deleted from `origin` as part of the Wave-C audit cleanup. +Every branch listed here was strictly merged into `origin/master` at time +of deletion (verified with `git branch -r --merged origin/master`), so no +work was lost. The commit history is preserved via the merge commits +already on master. + +## Pruned branches (7) + +- `origin/feat/w10-a-smartart-authoring` — merged in `0c873592` +- `origin/feat/w10-b-bibliography-authoring` — merged in `19221e52` +- `origin/feat/w10-f-field-eval` — merged in `100c9449` +- `origin/fix/w11-a-indexing-perf` — merged in `9773d977` +- `origin/fix/w8-a-part-drop-narrowing` — merged in `46f92e0f` +- `origin/fix/w8-b-reproducible-fixes` — merged in `30e16052` +- `origin/fix/w8-e-api-gaps` — merged in `71355efb` + +## Intentionally retained + +- `origin/feat/w11-d-upstream-sync` — retained per audit policy (handled by Wave-D). +- `origin/feat/w1-e-conformance-ci` — parked per policy (conformance CI still in design). +- `origin/agent/issue-28`, `origin/develop`, `origin/fix/overnight-n4-section-valign` + — merged, but outside the Wave-C safe-to-prune patterns (`feat/w10-*`, + `fix/w8-*`, `fix/w11-a-*`, `chore/overnight-*`, `worktree-agent-*`). + Left alone for a future hygiene pass. diff --git a/corpus-manifests/docx/bibliography-authoring.json b/corpus-manifests/docx/bibliography-authoring.json new file mode 100644 index 000000000..875c32cd9 --- /dev/null +++ b/corpus-manifests/docx/bibliography-authoring.json @@ -0,0 +1,131 @@ +{ + "$schema": "../manifest.schema.json", + "id": "docx/bibliography-authoring", + "title": "Bibliography and citation authoring", + "format": "docx", + "category": "references", + "summary": "A document with one bibliography source (`smith2020`) stored in `customXml/item{N}.xml` under ``, and a matching inline `` citation reference in `word/document.xml` carrying a `` marker and a complex `CITATION` field instruction.", + "spec": { + "source": "ecma-376-5-part-1", + "clause": "22.9 (bibliography) and 17.5.2 (structured document tags)", + "element": "b:Sources / w:sdt / w:citation", + "rnc_reference": "spec/ecma-376-5/part-1/rnc/wml.rnc", + "xsd_reference": "spec/ecma-376-5/part-1/xsd/wml.xsd", + "notes": "The bibliography namespace is `http://schemas.openxmlformats.org/officeDocument/2006/bibliography` (`b:` prefix). A bibliography part is a customXml data part at `/customXml/item{N}.xml` with a `` root; each source is a `` child carrying ``, ``, ``, ``, and one or more `` entries. Citation references in the document body are `` content controls whose `` carries a `` type marker; `` holds a complex-field instruction of the form ` CITATION \\l `. Word tolerates implicit binding by `` value; explicit `` is not required for the citation to round-trip through Word." + }, + "fixtures": { + "machine": "docx/bibliography-authoring", + "office": "docx/bibliography-authoring" + }, + "generator": { + "python": "scripts/gen_bibliography_authoring.py" + }, + "assertions": [ + { + "id": "bibliography-part-exists", + "part": "customXml/item1.xml", + "namespaces": { + "b": "http://schemas.openxmlformats.org/officeDocument/2006/bibliography" + }, + "xpath": "/b:Sources", + "must": "exist", + "description": "A `` root element must be present in `/customXml/item1.xml`." + }, + { + "id": "bibliography-source-tag-present", + "part": "customXml/item1.xml", + "namespaces": { + "b": "http://schemas.openxmlformats.org/officeDocument/2006/bibliography" + }, + "xpath": "//b:Source/b:Tag[normalize-space()='smith2020']", + "must": "exist", + "description": "A `` whose `` text equals `smith2020` must exist." + }, + { + "id": "bibliography-source-title-present", + "part": "customXml/item1.xml", + "namespaces": { + "b": "http://schemas.openxmlformats.org/officeDocument/2006/bibliography" + }, + "xpath": "//b:Source[b:Tag='smith2020']/b:Title[normalize-space()='Distributed Systems']", + "must": "exist", + "description": "The `smith2020` source must carry the expected ``." + }, + { + "id": "bibliography-source-year-present", + "part": "customXml/item1.xml", + "namespaces": { + "b": "http://schemas.openxmlformats.org/officeDocument/2006/bibliography" + }, + "xpath": "//b:Source[b:Tag='smith2020']/b:Year[normalize-space()='2020']", + "must": "exist", + "description": "The `smith2020` source must carry the expected ``." + }, + { + "id": "bibliography-itemprops-schemaref", + "part": "customXml/itemProps1.xml", + "namespaces": { + "ds": "http://schemas.openxmlformats.org/officeDocument/2006/customXml" + }, + "xpath": "//ds:schemaRef[@ds:uri='http://schemas.openxmlformats.org/officeDocument/2006/bibliography']", + "must": "exist", + "description": "The sibling `itemProps1.xml` must declare the bibliography schemaRef URI." + }, + { + "id": "citation-sdt-marker-present", + "part": "word/document.xml", + "namespaces": { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + }, + "xpath": "//w:sdt/w:sdtPr[w:citation]", + "must": "exist", + "description": "A `` with a `` marker inside its `` must exist (the citation reference produced by `Paragraph.add_citation_reference`)." + }, + { + "id": "citation-sdt-instrText-contains-tag", + "part": "word/document.xml", + "namespaces": { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + }, + "xpath": "//w:sdt[w:sdtPr/w:citation]//w:instrText[contains(normalize-space(), 'CITATION smith2020')]", + "must": "exist", + "description": "The citation SDT must carry a `CITATION ` complex-field instruction referencing `smith2020`." + }, + { + "id": "customxml-rel-retained", + "part": "word/_rels/document.xml.rels", + "namespaces": { + "r": "http://schemas.openxmlformats.org/package/2006/relationships" + }, + "xpath": "//r:Relationship[@Type='http://schemas.openxmlformats.org/officeDocument/2006/relationships/customXml']", + "must": "exist", + "description": "A `customXml` relationship from `word/document.xml` to the bibliography part must be retained in the output package." + } + ], + "render_assertions": [ + { + "id": "citation-rendered-in-body", + "kind": "css_selector", + "selector": ".docx-wrapper *", + "must": "match-text", + "value": "(smith2020)", + "description": "Renderers that do not resolve CITATION fields should fall back to the cached result text — python-docx emits `(smith2020)` by default when no explicit `result_text` is supplied." + } + ], + "authoring": { + "library": "python-docx", + "apis": [ + "Document.bibliography", + "Document.add_citation", + "Paragraph.add_citation_reference", + "Bibliography.sources", + "Bibliography.get_by_tag", + "Source.tag", + "Source.title", + "Source.author", + "Source.year", + "Source.source_type" + ], + "notes": "This manifest exercises the authoring surface shipped in python-docx 2026.05.7. The generator script builds a fresh document, calls `Document.add_citation` once, wires an inline citation reference into a paragraph, and saves to the machine-fixture output directory." + } +} diff --git a/corpus-manifests/scripts/gen_bibliography_authoring.py b/corpus-manifests/scripts/gen_bibliography_authoring.py new file mode 100644 index 000000000..f93d29e5f --- /dev/null +++ b/corpus-manifests/scripts/gen_bibliography_authoring.py @@ -0,0 +1,69 @@ +"""Generator for the `docx/bibliography-authoring` corpus manifest. + +Builds a fresh document carrying one `/` entry +(``smith2020``) in the bibliography part plus one inline `` citation +reference in the body that points at that tag. + +Usage:: + + python corpus-manifests/scripts/gen_bibliography_authoring.py + +The output path may be omitted; it defaults to +``fixtures/docx/bibliography-authoring.docx`` relative to the current +working directory (matching the corpus-repo convention). + +The generator deliberately uses only the public python-docx API so that the +manifest doubles as an executable contract for the authoring surface. +""" + +from __future__ import annotations + +import pathlib +import sys + +from docx import Document + + +def build_document(): + document = Document() + + # -- one primary source, reached from the body by tag -- + document.add_citation( + "smith2020", + title="Distributed Systems", + author="Smith, John", + year=2020, + city="London", + publisher="Acme", + ) + # -- a second source, exercising source_type + field aliasing -- + document.add_citation( + "einstein1905", + source_type="JournalArticle", + title="Zur Elektrodynamik bewegter Koerper", + author="Einstein, Albert", + year=1905, + ) + + p = document.add_paragraph("As argued in ") + p.add_citation_reference("smith2020") + p.add_run(", ... and again by ") + p.add_citation_reference("einstein1905") + p.add_run(".") + + return document + + +def main(argv: "list[str]") -> int: + out = pathlib.Path( + argv[1] if len(argv) > 1 else "fixtures/docx/bibliography-authoring.docx" + ) + out.parent.mkdir(parents=True, exist_ok=True) + document = build_document() + document.save(str(out)) + print(f"wrote {out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/docs/_templates/sidebarlinks.html b/docs/_templates/sidebarlinks.html deleted file mode 100644 index 7f261bd71..000000000 --- a/docs/_templates/sidebarlinks.html +++ /dev/null @@ -1,6 +0,0 @@ -

Useful Links

- diff --git a/docs/_themes/armstrong/LICENSE b/docs/_themes/armstrong/LICENSE deleted file mode 100644 index 337e8b2a2..000000000 --- a/docs/_themes/armstrong/LICENSE +++ /dev/null @@ -1,25 +0,0 @@ -Copyright (c) 2011 Bay Citizen & Texas Tribune - -Original ReadTheDocs.org code -Copyright (c) 2010 Charles Leifer, Eric Holscher, Bobby Grace - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the "Software"), to deal in the Software without -restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. diff --git a/docs/_themes/armstrong/layout.html b/docs/_themes/armstrong/layout.html deleted file mode 100644 index d7b8fbb14..000000000 --- a/docs/_themes/armstrong/layout.html +++ /dev/null @@ -1,48 +0,0 @@ -{% extends "basic/layout.html" %} - -{% set script_files = script_files + [pathto("_static/searchtools.js", 1)] %} - -{% block htmltitle %} -{{ super() }} - - - -{% endblock %} - -{% block footer %} - - - -{% if theme_analytics_code %} - - -{% endif %} - -{% endblock %} diff --git a/docs/_themes/armstrong/rtd-themes.conf b/docs/_themes/armstrong/rtd-themes.conf deleted file mode 100644 index 5930488d7..000000000 --- a/docs/_themes/armstrong/rtd-themes.conf +++ /dev/null @@ -1,65 +0,0 @@ -[theme] -inherit = default -stylesheet = rtd.css -pygment_style = default -show_sphinx = False - -[options] -show_rtd = True - -white = #ffffff -almost_white = #f8f8f8 -barely_white = #f2f2f2 -dirty_white = #eeeeee -almost_dirty_white = #e6e6e6 -dirtier_white = #dddddd -lighter_gray = #cccccc -gray_a = #aaaaaa -gray_9 = #999999 -light_gray = #888888 -gray_7 = #777777 -gray = #666666 -dark_gray = #444444 -gray_2 = #222222 -black = #111111 -light_color = #e8ecef -light_medium_color = #DDEAF0 -medium_color = #8ca1af -medium_color_link = #86989b -medium_color_link_hover = #a6b8bb -dark_color = #465158 - -h1 = #000000 -h2 = #465158 -h3 = #6c818f - -link_color = #444444 -link_color_decoration = #CCCCCC - -medium_color_hover = #697983 -green_highlight = #8ecc4c - - -positive_dark = #609060 -positive_medium = #70a070 -positive_light = #e9ffe9 - -negative_dark = #900000 -negative_medium = #b04040 -negative_light = #ffe9e9 -negative_text = #c60f0f - -ruler = #abc - -viewcode_bg = #f4debf -viewcode_border = #ac9 - -highlight = #ffe080 - -code_background = #eeeeee - -background = #465158 -background_link = #ffffff -background_link_half = #ffffff -background_text = #eeeeee -background_text_link = #86989b diff --git a/docs/_themes/armstrong/static/rtd.css_t b/docs/_themes/armstrong/static/rtd.css_t deleted file mode 100644 index 578946ab9..000000000 --- a/docs/_themes/armstrong/static/rtd.css_t +++ /dev/null @@ -1,781 +0,0 @@ -/* - * rtd.css - * ~~~~~~~~~~~~~~~ - * - * Sphinx stylesheet -- sphinxdoc theme. Originally created by - * Armin Ronacher for Werkzeug. - * - * Customized for ReadTheDocs by Eric Pierce & Eric Holscher - * - * :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -/* RTD colors - * light blue: {{ theme_light_color }} - * medium blue: {{ theme_medium_color }} - * dark blue: {{ theme_dark_color }} - * dark grey: {{ theme_grey_color }} - * - * medium blue hover: {{ theme_medium_color_hover }}; - * green highlight: {{ theme_green_highlight }} - * light blue (project bar): {{ theme_light_color }} - */ - -@import url("basic.css"); - -/* PAGE LAYOUT -------------------------------------------------------------- */ - -body { - font: 100%/1.5 "ff-meta-web-pro-1","ff-meta-web-pro-2",Arial,"Helvetica Neue",sans-serif; - text-align: center; - color: black; - background-color: {{ theme_background }}; - padding: 0; - margin: 0; -} - -div.document { - text-align: left; - background-color: {{ theme_light_color }}; -} - -div.bodywrapper { - background-color: {{ theme_white }}; - border-left: 1px solid {{ theme_lighter_gray }}; - border-bottom: 1px solid {{ theme_lighter_gray }}; - margin: 0 0 0 16em; -} - -div.body { - margin: 0; - padding: 0.5em 1.3em; - max-width: 55em; - min-width: 20em; -} - -div.related { - font-size: 1em; - background-color: {{ theme_background }}; -} - -div.documentwrapper { - float: left; - width: 100%; - background-color: {{ theme_light_color }}; -} - - -/* HEADINGS --------------------------------------------------------------- */ - -h1 { - margin: 0; - padding: 0.7em 0 0.3em 0; - font-size: 1.5em; - line-height: 1.15; - color: {{ theme_h1 }}; - clear: both; -} - -h2 { - margin: 2em 0 0.2em 0; - font-size: 1.35em; - padding: 0; - color: {{ theme_h2 }}; -} - -h3 { - margin: 1em 0 -0.3em 0; - font-size: 1.2em; - color: {{ theme_h3 }}; -} - -div.body h1 a, div.body h2 a, div.body h3 a, div.body h4 a, div.body h5 a, div.body h6 a { - color: black; -} - -h1 a.anchor, h2 a.anchor, h3 a.anchor, h4 a.anchor, h5 a.anchor, h6 a.anchor { - display: none; - margin: 0 0 0 0.3em; - padding: 0 0.2em 0 0.2em; - color: {{ theme_gray_a }} !important; -} - -h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, -h5:hover a.anchor, h6:hover a.anchor { - display: inline; -} - -h1 a.anchor:hover, h2 a.anchor:hover, h3 a.anchor:hover, h4 a.anchor:hover, -h5 a.anchor:hover, h6 a.anchor:hover { - color: {{ theme_gray_7 }}; - background-color: {{ theme_dirty_white }}; -} - - -/* LINKS ------------------------------------------------------------------ */ - -/* Normal links get a pseudo-underline */ -a { - color: {{ theme_link_color }}; - text-decoration: none; - border-bottom: 1px solid {{ theme_link_color_decoration }}; -} - -/* Links in sidebar, TOC, index trees and tables have no underline */ -.sphinxsidebar a, -.toctree-wrapper a, -.indextable a, -#indices-and-tables a { - color: {{ theme_dark_gray }}; - text-decoration: none; - border-bottom: none; -} - -/* Most links get an underline-effect when hovered */ -a:hover, -div.toctree-wrapper a:hover, -.indextable a:hover, -#indices-and-tables a:hover { - color: {{ theme_black }}; - text-decoration: none; - border-bottom: 1px solid {{ theme_black }}; -} - -/* Footer links */ -div.footer a { - color: {{ theme_background_text_link }}; - text-decoration: none; - border: none; -} -div.footer a:hover { - color: {{ theme_medium_color_link_hover }}; - text-decoration: underline; - border: none; -} - -/* Permalink anchor (subtle grey with a red hover) */ -div.body a.headerlink { - color: {{ theme_lighter_gray }}; - font-size: 1em; - margin-left: 6px; - padding: 0 4px 0 4px; - text-decoration: none; - border: none; -} -div.body a.headerlink:hover { - color: {{ theme_negative_text }}; - border: none; -} - - -/* NAVIGATION BAR --------------------------------------------------------- */ - -div.related ul { - height: 2.5em; -} - -div.related ul li { - margin: 0; - padding: 0.65em 0; - float: left; - display: block; - color: {{ theme_background_link_half }}; /* For the >> separators */ - font-size: 0.8em; -} - -div.related ul li.right { - float: right; - margin-right: 5px; - color: transparent; /* Hide the | separators */ -} - -/* "Breadcrumb" links in nav bar */ -div.related ul li a { - order: none; - background-color: inherit; - font-weight: bold; - margin: 6px 0 6px 4px; - line-height: 1.75em; - color: {{ theme_background_link }}; - text-shadow: 0 1px rgba(0, 0, 0, 0.5); - padding: 0.4em 0.8em; - border: none; - border-radius: 3px; -} -/* previous / next / modules / index links look more like buttons */ -div.related ul li.right a { - margin: 0.375em 0; - background-color: {{ theme_medium_color_hover }}; - text-shadow: 0 1px rgba(0, 0, 0, 0.5); - border-radius: 3px; - -webkit-border-radius: 3px; - -moz-border-radius: 3px; -} -/* All navbar links light up as buttons when hovered */ -div.related ul li a:hover { - background-color: {{ theme_medium_color }}; - color: {{ theme_white }}; - text-decoration: none; - border-radius: 3px; - -webkit-border-radius: 3px; - -moz-border-radius: 3px; -} -/* Take extra precautions for tt within links */ -a tt, -div.related ul li a tt { - background: inherit !important; - color: inherit !important; -} - - -/* SIDEBAR ---------------------------------------------------------------- */ - -div.sphinxsidebarwrapper { - padding: 0; -} - -div.sphinxsidebar { - margin: 0; - margin-left: -100%; - float: left; - top: 3em; - left: 0; - padding: 0 1em; - width: 14em; - font-size: 1em; - text-align: left; - background-color: {{ theme_light_color }}; -} - -div.sphinxsidebar img { - max-width: 12em; -} - -div.sphinxsidebar h3, div.sphinxsidebar h4 { - margin: 1.2em 0 0.3em 0; - font-size: 1em; - padding: 0; - color: {{ theme_gray_2 }}; - font-family: "ff-meta-web-pro-1", "ff-meta-web-pro-2", "Arial", "Helvetica Neue", sans-serif; -} - -div.sphinxsidebar h3 a { - color: {{ theme_grey_color }}; -} - -div.sphinxsidebar ul, -div.sphinxsidebar p { - margin-top: 0; - padding-left: 0; - line-height: 130%; - background-color: {{ theme_light_color }}; -} - -/* No bullets for nested lists, but a little extra indentation */ -div.sphinxsidebar ul ul { - list-style-type: none; - margin-left: 1.5em; - padding: 0; -} - -/* A little top/bottom padding to prevent adjacent links' borders - * from overlapping each other */ -div.sphinxsidebar ul li { - padding: 1px 0; -} - -/* A little left-padding to make these align with the ULs */ -div.sphinxsidebar p.topless { - padding-left: 0 0 0 1em; -} - -/* Make these into hidden one-liners */ -div.sphinxsidebar ul li, -div.sphinxsidebar p.topless { - white-space: nowrap; - overflow: hidden; -} -/* ...which become visible when hovered */ -div.sphinxsidebar ul li:hover, -div.sphinxsidebar p.topless:hover { - overflow: visible; -} - -/* Search text box and "Go" button */ -#searchbox { - margin-top: 2em; - margin-bottom: 1em; - background: {{ theme_dirtier_white }}; - padding: 0.5em; - border-radius: 6px; - -moz-border-radius: 6px; - -webkit-border-radius: 6px; -} -#searchbox h3 { - margin-top: 0; -} - -/* Make search box and button abut and have a border */ -input, -div.sphinxsidebar input { - border: 1px solid {{ theme_gray_9 }}; - float: left; -} - -/* Search textbox */ -input[type="text"] { - margin: 0; - padding: 0 3px; - height: 20px; - width: 144px; - border-top-left-radius: 3px; - border-bottom-left-radius: 3px; - -moz-border-radius-topleft: 3px; - -moz-border-radius-bottomleft: 3px; - -webkit-border-top-left-radius: 3px; - -webkit-border-bottom-left-radius: 3px; -} -/* Search button */ -input[type="submit"] { - margin: 0 0 0 -1px; /* -1px prevents a double-border with textbox */ - height: 22px; - color: {{ theme_dark_gray }}; - background-color: {{ theme_light_color }}; - padding: 1px 4px; - font-weight: bold; - border-top-right-radius: 3px; - border-bottom-right-radius: 3px; - -moz-border-radius-topright: 3px; - -moz-border-radius-bottomright: 3px; - -webkit-border-top-right-radius: 3px; - -webkit-border-bottom-right-radius: 3px; -} -input[type="submit"]:hover { - color: {{ theme_white }}; - background-color: {{ theme_green_highlight }}; -} - -div.sphinxsidebar p.searchtip { - clear: both; - padding: 0.5em 0 0 0; - background: {{ theme_dirtier_white }}; - color: {{ theme_gray }}; - font-size: 0.9em; -} - -/* Sidebar links are unusual */ -div.sphinxsidebar li a, -div.sphinxsidebar p a { - background: {{ theme_light_color }}; /* In case links overlap main content */ - border-radius: 3px; - -moz-border-radius: 3px; - -webkit-border-radius: 3px; - border: 1px solid transparent; /* To prevent things jumping around on hover */ - padding: 0 5px 0 5px; -} -div.sphinxsidebar li a:hover, -div.sphinxsidebar p a:hover { - color: {{ theme_black }}; - text-decoration: none; - border: 1px solid {{ theme_light_gray }}; -} - -/* Tweak any link appearing in a heading */ -div.sphinxsidebar h3 a { -} - - - - -/* OTHER STUFF ------------------------------------------------------------ */ - -cite, code, tt { - font-family: 'Consolas', 'Deja Vu Sans Mono', - 'Bitstream Vera Sans Mono', monospace; - font-size: 0.95em; - letter-spacing: 0.01em; -} - -tt { - background-color: {{ theme_code_background }}; - color: {{ theme_dark_gray }}; -} - -tt.descname, tt.descclassname, tt.xref { - border: 0; -} - -hr { - border: 1px solid {{ theme_ruler }}; - margin: 2em; -} - -pre, #_fontwidthtest { - font-family: 'Consolas', 'Deja Vu Sans Mono', - 'Bitstream Vera Sans Mono', monospace; - margin: 1em 2em; - font-size: 0.95em; - letter-spacing: 0.015em; - line-height: 120%; - padding: 0.5em; - border: 1px solid {{ theme_lighter_gray }}; - background-color: {{ theme_code_background }}; - border-radius: 6px; - -moz-border-radius: 6px; - -webkit-border-radius: 6px; -} - -pre a { - color: inherit; - text-decoration: underline; -} - -td.linenos pre { - padding: 0.5em 0; -} - -div.quotebar { - background-color: {{ theme_almost_white }}; - max-width: 250px; - float: right; - padding: 2px 7px; - border: 1px solid {{ theme_lighter_gray }}; -} - -div.topic { - background-color: {{ theme_almost_white }}; -} - -table { - border-collapse: collapse; - margin: 0 -0.5em 0 -0.5em; -} - -table td, table th { - padding: 0.2em 0.5em 0.2em 0.5em; -} - - -/* ADMONITIONS AND WARNINGS ------------------------------------------------- */ - -/* Shared by admonitions, warnings and sidebars */ -div.admonition, -div.warning, -div.sidebar { - font-size: 0.9em; - margin: 2em; - padding: 0; - /* - border-radius: 6px; - -moz-border-radius: 6px; - -webkit-border-radius: 6px; - */ -} -div.admonition p, -div.warning p, -div.sidebar p { - margin: 0.5em 1em 0.5em 1em; - padding: 0; -} -div.admonition pre, -div.warning pre, -div.sidebar pre { - margin: 0.4em 1em 0.4em 1em; -} -div.admonition p.admonition-title, -div.warning p.admonition-title, -div.sidebar p.sidebar-title { - margin: 0; - padding: 0.1em 0 0.1em 0.5em; - color: white; - font-weight: bold; - font-size: 1.1em; - text-shadow: 0 1px rgba(0, 0, 0, 0.5); -} -div.admonition ul, div.admonition ol, -div.warning ul, div.warning ol, -div.sidebar ul, div.sidebar ol { - margin: 0.1em 0.5em 0.5em 3em; - padding: 0; -} - - -/* Admonitions and sidebars only */ -div.admonition, div.sidebar { - border: 1px solid {{ theme_positive_dark }}; - background-color: {{ theme_positive_light }}; -} -div.admonition p.admonition-title, -div.sidebar p.sidebar-title { - background-color: {{ theme_positive_medium }}; - border-bottom: 1px solid {{ theme_positive_dark }}; -} - - -/* Warnings only */ -div.warning { - border: 1px solid {{ theme_negative_dark }}; - background-color: {{ theme_negative_light }}; -} -div.warning p.admonition-title { - background-color: {{ theme_negative_medium }}; - border-bottom: 1px solid {{ theme_negative_dark }}; -} - - -/* Sidebars only */ -div.sidebar { - max-width: 200px; -} - - - -div.versioninfo { - margin: 1em 0 0 0; - border: 1px solid {{ theme_lighter_gray }}; - background-color: {{ theme_light_medium_color }}; - padding: 8px; - line-height: 1.3em; - font-size: 0.9em; -} - -.viewcode-back { - font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', - 'Verdana', sans-serif; -} - -div.viewcode-block:target { - background-color: {{ theme_viewcode_bg }}; - border-top: 1px solid {{ theme_viewcode_border }}; - border-bottom: 1px solid {{ theme_viewcode_border }}; -} - -dl { - margin: 1em 0 2.5em 0; -} - -/* Highlight target when you click an internal link */ -dt:target { - background: {{ theme_highlight }}; -} -/* Don't highlight whole divs */ -div.highlight { - background: transparent; -} -/* But do highlight spans (so search results can be highlighted) */ -span.highlight { - background: {{ theme_highlight }}; -} - -div.footer { - background-color: {{ theme_background }}; - color: {{ theme_background_text }}; - padding: 0 2em 2em 2em; - clear: both; - font-size: 0.8em; - text-align: center; -} - -p { - margin: 0.8em 0 0.5em 0; -} - -.section p img { - margin: 1em 2em; -} - - -/* MOBILE LAYOUT -------------------------------------------------------------- */ - -@media screen and (max-width: 600px) { - - h1, h2, h3, h4, h5 { - position: relative; - } - - ul { - padding-left: 1.75em; - } - - div.bodywrapper a.headerlink, #indices-and-tables h1 a { - color: {{ theme_almost_dirty_white }}; - font-size: 80%; - float: right; - line-height: 1.8; - position: absolute; - right: -0.7em; - visibility: inherit; - } - - div.bodywrapper h1 a.headerlink, #indices-and-tables h1 a { - line-height: 1.5; - } - - pre { - font-size: 0.7em; - overflow: auto; - word-wrap: break-word; - white-space: pre-wrap; - } - - div.related ul { - height: 2.5em; - padding: 0; - text-align: left; - } - - div.related ul li { - clear: both; - color: {{ theme_dark_color }}; - padding: 0.2em 0; - } - - div.related ul li:last-child { - border-bottom: 1px dotted {{ theme_medium_color }}; - padding-bottom: 0.4em; - margin-bottom: 1em; - width: 100%; - } - - div.related ul li a { - color: {{ theme_dark_color }}; - padding-right: 0; - } - - div.related ul li a:hover { - background: inherit; - color: inherit; - } - - div.related ul li.right { - clear: none; - padding: 0.65em 0; - margin-bottom: 0.5em; - } - - div.related ul li.right a { - color: {{ theme_white }}; - padding-right: 0.8em; - } - - div.related ul li.right a:hover { - background-color: {{ theme_medium_color }}; - } - - div.body { - clear: both; - min-width: 0; - word-wrap: break-word; - } - - div.bodywrapper { - margin: 0 0 0 0; - } - - div.sphinxsidebar { - float: none; - margin: 0; - width: auto; - } - - div.sphinxsidebar input[type="text"] { - height: 2em; - line-height: 2em; - width: 70%; - } - - div.sphinxsidebar input[type="submit"] { - height: 2em; - margin-left: 0.5em; - width: 20%; - } - - div.sphinxsidebar p.searchtip { - background: inherit; - margin-bottom: 1em; - } - - div.sphinxsidebar ul li, div.sphinxsidebar p.topless { - white-space: normal; - } - - .bodywrapper img { - display: block; - margin-left: auto; - margin-right: auto; - max-width: 100%; - } - - div.documentwrapper { - float: none; - } - - div.admonition, div.warning, pre, blockquote { - margin-left: 0em; - margin-right: 0em; - } - - .body p img { - margin: 0; - } - - #searchbox { - background: transparent; - } - - .related:not(:first-child) li { - display: none; - } - - .related:not(:first-child) li.right { - display: block; - } - - div.footer { - padding: 1em; - } - - .rtd_doc_footer .badge { - float: none; - margin: 1em auto; - position: static; - } - - .rtd_doc_footer .badge.revsys-inline { - margin-right: auto; - margin-bottom: 2em; - } - - table.indextable { - display: block; - width: auto; - } - - .indextable tr { - display: block; - } - - .indextable td { - display: block; - padding: 0; - width: auto !important; - } - - .indextable td dt { - margin: 1em 0; - } - - ul.search { - margin-left: 0.25em; - } - - ul.search li div.context { - font-size: 90%; - line-height: 1.1; - margin-bottom: 1; - margin-left: 0; - } - -} diff --git a/docs/_themes/armstrong/theme.conf b/docs/_themes/armstrong/theme.conf deleted file mode 100644 index 5930488d7..000000000 --- a/docs/_themes/armstrong/theme.conf +++ /dev/null @@ -1,65 +0,0 @@ -[theme] -inherit = default -stylesheet = rtd.css -pygment_style = default -show_sphinx = False - -[options] -show_rtd = True - -white = #ffffff -almost_white = #f8f8f8 -barely_white = #f2f2f2 -dirty_white = #eeeeee -almost_dirty_white = #e6e6e6 -dirtier_white = #dddddd -lighter_gray = #cccccc -gray_a = #aaaaaa -gray_9 = #999999 -light_gray = #888888 -gray_7 = #777777 -gray = #666666 -dark_gray = #444444 -gray_2 = #222222 -black = #111111 -light_color = #e8ecef -light_medium_color = #DDEAF0 -medium_color = #8ca1af -medium_color_link = #86989b -medium_color_link_hover = #a6b8bb -dark_color = #465158 - -h1 = #000000 -h2 = #465158 -h3 = #6c818f - -link_color = #444444 -link_color_decoration = #CCCCCC - -medium_color_hover = #697983 -green_highlight = #8ecc4c - - -positive_dark = #609060 -positive_medium = #70a070 -positive_light = #e9ffe9 - -negative_dark = #900000 -negative_medium = #b04040 -negative_light = #ffe9e9 -negative_text = #c60f0f - -ruler = #abc - -viewcode_bg = #f4debf -viewcode_border = #ac9 - -highlight = #ffe080 - -code_background = #eeeeee - -background = #465158 -background_link = #ffffff -background_link_half = #ffffff -background_text = #eeeeee -background_text_link = #86989b diff --git a/docs/_themes/armstrong/theme.conf.orig b/docs/_themes/armstrong/theme.conf.orig deleted file mode 100644 index a74a8a2ce..000000000 --- a/docs/_themes/armstrong/theme.conf.orig +++ /dev/null @@ -1,66 +0,0 @@ -[theme] -inherit = default -stylesheet = rtd.css -pygment_style = default -show_sphinx = False - -[options] -show_rtd = True - -white = #ffffff -almost_white = #f8f8f8 -barely_white = #f2f2f2 -dirty_white = #eeeeee -almost_dirty_white = #e6e6e6 -dirtier_white = #DAC6AF -lighter_gray = #cccccc -gray_a = #aaaaaa -gray_9 = #999999 -light_gray = #888888 -gray_7 = #777777 -gray = #666666 -dark_gray = #444444 -gray_2 = #222222 -black = #111111 -light_color = #EDE4D8 -light_medium_color = #DDEAF0 -medium_color = #8ca1af -medium_color_link = #634320 -medium_color_link_hover = #261a0c -dark_color = rgba(160, 109, 52, 1.0) - -h1 = #1f3744 -h2 = #335C72 -h3 = #638fa6 - -link_color = #335C72 -link_color_decoration = #99AEB9 - -medium_color_hover = rgba(255, 255, 255, 0.25) -medium_color = rgba(255, 255, 255, 0.5) -green_highlight = #8ecc4c - - -positive_dark = rgba(51, 77, 0, 1.0) -positive_medium = rgba(102, 153, 0, 1.0) -positive_light = rgba(102, 153, 0, 0.1) - -negative_dark = rgba(51, 13, 0, 1.0) -negative_medium = rgba(204, 51, 0, 1.0) -negative_light = rgba(204, 51, 0, 0.1) -negative_text = #c60f0f - -ruler = #abc - -viewcode_bg = #f4debf -viewcode_border = #ac9 - -highlight = #ffe080 - -code_background = rgba(0, 0, 0, 0.075) - -background = rgba(135, 57, 34, 1.0) -background_link = rgba(212, 195, 172, 1.0) -background_link_half = rgba(212, 195, 172, 0.5) -background_text = rgba(212, 195, 172, 1.0) -background_text_link = rgba(171, 138, 93, 1.0) diff --git a/docs/api/accessibility.rst b/docs/api/accessibility.rst new file mode 100644 index 000000000..dd4a3bec1 --- /dev/null +++ b/docs/api/accessibility.rst @@ -0,0 +1,19 @@ + +.. _accessibility_api: + +Accessibility +============= + +.. currentmodule:: docx.accessibility + + +|HeadingIssue| objects +---------------------- + +.. autoclass:: HeadingIssue() + + +Heading validation +------------------ + +.. autofunction:: validate_heading_structure diff --git a/docs/api/bookmarks.rst b/docs/api/bookmarks.rst new file mode 100644 index 000000000..493019db9 --- /dev/null +++ b/docs/api/bookmarks.rst @@ -0,0 +1,19 @@ + +.. _bookmarks_api: + +Bookmarks +========= + +.. currentmodule:: docx.bookmarks + + +|Bookmarks| objects +------------------- + +.. autoclass:: Bookmarks() + + +Bookmark objects +---------------- + +.. autoclass:: Bookmark() diff --git a/docs/api/captions.rst b/docs/api/captions.rst new file mode 100644 index 000000000..16199a71a --- /dev/null +++ b/docs/api/captions.rst @@ -0,0 +1,16 @@ + +.. _captions_api: + +Captions +======== + +.. currentmodule:: docx.captions + +The :mod:`docx.captions` module provides helpers for adding ``SEQ``-field +caption paragraphs styled with the ``Caption`` built-in style. + + +Functions +--------- + +.. autofunction:: new_caption_paragraph diff --git a/docs/api/chart.rst b/docs/api/chart.rst new file mode 100644 index 000000000..a247e2fc0 --- /dev/null +++ b/docs/api/chart.rst @@ -0,0 +1,27 @@ + +.. _chart_api: + +Charts +====== + +.. currentmodule:: docx.chart + + +|Chart| objects +--------------- + +.. autoclass:: Chart() + + +Chart series +------------ + +.. autoclass:: ChartSeries() + + +Chart enumerations +------------------ + +.. autoclass:: WD_CHART_TYPE() + :members: + :undoc-members: diff --git a/docs/api/content-controls.rst b/docs/api/content-controls.rst new file mode 100644 index 000000000..0a68353df --- /dev/null +++ b/docs/api/content-controls.rst @@ -0,0 +1,36 @@ + +.. _content_controls_api: + +Content controls (SDTs) +======================= + +.. currentmodule:: docx.content_controls + +Structured-document-tag support for rich-text, plain-text, date, checkbox, +combo-box, dropdown, and picture content controls. + + +|ContentControl| objects +------------------------ + +.. autoclass:: ContentControl() + + +Data binding +------------ + +.. autoclass:: DataBinding() + + +Content-control type +-------------------- + +.. autoclass:: ContentControlType() + :members: + :undoc-members: + + +Constructors +------------ + +.. autofunction:: new_sdt diff --git a/docs/api/custom-properties.rst b/docs/api/custom-properties.rst new file mode 100644 index 000000000..2399c1ef5 --- /dev/null +++ b/docs/api/custom-properties.rst @@ -0,0 +1,13 @@ + +.. _custom_properties_api: + +Custom document properties +========================== + +.. currentmodule:: docx.custom_properties + + +|CustomProperties| objects +-------------------------- + +.. autoclass:: CustomProperties() diff --git a/docs/api/custom-xml.rst b/docs/api/custom-xml.rst new file mode 100644 index 000000000..5b5f2a127 --- /dev/null +++ b/docs/api/custom-xml.rst @@ -0,0 +1,19 @@ + +.. _custom_xml_api: + +Custom XML parts +================ + +.. currentmodule:: docx.custom_xml + + +|CustomXmlPart| objects +----------------------- + +.. autoclass:: CustomXmlPart() + + +Helpers +------- + +.. autofunction:: iter_custom_xml_parts diff --git a/docs/api/embedded-objects.rst b/docs/api/embedded-objects.rst new file mode 100644 index 000000000..b5c45e5d1 --- /dev/null +++ b/docs/api/embedded-objects.rst @@ -0,0 +1,13 @@ + +.. _embedded_objects_api: + +Embedded OLE objects +==================== + +.. currentmodule:: docx.embedded_objects + + +|EmbeddedObject| objects +------------------------ + +.. autoclass:: EmbeddedObject() diff --git a/docs/api/endnotes.rst b/docs/api/endnotes.rst new file mode 100644 index 000000000..497eecbbd --- /dev/null +++ b/docs/api/endnotes.rst @@ -0,0 +1,29 @@ + +.. _endnotes_api: + +Endnotes +======== + +.. currentmodule:: docx.endnotes + + +|Endnotes| objects +------------------ + +.. autoclass:: Endnotes() + :inherited-members: + :exclude-members: part + + +Endnote objects +--------------- + +.. autoclass:: Endnote() + :inherited-members: + :exclude-members: part + + +|EndnoteProperties| objects +--------------------------- + +.. autoclass:: EndnoteProperties() diff --git a/docs/api/enum/WdAnchorH.rst b/docs/api/enum/WdAnchorH.rst new file mode 100644 index 000000000..b34cf4ad6 --- /dev/null +++ b/docs/api/enum/WdAnchorH.rst @@ -0,0 +1,26 @@ +.. _WdAnchorH: + +``WD_ANCHOR_H`` +=============== + +Specifies the horizontal anchor used for positioning a floating shape or image. + +Example:: + + from docx.enum.shape import WD_ANCHOR_H + + floating_image.horizontal_anchor = WD_ANCHOR_H.MARGIN + +---- + +PAGE + Horizontal position is measured relative to the page edge. + +MARGIN + Horizontal position is measured relative to the page margin. + +COLUMN + Horizontal position is measured relative to the column. + +CHARACTER + Horizontal position is measured relative to a character anchor. diff --git a/docs/api/enum/WdAnchorV.rst b/docs/api/enum/WdAnchorV.rst new file mode 100644 index 000000000..971581c2c --- /dev/null +++ b/docs/api/enum/WdAnchorV.rst @@ -0,0 +1,26 @@ +.. _WdAnchorV: + +``WD_ANCHOR_V`` +=============== + +Specifies the vertical anchor used for positioning a floating shape or image. + +Example:: + + from docx.enum.shape import WD_ANCHOR_V + + floating_image.vertical_anchor = WD_ANCHOR_V.PARAGRAPH + +---- + +PAGE + Vertical position is measured relative to the page edge. + +MARGIN + Vertical position is measured relative to the page margin. + +PARAGRAPH + Vertical position is measured relative to the anchor paragraph. + +LINE + Vertical position is measured relative to a line of text. diff --git a/docs/api/enum/WdBorderDisplay.rst b/docs/api/enum/WdBorderDisplay.rst new file mode 100644 index 000000000..a9b933511 --- /dev/null +++ b/docs/api/enum/WdBorderDisplay.rst @@ -0,0 +1,23 @@ +.. _WdBorderDisplay: + +``WD_BORDER_DISPLAY`` +===================== + +Specifies which pages of a section display a page border. + +Example:: + + from docx.enum.section import WD_BORDER_DISPLAY + + section.page_borders.display = WD_BORDER_DISPLAY.FIRST_PAGE + +---- + +ALL_PAGES + Border is displayed on every page. + +FIRST_PAGE + Border is displayed only on the first page. + +NOT_FIRST_PAGE + Border is displayed on every page except the first. diff --git a/docs/api/enum/WdBorderOffsetFrom.rst b/docs/api/enum/WdBorderOffsetFrom.rst new file mode 100644 index 000000000..522cc2c8a --- /dev/null +++ b/docs/api/enum/WdBorderOffsetFrom.rst @@ -0,0 +1,20 @@ +.. _WdBorderOffsetFrom: + +``WD_BORDER_OFFSET_FROM`` +========================= + +Specifies the reference point used when measuring the offset of a page border. + +Example:: + + from docx.enum.section import WD_BORDER_OFFSET_FROM + + section.page_borders.offset_from = WD_BORDER_OFFSET_FROM.PAGE + +---- + +TEXT + Border is positioned relative to the text extents. + +PAGE + Border is positioned relative to the page edge. diff --git a/docs/api/enum/WdBorderStyle.rst b/docs/api/enum/WdBorderStyle.rst new file mode 100644 index 000000000..f8c81993b --- /dev/null +++ b/docs/api/enum/WdBorderStyle.rst @@ -0,0 +1,95 @@ +.. _WdBorderStyle: + +``WD_BORDER_STYLE`` +=================== + +Specifies the line style for a paragraph, run, table, or page border. + +Example:: + + from docx.enum.text import WD_BORDER_STYLE + + run.font.border_style = WD_BORDER_STYLE.DOUBLE + +---- + +NIL + No border. + +NONE + No border. + +SINGLE + A single line. + +THICK + A single thick line. + +DOUBLE + A double line. + +DOTTED + A dotted line. + +DASHED + A dashed line. + +DOT_DASH + An alternating dot-dash line. + +DOT_DOT_DASH + An alternating dot-dot-dash line. + +TRIPLE + A triple line. + +THIN_THICK_SMALL_GAP + A thin-thick line with a small gap. + +THICK_THIN_SMALL_GAP + A thick-thin line with a small gap. + +THIN_THICK_THIN_SMALL_GAP + A thin-thick-thin line with a small gap. + +THIN_THICK_MEDIUM_GAP + A thin-thick line with a medium gap. + +THICK_THIN_MEDIUM_GAP + A thick-thin line with a medium gap. + +THIN_THICK_THIN_MEDIUM_GAP + A thin-thick-thin line with a medium gap. + +THIN_THICK_LARGE_GAP + A thin-thick line with a large gap. + +THICK_THIN_LARGE_GAP + A thick-thin line with a large gap. + +THIN_THICK_THIN_LARGE_GAP + A thin-thick-thin line with a large gap. + +WAVE + A wavy line. + +DOUBLE_WAVE + A double wavy line. + +DASH_SMALL_GAP + A dashed line with a small gap. + +DASH_DOT_STROKED + A dash-dot stroked line. + +THREE_D_EMBOSS + A 3D embossed line. + +THREE_D_ENGRAVE + A 3D engraved line. + +OUTSET + An outset line. + +INSET + An inset line. diff --git a/docs/api/enum/WdBreakType.rst b/docs/api/enum/WdBreakType.rst new file mode 100644 index 000000000..efefa7656 --- /dev/null +++ b/docs/api/enum/WdBreakType.rst @@ -0,0 +1,47 @@ +.. _WdBreakType: + +``WD_BREAK_TYPE`` +================= + +Specifies the type of break inserted into the text flow. + +Example:: + + from docx.enum.text import WD_BREAK_TYPE + + run.add_break(WD_BREAK_TYPE.PAGE) + +---- + +LINE + A line break. + +LINE_CLEAR_LEFT + Line break, clearing text wrap on the left. + +LINE_CLEAR_RIGHT + Line break, clearing text wrap on the right. + +LINE_CLEAR_ALL + Line break, clearing text wrap on both sides. + +PAGE + A page break. + +COLUMN + A column break. + +SECTION_CONTINUOUS + A continuous section break. + +SECTION_EVEN_PAGE + A section break that begins on the next even page. + +SECTION_NEXT_PAGE + A section break that begins on the next page. + +SECTION_ODD_PAGE + A section break that begins on the next odd page. + +TEXT_WRAPPING + A text-wrapping break. diff --git a/docs/api/enum/WdBuildingBlockGallery.rst b/docs/api/enum/WdBuildingBlockGallery.rst new file mode 100644 index 000000000..8142d1296 --- /dev/null +++ b/docs/api/enum/WdBuildingBlockGallery.rst @@ -0,0 +1,119 @@ +.. _WdBuildingBlockGallery: + +``WD_BUILDING_BLOCK_GALLERY`` +============================= + +Specifies the gallery a building block (AutoText / Quick Part) belongs to. + +Example:: + + from docx.enum.text import WD_BUILDING_BLOCK_GALLERY + + block.gallery = WD_BUILDING_BLOCK_GALLERY.AUTO_TEXT + +---- + +PLACEHOLDER + Placeholder gallery. + +ANY + Matches any gallery. + +DEFAULT + Default gallery. + +DOC_PARTS + Generic document-parts gallery. + +COVER_PAGES + Cover-page gallery. + +EQUATIONS + Equation gallery. + +FOOTERS + Footer gallery. + +HEADERS + Header gallery. + +PAGE_NUMBERS + Page-number gallery. + +TABLES + Table gallery. + +WATERMARKS + Watermark gallery. + +AUTO_TEXT + AutoText gallery. + +TEXT_BOXES + Text-box gallery. + +PAGE_NUMBERS_BOTTOM + Page-number (bottom of page) gallery. + +PAGE_NUMBERS_TOP + Page-number (top of page) gallery. + +BIBLIOGRAPHIES + Bibliography gallery. + +QUICK_PARTS + Quick Parts gallery. + +CUSTOM_QUICK_PARTS + Custom Quick Parts gallery. + +CUSTOM_COVER_PAGES + Custom cover-page gallery. + +CUSTOM_EQUATIONS + Custom equation gallery. + +CUSTOM_FOOTERS + Custom footer gallery. + +CUSTOM_HEADERS + Custom header gallery. + +CUSTOM_PAGE_NUMBERS + Custom page-number gallery. + +CUSTOM_TABLES + Custom table gallery. + +CUSTOM_WATERMARKS + Custom watermark gallery. + +CUSTOM_AUTO_TEXT + Custom AutoText gallery. + +CUSTOM_TEXT_BOXES + Custom text-box gallery. + +CUSTOM_PAGE_NUMBERS_BOTTOM + Custom page-number (bottom) gallery. + +CUSTOM_PAGE_NUMBERS_TOP + Custom page-number (top) gallery. + +CUSTOM_BIBLIOGRAPHIES + Custom bibliography gallery. + +CUSTOM_1 + Generic custom gallery 1. + +CUSTOM_2 + Generic custom gallery 2. + +CUSTOM_3 + Generic custom gallery 3. + +CUSTOM_4 + Generic custom gallery 4. + +CUSTOM_5 + Generic custom gallery 5. diff --git a/docs/api/enum/WdDocGridType.rst b/docs/api/enum/WdDocGridType.rst new file mode 100644 index 000000000..ab9451997 --- /dev/null +++ b/docs/api/enum/WdDocGridType.rst @@ -0,0 +1,26 @@ +.. _WdDocGridType: + +``WD_DOC_GRID_TYPE`` +==================== + +Specifies the type of document grid applied to a section. + +Example:: + + from docx.enum.section import WD_DOC_GRID_TYPE + + section.document_grid.type = WD_DOC_GRID_TYPE.LINES + +---- + +DEFAULT + No document grid is applied. + +LINES + Grid specifies lines per page only. + +LINES_AND_CHARS + Grid specifies both lines per page and characters per line. + +SNAP_TO_CHARS + Grid snaps characters to a fixed-width column. diff --git a/docs/api/enum/WdDrawingType.rst b/docs/api/enum/WdDrawingType.rst new file mode 100644 index 000000000..adb47d4b0 --- /dev/null +++ b/docs/api/enum/WdDrawingType.rst @@ -0,0 +1,33 @@ +.. _WdDrawingType: + +``WD_DRAWING_TYPE`` +=================== + +Identifies the kind of DrawingML content contained in a ``w:drawing`` element. + +Example:: + + from docx.enum.shape import WD_DRAWING_TYPE + + if drawing.type == WD_DRAWING_TYPE.PICTURE: + ... + +---- + +SHAPE + A DrawingML shape. + +TEXT_BOX + A text box. + +GROUP + A group of shapes. + +CHART + An embedded chart. + +DIAGRAM + A SmartArt diagram. + +PICTURE + A picture. diff --git a/docs/api/enum/WdEndnotePosition.rst b/docs/api/enum/WdEndnotePosition.rst new file mode 100644 index 000000000..5333547c1 --- /dev/null +++ b/docs/api/enum/WdEndnotePosition.rst @@ -0,0 +1,20 @@ +.. _WdEndnotePosition: + +``WD_ENDNOTE_POSITION`` +======================= + +Specifies the position of endnotes in the document. + +Example:: + + from docx.enum.text import WD_ENDNOTE_POSITION + + section.endnote_properties.position = WD_ENDNOTE_POSITION.END_OF_SECTION + +---- + +END_OF_DOCUMENT + Endnotes appear at the end of the document. + +END_OF_SECTION + Endnotes appear at the end of each section. diff --git a/docs/api/enum/WdFootnotePosition.rst b/docs/api/enum/WdFootnotePosition.rst new file mode 100644 index 000000000..82e4eda57 --- /dev/null +++ b/docs/api/enum/WdFootnotePosition.rst @@ -0,0 +1,20 @@ +.. _WdFootnotePosition: + +``WD_FOOTNOTE_POSITION`` +======================== + +Specifies the position of footnotes on the page. + +Example:: + + from docx.enum.text import WD_FOOTNOTE_POSITION + + section.footnote_properties.position = WD_FOOTNOTE_POSITION.BOTTOM_OF_PAGE + +---- + +BOTTOM_OF_PAGE + Footnotes appear at the bottom of the page. + +BENEATH_TEXT + Footnotes appear immediately beneath the body text on the page. diff --git a/docs/api/enum/WdFootnoteRestart.rst b/docs/api/enum/WdFootnoteRestart.rst new file mode 100644 index 000000000..569e8443d --- /dev/null +++ b/docs/api/enum/WdFootnoteRestart.rst @@ -0,0 +1,23 @@ +.. _WdFootnoteRestart: + +``WD_FOOTNOTE_RESTART`` +======================= + +Specifies when footnote numbering restarts. + +Example:: + + from docx.enum.text import WD_FOOTNOTE_RESTART + + section.footnote_properties.restart = WD_FOOTNOTE_RESTART.EACH_SECTION + +---- + +CONTINUOUS + Continuous numbering throughout the document. + +EACH_SECTION + Numbering restarts at the beginning of each section. + +EACH_PAGE + Numbering restarts at the beginning of each page. diff --git a/docs/api/enum/WdFrameDropCap.rst b/docs/api/enum/WdFrameDropCap.rst new file mode 100644 index 000000000..dcaf2765f --- /dev/null +++ b/docs/api/enum/WdFrameDropCap.rst @@ -0,0 +1,23 @@ +.. _WdFrameDropCap: + +``WD_FRAME_DROP_CAP`` +===================== + +Specifies whether a text frame is a drop-cap frame and where it is located. + +Example:: + + from docx.enum.text import WD_FRAME_DROP_CAP + + paragraph.paragraph_format.frame.drop_cap = WD_FRAME_DROP_CAP.DROP + +---- + +NONE + Not a drop-cap frame. + +DROP + Drop-cap frame dropped into the paragraph text. + +MARGIN + Drop-cap frame positioned in the margin. diff --git a/docs/api/enum/WdFrameHAlign.rst b/docs/api/enum/WdFrameHAlign.rst new file mode 100644 index 000000000..68c95b3fc --- /dev/null +++ b/docs/api/enum/WdFrameHAlign.rst @@ -0,0 +1,29 @@ +.. _WdFrameHAlign: + +``WD_FRAME_H_ALIGN`` +==================== + +Specifies the horizontal alignment of a text frame. + +Example:: + + from docx.enum.text import WD_FRAME_H_ALIGN + + paragraph.paragraph_format.frame.horizontal_align = WD_FRAME_H_ALIGN.CENTER + +---- + +LEFT + Frame is left-aligned. + +CENTER + Frame is center-aligned. + +RIGHT + Frame is right-aligned. + +INSIDE + Frame is aligned to the inside of the page (for facing pages). + +OUTSIDE + Frame is aligned to the outside of the page (for facing pages). diff --git a/docs/api/enum/WdFrameHAnchor.rst b/docs/api/enum/WdFrameHAnchor.rst new file mode 100644 index 000000000..fd11482a6 --- /dev/null +++ b/docs/api/enum/WdFrameHAnchor.rst @@ -0,0 +1,23 @@ +.. _WdFrameHAnchor: + +``WD_FRAME_H_ANCHOR`` +===================== + +Specifies the horizontal anchor of a text frame. + +Example:: + + from docx.enum.text import WD_FRAME_H_ANCHOR + + paragraph.paragraph_format.frame.horizontal_anchor = WD_FRAME_H_ANCHOR.MARGIN + +---- + +TEXT + Horizontal position is relative to the text of the paragraph. + +MARGIN + Horizontal position is relative to the page margin. + +PAGE + Horizontal position is relative to the page edge. diff --git a/docs/api/enum/WdFrameVAlign.rst b/docs/api/enum/WdFrameVAlign.rst new file mode 100644 index 000000000..974da75e9 --- /dev/null +++ b/docs/api/enum/WdFrameVAlign.rst @@ -0,0 +1,32 @@ +.. _WdFrameVAlign: + +``WD_FRAME_V_ALIGN`` +==================== + +Specifies the vertical alignment of a text frame. + +Example:: + + from docx.enum.text import WD_FRAME_V_ALIGN + + paragraph.paragraph_format.frame.vertical_align = WD_FRAME_V_ALIGN.TOP + +---- + +INLINE + Frame is positioned inline with the surrounding text. + +TOP + Frame is top-aligned. + +CENTER + Frame is center-aligned vertically. + +BOTTOM + Frame is bottom-aligned. + +INSIDE + Frame is aligned to the inside of the page (for facing pages). + +OUTSIDE + Frame is aligned to the outside of the page (for facing pages). diff --git a/docs/api/enum/WdFrameVAnchor.rst b/docs/api/enum/WdFrameVAnchor.rst new file mode 100644 index 000000000..a0e01abd3 --- /dev/null +++ b/docs/api/enum/WdFrameVAnchor.rst @@ -0,0 +1,23 @@ +.. _WdFrameVAnchor: + +``WD_FRAME_V_ANCHOR`` +===================== + +Specifies the vertical anchor of a text frame. + +Example:: + + from docx.enum.text import WD_FRAME_V_ANCHOR + + paragraph.paragraph_format.frame.vertical_anchor = WD_FRAME_V_ANCHOR.PAGE + +---- + +TEXT + Vertical position is relative to the text of the paragraph. + +MARGIN + Vertical position is relative to the page margin. + +PAGE + Vertical position is relative to the page edge. diff --git a/docs/api/enum/WdFrameWrap.rst b/docs/api/enum/WdFrameWrap.rst new file mode 100644 index 000000000..c4b35186d --- /dev/null +++ b/docs/api/enum/WdFrameWrap.rst @@ -0,0 +1,32 @@ +.. _WdFrameWrap: + +``WD_FRAME_WRAP`` +================= + +Specifies how text wraps around a text frame. + +Example:: + + from docx.enum.text import WD_FRAME_WRAP + + paragraph.paragraph_format.frame.wrap = WD_FRAME_WRAP.AROUND + +---- + +AUTO + Text wraps around the frame on all sides. + +NOT_BESIDE + Text does not wrap beside the frame. + +AROUND + Text wraps around the frame. + +NONE + Text does not wrap around the frame. + +TIGHT + Text wraps tightly around the frame. + +THROUGH + Text wraps through the frame. diff --git a/docs/api/enum/WdHeaderFooterIndex.rst b/docs/api/enum/WdHeaderFooterIndex.rst new file mode 100644 index 000000000..4618b1777 --- /dev/null +++ b/docs/api/enum/WdHeaderFooterIndex.rst @@ -0,0 +1,24 @@ +.. _WdHeaderFooterIndex: + +``WD_HEADER_FOOTER_INDEX`` +========================== + +Identifies a header or footer in a section by its logical role. + +Example:: + + from docx.enum.section import WD_HEADER_FOOTER_INDEX + + header = section.header_for(WD_HEADER_FOOTER_INDEX.FIRST_PAGE) + +---- + +PRIMARY + Primary header/footer - used on odd pages and on pages not covered by + the other indexes. + +FIRST_PAGE + Header/footer used on the first page of the section. + +EVEN_PAGE + Header/footer used on even pages of a recto/verso section. diff --git a/docs/api/enum/WdInlineShapeType.rst b/docs/api/enum/WdInlineShapeType.rst new file mode 100644 index 000000000..cbce631f9 --- /dev/null +++ b/docs/api/enum/WdInlineShapeType.rst @@ -0,0 +1,30 @@ +.. _WdInlineShapeType: + +``WD_INLINE_SHAPE_TYPE`` +======================== + +Identifies the kind of content carried by an :class:`.InlineShape`. + +Example:: + + from docx.enum.shape import WD_INLINE_SHAPE_TYPE + + if inline_shape.type == WD_INLINE_SHAPE_TYPE.PICTURE: + ... + +---- + +CHART + The inline shape is a chart. + +LINKED_PICTURE + The inline shape is a linked picture (external reference). + +PICTURE + The inline shape is an embedded picture. + +SMART_ART + The inline shape is a SmartArt diagram. + +NOT_IMPLEMENTED + The inline shape is of a kind not currently recognised by ``python-docx``. diff --git a/docs/api/enum/WdLineNumberingRestart.rst b/docs/api/enum/WdLineNumberingRestart.rst new file mode 100644 index 000000000..24264e557 --- /dev/null +++ b/docs/api/enum/WdLineNumberingRestart.rst @@ -0,0 +1,23 @@ +.. _WdLineNumberingRestart: + +``WD_LINE_NUMBERING_RESTART`` +============================= + +Specifies when automatic line numbering restarts in a section. + +Example:: + + from docx.enum.section import WD_LINE_NUMBERING_RESTART + + section.line_numbering.restart = WD_LINE_NUMBERING_RESTART.NEW_PAGE + +---- + +CONTINUOUS + Line numbering continues from the previous section. + +NEW_SECTION + Line numbering restarts at the beginning of each section. + +NEW_PAGE + Line numbering restarts at the beginning of each page. diff --git a/docs/api/enum/WdMailMergeDataType.rst b/docs/api/enum/WdMailMergeDataType.rst new file mode 100644 index 000000000..e3ab9ab81 --- /dev/null +++ b/docs/api/enum/WdMailMergeDataType.rst @@ -0,0 +1,32 @@ +.. _WdMailMergeDataType: + +``WD_MAIL_MERGE_DATA_TYPE`` +=========================== + +Specifies the type of data source used by a mail-merge operation. + +Example:: + + from docx.enum.text import WD_MAIL_MERGE_DATA_TYPE + + settings.mail_merge.data_type = WD_MAIL_MERGE_DATA_TYPE.SPREADSHEET + +---- + +TEXT_FILE + Delimited text file (CSV / TSV). + +DATABASE + Microsoft Access or similar database. + +SPREADSHEET + Excel spreadsheet. + +QUERY + Word query file. + +ODBC + ODBC-connected data source. + +NATIVE + Native Word data source. diff --git a/docs/api/enum/WdMailMergeDestination.rst b/docs/api/enum/WdMailMergeDestination.rst new file mode 100644 index 000000000..52b00adbc --- /dev/null +++ b/docs/api/enum/WdMailMergeDestination.rst @@ -0,0 +1,26 @@ +.. _WdMailMergeDestination: + +``WD_MAIL_MERGE_DESTINATION`` +============================= + +Specifies the destination of mail-merge output. + +Example:: + + from docx.enum.text import WD_MAIL_MERGE_DESTINATION + + settings.mail_merge.destination = WD_MAIL_MERGE_DESTINATION.NEW_DOCUMENT + +---- + +NEW_DOCUMENT + Produce a new Word document containing the merged output. + +PRINTER + Send output directly to the printer. + +EMAIL + Email each merged record. + +FAX + Fax each merged record. diff --git a/docs/api/enum/WdMailMergeType.rst b/docs/api/enum/WdMailMergeType.rst new file mode 100644 index 000000000..0536bdfb6 --- /dev/null +++ b/docs/api/enum/WdMailMergeType.rst @@ -0,0 +1,32 @@ +.. _WdMailMergeType: + +``WD_MAIL_MERGE_TYPE`` +====================== + +Specifies the type of mail-merge operation. + +Example:: + + from docx.enum.text import WD_MAIL_MERGE_TYPE + + settings.mail_merge.merge_type = WD_MAIL_MERGE_TYPE.FORM_LETTERS + +---- + +CATALOG + Catalog-style merge (all records on one page). + +ENVELOPES + Envelope printing merge. + +MAILING_LABELS + Mailing-label printing merge. + +FORM_LETTERS + Form-letter merge (one letter per record). + +EMAIL + Email-message merge. + +FAX + Fax merge. diff --git a/docs/api/enum/WdNumberFormat.rst b/docs/api/enum/WdNumberFormat.rst new file mode 100644 index 000000000..88e3c15da --- /dev/null +++ b/docs/api/enum/WdNumberFormat.rst @@ -0,0 +1,50 @@ +.. _WdNumberFormat: + +``WD_NUMBER_FORMAT`` +==================== + +Specifies a numeric format used for numbering list items, footnotes, or endnotes. + +Example:: + + from docx.enum.text import WD_NUMBER_FORMAT + + level.number_format = WD_NUMBER_FORMAT.UPPER_ROMAN + +---- + +DECIMAL + Decimal numbers (1, 2, 3 ...). + +ARABIC + Alias for ``DECIMAL`` (Arabic numerals: 1, 2, 3 ...). + +UPPER_ROMAN + Uppercase Roman numerals (I, II, III ...). + +LOWER_ROMAN + Lowercase Roman numerals (i, ii, iii ...). + +UPPER_LETTER + Uppercase letters (A, B, C ...). + +LOWER_LETTER + Lowercase letters (a, b, c ...). + +ORDINAL + Ordinal numbers (1st, 2nd, 3rd ...). + +CARDINAL_TEXT + Cardinal text (One, Two, Three ...). + +ORDINAL_TEXT + Ordinal text (First, Second ...). + +CHICAGO + Chicago Manual of Style footnote marks (``*``, dagger, double dagger, section). + +BULLET + Bullet character (not numbered). + +NONE + No number. diff --git a/docs/api/enum/WdProtection.rst b/docs/api/enum/WdProtection.rst new file mode 100644 index 000000000..f194e8d9b --- /dev/null +++ b/docs/api/enum/WdProtection.rst @@ -0,0 +1,26 @@ +.. _WdProtection: + +``WD_PROTECTION`` +================= + +Specifies the type of editing protection applied to a document. + +Example:: + + from docx.enum.text import WD_PROTECTION + + settings.document_protection.protection_type = WD_PROTECTION.READ_ONLY + +---- + +READ_ONLY + The document is read-only; no edits are permitted. + +COMMENTS + Only comments may be inserted or modified. + +TRACKED_CHANGES + Any edit is permitted, but is recorded as a tracked change. + +FORMS + Only form-field content may be edited. diff --git a/docs/api/enum/WdShadingPattern.rst b/docs/api/enum/WdShadingPattern.rst new file mode 100644 index 000000000..661c9bffe --- /dev/null +++ b/docs/api/enum/WdShadingPattern.rst @@ -0,0 +1,59 @@ +.. _WdShadingPattern: + +``WD_SHADING_PATTERN`` +====================== + +Specifies the background pattern applied to a cell or run. + +Example:: + + from docx.enum.table import WD_SHADING_PATTERN + + cell.shading.pattern = WD_SHADING_PATTERN.SOLID + +---- + +CLEAR + No pattern, just background fill color. + +SOLID + Solid pattern (foreground color fills entire area). + +HORZ_STRIPE + Horizontal stripe pattern. + +VERT_STRIPE + Vertical stripe pattern. + +REVERSE_DIAG_STRIPE + Reverse diagonal stripe pattern. + +DIAG_STRIPE + Diagonal stripe pattern. + +HORZ_CROSS + Horizontal cross pattern. + +DIAG_CROSS + Diagonal cross pattern. + +THIN_HORZ_STRIPE + Thin horizontal stripe pattern. + +THIN_VERT_STRIPE + Thin vertical stripe pattern. + +THIN_REVERSE_DIAG_STRIPE + Thin reverse diagonal stripe pattern. + +THIN_DIAG_STRIPE + Thin diagonal stripe pattern. + +THIN_HORZ_CROSS + Thin horizontal cross pattern. + +THIN_DIAG_CROSS + Thin diagonal cross pattern. + +NIL + No shading. diff --git a/docs/api/enum/WdShape.rst b/docs/api/enum/WdShape.rst new file mode 100644 index 000000000..6e1c14a3d --- /dev/null +++ b/docs/api/enum/WdShape.rst @@ -0,0 +1,29 @@ +.. _WdShape: + +``WD_SHAPE`` +============ + +Identifies the preset geometry of a DrawingML shape. + +Example:: + + from docx.enum.shape import WD_SHAPE + + shape.preset_geometry = WD_SHAPE.OVAL + +---- + +RECTANGLE + Rectangle shape. + +ROUNDED_RECTANGLE + Rounded-rectangle shape. + +OVAL + Oval (ellipse) shape. + +ARROW_RIGHT + Right-arrow shape. + +CALLOUT_ROUNDED_RECTANGLE + Rounded-rectangle callout shape. diff --git a/docs/api/enum/WdTableAutofit.rst b/docs/api/enum/WdTableAutofit.rst new file mode 100644 index 000000000..67808f414 --- /dev/null +++ b/docs/api/enum/WdTableAutofit.rst @@ -0,0 +1,24 @@ +.. _WdTableAutofit: + +``WD_TABLE_AUTOFIT`` +==================== + +Specifies the autofit behavior for a table. + +Example:: + + from docx.enum.table import WD_TABLE_AUTOFIT + + table = document.add_table(3, 3) + table.autofit_behavior = WD_TABLE_AUTOFIT.AUTOFIT_TO_CONTENTS + +---- + +AUTOFIT_TO_WINDOW + Column widths adjust automatically so the table fills the window width. + +AUTOFIT_TO_CONTENTS + Column widths adjust automatically based on cell contents. + +FIXED_WIDTH + Column widths are fixed regardless of cell contents. diff --git a/docs/api/enum/WdTextDirection.rst b/docs/api/enum/WdTextDirection.rst new file mode 100644 index 000000000..e4b3cd643 --- /dev/null +++ b/docs/api/enum/WdTextDirection.rst @@ -0,0 +1,35 @@ +.. _WdTextDirection: + +``WD_TEXT_DIRECTION`` +===================== + +Specifies the direction in which text flows within a table cell or section. + +Example:: + + from docx.enum.table import WD_TEXT_DIRECTION + + table = document.add_table(3, 3) + table.cell(0, 0).text_direction = WD_TEXT_DIRECTION.TB_RL + +---- + +LR_TB + Left-to-right, top-to-bottom (default horizontal orientation). + +TB_RL + Top-to-bottom, right-to-left. Rotates text 90 degrees clockwise so it + reads top-to-bottom along the right edge of the cell. + +BT_LR + Bottom-to-top, left-to-right. Rotates text 90 degrees counter-clockwise + so it reads bottom-to-top along the left edge of the cell. + +LR_TB_V + Left-to-right horizontal flow with vertical glyph layout. + +TB_RL_V + Top-to-bottom, right-to-left vertical flow with vertical glyph layout. + +TB_LR_V + Top-to-bottom, left-to-right vertical flow with vertical glyph layout. diff --git a/docs/api/enum/WdView.rst b/docs/api/enum/WdView.rst new file mode 100644 index 000000000..d8df66b0e --- /dev/null +++ b/docs/api/enum/WdView.rst @@ -0,0 +1,35 @@ +.. _WdView: + +``WD_VIEW`` +=========== + +Specifies the initial view mode Word uses when opening the document. + +Example:: + + from docx.enum.text import WD_VIEW + + settings.view = WD_VIEW.PRINT + +---- + +NONE + No view mode is specified. + +PRINT + Print layout view (Word's default editing view). + +OUTLINE + Outline view, showing document headings and hierarchy. + +MASTER_PAGES + Master-pages (master document) view. + +NORMAL + Normal (draft) view, emphasizing text flow over layout. + +WEB + Web layout view, showing the document as it would appear in a browser. + +READING + Full-screen reading view optimized for reading. diff --git a/docs/api/enum/WdWrapType.rst b/docs/api/enum/WdWrapType.rst new file mode 100644 index 000000000..0f9d24d89 --- /dev/null +++ b/docs/api/enum/WdWrapType.rst @@ -0,0 +1,32 @@ +.. _WdWrapType: + +``WD_WRAP_TYPE`` +================ + +Specifies how text wraps around a floating shape or image. + +Example:: + + from docx.enum.shape import WD_WRAP_TYPE + + floating_image.wrap_type = WD_WRAP_TYPE.SQUARE + +---- + +SQUARE + Text wraps around the bounding box of the shape. + +TIGHT + Text wraps tightly around the shape contour. + +THROUGH + Text wraps through the shape, filling available concavities. + +TOP_AND_BOTTOM + Text flows above and below the shape only. + +BEHIND + Shape floats behind the text. + +IN_FRONT + Shape floats in front of the text. diff --git a/docs/api/enum/index.rst b/docs/api/enum/index.rst index ce76e7f51..6ac240f3c 100644 --- a/docs/api/enum/index.rst +++ b/docs/api/enum/index.rst @@ -11,16 +11,48 @@ can be found here: MsoColorType MsoThemeColorIndex WdAlignParagraph + WdAnchorH + WdAnchorV + WdBorderDisplay + WdBorderOffsetFrom + WdBorderStyle + WdBreakType + WdBuildingBlockGallery WdBuiltinStyle WdCellVerticalAlignment WdColorIndex + WdDocGridType + WdDrawingType + WdEndnotePosition + WdFootnotePosition + WdFootnoteRestart + WdFrameDropCap + WdFrameHAlign + WdFrameHAnchor + WdFrameVAlign + WdFrameVAnchor + WdFrameWrap + WdHeaderFooterIndex + WdInlineShapeType + WdLineNumberingRestart WdLineSpacing + WdMailMergeDataType + WdMailMergeDestination + WdMailMergeType + WdNumberFormat WdOrientation + WdProtection WdRowAlignment WdRowHeightRule WdSectionStart + WdShadingPattern + WdShape WdStyleType WdTabAlignment - WdTabLeader + WdTableAutofit WdTableDirection + WdTabLeader + WdTextDirection WdUnderline + WdView + WdWrapType diff --git a/docs/api/equations.rst b/docs/api/equations.rst new file mode 100644 index 000000000..7a7e565ca --- /dev/null +++ b/docs/api/equations.rst @@ -0,0 +1,27 @@ + +.. _equations_api: + +Equations (Office Math) +======================= + +.. currentmodule:: docx.equations + + +|Equation| objects +------------------ + +.. autoclass:: Equation() + + +Builder helpers +--------------- + +.. autofunction:: build_identifier + +.. autofunction:: build_fraction + +.. autofunction:: build_superscript + +.. autofunction:: build_subscript + +.. autofunction:: build_radical diff --git a/docs/api/fields.rst b/docs/api/fields.rst new file mode 100644 index 000000000..7d1f7b459 --- /dev/null +++ b/docs/api/fields.rst @@ -0,0 +1,21 @@ + +.. _fields_api: + +Fields +====== + +.. currentmodule:: docx.fields + + +|Field| objects +--------------- + +.. autoclass:: Field() + + +Field-type constants +-------------------- + +.. autoclass:: WD_FIELD_TYPE() + :members: + :undoc-members: diff --git a/docs/api/font-table.rst b/docs/api/font-table.rst new file mode 100644 index 000000000..5d273a7e5 --- /dev/null +++ b/docs/api/font-table.rst @@ -0,0 +1,19 @@ + +.. _font_table_api: + +Font table +========== + +.. currentmodule:: docx.font_table + + +|FontTable| objects +------------------- + +.. autoclass:: FontTable() + + +FontMetadata objects +-------------------- + +.. autoclass:: FontMetadata() diff --git a/docs/api/footnotes.rst b/docs/api/footnotes.rst new file mode 100644 index 000000000..04e567eb1 --- /dev/null +++ b/docs/api/footnotes.rst @@ -0,0 +1,29 @@ + +.. _footnotes_api: + +Footnotes +========= + +.. currentmodule:: docx.footnotes + + +|Footnotes| objects +------------------- + +.. autoclass:: Footnotes() + :inherited-members: + :exclude-members: part + + +Footnote objects +---------------- + +.. autoclass:: Footnote() + :inherited-members: + :exclude-members: part + + +|FootnoteProperties| objects +---------------------------- + +.. autoclass:: FootnoteProperties() diff --git a/docs/api/form-fields.rst b/docs/api/form-fields.rst new file mode 100644 index 000000000..f3e54c336 --- /dev/null +++ b/docs/api/form-fields.rst @@ -0,0 +1,39 @@ + +.. _form_fields_api: + +Form fields (legacy) +==================== + +.. currentmodule:: docx.form_fields + + +|FormField| objects +------------------- + +.. autoclass:: FormField() + + +Text-input form fields +---------------------- + +.. autoclass:: TextInputFormField() + + +Checkbox form fields +-------------------- + +.. autoclass:: CheckboxFormField() + + +Dropdown form fields +-------------------- + +.. autoclass:: DropdownFormField() + + +Form-field types +---------------- + +.. autoclass:: WD_FORM_FIELD_TYPE() + :members: + :undoc-members: diff --git a/docs/api/glossary.rst b/docs/api/glossary.rst new file mode 100644 index 000000000..08ca647bd --- /dev/null +++ b/docs/api/glossary.rst @@ -0,0 +1,24 @@ + +.. _glossary_api: + +Glossary document +================= + +.. currentmodule:: docx.glossary + + +|Glossary| objects +------------------ + +.. autoclass:: Glossary() + + +Building blocks +--------------- + +.. autoclass:: BuildingBlock() + :inherited-members: + :exclude-members: part + + +.. autoclass:: BuildingBlockCategory() diff --git a/docs/api/ink.rst b/docs/api/ink.rst new file mode 100644 index 000000000..ba2b88543 --- /dev/null +++ b/docs/api/ink.rst @@ -0,0 +1,13 @@ + +.. _ink_api: + +Ink annotations +=============== + +.. currentmodule:: docx.ink + + +|InkAnnotation| objects +----------------------- + +.. autoclass:: InkAnnotation() diff --git a/docs/api/numbering.rst b/docs/api/numbering.rst new file mode 100644 index 000000000..8a87cc4b3 --- /dev/null +++ b/docs/api/numbering.rst @@ -0,0 +1,25 @@ + +.. _numbering_api: + +Numbering and lists +=================== + +.. currentmodule:: docx.numbering + + +|Numbering| objects +------------------- + +.. autoclass:: Numbering() + + +Numbering definitions +--------------------- + +.. autoclass:: NumberingDefinition() + + +|Level| objects +--------------- + +.. autoclass:: Level() diff --git a/docs/api/permissions.rst b/docs/api/permissions.rst new file mode 100644 index 000000000..186eab0c5 --- /dev/null +++ b/docs/api/permissions.rst @@ -0,0 +1,13 @@ + +.. _permissions_api: + +Permission ranges +================= + +.. currentmodule:: docx.permissions + + +|PermissionRange| objects +------------------------- + +.. autoclass:: PermissionRange() diff --git a/docs/api/ruby.rst b/docs/api/ruby.rst new file mode 100644 index 000000000..ce6a3b8f6 --- /dev/null +++ b/docs/api/ruby.rst @@ -0,0 +1,13 @@ + +.. _ruby_api: + +Ruby (phonetic) annotations +=========================== + +.. currentmodule:: docx.ruby + + +|RubyAnnotation| objects +------------------------ + +.. autoclass:: RubyAnnotation() diff --git a/docs/api/search.rst b/docs/api/search.rst new file mode 100644 index 000000000..b7da6b87c --- /dev/null +++ b/docs/api/search.rst @@ -0,0 +1,41 @@ + +.. _search_api: + +Search and replace +================== + +.. currentmodule:: docx.search + +High-level ``Document.search`` / ``Document.replace`` methods delegate to the +functions in this module. Use the functions directly when operating on a +list of paragraphs returned by another API. + + +|SearchMatch| objects +--------------------- + +.. autoclass:: SearchMatch() + + +Searching +--------- + +.. autofunction:: search_paragraphs + +.. autofunction:: search_paragraphs_regex + +.. autofunction:: search_all_paragraphs + +.. autofunction:: search_all_paragraphs_regex + + +Replacing +--------- + +.. autofunction:: replace_in_paragraphs + +.. autofunction:: replace_in_paragraphs_regex + +.. autofunction:: replace_in_all_paragraphs + +.. autofunction:: replace_in_all_paragraphs_regex diff --git a/docs/api/section.rst b/docs/api/section.rst index e2d547c75..763cf54ef 100644 --- a/docs/api/section.rst +++ b/docs/api/section.rst @@ -14,7 +14,6 @@ Provides access to section properties such as margins and page orientation. .. currentmodule:: docx.section .. autoclass:: Sections - :members: |Section| objects @@ -22,7 +21,6 @@ Provides access to section properties such as margins and page orientation. .. autoclass:: Section - :members: |_Header| and |_Footer| objects @@ -30,12 +28,46 @@ Provides access to section properties such as margins and page orientation. .. autoclass:: _Header() - :inherited-members: - :members: + :inherited-members: :exclude-members: part .. autoclass:: _Footer() - :inherited-members: - :members: + :inherited-members: :exclude-members: part + + +|Column| objects +---------------- + +.. autoclass:: Column + + +|SectionColumns| objects +------------------------ + +.. autoclass:: SectionColumns + + +|PageBorder| objects +-------------------- + +.. autoclass:: PageBorder + + +|PageBorders| objects +--------------------- + +.. autoclass:: PageBorders + + +|LineNumbering| objects +----------------------- + +.. autoclass:: LineNumbering + + +|DocumentGrid| objects +---------------------- + +.. autoclass:: DocumentGrid diff --git a/docs/api/settings.rst b/docs/api/settings.rst index 509b925b5..6758d579f 100644 --- a/docs/api/settings.rst +++ b/docs/api/settings.rst @@ -7,7 +7,30 @@ Document |Settings| objects .. currentmodule:: docx.settings .. autoclass:: Settings() - :members: :inherited-members: :exclude-members: part + + +|DocumentProtection| objects +---------------------------- + +.. autoclass:: DocumentProtection() + + +CompatSettings objects +---------------------- + +.. autoclass:: CompatSettings() + + +CompatFlags objects +------------------- + +.. autoclass:: CompatFlags() + + +|MailMerge| objects +------------------- + +.. autoclass:: MailMerge() diff --git a/docs/api/shape.rst b/docs/api/shape.rst index 200b34977..45ebdd14e 100644 --- a/docs/api/shape.rst +++ b/docs/api/shape.rst @@ -11,7 +11,6 @@ Shape-related objects ---------------------- .. autoclass:: InlineShapes - :members: :exclude-members: add_picture @@ -28,4 +27,9 @@ have built-in units conversion properties, e.g.:: 1.0 .. autoclass:: InlineShape - :members: height, type, width + + +|FloatingImage| objects +----------------------- + +.. autoclass:: FloatingImage diff --git a/docs/api/signatures.rst b/docs/api/signatures.rst new file mode 100644 index 000000000..aba6c7587 --- /dev/null +++ b/docs/api/signatures.rst @@ -0,0 +1,13 @@ + +.. _signatures_api: + +Digital signatures +================== + +.. currentmodule:: docx.signatures + + +|SignatureInfo| objects +----------------------- + +.. autoclass:: SignatureInfo() diff --git a/docs/api/smart-art.rst b/docs/api/smart-art.rst new file mode 100644 index 000000000..9e25ea6b8 --- /dev/null +++ b/docs/api/smart-art.rst @@ -0,0 +1,25 @@ + +.. _smart_art_api: + +SmartArt +======== + +.. currentmodule:: docx.smart_art + + +|SmartArt| objects +------------------ + +.. autoclass:: SmartArt() + + +SmartArt nodes +-------------- + +.. autoclass:: SmartArtNode() + + +Helpers +------- + +.. autofunction:: smart_art_for_drawing diff --git a/docs/api/stable-ids.rst b/docs/api/stable-ids.rst new file mode 100644 index 000000000..ad393cf67 --- /dev/null +++ b/docs/api/stable-ids.rst @@ -0,0 +1,18 @@ + +.. _stable_ids_api: + +Stable identifiers +================== + +.. currentmodule:: docx.ids + +The :mod:`docx.ids` module provides pragmatic mostly-stable identifiers for +paragraphs, runs, tables, and cells. The high-level API surface is the +``stable_id`` property on each of those proxy classes; the helper below is +exposed for advanced use-cases. + + +Functions +--------- + +.. autofunction:: compute_stable_id diff --git a/docs/api/statistics.rst b/docs/api/statistics.rst new file mode 100644 index 000000000..82dc28067 --- /dev/null +++ b/docs/api/statistics.rst @@ -0,0 +1,19 @@ + +.. _statistics_api: + +Document statistics +=================== + +.. currentmodule:: docx.statistics + + +|DocumentStatistics| objects +---------------------------- + +.. autoclass:: DocumentStatistics() + + +Functions +--------- + +.. autofunction:: compute_statistics diff --git a/docs/api/table.rst b/docs/api/table.rst index 6f27670fa..d69ff28fb 100644 --- a/docs/api/table.rst +++ b/docs/api/table.rst @@ -14,7 +14,6 @@ Table objects are constructed using the ``add_table()`` method on |Document|. --------------- .. autoclass:: Table - :members: :exclude-members: table @@ -22,8 +21,7 @@ Table objects are constructed using the ``add_table()`` method on |Document|. ------------------------ .. autoclass:: _Cell - :inherited-members: - :members: + :inherited-members: :exclude-members: part @@ -31,25 +29,57 @@ Table objects are constructed using the ``add_table()`` method on |Document|. -------------- .. autoclass:: _Row - :members: |_Column| objects ----------------- .. autoclass:: _Column - :members: |_Rows| objects --------------- .. autoclass:: _Rows - :members: |_Columns| objects ------------------ .. autoclass:: _Columns - :members: + + +|CellShading| objects +--------------------- + +.. autoclass:: CellShading + + +|TableBorders| objects +---------------------- + +.. autoclass:: TableBorders + + +BorderElement objects +--------------------- + +.. autoclass:: BorderElement + + +|TableStyleFlags| objects +------------------------- + +.. autoclass:: TableStyleFlags + + +|CellBorders| objects +--------------------- + +.. autoclass:: CellBorders + + +|CellMargins| objects +--------------------- + +.. autoclass:: CellMargins diff --git a/docs/api/text.rst b/docs/api/text.rst index f76e3ba33..f02a7acac 100644 --- a/docs/api/text.rst +++ b/docs/api/text.rst @@ -9,49 +9,72 @@ Text-related objects ------------------- .. autoclass:: docx.text.paragraph.Paragraph() - :members: |ParagraphFormat| objects ------------------------- .. autoclass:: docx.text.parfmt.ParagraphFormat() - :members: + + +|ParagraphBorders| objects +-------------------------- + +.. autoclass:: docx.text.parfmt.ParagraphBorders() + + +Border objects +-------------- + +.. autoclass:: docx.text.parfmt.Border() + + +|TextFrame| objects +------------------- + +.. autoclass:: docx.text.parfmt.TextFrame() |Hyperlink| objects ------------------- .. autoclass:: docx.text.hyperlink.Hyperlink() - :members: |Run| objects ------------- .. autoclass:: docx.text.run.Run() - :members: |Font| objects -------------- .. autoclass:: docx.text.run.Font() - :members: + + +|EastAsianLayout| objects +------------------------- + +.. autoclass:: docx.text.font.EastAsianLayout() + + +|Symbol| objects +---------------- + +.. autoclass:: docx.text.symbol.Symbol() |RenderedPageBreak| objects --------------------------- .. autoclass:: docx.text.pagebreak.RenderedPageBreak() - :members: |TabStop| objects ----------------- .. autoclass:: docx.text.tabstops.TabStop() - :members: |TabStops| objects @@ -61,3 +84,4 @@ Text-related objects :members: clear_all .. automethod:: docx.text.tabstops.TabStops.add_tab_stop(position, alignment=WD_TAB_ALIGNMENT.LEFT, leader=WD_TAB_LEADER.SPACES) + :no-index: diff --git a/docs/api/theme.rst b/docs/api/theme.rst new file mode 100644 index 000000000..6b89a2b43 --- /dev/null +++ b/docs/api/theme.rst @@ -0,0 +1,25 @@ + +.. _theme_api: + +Theme +===== + +.. currentmodule:: docx.theme + + +|Theme| objects +--------------- + +.. autoclass:: Theme() + + +Theme colors +------------ + +.. autoclass:: ThemeColors() + + +Theme fonts +----------- + +.. autoclass:: ThemeFonts() diff --git a/docs/api/toc.rst b/docs/api/toc.rst new file mode 100644 index 000000000..a04055030 --- /dev/null +++ b/docs/api/toc.rst @@ -0,0 +1,18 @@ + +.. _toc_api: + +Table of contents +================= + +.. currentmodule:: docx.toc + +High-level TOC creation lives on :meth:`.Document.add_table_of_contents`. The +helpers below are exposed for advanced use-cases. + + +Functions +--------- + +.. autofunction:: build_toc_instruction + +.. autofunction:: populate_toc_paragraph diff --git a/docs/api/tracked-changes.rst b/docs/api/tracked-changes.rst new file mode 100644 index 000000000..630887188 --- /dev/null +++ b/docs/api/tracked-changes.rst @@ -0,0 +1,25 @@ + +.. _tracked_changes_api: + +Tracked changes +=============== + +.. currentmodule:: docx.tracked_changes + + +|TrackedChange| objects +----------------------- + +.. autoclass:: TrackedChange() + + +|MoveRevision| objects +---------------------- + +.. autoclass:: MoveRevision() + + +|FormattingChange| objects +-------------------------- + +.. autoclass:: FormattingChange() diff --git a/docs/api/watermark.rst b/docs/api/watermark.rst new file mode 100644 index 000000000..cddcb8a43 --- /dev/null +++ b/docs/api/watermark.rst @@ -0,0 +1,18 @@ + +.. _watermark_api: + +Watermark +========= + +.. currentmodule:: docx.watermark + +The read side of text and image watermarks. +``Section.add_text_watermark()`` and ``Section.add_image_watermark()`` +create watermarks; the :class:`.Watermark` object below exposes the +properties of an existing watermark. + + +|Watermark| objects +------------------- + +.. autoclass:: Watermark() diff --git a/docs/api/web-settings.rst b/docs/api/web-settings.rst new file mode 100644 index 000000000..ecbf3cfde --- /dev/null +++ b/docs/api/web-settings.rst @@ -0,0 +1,13 @@ + +.. _web_settings_api: + +Web settings +============ + +.. currentmodule:: docx.web_settings + + +|WebSettings| objects +--------------------- + +.. autoclass:: WebSettings() diff --git a/docs/conf.py b/docs/conf.py index 883ecb81d..b2661f503 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -33,13 +33,88 @@ extensions = [ "sphinx.ext.autodoc", "sphinx.ext.intersphinx", - "sphinx.ext.todo", - "sphinx.ext.coverage", + "sphinx.ext.napoleon", "sphinx.ext.viewcode", ] +# -- Napoleon (Google / NumPy docstring support) ---------------------------- +napoleon_google_docstring = True +napoleon_numpy_docstring = True +napoleon_include_init_with_doc = False +napoleon_include_private_with_doc = False +napoleon_include_special_with_doc = True + + +def setup(app): + r"""Install a pre-napoleon hook that disables napoleon's attribute-style + parsing for property/attribute/data docstrings. + + ---------------------------------------------------------------------- + Why this hook exists + ---------------------------------------------------------------------- + This fork's public API is heavily documented in RST style. Property and + attribute docstrings frequently quote OOXML element / attribute names + using single-backtick inline code — e.g.:: + + Read/write. Corresponds to the `w:moveFrom` element on this run. + + Sphinx-napoleon's ``_parse_attribute_docstring`` path treats concise + attribute docstrings as ``"type : description"`` and splits on the + first colon it finds. Because ``w:moveFrom`` contains a colon *inside* + a backtick span, napoleon misinterprets ``w:moveFrom`` as the + ``:type:`` field, producing ~42 bogus ``docutils`` warnings of the + form "Unknown target name: 'w:moveFrom'" (and similar) across + ``chart.py``, ``content_controls.py``, ``document.py``, + ``permissions.py``, ``ruby.py``, ``section.py``, ``styles/style.py``, + ``table.py``, ``text/paragraph.py``, ``text/run.py``, and + ``tracked_changes.py``. + + ---------------------------------------------------------------------- + How it works + ---------------------------------------------------------------------- + We register two ``autodoc-process-docstring`` listeners around + napoleon's own listener (which sits at the default priority of 500): + + * ``_snapshot`` runs at priority 100 — before napoleon — and takes a + copy of the raw docstring lines for every ``property``/ + ``attribute``/``data`` object. + * ``_restore`` runs at priority 900 — after napoleon has mangled + the lines — and writes the pristine snapshot back in place, + undoing the ``type : description`` split. + + The net effect is: napoleon is a no-op for attribute-style docstrings + (which are already valid RST), and still parses Google/NumPy + method/function/class docstrings normally. + + ---------------------------------------------------------------------- + Do not remove without fixing the root cause + ---------------------------------------------------------------------- + The alternative is to rewrite every single-backtick OOXML reference + in those ~11 files to use double-backtick literals (``w:moveFrom``), + so the napoleon ``:``-split never triggers. That's a large mechanical + refactor for a purely cosmetic docs-parsing issue that this hook + already handles cleanly. If you remove this hook, run + ``cd docs && python -m sphinx -b html . _build/html`` and confirm the + build is still warning-free first. + """ + snapshots: dict = {} + + def _snapshot(app, what, name, obj, options, lines): + if what in ("property", "attribute", "data"): + snapshots[name] = list(lines) + + def _restore(app, what, name, obj, options, lines): + if what in ("property", "attribute", "data") and name in snapshots: + lines[:] = snapshots.pop(name) + + # Napoleon's listener registers at default priority (500). Use 100 for + # snapshot (runs first) and 900 for restore (runs last). + app.connect("autodoc-process-docstring", _snapshot, priority=100) + app.connect("autodoc-process-docstring", _restore, priority=900) + + # Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] +# templates_path = ["_templates"] # The suffix of source filenames. source_suffix = ".rst" @@ -77,16 +152,36 @@ .. |_Body| replace:: :class:`._Body` +.. |Bookmark| replace:: :class:`.Bookmark` + +.. |Bookmarks| replace:: :class:`.Bookmarks` + +.. |Border| replace:: :class:`.Border` + +.. |BuildingBlock| replace:: :class:`.BuildingBlock` + +.. |BuildingBlockCategory| replace:: :class:`.BuildingBlockCategory` + .. |_Cell| replace:: :class:`._Cell` +.. |CellBorders| replace:: :class:`.CellBorders` + +.. |CellMargins| replace:: :class:`.CellMargins` + +.. |CellShading| replace:: :class:`.CellShading` + .. |_CharacterStyle| replace:: :class:`.CharacterStyle` .. |CharacterStyle| replace:: :class:`.CharacterStyle` +.. |Chart| replace:: :class:`.Chart` + .. |Cm| replace:: :class:`.Cm` .. |ColorFormat| replace:: :class:`.ColorFormat` +.. |Column| replace:: :class:`.Column` + .. |_Column| replace:: :class:`._Column` .. |_Columns| replace:: :class:`._Columns` @@ -95,38 +190,92 @@ .. |Comments| replace:: :class:`.Comments` +.. |CompatFlags| replace:: :class:`.CompatFlags` + +.. |CompatSettings| replace:: :class:`.CompatSettings` + +.. |ContentControl| replace:: :class:`.ContentControl` + +.. |DataBinding| replace:: :class:`.DataBinding` + +.. |DocumentProtection| replace:: :class:`.DocumentProtection` + .. |CoreProperties| replace:: :class:`.CoreProperties` +.. |CustomProperties| replace:: :class:`.CustomProperties` + +.. |CustomXmlPart| replace:: :class:`.CustomXmlPart` + .. |datetime| replace:: :class:`.datetime.datetime` .. |Document| replace:: :class:`.Document` +.. |DocumentGrid| replace:: :class:`.DocumentGrid` + .. |DocumentPart| replace:: :class:`.DocumentPart` +.. |DocumentStatistics| replace:: :class:`.DocumentStatistics` + .. |docx| replace:: ``python-docx`` +.. |Drawing| replace:: :class:`.Drawing` + +.. |EastAsianLayout| replace:: :class:`.EastAsianLayout` + +.. |EmbeddedObject| replace:: :class:`.EmbeddedObject` + +.. |EndnoteProperties| replace:: :class:`.EndnoteProperties` + +.. |Endnote| replace:: :class:`.Endnote` + +.. |Endnotes| replace:: :class:`.Endnotes` + .. |Emu| replace:: :class:`.Emu` +.. |Equation| replace:: :class:`.Equation` + .. |False| replace:: :class:`False` +.. |Field| replace:: :class:`.Field` + .. |float| replace:: :class:`.float` +.. |FloatingImage| replace:: :class:`.FloatingImage` + .. |Font| replace:: :class:`.Font` +.. |FontTable| replace:: :class:`.FontTable` + .. |_Footer| replace:: :class:`._Footer` .. |FooterPart| replace:: :class:`.FooterPart` +.. |FootnoteProperties| replace:: :class:`.FootnoteProperties` + +.. |Footnote| replace:: :class:`.Footnote` + +.. |Footnotes| replace:: :class:`.Footnotes` + +.. |FormattingChange| replace:: :class:`.FormattingChange` + +.. |FormField| replace:: :class:`.FormField` + +.. |Glossary| replace:: :class:`.Glossary` + .. |_Header| replace:: :class:`._Header` .. |HeaderPart| replace:: :class:`.HeaderPart` +.. |HeadingIssue| replace:: :class:`.HeadingIssue` + .. |Hyperlink| replace:: :class:`.Hyperlink` .. |ImageParts| replace:: :class:`.ImageParts` .. |Inches| replace:: :class:`.Inches` +.. |InkAnnotation| replace:: :class:`.InkAnnotation` + .. |InlineShape| replace:: :class:`.InlineShape` .. |InlineShapes| replace:: :class:`.InlineShapes` @@ -135,22 +284,42 @@ .. |int| replace:: :class:`.int` +.. |KeyError| replace:: :exc:`.KeyError` + .. |_LatentStyle| replace:: :class:`._LatentStyle` .. |LatentStyles| replace:: :class:`.LatentStyles` .. |Length| replace:: :class:`.Length` +.. |Level| replace:: :class:`.Level` + +.. |LineNumbering| replace:: :class:`.LineNumbering` + +.. |MailMerge| replace:: :class:`.MailMerge` + +.. |MoveRevision| replace:: :class:`.MoveRevision` + .. |None| replace:: :class:`.None` +.. |Numbering| replace:: :class:`.Numbering` + +.. |NumberingDefinition| replace:: :class:`.NumberingDefinition` + .. |NumberingPart| replace:: :class:`.NumberingPart` .. |_NumberingStyle| replace:: :class:`._NumberingStyle` .. |OpcPackage| replace:: :class:`.OpcPackage` +.. |PageBorder| replace:: :class:`.PageBorder` + +.. |PageBorders| replace:: :class:`.PageBorders` + .. |Paragraph| replace:: :class:`.Paragraph` +.. |ParagraphBorders| replace:: :class:`.ParagraphBorders` + .. |ParagraphFormat| replace:: :class:`.ParagraphFormat` .. |_ParagraphStyle| replace:: :class:`.ParagraphStyle` @@ -159,6 +328,8 @@ .. |Part| replace:: :class:`.Part` +.. |PermissionRange| replace:: :class:`.PermissionRange` + .. |Pt| replace:: :class:`.Pt` .. |_Relationship| replace:: :class:`._Relationship` @@ -173,22 +344,38 @@ .. |_Rows| replace:: :class:`._Rows` +.. |RubyAnnotation| replace:: :class:`.RubyAnnotation` + .. |Run| replace:: :class:`.Run` +.. |SearchMatch| replace:: :class:`.SearchMatch` + .. |Section| replace:: :class:`.Section` +.. |SectionColumns| replace:: :class:`.SectionColumns` + .. |Sections| replace:: :class:`.Sections` .. |Settings| replace:: :class:`.Settings` +.. |SignatureInfo| replace:: :class:`.SignatureInfo` + +.. |SmartArt| replace:: :class:`.SmartArt` + .. |str| replace:: :class:`.str` .. |Styles| replace:: :class:`.Styles` .. |StylesPart| replace:: :class:`.StylesPart` +.. |Symbol| replace:: :class:`.Symbol` + .. |Table| replace:: :class:`.Table` +.. |TableBorders| replace:: :class:`.TableBorders` + +.. |TableStyleFlags| replace:: :class:`.TableStyleFlags` + .. |_TableStyle| replace:: :class:`._TableStyle` .. |TabStop| replace:: :class:`.TabStop` @@ -197,15 +384,61 @@ .. |_Text| replace:: :class:`._Text` +.. |TextFrame| replace:: :class:`.TextFrame` + +.. |Theme| replace:: :class:`.Theme` + +.. |ThemeColors| replace:: :class:`.ThemeColors` + +.. |ThemeFonts| replace:: :class:`.ThemeFonts` + +.. |TrackedChange| replace:: :class:`.TrackedChange` + .. |True| replace:: :class:`True` .. |ValueError| replace:: :class:`ValueError` + +.. |Watermark| replace:: :class:`.Watermark` + +.. |WD_ANCHOR_H| replace:: :class:`.WD_ANCHOR_H` + +.. |WD_ANCHOR_V| replace:: :class:`.WD_ANCHOR_V` + +.. |WD_BORDER_STYLE| replace:: :class:`.WD_BORDER_STYLE` + +.. |WD_BUILDING_BLOCK_GALLERY| replace:: :class:`.WD_BUILDING_BLOCK_GALLERY` + +.. |WD_DOC_GRID_TYPE| replace:: :class:`.WD_DOC_GRID_TYPE` + +.. |WD_LINE_NUMBERING_RESTART| replace:: :class:`.WD_LINE_NUMBERING_RESTART` + +.. |WD_PROTECTION| replace:: :class:`.WD_PROTECTION` + +.. |WD_SHADING_PATTERN| replace:: :class:`.WD_SHADING_PATTERN` + +.. |WD_TABLE_AUTOFIT| replace:: :class:`.WD_TABLE_AUTOFIT` + +.. |WD_VIEW| replace:: :class:`.WD_VIEW` + +.. |WD_WRAP_TYPE| replace:: :class:`.WD_WRAP_TYPE` + +.. |WebSettings| replace:: :class:`.WebSettings` """ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = [".build"] +exclude_patterns = ["_build"] + +# -- Autodoc ----------------------------------------------------------------- + +# Default options applied to every ``.. autoclass::`` / ``.. automodule::`` +# directive. Individual directives can still override these with their own +# option list. +autodoc_default_options = { + "members": True, + "show-inheritance": True, +} # The reST default role (used for this markup: `text`) to use for all # documents. @@ -233,15 +466,16 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = "armstrong" +html_theme = "furo" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -# html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -html_theme_path = ["_themes"] +html_theme_options = { + "source_repository": "https://github.com/loadfix/python-docx/", + "source_branch": "master", + "source_directory": "docs/", +} # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". @@ -273,8 +507,8 @@ # html_use_smartypants = True # Custom sidebar templates, maps document names to template names. +# Furo supplies its own sidebar out of the box; keep defaults. # html_sidebars = {} -html_sidebars = {"**": ["localtoc.html", "relations.html", "sidebarlinks.html", "searchbox.html"]} # Additional templates that should be rendered to pages, maps page names to # template names. @@ -390,4 +624,4 @@ # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {"http://docs.python.org/3/": None} +intersphinx_mapping = {"python": ("https://docs.python.org/3/", None)} diff --git a/docs/index.rst b/docs/index.rst index aee0acfbf..585c87a94 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,35 +30,40 @@ Here's an example of what |docx| can do: p.add_run(' and some ') p.add_run('italic.').italic = True + # -- fork feature: attach a footnote to a run -- + document.footnotes.add(p.runs[0], 'Footnote body text.') + + # -- fork feature: attach a comment to a range of runs -- + document.add_comment( + runs=p.runs, + text='A reviewer comment.', + author='Editor', + initials='ED', + ) + document.add_heading('Heading, level 1', level=1) document.add_paragraph('Intense quote', style='Intense Quote') document.add_paragraph( 'first item in unordered list', style='List Bullet' ) - document.add_paragraph( - 'first item in ordered list', style='List Number' - ) document.add_picture('monty-truth.png', width=Inches(1.25)) - records = ( - (3, '101', 'Spam'), - (7, '422', 'Eggs'), - (4, '631', 'Spam, spam, eggs, and spam') - ) - table = document.add_table(rows=1, cols=3) hdr_cells = table.rows[0].cells hdr_cells[0].text = 'Qty' hdr_cells[1].text = 'Id' hdr_cells[2].text = 'Desc' - for qty, id, desc in records: + for qty, id, desc in ((3, '101', 'Spam'), (7, '422', 'Eggs')): row_cells = table.add_row().cells row_cells[0].text = str(qty) row_cells[1].text = id row_cells[2].text = desc + # -- fork feature: search and replace across every story -- + document.replace_all('Spam', 'Ham') + document.add_page_break() document.save('demo.docx') @@ -75,14 +80,40 @@ User Guide user/quickstart user/documents user/tables + user/tables-advanced user/text + user/text-advanced user/sections + user/sections-advanced user/hdrftr user/api-concepts user/styles-understanding user/styles-using user/comments + user/bookmarks + user/captions + user/charts + user/content-controls + user/custom-properties + user/document-safety + user/endnotes + user/equations + user/fields + user/footnotes + user/form-fields + user/glossary + user/mail-merge + user/numbering + user/permissions + user/search user/shapes + user/drawing + user/accessibility + user/statistics + user/toc + user/track-changes + user/themes + user/watermarks API Documentation @@ -101,6 +132,35 @@ API Documentation api/shape api/dml api/shared + api/accessibility + api/bookmarks + api/captions + api/chart + api/content-controls + api/custom-properties + api/custom-xml + api/embedded-objects + api/endnotes + api/equations + api/fields + api/font-table + api/footnotes + api/form-fields + api/glossary + api/ink + api/numbering + api/permissions + api/ruby + api/search + api/signatures + api/smart-art + api/stable-ids + api/statistics + api/theme + api/toc + api/tracked-changes + api/watermark + api/web-settings api/enum/index diff --git a/docs/user/accessibility.rst b/docs/user/accessibility.rst new file mode 100644 index 000000000..d5e1f7253 --- /dev/null +++ b/docs/user/accessibility.rst @@ -0,0 +1,109 @@ +.. _accessibility: + +Checking document accessibility +=============================== + +Screen readers and other assistive technologies rely on a clean *heading +outline* to navigate a document. Word's own Accessibility Checker flags common +problems — skipped heading levels, multiple top-level headings, empty heading +paragraphs — and |docx| provides a small API that surfaces the same class of +issues so that build pipelines, CMS imports, and validation scripts can catch +them before a document is published. + +The main entry point is :meth:`.Document.validate_heading_structure`, which +returns a list of |HeadingIssue| objects describing each problem it detects. +When the document's outline is clean the list is empty. + + +What counts as a heading +------------------------ + +A paragraph is considered a heading when its style name matches ``"Heading N"`` +for ``N`` in ``1`` through ``9``, case-insensitively. Paragraphs with any other +style (including ``Title``, ``Subtitle``, or custom styles that merely *look* +like headings) are ignored by the validator. This matches how Word builds its +document outline from the built-in heading styles. + + +Detected issues +--------------- + +Each |HeadingIssue| carries three attributes: + +``paragraph`` + The offending |Paragraph|, so callers can jump straight to the problem. + +``kind`` + A short string identifier. One of: + + * ``"skipped_level"`` — a heading skips one or more outline levels + (e.g. a ``Heading 3`` that directly follows a ``Heading 1`` without an + intervening ``Heading 2``). + * ``"multiple_h1"`` — the document contains more than one top-level + heading. Only the *second* and later ``Heading 1`` paragraphs are + flagged; the first is considered canonical. + * ``"empty_heading"`` — a heading paragraph has no visible text after + whitespace has been stripped. + * ``"no_h1"`` — the first heading in the document is below + ``Heading 1`` (e.g. the outline starts at ``Heading 2``). + +``message`` + A human-readable description of the problem suitable for displaying to + the author. These strings are not meant to be parsed; prefer branching on + ``kind`` and building your own messages when you need i18n or tight UX + control. + + +Running the validator +--------------------- + +Validation happens over the body's paragraphs (tables, headers, footers, and +comment text are not scanned). A minimal example:: + + >>> from docx import Document + >>> document = Document("report.docx") + >>> issues = document.validate_heading_structure() + >>> for issue in issues: + ... print(f"{issue.kind}: {issue.message}") + skipped_level: Heading 3 follows Heading 1; Heading 2 is missing + multiple_h1: Document contains more than one Heading 1; exactly one + top-level heading is recommended + +Issues are returned in document order, which makes them convenient to feed +straight into a linter-style report or to highlight inline in an editor. + + +Building an accessibility gate +------------------------------ + +A common pattern is to fail the build when *any* heading issues are present:: + + issues = document.validate_heading_structure() + if issues: + for issue in issues: + print(f"{issue.kind}: {issue.message}") + raise SystemExit(1) + +Or to selectively allow certain categories (for instance, tolerating multiple +``Heading 1`` paragraphs during a migration):: + + BLOCKING = {"skipped_level", "empty_heading", "no_h1"} + blocking = [i for i in document.validate_heading_structure() if i.kind in BLOCKING] + if blocking: + raise SystemExit(1) + + +Working directly with the function +---------------------------------- + +:meth:`.Document.validate_heading_structure` is a thin wrapper around +:func:`docx.accessibility.validate_heading_structure`, which accepts *any* +iterable of |Paragraph| objects. This is handy for validating just one +section of a document, a comment, or a filtered view:: + + from docx.accessibility import validate_heading_structure + + issues = validate_heading_structure(p for p in document.paragraphs if p.text) + +The function is pure — it does not modify the paragraphs it inspects — so it +is safe to call repeatedly during a larger document-building workflow. diff --git a/docs/user/api-concepts.rst b/docs/user/api-concepts.rst index 2ebe734a3..ecc6972a6 100644 --- a/docs/user/api-concepts.rst +++ b/docs/user/api-concepts.rst @@ -29,3 +29,204 @@ inline objects must be created individually, you'll need the block-item reference to do it. ... add example here as API solidifies ... + + +Architecture: proxy, part, oxml +------------------------------- + +Most |docx| users never need to look below the friendly ``Document``, +``Paragraph``, ``Run``, and ``Table`` APIs. That's by design. When a question +gets more interesting however — "how is this actually stored on disk?", +"why does my edit round-trip that way?", or "how can I reach a feature the +high-level API doesn't yet cover?" — it helps to understand the three layers +the library is built on. + +|docx| is organized as a stack:: + + Document API (src/docx/document.py, src/docx/text/*.py, ...) + | Proxy objects wrapping oxml elements + Parts Layer (src/docx/parts/*.py) + | XmlPart subclasses owning XML trees and relationships + oxml Layer (src/docx/oxml/*.py) + | CT_* element classes extending lxml.etree.ElementBase + lxml (XML parsing and serialization) + +Each layer is narrow and does exactly one job: + +* **Document API (proxy).** The classes you import from |docx| — ``Document``, + ``Paragraph``, ``Run``, ``Table``, ``Section``, ``Footnote`` — are *proxy + objects*. They hold no content of their own; they wrap a single oxml element + and expose an ergonomic, Pythonic interface over it. Proxies inherit from + ``ElementProxy``, ``StoryChild``, or ``BlockItemContainer`` depending on + what they wrap. + +* **Parts Layer.** A `.docx` file is really a ZIP package (OPC) containing + several XML *parts* — the main document, styles, numbering, comments, + footnotes, and so on — joined together by relationships. Each part is + represented by a subclass of ``XmlPart`` (or ``StoryPart``), which owns the + parsed XML tree for that part and knows how to find related parts. The + parts layer is where lazy creation happens: a footnotes part, for example, + is only attached to the document when something first asks for it. + +* **oxml Layer.** The ``CT_*`` classes in ``src/docx/oxml/`` are thin + subclasses of ``lxml.etree.ElementBase``. They are the XML; they don't wrap + anything. They give element types a name (``CT_Footnote``, ``CT_Paragraph``, + ``CT_R``), provide typed accessors for child elements and attributes, and + enforce OOXML schema ordering when new children are inserted. + +Looking at it from the other direction: every proxy object holds its +underlying XML element in a ``_element`` (often aliased ``_p``, ``_r``, ``_tc``, +etc.) attribute. Every such element is an instance of a ``CT_*`` class. The +part that owns it is reachable through its ancestors. You can always move +between layers — the split is organizational, not a security boundary. + + +A concrete pair: ``Footnote`` and ``CT_Footnote`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Here is a proxy class and the oxml element it wraps, reduced to the essentials. +The oxml class describes the ```` element, and the proxy class +gives application code a friendly API over it:: + + # src/docx/oxml/footnotes.py + from docx.oxml.xmlchemy import ( + BaseOxmlElement, RequiredAttribute, ZeroOrMore, ZeroOrOne, + ) + from docx.oxml.simpletypes import ST_DecimalNumber + + class CT_Footnote(BaseOxmlElement): + """```` element.""" + pPr = ZeroOrOne("w:pPr", successors=("w:r",)) + r = ZeroOrMore("w:r", successors=()) + id = RequiredAttribute("w:id", ST_DecimalNumber) + + # src/docx/footnotes.py + class Footnote(BlockItemContainer): + """Proxy for a single ```` element.""" + + @property + def footnote_id(self): + return self._element.id + +Two things are worth noting: + +* The proxy's ``footnote_id`` delegates straight to ``self._element.id``. + The proxy adds no storage; it only translates attribute access into + operations on the underlying element. + +* The ``CT_Footnote`` class says nothing about "the Python API"; it is + exclusively a description of XML shape. A different proxy could be layered + on top of the same element class without any changes to oxml. + + +``BaseOxmlElement``, ``ZeroOrOne``, ``ZeroOrMore``, and successors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The oxml layer is built with an internal helper package called +``xmlchemy``. It provides descriptors that turn child-element and attribute +declarations on a ``CT_*`` class into Pythonic accessors: + +* ``ZeroOrOne(tag, successors=(...))`` — declares an optional single child. + It generates a read-only attribute with that name, plus + ``_add_()``, ``get_or_add_()``, ``_remove_()``, and + ``_insert_()`` helpers. + +* ``ZeroOrMore(tag, successors=(...))`` — declares a repeating child. + It generates a ``_lst`` property returning a list, plus + ``add_()`` and ``_insert_()`` helpers. + +* ``OneAndOnlyOne`` / ``OneOrMore`` — variants with different cardinality + semantics. + +* ``RequiredAttribute`` / ``OptionalAttribute`` — typed attribute descriptors + that validate values through an ``ST_*`` simpletype. + +The ``successors`` tuple is important. OOXML is position-sensitive: a +```` that appears *after* the runs of a paragraph is not the same +thing as one that appears before them — Word will reject or silently mangle +malformed ordering. ``successors`` names the sibling tags that, if present, +must come *after* the element being inserted. ``xmlchemy`` uses it to pick a +correct insertion point when adding children. + +Getting ``successors`` right therefore requires consulting the schema. +The canonical source for element ordering lives in the ``spec/`` folder at +the repository root: + +* ``spec/xsd/wml.xsd`` — WordprocessingML (paragraphs, runs, tables, sections, + footnotes, comments, etc.). +* ``spec/xsd/dml-wordprocessingDrawing.xsd`` — inline and anchor drawing + wrappers used for images and shapes inside a Word document. +* ``spec/xsd/shared-math.xsd`` — Office Math (OMML). +* ``spec/rnc/*.rnc`` — RELAX NG Compact equivalents of the same grammars, + substantially easier to read than the XSDs when you only want the ordering. + +When the XSD and observed Word behaviour disagree, treat Word's behaviour +as authoritative for interoperability and the XSD as authoritative for what +the spec permits. + + +Registering elements and relationships +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Declaring a ``CT_*`` class is not enough on its own — lxml needs to know +that the tag ``w:footnote`` should be parsed into an instance of +``CT_Footnote``. That mapping is installed by ``register_element_cls`` in +``src/docx/oxml/__init__.py``:: + + register_element_cls("w:footnote", CT_Footnote) + +Without that line, parsing a footnote element would give you a generic lxml +element and the descriptors on ``CT_Footnote`` would never run. + +Adjacent constants live in ``src/docx/opc/constants.py``. That module +defines both content types (``CT.WML_FOOTNOTES``, ``CT.WML_COMMENTS``, etc.) +and relationship types (``RT.FOOTNOTES``, ``RT.COMMENTS``) used when a part +attaches itself to the package. New part classes — defined under +``src/docx/parts/`` and registered on ``PartFactory.part_type_for`` in +``src/docx/__init__.py`` — reach for these constants rather than hard-coding +URIs. + + +Reaching into ``_element``: when (and when not) to do it +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Because |docx|'s proxies are deliberately thin, the escape hatch is simple: +every proxy exposes its underlying oxml element, and every oxml element is +itself an lxml element. If the high-level API does not yet cover a feature +you need, you can always drop down:: + + # paragraph.pPr is a CT_PPr (lxml element). From here, any + # WordprocessingML that can appear under w:pPr is reachable. + pPr = paragraph._p.get_or_add_pPr() + shd = pPr.makeelement(qn("w:shd"), {qn("w:fill"): "FFFF00"}) + pPr.append(shd) + +This is fully supported — it is the same API the library's own proxies are +implemented with. A few rules of thumb: + +* **Prefer the public API where it exists.** The proxies exist precisely + because OOXML has many sharp edges (ordering, reserved IDs, namespace + aliases, schema transitions). When they cover your case, use them. + +* **Drop to oxml for feature gaps.** If you need a paragraph property + python-docx does not yet surface — ruby text, conditional formatting, + a Word 2013+ extension — call ``paragraph._p`` (or ``.element``) and + manipulate the tree directly. This is a legitimate and common pattern. + +* **Treat oxml as semi-public.** ``CT_*`` class names, attribute descriptors, + and the ``_p`` / ``_r`` / ``_tc`` / ``_element`` accessors are stable + enough to build on. Deep internals of ``xmlchemy`` (the descriptor + implementation itself) are not. + +* **Respect schema ordering.** When you insert a new child from oxml, use + ``_insert_()`` on its parent (if one is generated) rather than a bare + ``append()``. That insertion uses the ``successors`` tuple described above + and keeps Word happy. + +* **Namespace everything.** Use ``docx.oxml.ns.qn("w:tag")`` to build + Clark-notation tag names — never hard-coded strings — so that the right + namespace URI is always attached. + +With that shape in mind, the rest of the user guide — individual topics like +comments, footnotes, numbering, or track changes — can be read as a tour of +the proxy APIs layered on top of this same three-tier foundation. diff --git a/docs/user/bookmarks.rst b/docs/user/bookmarks.rst new file mode 100644 index 000000000..2ec4e9914 --- /dev/null +++ b/docs/user/bookmarks.rst @@ -0,0 +1,137 @@ +.. _bookmarks: + +Working with Bookmarks +====================== + +Word allows *bookmarks* to be defined on ranges of text in a document. A bookmark +names a specific range so that it can be navigated to (``Insert > Bookmark``) or +cross-referenced elsewhere — most notably by ``REF`` and ``PAGEREF`` fields, which +render the text or page number of the referenced bookmark. + +A bookmark can be added to the main document, and bookmarks may also appear in +headers, footers, footnotes, and table cells. *python-docx* currently models +bookmarks that live in the main document body. + +**Bookmark Anatomy.** Each bookmark is a *range* delimited by two empty marker +elements in the XML, ```` and ````. Both +markers carry the same integer ``w:id`` attribute; the ```` +additionally carries the bookmark's ``w:name``. The start marker is placed +immediately before the first run in the range and the end marker immediately +after the last run in the range. + +Like a comment reference, a bookmark range must begin and end at a *run* +boundary. A range can start in one paragraph and end in a later paragraph, but +it must always enclose a contiguous sequence of runs. + +**Bookmark Names.** Bookmark names are strings. They must be unique within a +document — Word silently repairs duplicates on load. Names beginning with an +underscore (for example ``_Ref12345``) are conventionally *hidden* bookmarks +used internally by Word to back features such as automatic cross-references. +*python-docx* does not treat hidden bookmarks specially: they appear in the +``document.bookmarks`` collection alongside user-visible bookmarks. + +**Bookmark IDs.** Each bookmark is identified by a non-negative integer ``id`` +that is unique within the document. IDs are assigned automatically when a +bookmark is added via the *python-docx* API; the next available ID is chosen by +scanning existing ``w:bookmarkStart`` elements in the document body. + + +Adding a bookmark +----------------- + +The simplest case is bookmarking a whole paragraph:: + + >>> from docx import Document + >>> document = Document() + >>> paragraph = document.add_paragraph("Hello, world.") + + >>> bookmark = paragraph.add_bookmark("intro") + >>> bookmark + + >>> bookmark.name + 'intro' + >>> bookmark.bookmark_id + 0 + +To bookmark a specific range of runs within a paragraph, pass `start_run` and +`end_run`:: + + >>> paragraph = document.add_paragraph("The ") + >>> paragraph.add_run("middle").bold = True + >>> paragraph.add_run(" run is special.") + + >>> bookmark = paragraph.add_bookmark( + ... "middle", + ... start_run=paragraph.runs[1], + ... end_run=paragraph.runs[1], + ... ) + >>> bookmark.name + 'middle' + +When only `start_run` is supplied, `end_run` defaults to `start_run`, so the +bookmark wraps that single run. When both `start_run` and `end_run` are +``None``, the bookmark wraps the whole paragraph. + +.. note:: + The :meth:`.Paragraph.add_bookmark` method only bookmarks runs inside the + paragraph it is called on. To create a bookmark that spans multiple + paragraphs you currently need to drop down to the element level and insert + the ```` and ```` markers yourself. + + +Accessing the bookmarks collection +---------------------------------- + +The collection of bookmarks in a document is accessed via the +:attr:`.Document.bookmarks` property:: + + >>> bookmarks = document.bookmarks + >>> bookmarks + + >>> len(bookmarks) + 2 + +The collection is iterable and yields |Bookmark| objects in document order:: + + >>> for bm in bookmarks: + ... print(bm.name, bm.bookmark_id) + intro 0 + middle 1 + + +Looking up a bookmark by name +----------------------------- + +Bookmarks are typically referenced by name. :meth:`.Bookmarks.get` returns the +matching bookmark, or ``None`` when no bookmark with that name is present:: + + >>> bookmarks.get("intro") + + >>> bookmarks.get("not_a_bookmark") is None + True + +The collection also supports ``in`` for a quick presence check:: + + >>> "intro" in bookmarks + True + >>> "not_a_bookmark" in bookmarks + False + + +Deleting a bookmark +------------------- + +A bookmark can be removed from the document by calling +:meth:`.Bookmark.delete`. This removes both the ```` and the +matching ```` marker, leaving the bookmarked text in place:: + + >>> bookmark = bookmarks.get("intro") + >>> bookmark.delete() + + >>> "intro" in bookmarks + False + >>> len(bookmarks) + 1 + +Deleting a bookmark is safe even when its start and end markers live in +different paragraphs: both are found by ``w:id`` and removed. diff --git a/docs/user/captions.rst b/docs/user/captions.rst new file mode 100644 index 000000000..6a3cff978 --- /dev/null +++ b/docs/user/captions.rst @@ -0,0 +1,130 @@ +.. _captions: + +Captions for figures, tables, and equations +=========================================== + +Word's **References > Insert Caption** menu produces a caption paragraph of +the form + +.. parsed-literal:: + + Figure 1: A diagram of the system + +where ``Figure`` is a label, ``1`` is an auto-number maintained by Word, and +``A diagram of the system`` is the caption text. The auto-number is driven +by a :ref:`SEQ ` field so that adding, deleting, or reordering +captions automatically renumbers them when the document is next opened. + +|docx| provides three entry points for authoring captions: + +- :meth:`.Document.add_caption` — append a caption at the end of the document. +- :meth:`.Paragraph.add_caption_before` — insert a caption directly above an + existing paragraph (typical for tables, where the caption sits above the + table). +- :meth:`.Paragraph.add_caption_after` — insert a caption directly below an + existing paragraph (typical for figures). + + +Anatomy of a caption +-------------------- + +Every caption emitted by |docx| has the same structure:: + + + + Figure + + 1 + + : + A diagram of the system + + +The ``1`` inside the ``w:fldSimple`` is a *cached* field result: when Word +reopens the document it will replace that value with the correct +auto-number. For non-Word consumers (spell-checkers, text extractors) the +cached result gives a reasonable fallback. + + +Adding a caption at the end of the document +------------------------------------------- + +:: + + >>> from docx import Document + >>> document = Document() + >>> caption = document.add_caption("A diagram of the system", label="Figure") + >>> caption.style.name + 'Caption' + >>> caption.text + 'Figure 1: A diagram of the system' + +The returned |Paragraph| is the freshly-appended caption; modify its runs in +the usual way to add bold, italics, or other formatting. + + +Captioning tables and figures in place +-------------------------------------- + +Captions rarely belong at the end of the document. The more common pattern +is to add the figure or table first, then attach a caption immediately above +or below using the paragraph-level helpers:: + + >>> p = document.add_paragraph() + >>> p.add_run().add_picture("diagram.png") + >>> p.add_caption_after("A diagram of the system", label="Figure") + + >>> heading = document.add_paragraph() + >>> heading.add_caption_before("Quarterly results", label="Table") + >>> heading.add_run().add_table(...) # hypothetical + +Both helpers return the inserted caption paragraph so the caller can chain +additional mutations. + + +Label grouping +-------------- + +Each distinct `label` argument defines an independent numbering sequence. +Word maintains one counter per SEQ identifier; so the first ``Figure`` +caption is numbered ``1``, the first ``Table`` caption is also ``1``, and +the second ``Figure`` caption is ``2``. ``label`` is what controls the +counter, not the paragraph style. + +Common labels are ``"Figure"``, ``"Table"``, and ``"Equation"``, but any +string Word will accept as a SEQ identifier is permitted — callers adding +localised captions can pass ``"Figura"``, ``"Таблица"``, or similar. + + +Customising the paragraph style +------------------------------- + +The `style` parameter defaults to ``"Caption"``, which is the built-in +Word style for captions. A different style can be supplied when a document +uses a custom caption style:: + + >>> document.add_caption( + ... "Performance metrics", + ... label="Figure", + ... style="FigureCaption", + ... ) + +The style must already be defined in the document; |docx| does not +synthesise it for you. + + +Limitations +----------- + +- |docx| does not implement a layout engine, so the cached ``1`` inside the + SEQ field is always emitted literally. Word will rewrite this to the + correct auto-number on open. +- There is no ``paragraph.caption`` read-side accessor; to enumerate + captions, filter paragraphs by their style name:: + + captions = [p for p in document.paragraphs if p.style.name == "Caption"] + +- Cross-references to a caption (``Figure 1`` as a clickable link elsewhere + in the document) require a bookmark and a REF field; neither is created + by :meth:`add_caption`. Use :meth:`Paragraph.add_bookmark` and the + :mod:`docx.fields` module to wire those up manually. diff --git a/docs/user/charts.rst b/docs/user/charts.rst new file mode 100644 index 000000000..8517e7778 --- /dev/null +++ b/docs/user/charts.rst @@ -0,0 +1,196 @@ +.. _charts: + +Working with Charts +=================== + +Word documents can embed *charts* authored in the drawing-markup chart format +(DrawingML ``c:chartSpace``). A chart might be a column chart summarizing +quarterly sales, a pie chart displaying market share, or any of the other +chart kinds Word supports. This page describes how to access charts already +present in a document and read their properties; building a new chart from +scratch is covered separately (see the forward reference at the end). + +**Chart anatomy.** Each chart in a document lives in its own *chart part* +(for example ``/word/charts/chart1.xml``) and is referenced from the document +body by a ```` element nested inside a ````. +The drawing may be *inline* (flows with surrounding text) or *floating* +(anchored). Both are surfaced uniformly by |docx|. + +A single chart-part contains one ``c:chartSpace`` XML tree. Inside it are a +*title* (optional), a *plot area* containing one chart-kind element +(``c:barChart``, ``c:lineChart``, ``c:pieChart``, etc.) and one or more +*series* (``c:ser``), and an optional *legend*. Each series carries a name +(its label in the legend), a list of numeric *values*, and a list of +*categories* (x-axis labels for bar / line / column charts, slice labels for +pie charts). + +**Chart kinds.** |docx| exposes a subset of Word's chart-type enumeration in +:class:`docx.chart.WD_CHART_TYPE`. The read side recognizes ``BAR``, +``BAR_STACKED``, ``COLUMN``, ``COLUMN_STACKED``, ``LINE``, ``PIE``, +``DOUGHNUT``, ``SCATTER``, and ``AREA``. Charts authored with a chart-kind +outside this list still appear in :attr:`.Document.charts` but report a +|None| ``chart_type``. + +**Scope.** The current API is deliberately narrow: it lets you *enumerate* +charts, ask each chart what kind it is and what its title reads, and iterate +its series to pull out the raw numbers. It does not expose axis formatting, +data-label styling, per-point color, or other presentation details; those +live in the underlying XML and can be reached via ``chart.part.element`` if +you need them. + + +Detecting charts in a document +------------------------------ + +Every chart referenced from the document body is surfaced through the +:attr:`.Document.charts` property. The list is empty when the document +contains no charts, so you can test for the presence of charts with a simple +truthiness check:: + + >>> from docx import Document + >>> document = Document("quarterly-report.docx") + >>> if document.charts: + ... print("document contains %d chart(s)" % len(document.charts)) + document contains 3 chart(s) + +Both *inline* and *floating* chart references are discovered, and duplicate +references to the same chart part (rare, but legal) are de-duplicated so each +chart appears exactly once. Broken references — those whose target part is +missing or of the wrong content-type — are silently skipped rather than +raising. + + +The Document.charts collection +------------------------------ + +:attr:`.Document.charts` returns a plain Python ``list`` of |Chart| objects +in document order:: + + >>> charts = document.charts + >>> charts + [, + , + ] + >>> len(charts) + 3 + >>> charts[0] + + +Because it is a list, all the usual sequence operations are available — +indexing, slicing, iteration, and passing through ``len()``. Note that the +value is *recomputed* each time the property is accessed, so if you plan to +hit it repeatedly in a loop you should bind it to a local name. + + +Reading the chart type +---------------------- + +Every |Chart| exposes its chart kind via :attr:`.Chart.chart_type`, which +returns a :class:`docx.chart.WD_CHART_TYPE` member (or |None| when the +underlying chart kind is outside the enumerated subset):: + + >>> from docx.chart import WD_CHART_TYPE + >>> chart = document.charts[0] + >>> chart.chart_type + + >>> chart.chart_type is WD_CHART_TYPE.COLUMN + True + +The distinction between ``BAR`` and ``COLUMN`` (both authored with +``c:barChart`` in the XML) is decided by the ``c:barDir`` child: a *bar* +direction yields :class:`docx.chart.WD_CHART_TYPE`\ ``.BAR`` while *col* +yields :class:`docx.chart.WD_CHART_TYPE`\ ``.COLUMN``. The stacked variants +are likewise distinguished by the ``c:grouping`` value. + + +Reading the chart title +----------------------- + +:attr:`.Chart.title` returns the concatenated text of the chart's title, +pulled from the ``c:title/c:tx/c:rich`` subtree. It returns |None| when no +title element is present:: + + >>> chart.title + 'Quarterly Sales' + >>> untitled_chart = document.charts[2] + >>> untitled_chart.title is None + True + +Because the title text is built by concatenating every ``a:t`` descendant in +document order, rich-text titles with emphasized runs (for example a bold +word followed by a regular word) still round-trip as their plain-text +equivalent. + + +Iterating chart series +---------------------- + +Each chart owns an ordered list of series. :attr:`.Chart.series` returns a +list of :class:`docx.chart.ChartSeries`, one per ``c:ser`` element in the +plot area:: + + >>> for series in chart.series: + ... print(series.name) + East + West + +A :class:`docx.chart.ChartSeries` exposes three read-only properties: + +- ``name`` — the series label (an empty string when no ``c:tx`` is set). +- ``values`` — a list of ``float`` read from the series' value cache. +- ``categories`` — the list of category labels (as strings) associated with + this series. For charts with a shared category axis every series reports + the same list; for scatter / pie charts the list may differ. + +Pulling the numeric data out of a chart is therefore a one-liner per series:: + + >>> east = chart.series[0] + >>> east.name + 'East' + >>> east.values + [10.0, 20.0, 15.0, 25.0] + >>> east.categories + ['Q1', 'Q2', 'Q3', 'Q4'] + +When the chart XML does not carry a numeric cache (for example because the +authoring tool wrote only ``c:ref`` formulas and no ``c:numCache``), the +``values`` and ``categories`` lists will be empty. This is expected for +charts whose data source is an external embedded spreadsheet that was +subsequently removed. + +For convenience, :attr:`.Chart.categories` is a shortcut equivalent to +``chart.series[0].categories`` for the common case of a shared category +axis, returning an empty list when the chart has no series. + + +Presence of a legend +-------------------- + +:attr:`.Chart.has_legend` is a ``bool`` that reports whether the chart has a +``c:legend`` element. This is useful when mirroring an existing chart's +formatting into a new document:: + + >>> chart.has_legend + True + + +Creating a new chart (forward reference) +---------------------------------------- + +|Document| exposes a narrow creation API as :meth:`.Document.add_chart`, +which takes a :class:`docx.chart.WD_CHART_TYPE`, a list of category labels, +and a mapping of series names to value lists:: + + >>> from docx.chart import WD_CHART_TYPE + >>> chart = document.add_chart( + ... WD_CHART_TYPE.COLUMN, + ... categories=["Q1", "Q2", "Q3", "Q4"], + ... series_data={"East": [10, 20, 15, 25], "West": [12, 18, 14, 22]}, + ... ) + +Only a subset of chart kinds are supported on the create side — +``BAR``, ``BAR_STACKED``, ``COLUMN``, ``COLUMN_STACKED``, ``LINE``, and +``PIE`` — and the chart is always appended to the document body as an +inline drawing. A more complete chart-authoring API, including titles, +legends, axis configuration, and placement control, is being developed in +a subsequent phase and will have its own user-guide page. diff --git a/docs/user/comments.rst b/docs/user/comments.rst index 869d6f5f1..ff4148f6b 100644 --- a/docs/user/comments.rst +++ b/docs/user/comments.rst @@ -131,7 +131,7 @@ The comments collection supports random access to a comment by its id:: Adding rich content to a comment -------------------------------- -A comment is a _block-item container_, just like the document body or a table cell, so +A comment is a *block-item container*, just like the document body or a table cell, so it can contain any content that can appear in those places. It does not contain page-layout sections and cannot contain a comment reference, but it can contain multiple paragraphs and/or tables, and runs within paragraphs can have emphasis such as bold or diff --git a/docs/user/content-controls.rst b/docs/user/content-controls.rst new file mode 100644 index 000000000..0fb8add61 --- /dev/null +++ b/docs/user/content-controls.rst @@ -0,0 +1,237 @@ +.. _content_controls: + +Working with Content Controls +============================= + +A *content control*, known in the OOXML schema as a *structured document tag* (SDT), +is a region of a Word document whose *kind of content* is declared up-front. You can +think of it as a typed placeholder. Word uses content controls to build form-like +documents where users can fill in answers, pick items from a list, or tick a box +without being able to edit the surrounding template text. + +Each content control has a *type* (rich text, plain text, date, checkbox, combo box, +drop-down list, or picture), optional metadata (`tag`, `title`, and an integer `id`), +some *content* (runs or paragraphs held under a `w:sdtContent` element), and may +optionally be *data-bound* to an XML payload stored elsewhere in the package. + +.. note:: + + *python-docx* surfaces the structure and metadata of content controls, not Word's + interactive behaviors. For example, a combo box's list of choices is carried + through the XML untouched, but evaluating a data binding's XPath or enforcing + editability locks is out of scope. + + +Content-control anatomy +----------------------- + +In the XML, every content control looks like this:: + + + + + + + + + + + + + + +- An SDT is **block-level** when it is a direct child of ``w:body`` or a table + cell. Its `sdtContent` holds whole paragraphs. +- An SDT is **inline** when it is a child of a ``w:p``. Its `sdtContent` holds + runs. +- The *type* is determined by a marker element inside ``w:sdtPr`` (``w:text``, + ``w:date``, ``w:comboBox``, ``w:dropDownList``, ``w:picture``, + ``w14:checkbox``). A rich-text SDT has no marker. + + +Creating content controls +------------------------- + +Block-level content controls are added with :meth:`.Document.add_content_control`, +which appends a new `w:sdt` to the document body (just before any trailing section +properties):: + + >>> from docx import Document + >>> from docx.content_controls import ContentControlType + + >>> document = Document() + >>> cc = document.add_content_control( + ... ContentControlType.PLAIN_TEXT, + ... tag="customer", + ... title="Customer", + ... ) + >>> cc + + >>> cc.type + + >>> cc.tag, cc.title + ('customer', 'Customer') + >>> cc.sdt_id + 1872943104 + +Inline content controls are added via :meth:`.Paragraph.add_content_control`, which +appends the new `w:sdt` to that paragraph:: + + >>> paragraph = document.add_paragraph("Hello, ") + >>> inline = paragraph.add_content_control( + ... ContentControlType.RICH_TEXT, tag="greeting" + ... ) + >>> inline.text = "world" + +The full set of supported types is enumerated by :class:`.ContentControlType`: +``RICH_TEXT``, ``PLAIN_TEXT``, ``DATE``, ``CHECKBOX``, ``COMBO_BOX``, ``DROPDOWN``, +and ``PICTURE``. The rich-text flavor is the OOXML default and carries no explicit +marker element inside ``w:sdtPr``. + +.. warning:: + + The ``PICTURE`` type is surfaced for introspection only — *python-docx* does not + yet provide a high-level API for assigning an image to a picture SDT. + + +Reading and modifying a content control +--------------------------------------- + +The :class:`.ContentControl` proxy exposes the metadata and content through simple +Python attributes:: + + >>> cc.tag = "billing_customer" + >>> cc.title = "Billing customer" + >>> cc.type + + >>> cc.sdt_id # read-only + 1872943104 + >>> cc.text = "Acme Co." + >>> cc.text + 'Acme Co.' + +Assigning to :attr:`.ContentControl.text` replaces the current `sdtContent` with a +single run (inline SDTs) or a single paragraph holding one run (block SDTs). To add +multiple paragraphs, runs with custom formatting, images, or tables, reach through +to the underlying XML via ``cc.element``. + +Checkbox content controls carry an extra :attr:`.ContentControl.checked` property:: + + >>> cbx = document.add_content_control(ContentControlType.CHECKBOX, tag="ok") + >>> cbx.checked = True + >>> cbx.checked + True + + +Placeholder text +---------------- + +Word displays placeholder text inside an empty content control. In the XML this is +represented by a ``w:sdtPr/w:showingPlcHdr`` flag referencing a glossary document +entry. *python-docx* does not yet expose a first-class API for placeholder entries; +if you need to set a placeholder, you can do so by writing the initial text directly +into the control's content:: + + >>> cc = document.add_content_control(ContentControlType.PLAIN_TEXT) + >>> cc.text = "Click here to enter customer name" + +When a user opens the document in Word and begins typing, the placeholder text will +be replaced with their input. This approach works for all supported types and is +the technique used by the behave fixtures that accompany this guide. + + +Iterating content controls +-------------------------- + +:attr:`.Document.content_controls` returns the block-level content controls found in +the main document story, in document order:: + + >>> document.content_controls + [, ...] + >>> for cc in document.content_controls: + ... print(cc.type, cc.tag, cc.title) + +Only top-level `w:sdt` elements that are direct children of ``w:body`` are surfaced +here. Inline content controls appear under :attr:`.Paragraph.content_controls` on +the enclosing paragraph, which likewise yields controls in document order:: + + >>> paragraph.content_controls + [] + +Content controls nested inside table cells, headers, footers, or other stories are +not part of these collections. Walk the underlying XML tree (``.xpath(".//w:sdt")`` +on the relevant element) to reach them if you need to. + + +Data binding +------------ + +A *data binding* ties the visible content of an SDT to an XPath expression over a +*custom XML data part* elsewhere in the package. Binding metadata is carried on the +control's ``w:sdtPr/w:dataBinding`` child:: + + >>> cc = document.add_content_control(ContentControlType.PLAIN_TEXT, tag="customer") + >>> binding = cc.set_data_binding( + ... xpath="/ns0:order[1]/ns0:customer[1]", + ... prefix_mappings="xmlns:ns0='http://example.com/orders'", + ... store_item_id="{11111111-2222-3333-4444-555555555555}", + ... ) + >>> binding + + >>> binding.xpath + '/ns0:order[1]/ns0:customer[1]' + >>> binding.prefix_mappings + "xmlns:ns0='http://example.com/orders'" + >>> binding.store_item_id + '{11111111-2222-3333-4444-555555555555}' + +Each attribute is read/write; reassigning :attr:`.DataBinding.xpath` or +:attr:`.DataBinding.store_item_id` updates the XML in place. A content control has +at most one data binding. Reading :attr:`.ContentControl.data_binding` on an unbound +control returns |None|. Use :meth:`.ContentControl.remove_data_binding` to clear it:: + + >>> cc.remove_data_binding() + >>> cc.data_binding is None + True + +.. note:: + + *python-docx* does **not** evaluate the binding — it stores the XPath verbatim + and leaves resolution to Word. If you need the bound value, fetch the + corresponding :class:`.CustomXmlPart` and run the XPath yourself. + + +Custom XML data parts +--------------------- + +Data-bound content controls reference an XML payload stored in a sibling package +part: the *custom XML data part*. A typical document has one or more of these at +``/customXml/item{N}.xml``, each with a companion ``/customXml/itemProps{N}.xml`` +part that declares a ``{GUID}``-formatted *store-item id* (and optional schema +references). + +Those parts are surfaced read-only as :class:`.CustomXmlPart` proxies through +:attr:`.Document.custom_xml_parts`:: + + >>> for part in document.custom_xml_parts: + ... print(part.partname, part.item_id, part.schema_refs) + /customXml/item1.xml {EF278816-EC6F-A645-907D-7F25AECB1D4A} ['http://schemas.openxmlformats.org/officeDocument/2006/bibliography'] + /customXml/item2.xml {11111111-2222-3333-4444-555555555555} ['http://example.com/orders'] + +To resolve a binding to its backing part, match +:attr:`.DataBinding.store_item_id` to :attr:`.CustomXmlPart.item_id`:: + + >>> target_id = cc.data_binding.store_item_id + >>> part = next(p for p in document.custom_xml_parts if p.item_id == target_id) + >>> part.root_element.tag + '{http://example.com/orders}order' + +Each proxy exposes :attr:`~docx.custom_xml.CustomXmlPart.blob` (raw bytes), +:attr:`~docx.custom_xml.CustomXmlPart.root_element` (parsed lxml element or |None| +on parse failure), :attr:`~docx.custom_xml.CustomXmlPart.item_id`, and +:attr:`~docx.custom_xml.CustomXmlPart.schema_refs`. The collection is read-only — +authoring new custom XML data parts is outside the scope of the current release and +requires working directly with the underlying OPC package. + +See :ref:`content_controls_api` and :ref:`custom_xml_api` for the full API +reference. diff --git a/docs/user/custom-properties.rst b/docs/user/custom-properties.rst new file mode 100644 index 000000000..de9aa0e9d --- /dev/null +++ b/docs/user/custom-properties.rst @@ -0,0 +1,138 @@ +.. _custom_properties: + +Custom document properties +========================== + +Beyond the fixed Dublin-Core "core" properties exposed by +:attr:`.Document.core_properties` (title, author, subject, ...), Word also +supports user-defined, typed **custom properties** stored in the +``docProps/custom.xml`` part. Examples of custom properties a document might +carry: a project code, a document revision number, a workflow status, a +client identifier, a budget figure. + +|docx| surfaces these through :attr:`.Document.custom_properties`, which +returns a |CustomProperties| collection behaving like a Python ``dict``: +membership testing, indexed access, iteration, deletion, and a handful of +convenience methods. + +A lazily-created ``custom.xml`` part is added to the document the first time +:attr:`custom_properties` is accessed, so callers never need to check whether +one already exists. + + +Supported value types +--------------------- + +Each custom property has a single, statically-typed value. Five Python types +are supported; each maps to a VT (Variant Type) element defined by the Office +``customProperties`` schema: + +============================ =================== ============================================== +Python type OOXML serialisation Notes +============================ =================== ============================================== +``str`` ``vt:lpwstr`` Any length; Unicode. +``int`` ``vt:i4`` 32-bit signed integer. +``float`` ``vt:r8`` IEEE-754 double. +``bool`` ``vt:bool`` Stored as ``"true"``/``"false"``. +``datetime.datetime`` ``vt:filetime`` Serialised as ISO-8601 with a ``Z`` suffix. +============================ =================== ============================================== + +Assigning any other type (``list``, ``dict``, ``bytes``, custom objects, ...) +raises :class:`TypeError`. + + +Reading custom properties +------------------------- + +:: + + >>> from docx import Document + >>> document = Document("contract.docx") + >>> document.custom_properties["Project"] + 'Apollo' + >>> document.custom_properties["Budget"] + 99.95 + >>> len(document.custom_properties) + 5 + +The collection also supports membership testing, iteration, and dict-style +``get()``:: + + >>> "Project" in document.custom_properties + True + >>> "Unknown" in document.custom_properties + False + >>> document.custom_properties.get("Unknown") # returns None + >>> document.custom_properties.get("Unknown", "-") # returns '-' + '-' + >>> list(document.custom_properties) # iteration yields names + ['Project', 'Priority', 'Budget', 'Approved', 'Reviewed'] + +Use :meth:`.CustomProperties.names` to obtain a concrete list of property +names, or :meth:`.CustomProperties.items` to get ``(name, value)`` pairs — +both preserve document order:: + + >>> document.custom_properties.names() + ['Project', 'Priority', 'Budget', 'Approved', 'Reviewed'] + >>> document.custom_properties.items() + [('Project', 'Apollo'), ('Priority', 5), ('Budget', 99.95), ...] + + +Adding and updating properties +------------------------------ + +Subscript assignment is the primary authoring API. If the named property does +not yet exist, it is created; if it does, its value (and serialised type) is +replaced:: + + >>> document.custom_properties["Project"] = "Gemini" + >>> document.custom_properties["Priority"] = 9 + +|CustomProperties| also offers :meth:`.CustomProperties.add`, which raises +:class:`ValueError` if the name is already in use — useful when the caller +wants to refuse accidental overwrites:: + + >>> document.custom_properties.add("Owner", "alice@example.com") + >>> document.custom_properties.add("Owner", "bob@example.com") + Traceback (most recent call last): + ... + ValueError: a custom property named 'Owner' already exists + + +Deleting properties +------------------- + +Use the ``del`` statement with the property name:: + + >>> del document.custom_properties["Priority"] + >>> "Priority" in document.custom_properties + False + +Deleting a property that doesn't exist raises :class:`KeyError`, mirroring +standard Python dict semantics. + + +Preserving order +---------------- + +Custom properties are stored in document order, not by any particular sort +key. Iteration, :meth:`.CustomProperties.names`, and +:meth:`.CustomProperties.items` all walk the underlying +``custom.xml/property`` children in their XML order. Adding a new property +appends it at the end; overwriting an existing property keeps its current +position. + + +Interoperability notes +---------------------- + +Word's UI exposes custom properties via **File > Info > Properties > Advanced +Properties > Custom**. Properties authored through |docx| show up in that +dialog with their declared types intact, and can be referenced from Word +fields (for example ``DOCPROPERTY "Project"``) or from other Office +applications reading the same document. + +Some third-party readers ignore custom-property types and treat every value +as text. If your downstream tooling depends on the serialised type being +preserved, round-trip your document through the target reader once as part +of your test suite to confirm behaviour. diff --git a/docs/user/document-safety.rst b/docs/user/document-safety.rst new file mode 100644 index 000000000..6e2e6978e --- /dev/null +++ b/docs/user/document-safety.rst @@ -0,0 +1,174 @@ +.. _document_safety: + +Document safety: corruption, encryption, macros, signatures +=========================================================== + +Beyond "happy-path" reads, |docx| provides a handful of APIs that surface +the *safety* attributes of a document: whether its XML parts survived +parsing, whether it is password-encrypted, whether it carries VBA macros, +and whether it bears a digital signature. These matter when a tool has to +decide whether to load, process, forward, or reject a document it received +from somewhere else. + +The core package does not execute VBA or cryptographically verify +signatures — it only inspects what the package contains. Reading *or +writing* password-protected files is supported via the optional +``python-ooxml-crypto`` dependency (see :ref:`encrypted-documents` below); +without that extra installed, :class:`.EncryptedDocumentError` is raised +when encryption is detected. + + +Recover mode for malformed documents +------------------------------------ + +When a ``.docx`` has been truncated, had an editor partially rewrite its +XML, or otherwise lost well-formedness, the default +:func:`docx.Document` loader raises :class:`lxml.etree.XMLSyntaxError`. +Passing ``recover=True`` switches lxml into its recovering parser, which +salvages whatever is well-formed and records the parse errors on +:attr:`.Document.recovery_warnings`:: + + >>> from docx import Document + >>> from lxml import etree + + >>> try: + ... document = Document("corrupt.docx") + ... except etree.XMLSyntaxError as e: + ... print(f"default open failed: {e}") + + >>> document = Document("corrupt.docx", recover=True) + >>> len(document.recovery_warnings) + 1 + >>> document.recovery_warnings[0] + ':10:24:FATAL:PARSER:ERR_TAG_NOT_FINISHED: ...' + +The readable prefix of the document is available through the normal API. +Content after the corruption boundary is dropped; in extreme cases where +lxml cannot recover *any* elements from a part, |docx| substitutes an +empty stub for that part so the rest of the package still loads. + +Recover mode never masks unrelated failures. If the physical package is +not a zip file, :class:`docx.opc.exceptions.PackageNotFoundError` still +propagates; if the file is an encrypted OLE compound file, +:class:`docx.exceptions.EncryptedDocumentError` still propagates. The +``recover=True`` flag only relaxes XML parsing. + + +.. _encrypted-documents: + +Password-encrypted documents +---------------------------- + +Word stores password-protected documents as OLE compound files (CFBF), not +as regular ZIP packages. The ZIP-based OPC reader cannot process them; the +naive error would be a confusing ``BadZipFile`` from the standard library. + +|docx| short-circuits that by peeking at the first eight bytes of the file +and — when no ``password=`` is supplied — raising +:class:`.EncryptedDocumentError` if they match the OLE signature +``D0 CF 11 E0 A1 B1 1A E1``:: + + >>> from docx import Document + >>> from docx.exceptions import EncryptedDocumentError + >>> try: + ... Document("secret.docx") + ... except EncryptedDocumentError as e: + ... print(e) + Document is password-protected (encrypted .docx detected). Pass + `password=...` to `Document(...)` to decrypt it, or install the + optional 'python-ooxml-crypto' package + (https://github.com/loadfix/python-ooxml-crypto). + +Recover mode does **not** bypass this check — the file is not just +malformed XML, it is an entirely different format. + +Decrypting on open and encrypting on save +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Install the optional ``python-ooxml-crypto`` dependency +(``pip install 'python-docx[encryption]'``) and pass ``password=`` through +the public API; |docx| delegates AES key derivation and CFBF parsing to +the dependency:: + + from docx import Document + + # decrypt an existing protected file + document = Document("secret.docx", password="s3cret") + + # encrypt on save (ECMA-376 Agile Encryption — the scheme Word writes) + document.add_paragraph("confidential") + document.save("protected.docx", password="s3cret") + +Supplying the wrong password raises :class:`.EncryptedDocumentError` with +a ``"password does not match"`` message. Azure RMS / AIP / IRM-wrapped +files (whose payload is keyed to the user's Microsoft 365 identity rather +than a password) raise :class:`.RmsProtectedDocumentError` — a subclass +of :class:`.EncryptedDocumentError` — because ``python-ooxml-crypto`` +cannot decrypt them; those files need Microsoft Office automation or the +Microsoft Information Protection SDK as a preprocessing step. + +When the optional extra is not installed, the call still raises a +helpful :class:`.EncryptedDocumentError` pointing at the install +instructions — so code paths stay callable without the extra on hand. + + +Macro-enabled documents (.docm) +------------------------------- + +``.docm`` documents are OOXML packages whose main document part uses the +macro-enabled content type +(``application/vnd.ms-word.document.macroEnabled.main+xml``) and carry a +``vbaProject`` relationship pointing at a binary ``vbaProject.bin`` part. + +|docx| loads them seamlessly — no special flag is required — and surfaces +the VBA relationship through :attr:`.Document.has_macros`:: + + >>> document = Document("form.docm") + >>> document.has_macros + True + >>> Document("plain.docx").has_macros + False + +|docx| does not read or author VBA. The ``vbaProject.bin`` part is left +untouched on save; if you inspect or swap VBA code, use a dedicated tool +and then pass the resulting bytes back to |docx|. + +.. note:: + + VBA projects are an execution vector. Treat a positive + :attr:`has_macros` result as a security signal unless the document + came from a trusted source. + + +Digital signatures +------------------ + +A signed OOXML package includes: + +- A package-level relationship of type + ``.../digital-signature/origin`` targeting + ``/_xmlsignatures/origin.sigs``; +- One or more ``digital-signature/signature`` relationships from the origin + part, each targeting a ``/_xmlsignatures/sigN.xml`` part holding an + XML-DSig document (optionally with XAdES extensions carrying the signing + time and signer identity). + +|docx| surfaces both the presence and the minimal metadata:: + + >>> document = Document("contract.docx") + >>> document.is_signed + True + >>> for sig in document.signatures: + ... print(sig.partname, sig.signer, sig.signed_at) + /_xmlsignatures/sig1.xml CN=Alice Example 2024-04-01 12:34:56+00:00 + +Each :class:`.SignatureInfo` exposes :attr:`partname`, :attr:`blob` +(the raw XML bytes), :attr:`signer` (the ``X509SubjectName``), and +:attr:`signed_at` (the XAdES ``SigningTime``, or |None| when absent). The +full signature XML is available through :attr:`blob` for callers that want +to perform their own cryptographic verification. + +|docx| does not verify signatures — signature validation is a +cryptographic operation outside |docx|'s scope. Consumers that rely on +signed documents should pass the :attr:`blob` to a library such as +``signxml`` and check the result before proceeding. diff --git a/docs/user/documents.rst b/docs/user/documents.rst index ecdfefab1..8f860d146 100644 --- a/docs/user/documents.rst +++ b/docs/user/documents.rst @@ -68,7 +68,7 @@ Opening a 'file-like' document save to a file-like object. This can be handy when you want to get the source or target document over a network connection or from a database and don't want to (or aren't allowed to) interact with the file system. In practice this means -you can pass an open file or StringIO/BytesIO stream object to open or save +you can pass an open file or BytesIO stream object to open or save a document like so:: f = open('foobar.docx', 'rb') @@ -78,11 +78,11 @@ a document like so:: # or with open('foobar.docx', 'rb') as f: - source_stream = StringIO(f.read()) + source_stream = BytesIO(f.read()) document = Document(source_stream) source_stream.close() ... - target_stream = StringIO() + target_stream = BytesIO() document.save(target_stream) The ``'rb'`` file open mode parameter isn't required on all operating diff --git a/docs/user/drawing.rst b/docs/user/drawing.rst new file mode 100644 index 000000000..a53e0ee3d --- /dev/null +++ b/docs/user/drawing.rst @@ -0,0 +1,351 @@ +.. _drawing: + +Working with DrawingML shapes +============================= + +Word documents carry graphical content on a separate *drawing layer* alongside +the text layer. In addition to pictures, the drawing layer may host +*preset-geometry shapes* (rectangles, arrows, callouts, and similar), *group +shapes* that bundle multiple shapes together, *text frames* embedded in +shapes, *ink annotations* authored with a stylus, and *embedded OLE objects* +such as Excel workbooks or PDF files. For a conceptual introduction to the +two layers see :doc:`shapes`; this page is the fork-era companion and +documents the |docx| APIs for each of these drawing-layer features. + +The drawing layer is expressed in OOXML as ```` elements nested +inside run elements. A single ``w:drawing`` may wrap an *inline* object +(``wp:inline`` — flows as a character glyph) or an *anchored / floating* +object (``wp:anchor`` — placed at arbitrary coordinates). The nested +``a:graphicData`` element carries one of ``pic:pic`` (a picture), +``c:chart`` (a chart reference), ``dgm:*`` (a SmartArt diagram), +``wpg:grpSp`` (a group of shapes), or ``wps:wsp`` (a DrawingML shape, +optionally with a ``wps:txbx`` text frame). + +The :class:`docx.drawing.Drawing` proxy is the uniform entry point: +:attr:`.Drawing.type` returns a :class:`.WD_DRAWING_TYPE` member +(``PICTURE``, ``CHART``, ``DIAGRAM``, ``GROUP``, ``TEXT_BOX``, or +``SHAPE``), and the remaining methods on the proxy give you access to the +content-specific API. + + +Floating (anchored) images +-------------------------- + +A *floating image* is a picture wrapped in ``wp:anchor`` rather than +``wp:inline``. Unlike an inline picture — which behaves like a large glyph +— a floating image is positioned relative to a page, margin, column, or +paragraph, and surrounding text wraps around it according to a configurable +wrap style. + +|docx| adds floating images via :meth:`.Paragraph.add_floating_image` and +exposes existing anchors via :attr:`.Paragraph.floating_images`. The proxy +type is :class:`docx.shape.FloatingImage`:: + + >>> from docx import Document + >>> from docx.enum.shape import WD_ANCHOR_H, WD_ANCHOR_V, WD_WRAP_TYPE + >>> from docx.shared import Inches + + >>> document = Document() + >>> paragraph = document.add_paragraph("Text surrounding the picture.") + >>> floating = paragraph.add_floating_image( + ... "logo.png", + ... width=Inches(1.5), + ... position={ + ... "h_anchor": WD_ANCHOR_H.PAGE, + ... "v_anchor": WD_ANCHOR_V.PAGE, + ... "horizontal": Inches(2), + ... "vertical": Inches(3), + ... "wrap": WD_WRAP_TYPE.SQUARE, + ... }, + ... ) + >>> floating.horizontal_anchor, floating.vertical_anchor + (, ) + >>> floating.horizontal_offset, floating.vertical_offset + (1828800, 2743200) + +The ``position`` dict is optional; when omitted the image is anchored at +``COLUMN``/``PARAGRAPH`` with :class:`.WD_WRAP_TYPE.SQUARE` and zero +offsets. Only the keys you supply are overridden; unspecified keys fall back +to the default. The :attr:`.FloatingImage.position` property returns a dict +in the same shape that you can feed back into a subsequent call on a +different anchor. + +Each horizontal/vertical anchor choice corresponds to a +``wp:positionH/@relativeFrom`` or ``wp:positionV/@relativeFrom`` token from +the OOXML grammar. The ``wrap`` entry maps onto ``wp:wrapSquare``, +``wp:wrapTight``, ``wp:wrapThrough``, ``wp:wrapTopAndBottom``, or +``wp:wrapNone`` (``BEHIND``/``IN_FRONT`` are both ``wp:wrapNone`` with +different ``behindDoc`` attributes). + +Floating images are enumerated per paragraph:: + + >>> for paragraph in document.paragraphs: + ... for fi in paragraph.floating_images: + ... print(fi.wrap_type, fi.offset) + + +Preset-geometry shapes +---------------------- + +|docx| can add DrawingML preset shapes — the kind of geometric primitives +you reach via ``Insert > Shapes`` in Word — inline to a paragraph:: + + >>> from docx.enum.shape import WD_SHAPE + >>> from docx.shared import Inches + + >>> paragraph = document.add_paragraph() + >>> shape = paragraph.add_shape( + ... WD_SHAPE.ROUNDED_RECTANGLE, + ... width=Inches(2), + ... height=Inches(1), + ... text="Click me", + ... ) + >>> shape.shape_type + + >>> shape.text + 'Click me' + +:meth:`.Paragraph.add_shape` returns a :class:`docx.drawing.WordprocessingShape` +proxy wrapping the newly-created ``wps:wsp`` element. The proxy exposes +:attr:`.WordprocessingShape.name`, :attr:`.WordprocessingShape.shape_type` +(a member of :class:`.WD_SHAPE`), and a read/write +:attr:`.WordprocessingShape.text` property. + +The implemented :class:`.WD_SHAPE` members cover rectangles +(``RECTANGLE`` / ``ROUNDED_RECTANGLE``), ovals (``OVAL``), arrows +(``ARROW_RIGHT``), and a rounded-rectangle callout +(``CALLOUT_ROUNDED_RECTANGLE``). Shapes authored in Word with other preset +geometries round-trip correctly: a read via +:attr:`.Paragraph.drawings` reports them as :class:`.WD_DRAWING_TYPE.SHAPE`, +and :attr:`.WordprocessingShape.shape_type` returns |None| when the preset +token does not correspond to a known enum member. + +``add_shape`` validates its ``shape_type`` argument:: + + >>> paragraph.add_shape("rect") + Traceback (most recent call last): + ... + TypeError: shape_type must be a WD_SHAPE member, got 'rect' + + +Group shapes +------------ + +Word can combine several shapes into a *group* — a single unit that can be +selected, moved, or resized as a whole. The underlying element is +``wpg:grpSp`` and groups may nest arbitrarily. |docx| models groups read-only +through :class:`docx.drawing.GroupShape`:: + + >>> from docx.drawing import GroupShape, WordprocessingShape + + >>> for paragraph in document.paragraphs: + ... for drawing in paragraph.drawings: + ... if drawing.is_group: + ... group = drawing.group_shape + ... print(group.name) + ... for child in group.shapes: + ... print(" ", type(child).__name__) + +Each child returned by :attr:`.GroupShape.shapes` is a +:class:`.WordprocessingShape`, a :class:`docx.drawing.Picture`, or a nested +:class:`.GroupShape`. Unsupported child element types (for example +``wpg:graphicFrame``) are omitted from the list so calling code can assume +every entry is one of the three proxy classes. + +Use :attr:`.Drawing.group_shapes` to access every top-level group on a +drawing; :attr:`.Drawing.group_shape` returns just the first (which is what +Word writes for a single selection) or |None| if the drawing is not a group. + + +Text box (shape text-frame) content +----------------------------------- + +A ``wps:wsp`` shape may carry a text frame (``wps:txbx/w:txbxContent``) — +this is what Word exposes as "Edit Text" on a shape. When a shape contains +text its :attr:`.Drawing.type` reports :class:`.WD_DRAWING_TYPE.TEXT_BOX` +instead of ``SHAPE``. + +Two access paths are available. :attr:`.Drawing.text` returns a single +concatenated string (multiple paragraphs are separated by ``\n``), and +:attr:`.Drawing.paragraphs` returns the |Paragraph| objects inside the text +frame so the full run-level API is available:: + + >>> drawing = paragraph.drawings[0] + >>> drawing.type + + >>> drawing.text + 'First line\nSecond line\nThird line' + >>> [p.text for p in drawing.paragraphs] + ['First line', 'Second line', 'Third line'] + +:meth:`.Paragraph.add_shape` accepts an optional ``text`` argument; when +supplied, a minimal text frame containing that string is attached to the new +``wps:wsp``. :attr:`.WordprocessingShape.text` is read/write — assigning a +string replaces the existing text-frame content:: + + >>> shape = paragraph.add_shape(WD_SHAPE.RECTANGLE, text="Initial") + >>> shape.text + 'Initial' + >>> shape.text = "Replaced" + >>> shape.text + 'Replaced' + + +Ink annotations +--------------- + +Word on touch-enabled devices can record stylus-drawn *ink annotations*. +They are stored as separate ``word/ink/ink*.xml`` parts in the +`InkML `_ format and referenced from the +document body by a ```` element inside a run. + +|docx| exposes ink annotations read-only via +:attr:`.Document.ink_annotations` and :attr:`.Paragraph.ink_annotations`. +The proxy type is :class:`docx.ink.InkAnnotation`:: + + >>> for annotation in document.ink_annotations: + ... print(annotation.partname, annotation.stroke_count) + /word/ink/ink1.xml 2 + /word/ink/ink2.xml 1 + +:attr:`.InkAnnotation.blob` returns the raw InkML XML bytes so you can pass +them to a downstream parser or renderer. :attr:`.InkAnnotation.stroke_count` +reports the number of ``inkml:trace`` elements in the part — this is a +structural count, not a glyph count; it counts both direct children of +``inkml:ink`` and traces nested inside ``inkml:traceGroup``. + +``w:contentPart`` references whose relationship target is missing from the +package, or whose target part is of the wrong type, are silently skipped +rather than raised. This keeps repair-mode loads of damaged documents from +crashing when a stray ink reference was left behind after a part was +dropped. python-docx does not support *creating* or *modifying* ink +annotations; the API is deliberately read-only. + + +Embedded OLE objects +-------------------- + +Word supports embedding OLE objects — Excel workbooks, PDF documents, +mathematical equations, and so on — directly into a document. Each object is +stored as a separate part (usually under ``word/embeddings/``) whose content +type is ``application/vnd.openxmlformats-officedocument.oleObject``. The +reference comes from an ```` element inside a ```` +element inside a run. + +|docx| exposes embedded objects read-only via +:attr:`.Document.embedded_objects` and +:attr:`.Paragraph.embedded_objects`. The proxy type is +:class:`docx.embedded_objects.EmbeddedObject`:: + + >>> for obj in document.embedded_objects: + ... print(obj.prog_id, obj.type, len(obj.blob)) + Excel.Sheet.12 Embed 16 + +:attr:`.EmbeddedObject.prog_id` is the ProgID token identifying the object's +type (``Excel.Sheet.12``, ``AcroExch.Document``, ``Equation.DSMT4``, etc.). +:attr:`.EmbeddedObject.type` is either ``"Embed"`` (the binary lives in the +package) or ``"Link"`` (the binary lives at a file-system or URL target). +:attr:`.EmbeddedObject.blob` returns the raw OLE bytes. + +A reference whose relationship id cannot be resolved — for example because +the target part was dropped or is of the wrong type — still produces an +:class:`.EmbeddedObject`, but its :attr:`.EmbeddedObject.blob` returns +``b""`` and its :attr:`.EmbeddedObject.embedded_partname` returns |None|. +Callers that care can filter on ``if obj.blob:``. Creation and modification +are intentionally not supported. + + +Accessibility: alt text and titles +---------------------------------- + +Every inline picture, floating picture, preset shape, and group in a Word +document has an accessibility-facing *description* (alt text) and an +optional *title*. These map onto the ``@descr`` and ``@title`` attributes of +the ``wp:docPr`` element inside the ``wp:inline`` or ``wp:anchor``. For +assistive technologies the description is read in place of the image when +the text layer is dictated aloud. + +|docx| exposes both attributes as read/write properties on +:class:`.InlineShape` and :class:`.FloatingImage`:: + + >>> shape = document.inline_shapes[0] + >>> shape.alt_text = "A pencil-drawing of a mountain peak" + >>> shape.title = "Mountain peak" + >>> shape.alt_text + 'A pencil-drawing of a mountain peak' + +Either attribute can be assigned |None| to clear it. When the underlying +XML attribute is absent the getter returns |None|; for floating images +whose ``wp:docPr`` element itself is absent the getter still returns +|None| and the setter creates the element on demand. + +Setting alt text is the single most effective accessibility fix available +for a document containing graphical content; aim to populate +``alt_text`` on every decorative or informational picture in a document you +generate. + + +SVG pictures +------------ + +|docx| accepts SVG (Scalable Vector Graphics) files as input to +:meth:`.Run.add_picture`. Word renders SVG natively in recent versions; for +compatibility with older consumers the library also stores a small PNG +*fallback* so the image is visible even when the reader does not understand +SVG. + +Both are referenced from the same ``pic:pic`` element: the fallback PNG is +the primary ``a:blip/@r:embed`` and the SVG is attached via an +``asvg:svgBlip`` extension element. Round-tripping an SVG preserves the +original bytes — the fallback is generated once at write time. + +:: + + >>> paragraph = document.add_paragraph() + >>> run = paragraph.add_run() + >>> run.add_picture("diagram.svg", width=Inches(3)) + +SVG pixel dimensions are inferred from the ``width``/``height`` or +``viewBox`` attributes on the root ```` element. Unitless values are +treated as CSS pixels; ``in`` / ``cm`` / ``mm`` / ``pt`` units are converted +to pixels at 96 DPI (the CSS reference density). An SVG whose dimensions +cannot be parsed falls back to the SVG spec default of 300 x 150. + +Floating placement (``add_floating_image``) does not implement the PNG +fallback path and treats SVG like any other image — it relies on the +consumer to render SVG directly. If you need SVG on the drawing layer with +wide-compatibility fallback, add it inline. + + +Iterating drawings generically +------------------------------ + +:attr:`.Paragraph.drawings` returns a :class:`.Drawing` proxy for every +``w:drawing`` descendant of the paragraph, regardless of what the drawing +wraps. Use :attr:`.Drawing.type` to branch on the content kind:: + + >>> from docx.enum.shape import WD_DRAWING_TYPE + + >>> for paragraph in document.paragraphs: + ... for drawing in paragraph.drawings: + ... if drawing.type is WD_DRAWING_TYPE.PICTURE: + ... image = drawing.image + ... ... + ... elif drawing.type is WD_DRAWING_TYPE.CHART: + ... chart = drawing.chart + ... ... + ... elif drawing.type is WD_DRAWING_TYPE.GROUP: + ... group = drawing.group_shape + ... ... + ... elif drawing.type is WD_DRAWING_TYPE.TEXT_BOX: + ... print(drawing.text) + ... elif drawing.type is WD_DRAWING_TYPE.SHAPE: + ... # bare wps:wsp with no text frame + ... ... + +The dedicated collections — :attr:`.Document.inline_shapes`, +:attr:`.Document.charts`, :attr:`.Document.ink_annotations`, +:attr:`.Document.embedded_objects` — are the right tool for single-kind +surveys; :attr:`.Paragraph.drawings` is the right tool when you need +position-aware (paragraph-scoped) enumeration or when you want to handle +every drawing kind in one pass. diff --git a/docs/user/endnotes.rst b/docs/user/endnotes.rst new file mode 100644 index 000000000..d1880f08f --- /dev/null +++ b/docs/user/endnotes.rst @@ -0,0 +1,159 @@ +.. _endnotes: + +Working with Endnotes +===================== + +Word allows *endnotes* to be added to a document. An endnote is a piece of +reference material whose body appears at the end of the document (or end of a +section) while a numbered marker is inserted into the running text where the +citation occurs. Endnotes are generally used for citations or extended +remarks that would otherwise distract from the flow of the main document. + +The procedure is simple: + +- You place the cursor at the spot where you want the endnote reference mark + to appear. +- You press the *Insert Endnote* button on the References toolbar. +- You type or paste in the endnote text, which is stored in a separate + *endnotes part* at the bottom of the document. + +**Endnote Anatomy.** Each endnote has two parts, the *endnote-reference* and +the *endnote-content*: + +The **endnote-reference**, sometimes *endnote-anchor*, is the small +superscript mark placed into the main document where the endnote was +inserted. It is a single ```` element carrying the *id* +of the endnote it anchors, wrapped in a run styled with the +"EndnoteReference" character style. + +The **endnote-content**, sometimes just *endnote*, is whatever content was +typed or pasted in. The content for each endnote lives in a separate endnote +object, and these endnote objects are stored in a separate *endnotes part* +(part-name ``word/endnotes.xml``), not in the main document. Each endnote is +assigned a unique id when it is created, allowing the endnote reference to +be associated with its content and vice versa. + +**Reserved Ids.** Endnote ids 0 and 1 are reserved — they identify the +*separator* and *continuation-separator* markers that Word uses to draw the +horizontal rule between the document body and the endnotes area. User +endnotes added by *python-docx* receive ids starting at 2. These reserved +entries are filtered out of iteration and are not counted by ``len()``. + +**Endnote Content.** Although most endnotes contain a single paragraph of +plain text, an endnote is a *block-item container* — it can contain multiple +paragraphs and tables, and runs within paragraphs can carry character +emphasis such as bold or italic, embedded hyperlinks, and images. + +**Endnote Properties.** Document-level endnote numbering is controlled by a +```` element which lives inside the settings part. Through the +|EndnoteProperties| proxy you can configure: + +- *number_format* — the numeral style used for endnote marks (Arabic, + Roman, lowercase letters, Chicago-style marks, etc.) +- *start_number* — the first number used for automatic numbering. +- *restart_rule* — when numbering resets (continuous or at each section). +- *position* — where the endnote body appears (end of document or end of + section). + +**Applicability.** Endnotes can only be added to the main document body. +An endnote cannot be added to a header, a footer, a footnote, a comment, or +nested inside another endnote. In general the *python-docx* API will not +allow these operations, but if you outsmart it the resulting endnote will +either be silently removed or trigger a repair error when the document is +loaded by Word. + + +Adding an endnote +----------------- + +A simple example is adding an endnote anchored to a run:: + + >>> from docx import Document + >>> document = Document() + >>> paragraph = document.add_paragraph("Hello, world.") + >>> run = paragraph.runs[-1] + + >>> endnote = document.endnotes.add(run, text="See the appendix for details.") + >>> endnote + + >>> endnote.endnote_id + 2 + >>> endnote.text + 'See the appendix for details.' + +The :meth:`.Endnotes.add` call inserts a ```` into the +supplied run, styled with the "EndnoteReference" character style, and +creates a new ```` element in the endnotes part whose first +paragraph carries the ``EndnoteText`` paragraph style. If ``text`` is +provided, it is added as a run in that paragraph following the auto-number +mark. + + +Accessing and iterating the Endnotes collection +----------------------------------------------- + +The endnotes collection is accessed via the :attr:`.Document.endnotes` +property:: + + >>> endnotes = document.endnotes + >>> endnotes + + >>> len(endnotes) + 1 + +The |Endnotes| object is iterable over user endnotes; the reserved +separator entries are skipped:: + + >>> for endnote in document.endnotes: + ... print(endnote.endnote_id, endnote.text) + 2 See the appendix for details. + + +Adding rich content to an endnote +--------------------------------- + +An endnote is a *block-item container*, just like the document body or a +table cell, so it can contain any content those places can. The methods for +adding this content are the same as those used for the document and table +cells:: + + >>> endnote = document.endnotes.add(run, text="") + >>> endnote.add_paragraph("A longer citation follows.") + >>> end_para = endnote.paragraphs[0] + >>> end_para.add_run(" See p. 42.").italic = True + + +Deleting an endnote +------------------- + +To remove an endnote from the document, call :meth:`.Endnote.delete`. This +removes both the ```` element from the endnotes part and the +```` run from the main document body:: + + >>> endnote = document.endnotes.add(paragraph.runs[-1], text="Temporary note.") + >>> endnote.delete() + +After calling :meth:`.Endnote.delete` the |Endnote| object is *defunct* and +should not be used further. + + +Configuring endnote numbering and position +------------------------------------------ + +Document-level endnote properties are accessed via +:attr:`.Document.endnote_properties`. The property returns |None| when no +``w:endnotePr`` element exists in the document settings; use +:meth:`.Document.add_endnote_properties` to add one and configure it:: + + >>> from docx.enum.text import ( + ... WD_ENDNOTE_POSITION, WD_FOOTNOTE_RESTART, WD_NUMBER_FORMAT, + ... ) + >>> props = document.add_endnote_properties() + >>> props.number_format = WD_NUMBER_FORMAT.LOWER_ROMAN + >>> props.restart_rule = WD_FOOTNOTE_RESTART.EACH_SECTION + >>> props.position = WD_ENDNOTE_POSITION.END_OF_SECTION + >>> props.start_number = 1 + +All four properties are read/write. Assigning |None| removes the +corresponding child element from the ``w:endnotePr`` so that Word falls +back to its default behaviour. diff --git a/docs/user/equations.rst b/docs/user/equations.rst new file mode 100644 index 000000000..db29da766 --- /dev/null +++ b/docs/user/equations.rst @@ -0,0 +1,288 @@ +.. _equations: + +Working with equations +====================== + +Word stores mathematical expressions as *Office Math* (OMML, the ``m:`` namespace) +rather than as text runs. An equation lives in one of two container elements: + +- ```` — an inline equation, embedded in a run-level position + inside a paragraph, flowing with the surrounding text; +- ```` — a *display-mode* equation, which Word centers on its + own line and renders at a larger size. ``m:oMathPara`` always wraps + exactly one ``m:oMath`` element and carries its own formatting in + ``m:oMathParaPr``. + +*python-docx* provides a read-only |Equation| proxy over either element, plus a +small family of *builder* functions that emit OMML XML strings for the most +common single-node idioms (identifiers, fractions, sub/superscripts, radicals). +Import/export for LaTeX or MathML is intentionally out of scope — the OMML XML +string is the exchange format. + + +Reading equations from a document +--------------------------------- + +The document-level :attr:`.Document.equations` property returns every top-level +equation found by walking the document body:: + + >>> from docx import Document + >>> document = Document("has-equations.docx") + >>> document.equations + [, ...] + >>> len(document.equations) + 2 + +Each entry is an |Equation| proxy. The walk yields ``m:oMathPara`` wrappers +whole; an inline ``m:oMath`` nested inside a ``m:oMathPara`` is represented +once, by the enclosing wrapper, not as two separate equations. Equations inside +headers, footers, footnotes, endnotes, and comments are **not** included in +this collection — they belong to the corresponding story container, and are +accessible via :attr:`.Paragraph.equations` on paragraphs within those stories. + +Paragraph-level access mirrors the document-level shape:: + + >>> paragraph = document.paragraphs[1] + >>> [e.text for e in paragraph.equations] + ['x'] + +.. note:: + + The paragraph and document equation walks are *read-only iterators*. They + reflect the current XML tree; they do not expose add/remove operations + directly. Creating an equation is always done by appending OMML XML via + :meth:`.Paragraph.add_equation` (see below). + + +Inspecting an equation +---------------------- + +The |Equation| proxy exposes three read-only properties over the wrapped +``m:oMath`` / ``m:oMathPara`` element: + +* :attr:`~.Equation.text` — a best-effort, *flattened* plain-text rendering + that concatenates every descendant ``m:t`` element's text. Structure + (fractions, sub/superscripts, radicals) is stripped, which is usually good + enough for search indexing or quick previews but loses the mathematical + meaning. Use :attr:`~.Equation.raw_xml` when fidelity matters. +* :attr:`~.Equation.raw_xml` — the serialized OMML XML for this equation, as + UTF-8 bytes, with all namespace declarations preserved. Callers who want to + reason about the tree should hand these bytes to their own XML parser. +* :attr:`~.Equation.is_display_mode` — |True| when the wrapped element is + ``m:oMathPara``, |False| when it is a bare inline ``m:oMath``:: + + >>> equation = document.equations[0] + >>> equation.text + 'x' + >>> equation.raw_xml[:48] + b'e' + >>> equation = Equation.from_omml_xml(xml) + >>> equation.text + 'e' + +:meth:`.Equation.from_omml_xml` raises :class:`ValueError` when the root +element is neither ``m:oMath`` nor ``m:oMathPara``. Namespace declarations for +the ``m:`` prefix must be present on the root element (or an ancestor); the +caller is responsible for including them. + + +Appending an equation to a paragraph +------------------------------------ + +:meth:`.Paragraph.add_equation` parses an OMML XML string and appends the +resulting element to the paragraph, returning the wrapping |Equation|:: + + >>> paragraph = document.add_paragraph("The variable ") + >>> xml = 'x' + >>> equation = paragraph.add_equation(xml) + >>> equation.text + 'x' + >>> equation.is_display_mode + False + +Passing ``display_mode=True`` wraps a bare ``m:oMath`` in an ``m:oMathPara`` +before appending, turning it into a centered display equation. If the supplied +XML is already an ``m:oMathPara``, it is appended unchanged regardless of the +flag:: + + >>> equation = paragraph.add_equation(xml, display_mode=True) + >>> equation.is_display_mode + True + + +Builder helpers +--------------- + +Hand-authoring OMML XML is possible but verbose. The +:mod:`docx.equations` module ships a small family of builder functions that +each return a complete, parseable ``m:oMath`` fragment with the namespace +declaration already in place. Their output is suitable for passing directly to +:meth:`.Paragraph.add_equation` or :meth:`.Equation.from_omml_xml`:: + + >>> from docx.equations import ( + ... build_identifier, build_fraction, + ... build_superscript, build_subscript, build_radical, + ... ) + +These helpers cover the everyday shapes. When you need nested structure — +a fraction whose numerator is itself a superscript, for instance — hand-author +the OMML or compose the builders' output with an XML tree library of your +choice. + + +build_identifier -- plain identifiers and literals +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~docx.equations.build_identifier` wraps a short text span in a single +```` run inside an ``m:oMath`` element. It is the +correct building block for a single-letter variable, a Greek letter, or a +short keyword:: + + >>> build_identifier("x") + '...x' + >>> build_identifier("χ") # Greek chi + '...χ' + +The `text` argument is XML-escaped, so identifiers that happen to contain +characters like ``<`` or ``&`` are safe:: + + >>> build_identifier("a...a<b' + +Word does not italicize the glyph based on the identifier's content; styling +is controlled by the ```` run-property child, which this builder does +not emit. The default Word rendering for ```` inside a run with no +explicit ```` is italic, matching convention +for mathematical variables. + + +build_fraction -- stacked fractions with a horizontal bar +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~docx.equations.build_fraction` emits an ``m:f`` element (a "fraction") +containing an ``m:num`` (numerator) and ``m:den`` (denominator), each wrapped +around a single run. The ``m:fPr`` child carries ```` to +select the stacked horizontal-bar appearance (``"bar"`` is the default; other +types are ``"lin"`` for linear *a/b*, ``"noBar"`` for stacked without a bar, +and ``"skw"`` for skewed):: + + >>> build_fraction("a", "b") + ' + a + b + ' + +Both arguments are wrapped as a single ``m:r``/``m:t`` run — the builder does +not parse its inputs. To nest a fraction inside a fraction, or to place a +superscript in the numerator, hand-author the OMML around the builder's +output. + +The flattened :attr:`.Equation.text` of a fraction concatenates the numerator +and denominator text in that order:: + + >>> equation = Equation.from_omml_xml(build_fraction("a", "b")) + >>> equation.text + 'ab' + + +build_superscript -- exponents +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~docx.equations.build_superscript` emits an ``m:sSup`` element (a +"script-super") with ``m:e`` (the base) and ``m:sup`` (the exponent) children. +Each is wrapped around a single run:: + + >>> build_superscript("x", "2") + ' + x + 2 + ' + + +build_subscript -- subscripts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~docx.equations.build_subscript` is the mirror image of +:func:`~docx.equations.build_superscript`: it emits an ``m:sSub`` element +("script-sub") with ``m:e`` (the base) and ``m:sub`` (the subscript):: + + >>> build_subscript("x", "i") + ' + x + i + ' + +For an identifier that carries both a subscript *and* a superscript +simultaneously (``x_i^2``), hand-author an ``m:sSubSup`` element — the +builders do not cover that case. + + +build_radical -- square roots and nth roots +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~docx.equations.build_radical` emits an ``m:rad`` element (a "radical") +with an optional ``m:deg`` (degree) and a required ``m:e`` (the radicand). +When `degree_text` is |None| (the default) an empty ```` is written, +which Word renders as a square-root glyph:: + + >>> build_radical("x") + ' + x + ' + >>> build_radical("x", "3") # cube root + ' + 3 + x + ' + + +Composing builders with add_equation +------------------------------------ + +The intended workflow is to build an OMML fragment with one of the helpers, +then hand it to :meth:`.Paragraph.add_equation`. The example below composes +a sentence with an inline fraction equation:: + + >>> from docx import Document + >>> from docx.equations import build_fraction + >>> document = Document() + >>> paragraph = document.add_paragraph("The ratio is ") + >>> equation = paragraph.add_equation(build_fraction("a", "b")) + >>> equation.text + 'ab' + >>> document.save("ratio.docx") + + +Limitations +----------- + +The builder helpers are intentionally minimal. They each return a single +top-level mathematical node wrapped in ``m:oMath``, suitable for the most +common idioms but not for nesting. In particular: + +- Each builder's argument becomes a single ```` text run. Numerators, + denominators, bases, exponents, degrees, and radicands cannot themselves + contain further builder output directly — the ``str`` arguments are not + parsed. To nest (a fraction whose numerator is a superscript, for instance), + hand-author the OMML. +- No run properties (````) are emitted. Style (italic/roman, color, + size) is inherited from the paragraph. +- Combined sub-and-super (``m:sSubSup``), matrices (``m:m``), delimiters + (``m:d``), and accent marks (``m:acc``) are not covered by a builder. Read + access via |Equation| still works for these elements — their text content + is flattened into :attr:`~.Equation.text` — but creating them requires + hand-authored OMML. +- LaTeX or MathML input/output is out of scope. The OMML XML string remains + the authoritative exchange format. diff --git a/docs/user/fields.rst b/docs/user/fields.rst new file mode 100644 index 000000000..6f05b8bc1 --- /dev/null +++ b/docs/user/fields.rst @@ -0,0 +1,213 @@ +.. _fields: + +Working with Fields +=================== + +Word supports so-called *field codes* — short instructions such as ``PAGE``, +``DATE``, ``AUTHOR``, or ``REF bookmark1 \h`` that Word evaluates at display +time to produce some rendered text. The rendered result is cached in the +document alongside the instruction; Word refreshes the cache when you press +*F9* or when the document is reopened. + +A field therefore has two observable pieces: + +* the **instruction** — what the field will evaluate, e.g. ``"PAGE"`` or + ``"REF FavouriteValue \\h"`` +* the **result text** — the cached rendered value, e.g. ``"7"`` or + ``"The quoted value is forty-two."`` + +WordprocessingML represents fields two different ways and *python-docx* +exposes both forms behind a single :class:`~docx.fields.Field` proxy. + + +Simple vs. complex fields +------------------------- + +**Simple fields** use a single ```` block. The instruction is +stored in the ``w:instr`` attribute and the rendered result is held in one or +more ```` run children:: + + + 2025-01-02 + + +**Complex fields** split the same information across a sequence of runs +delimited by three ```` markers — ``begin``, ``separate``, and +``end``. The instruction lives in an ```` element between +``begin`` and ``separate``; the rendered result is the plain text of the runs +between ``separate`` and ``end``:: + + + PAGE + + 7 + + +Word prefers the complex form for anything non-trivial (fields with switches, +nested fields, form-field controls), but consumer software must handle both. +*python-docx* reads either and you can choose which to write. + +**Applicability.** Fields occur inside paragraphs. A simple field is a +block-level child of ``w:p``; a complex field is a sequence of ``w:r`` runs +inside a ``w:p``. Fields are supported in the document body, inside table +cells, and inside headers, footers, and comments. + + +The Field proxy +--------------- + +Every field — simple or complex — is wrapped by the same +:class:`~docx.fields.Field` class. The three read-only properties that matter +day-to-day are :attr:`~docx.fields.Field.instruction`, +:attr:`~docx.fields.Field.type`, and :attr:`~docx.fields.Field.result_text`:: + + >>> from docx import Document + >>> document = Document("my-doc.docx") + >>> paragraph = document.paragraphs[2] + + >>> field = paragraph.fields[0] + >>> field + + >>> field.is_complex + False + >>> field.instruction + 'DATE' + >>> field.type + 'DATE' + >>> field.result_text + '2025-01-02' + +:attr:`~docx.fields.Field.type` is the convenient shorthand — it is just the +first whitespace-delimited token of :attr:`~docx.fields.Field.instruction`. + + +Field-type constants +-------------------- + +The :class:`~docx.fields.WD_FIELD_TYPE` class collects common field-type +tokens as string constants, purely to help with autocomplete and typo +avoidance:: + + >>> from docx.fields import WD_FIELD_TYPE + >>> WD_FIELD_TYPE.PAGE + 'PAGE' + >>> WD_FIELD_TYPE.REF + 'REF' + +These are plain strings (not a real :class:`enum.Enum`) because the set of +field-type tokens used in real-world documents is open-ended; custom field +types simply round-trip as whatever string appears in the document. + + +Adding a field to a paragraph +----------------------------- + +You can append either a simple or a complex field to an existing paragraph:: + + >>> paragraph = document.add_paragraph("Today is ") + + >>> # -- simple form: one element -- + >>> field = paragraph.add_simple_field(WD_FIELD_TYPE.DATE, "2025-01-02") + >>> field.is_complex + False + >>> field.result_text + '2025-01-02' + + >>> # -- complex form: begin/separate/end run sequence -- + >>> field = paragraph.add_complex_field(WD_FIELD_TYPE.PAGE, "7") + >>> field.is_complex + True + >>> field.result_text + '7' + +The `text` / `result_text` parameter is optional. When omitted the field is +written with no cached result — Word (or another consumer) will populate it +the first time the field is evaluated. + + +Iterating fields in a document +------------------------------ + +Every paragraph exposes its fields in document order via +:attr:`Paragraph.fields `. To walk +every field in the body, iterate the paragraphs:: + + >>> for paragraph in document.paragraphs: + ... for field in paragraph.fields: + ... print(field.type, repr(field.result_text)) + DATE '2025-01-02' + PAGE '7' + REF 'The quoted value is forty-two.' + +:attr:`Paragraph.fields` returns both simple and complex fields in a single +flat list ordered by XML position. Fields inside tables, headers, footers, or +comments are reached by iterating the paragraphs of those containers directly. + + +Updating the rendered result +---------------------------- + +:meth:`Field.update_result_text() ` +replaces the cached result text in place without disturbing the instruction:: + + >>> field = paragraph.fields[0] + >>> field.update_result_text("42") + >>> field.result_text + '42' + +For a simple field this rewrites the run(s) inside the ```` +element. For a complex field it replaces the runs between the ``separate`` +and ``end`` markers with a single new run. If a complex field has no +``separate`` marker the call is a no-op — there is nowhere to write the +rendered text. + + +Cross-reference resolution (REF / PAGEREF) +------------------------------------------ + +``REF`` fields point at a bookmark elsewhere in the document; ``PAGEREF`` +fields reference the page number that a bookmark falls on. *python-docx* +cannot compute real page numbers — it has no layout engine — but it can +resolve ``REF`` fields against the bookmark's current text using +:meth:`Field.resolve() `:: + + >>> paragraph = document.add_paragraph("The quoted value is forty-two.") + >>> paragraph.add_bookmark( + ... "FavouriteValue", + ... start_run=paragraph.runs[0], + ... end_run=paragraph.runs[0], + ... ) + + >>> ref_para = document.add_paragraph("As noted earlier: ") + >>> ref_field = ref_para.add_complex_field("REF FavouriteValue \\h") + >>> ref_field.resolve(document) + 'The quoted value is forty-two.' + +:meth:`Field.resolve` is best-effort and never raises. For field types it +does not understand (``PAGE``, ``DATE``, ``SEQ``, custom fields, …) it simply +returns the existing :attr:`~docx.fields.Field.result_text`. For a +``PAGEREF`` whose cached result is empty it returns ``"?"``. + +To rewrite every ``REF`` and ``PAGEREF`` result in the document body in one +go, use :meth:`Document.resolve_cross_references +`:: + + >>> updated = document.resolve_cross_references() + >>> updated + 1 + +The return value is the number of fields whose cached result was actually +changed. Fields whose cached result already matched the bookmark text — or +whose bookmark could not be found — are skipped. + + +A note about form fields +------------------------ + +A **form field** is a specific kind of complex field whose ``begin`` marker +carries a ```` child describing a text input, checkbox, or dropdown. +Form fields are presented through a dedicated +:class:`~docx.form_fields.FormField` proxy and are accessible via +:attr:`Document.form_fields `, not via the +``fields`` collection. Non-form complex fields (``PAGE``, ``REF``, …) appear +only in :attr:`Paragraph.fields` and the two collections are disjoint. diff --git a/docs/user/footnotes.rst b/docs/user/footnotes.rst new file mode 100644 index 000000000..e11fa99d1 --- /dev/null +++ b/docs/user/footnotes.rst @@ -0,0 +1,209 @@ +.. _footnotes: + +Working with Footnotes +====================== + +Word allows *footnotes* to be attached to running prose. A footnote appears as a small +superscript reference mark in the body (usually a number, asterisk, or Roman numeral) +paired with a separate block of text that Word renders at the bottom of the page, where +the reader can consult it without losing their place. + +The procedure, from the Word UI, is simple: + +- You place the insertion cursor where you want the reference mark to appear +- You press the *Insert Footnote* button (References toolbar) +- You type or paste in the footnote content + +**Footnote Anatomy.** Each footnote has two parts, a *footnote-reference* and a +*footnote-content*: + +The **footnote-reference** is an empty marker element (````) +inserted inside a run at the point in the body where the superscript number should +appear. The reference carries the numeric ``id`` of the footnote it points at, and +*python-docx* styles the containing run with the ``FootnoteReference`` character style +so Word displays the mark as a superscript number. + +The **footnote-content** is the prose that appears at the bottom of the page. Each +footnote's content is stored in a separate ``w:footnote`` element in the *footnotes +part* (part-name ``word/footnotes.xml``), not in the main document body. The two halves +are tied together by the shared ``id`` attribute. + +**Reserved Ids.** The footnotes part always contains at least two ids that are not real +footnotes: ``id=0`` is the *separator* (the horizontal line between body text and +footnotes) and ``id=1`` is the *continuation separator* used when a footnote overflows +to the next page. User-added footnotes start at ``id=2`` and are assigned sequentially. +*python-docx* hides these reserved ids from iteration and from the ``Document.footnotes`` +length. + +**Applicability.** Footnotes can be added only in the main document body. The +*python-docx* API does not currently support adding footnotes inside comments, +headers, or footers, and it does not support endnotes being anchored to footnote text. + + +Adding a footnote +----------------- + +A simple example is anchoring a footnote to the first run of a paragraph:: + + >>> from docx import Document + >>> document = Document() + >>> paragraph = document.add_paragraph("The rain in Spain.") + + >>> footnote = document.footnotes.add( + ... paragraph.runs[0], + ... "A common saying about Iberian weather.", + ... ) + >>> footnote + + >>> footnote.footnote_id + 2 + >>> footnote.text + 'A common saying about Iberian weather.' + +Note that :meth:`.Footnotes.add` takes a single |Run| (not a range), because a +footnote has a point of insertion rather than a range of selected text. The +``FootnoteReference`` marker is inserted at the end of that run. If you need the +reference to appear in the middle of a run, split the run first so that the +insertion point lies on a run boundary. + + +Reading footnotes from a document +--------------------------------- + +The footnotes collection is reached via the :attr:`.Document.footnotes` property:: + + >>> document = Document("has-footnotes.docx") + >>> footnotes = document.footnotes + >>> footnotes + + >>> len(footnotes) + 3 + +The collection is iterable and yields |Footnote| objects for every user footnote in +document order. The separator and continuation-separator entries are filtered out:: + + >>> for footnote in document.footnotes: + ... print(footnote.footnote_id, footnote.text) + 2 A common saying about Iberian weather. + 3 As of the loadfix fork. + 4 Ids 0 and 1 are reserved for separators. + + +Inspecting a footnote +--------------------- + +A |Footnote| is a *block-item container*, just like a document body or a table +cell, so it exposes the same paragraph-access API. Each footnote contains at +least one paragraph, styled ``FootnoteText``, whose first run carries the +auto-number mark that Word renders in front of the footnote text:: + + >>> footnote = document.footnotes[0] + >>> footnote.footnote_id + 2 + >>> len(footnote.paragraphs) + 1 + >>> footnote.paragraphs[0].style.name + 'FootnoteText' + >>> footnote.text + 'A common saying about Iberian weather.' + +The :attr:`.Footnote.text` property concatenates the text of every paragraph in +the footnote, joined by newlines. All emphasis and character-level styling is +stripped; use ``Footnote.paragraphs`` to walk the runs yourself if you need +richer access. + + +Adding rich content to a footnote +--------------------------------- + +Because a footnote is a block-item container, you can add additional paragraphs +and runs to it just like you would to the document body:: + + >>> footnote = document.footnotes.add(paragraph.runs[0], "First line.") + >>> second_para = footnote.add_paragraph("Second line.") + >>> second_para.style.name + 'FootnoteText' + >>> footnote.paragraphs[0].add_run(" (emphasised)").italic = True + +:meth:`.Footnote.add_paragraph` applies the ``FootnoteText`` paragraph style by +default so the added paragraph blends in visually with the footnote's existing +content. + + +Modifying and deleting footnotes +-------------------------------- + +:meth:`.Footnote.clear` drops every run after the initial auto-number mark, +leaving a single empty paragraph you can populate fresh:: + + >>> footnote.clear() + >>> footnote.text + '' + >>> len(footnote.paragraphs) + 1 + +:meth:`.Footnote.delete` removes the footnote outright. Both the +``w:footnote`` element in the footnotes part and every ``w:footnoteReference`` +in the document body that targets it are removed; runs that contained only the +reference are cleaned up:: + + >>> len(document.footnotes) + 3 + >>> document.footnotes[0].delete() + >>> len(document.footnotes) + 2 + +After calling :meth:`~.Footnote.delete` the |Footnote| object is "defunct" and +should not be used further. + + +Footnote numbering properties +----------------------------- + +Footnote numbering is configured at the document level via a |FootnoteProperties| +object. Access it via :attr:`.Document.footnote_properties`; the property is +|None| when no ``w:footnotePr`` element is present in the settings part, in +which case Word applies its defaults (Arabic numerals starting at 1, continuous +numbering, footnotes at the bottom of the page):: + + >>> document.footnote_properties is None + True + >>> props = document.add_footnote_properties() + >>> props + + +|FootnoteProperties| exposes four read/write properties, each backed by a child +element of ``w:footnotePr``. Assigning |None| removes the underlying element, +restoring Word's default for that aspect:: + + >>> from docx.enum.text import ( + ... WD_FOOTNOTE_POSITION, + ... WD_FOOTNOTE_RESTART, + ... WD_NUMBER_FORMAT, + ... ) + >>> props.number_format = WD_NUMBER_FORMAT.LOWER_ROMAN + >>> props.start_number = 1 + >>> props.restart_rule = WD_FOOTNOTE_RESTART.EACH_SECTION + >>> props.position = WD_FOOTNOTE_POSITION.BENEATH_TEXT + +* :attr:`~.FootnoteProperties.number_format` — a :ref:`WdNumberFormat` member + selecting the glyph family for the reference marks. Common choices are + ``DECIMAL`` (1, 2, 3 ...), ``UPPER_ROMAN``, ``LOWER_ROMAN``, ``UPPER_LETTER``, + and ``CHICAGO`` (the ``*``, ``†``, ``‡``, ``§`` cycle). +* :attr:`~.FootnoteProperties.start_number` — the integer at which numbering + begins. Usually ``1``; set higher when continuing a numbering scheme across + documents. +* :attr:`~.FootnoteProperties.restart_rule` — a :ref:`WdFootnoteRestart` + member that controls whether numbering runs continuously + (``CONTINUOUS``), restarts at each section (``EACH_SECTION``), or restarts + on every page (``EACH_PAGE``). +* :attr:`~.FootnoteProperties.position` — a :ref:`WdFootnotePosition` member + that places footnotes either at the page bottom (``BOTTOM_OF_PAGE``, the + default) or immediately beneath the last line of body text + (``BENEATH_TEXT``). + +Section-level overrides are also supported via +:attr:`.Section.footnote_properties` and :meth:`.Section.add_footnote_properties`, +which accept the same |FootnoteProperties| API. When a section defines its own +``w:footnotePr`` it takes precedence over the document-level element for that +section. diff --git a/docs/user/form-fields.rst b/docs/user/form-fields.rst new file mode 100644 index 000000000..c6bcf978f --- /dev/null +++ b/docs/user/form-fields.rst @@ -0,0 +1,197 @@ +.. _form_fields: + +Working with Legacy Form Fields +=============================== + +Word supports two families of in-document form controls. The modern family, +*content controls* (Structured Document Tags, or SDTs), was introduced in +Word 2007. The older family — *legacy form fields* — predates them and is +still widely used, especially by templates authored for older Word versions +and by documents produced by legal, accounting, and government tooling. + +*python-docx* exposes the legacy family via the ``docx.form_fields`` module. + +**Legacy Form-Field Anatomy.** A legacy form field is a *complex field* whose +``begin`` ``w:fldChar`` carries a ``w:ffData`` child. The ``w:ffData`` +element holds the form field's metadata (name, help text, enabled flag, +calc-on-exit flag) and a type-specific options block: + +- ``w:textInput`` for a *text-input* field (``FORMTEXT``) — a free-form + text entry with an optional default value, max length, and format. +- ``w:checkBox`` for a *checkbox* field (``FORMCHECKBOX``) — a boolean + state with a default and a current ``checked`` state. +- ``w:ddList`` for a *dropdown* field (``FORMDROPDOWN``) — a list of + options with a default-selection index and a result-selection index. + +Each form field is bracketed by the usual complex-field markers: a ``begin`` +fldChar, an ``instrText`` run (``FORMTEXT``, ``FORMCHECKBOX``, or +``FORMDROPDOWN``), a ``separate`` fldChar, a *result* region (runs carrying +the rendered value), and an ``end`` fldChar. + +**Form-Field Name.** Each form field has a *name* (``w:ffData/w:name``) that +acts as the programmatic identifier used by Word VBA macros and by ``REF`` +fields elsewhere in the document to retrieve the field's value. Names do not +need to be unique, but Word tooling typically treats them as such. + +**Read vs. Mutate.** The :class:`FormField` proxy is read-oriented: it +exposes a type discriminator, the shared metadata (name, help text, status +text, enabled, calc-on-exit), a ``value`` derivation, and per-type views +(:class:`TextInputFormField`, :class:`CheckboxFormField`, +:class:`DropdownFormField`) that are *read-only* projections over the +corresponding ``w:ffData`` child. + +To *create* new form fields, three paragraph-level convenience methods are +provided — :meth:`Paragraph.add_text_form_field`, +:meth:`Paragraph.add_checkbox_form_field`, and +:meth:`Paragraph.add_dropdown_form_field` — each of which appends a +complete complex-field sequence to the paragraph and returns a +:class:`FormField` proxy. + +**Applicability.** Legacy form fields render and round-trip correctly in +Word. They can appear in the document body, in table cells, and in headers +and footers. The :attr:`Document.form_fields` collection walks *top-level +body paragraphs only* — to access form fields nested inside table cells, +headers, footers, footnotes, or endnotes, iterate the enclosing +paragraphs and read their :attr:`Paragraph.form_fields` collections. + + +Accessing the form-fields collection +------------------------------------ + +The top-level collection is accessed via :attr:`Document.form_fields`:: + + >>> from docx import Document + >>> document = Document("application-form.docx") + >>> fields = document.form_fields + >>> len(fields) + 3 + >>> [ff.name for ff in fields] + ['FullName', 'Subscribe', 'Country'] + +Each member is a :class:`FormField` proxy. The +:attr:`FormField.type` property returns a :class:`WD_FORM_FIELD_TYPE` +enum member that discriminates the three field families:: + + >>> from docx.form_fields import WD_FORM_FIELD_TYPE + >>> fields[0].type + + >>> fields[0].type is WD_FORM_FIELD_TYPE.TEXT + True + +The shared metadata is exposed on the proxy itself:: + + >>> ff = fields[0] + >>> ff.name + 'FullName' + >>> ff.help_text + '' + >>> ff.enabled + True + >>> ff.calc_on_exit + False + + +Text-input form fields +---------------------- + +Type-specific attributes are exposed via a narrow read-only view. For text +inputs, use :attr:`FormField.text_input`:: + + >>> ff = fields[0] # -- the FullName text input -- + >>> ti = ff.text_input + >>> ti.default + 'Jane Doe' + >>> ti.max_length + 40 + >>> ti.format + '' + +A ``max_length`` of |None| indicates no limit (the ``w:maxLength`` element +is absent or its value is ``0``, the OOXML "no-limit" sentinel). + +The current rendered value is available on the proxy itself via +:attr:`FormField.value`, which returns the concatenated text of the runs +between the ``separate`` and ``end`` markers:: + + >>> ff.value + 'Jane Doe' + + +Checkbox form fields +-------------------- + +Checkbox views are exposed via :attr:`FormField.checkbox`:: + + >>> ff = fields[1] # -- the Subscribe checkbox -- + >>> cb = ff.checkbox + >>> cb.default + True + >>> cb.checked + True + +When ``w:checked`` is absent but ``w:default`` is present, ``checked`` +returns the default — mirroring Word's runtime behaviour. For checkboxes, +:attr:`FormField.value` returns the boolean ``checked`` state directly:: + + >>> ff.value + True + + +Dropdown form fields +-------------------- + +Dropdown views are exposed via :attr:`FormField.dropdown`:: + + >>> ff = fields[2] # -- the Country dropdown -- + >>> dd = ff.dropdown + >>> dd.options + ['US', 'UK', 'AU'] + >>> dd.default_index + 1 + >>> dd.result_index + 1 + +``default_index`` and ``result_index`` are 0-based. When ``w:result`` is +absent, ``result_index`` falls back to ``default_index``. For dropdowns, +:attr:`FormField.value` returns the *selected option string* (the entry at +``result_index``), or the empty string when the index is out of range:: + + >>> ff.value + 'UK' + + +Adding form fields +------------------ + +Form fields are appended to a paragraph via three convenience methods on +:class:`Paragraph`. Each returns a :class:`FormField` proxy for the newly +added field:: + + >>> from docx import Document + >>> document = Document() + + >>> p = document.add_paragraph("Name: ") + >>> p.add_text_form_field(name="FullName", default="Jane Doe", maxlength=40) + + + >>> p = document.add_paragraph("Subscribe? ") + >>> p.add_checkbox_form_field(name="Subscribe", checked=True) + + + >>> p = document.add_paragraph("Country: ") + >>> p.add_dropdown_form_field( + ... name="Country", options=["US", "UK", "AU"], default_index=1, + ... ) + + + >>> document.save("application-form.docx") + +Each method emits a complete complex-field sequence — the ``begin`` run +(with the ``w:ffData`` attached to its ``w:fldChar``), the ``instrText`` +run, the ``separate`` run, a *result* run, and the ``end`` run. The +rendered result text is seeded so Word displays the initial value +immediately without a field update. + +For the type-specific attributes of these methods — such as ``maxlength`` +for text inputs, or ``default_index`` for dropdowns — see the API +documentation at :ref:`form_fields_api`. diff --git a/docs/user/glossary.rst b/docs/user/glossary.rst new file mode 100644 index 000000000..a7294161a --- /dev/null +++ b/docs/user/glossary.rst @@ -0,0 +1,185 @@ +.. _glossary: + +Working with the Glossary Document +================================== + +Word stores its *AutoText*, *Quick Parts*, *cover-page*, *header/footer*, and +similar reusable snippets in a dedicated part of the package called the +*glossary document*. At load time these show up in Word's *Insert > Quick +Parts* and *Insert > Cover Page* galleries. On disk they live under +``word/glossary/document.xml`` alongside the main document part, and each +individual snippet is a ``w:docPart`` element — a *building block*. + +python-docx exposes this part **read-only**. The glossary document is almost +always authored by Word itself — for example, it's the part that carries the +built-in cover pages and headers — so this release surfaces it for inspection +without providing any creation or mutation API. Documents created via +``Document()`` with the default template do **not** ship with a glossary +part, so ``document.glossary`` returns |None| for them. + +**What you get:** + +- Discover whether a document has a glossary part at all. +- Iterate the building blocks it contains, in document order. +- Read each block's metadata (name, description, GUID, category, gallery). +- Walk the paragraphs and tables that make up each block's body. +- Filter and aggregate blocks by gallery and/or category name. + + +Accessing the glossary +---------------------- + +The glossary is reached via the :attr:`.Document.glossary` property:: + + >>> from docx import Document + >>> document = Document("briefing-with-cover-pages.docx") + >>> glossary = document.glossary + >>> glossary + + +For a document without a glossary part the same property returns |None|:: + + >>> Document().glossary is None + True + +It is therefore worth a ``None`` check before working with the proxy:: + + >>> glossary = document.glossary + >>> if glossary is None: + ... print("no glossary part") + ... else: + ... print(f"{len(glossary)} building blocks") + ... + 7 building blocks + + +Iterating building blocks +------------------------- + +The |Glossary| proxy behaves like a read-only collection. It supports +``len()``, iteration, and indexed lookup by building-block name:: + + >>> len(glossary) + 7 + >>> for block in glossary: + ... print(block.name) + ... + Austere Cover Page + Banded Cover Page + Default Quick Part + ... + +The :attr:`.Glossary.building_blocks` property returns the same sequence as +a list, in document order. Indexed lookup is by **name** (exact, case +sensitive); a |KeyError| is raised when no building block with that name +exists:: + + >>> block = glossary["Austere Cover Page"] + >>> block.name + 'Austere Cover Page' + >>> glossary["Does Not Exist"] + Traceback (most recent call last): + ... + KeyError: 'Does Not Exist' + + +Building-block metadata +----------------------- + +Each |BuildingBlock| exposes the metadata Word writes into +``w:docPart/w:docPartPr``:: + + >>> block = glossary["Austere Cover Page"] + >>> block.name + 'Austere Cover Page' + >>> block.description + 'Cover page with bold title and heading frame.' + >>> block.guid + '{12345678-90AB-CDEF-1234-567890ABCDEF}' + +Any of these may be |None| when the underlying metadata slot is absent, so +guard accordingly when working with arbitrary documents. + +The *category* of a building block is available as a |BuildingBlockCategory| +proxy. The proxy is always returned — even when Word has not written a +``w:category`` element — but its slots will both be |None| in that case:: + + >>> block = glossary["Austere Cover Page"] + >>> block.category + BuildingBlockCategory(gallery='coverPg', category_name='Built-In') + >>> block.category.gallery + 'coverPg' + >>> block.category.category_name + 'Built-In' + +The ``gallery`` slot is the raw XML string that Word writes. For the +well-known galleries it can be mapped to a :ref:`WdBuildingBlockGallery` +enum member via :attr:`.BuildingBlockCategory.gallery_value`:: + + >>> from docx.enum.text import WD_BUILDING_BLOCK_GALLERY + >>> block.category.gallery_value is WD_BUILDING_BLOCK_GALLERY.COVER_PAGES + True + +Unknown or vendor-specific gallery values return |None| from +``gallery_value``; the raw string is still available via +:attr:`.BuildingBlockCategory.gallery` for manual inspection. + + +Reading the body of a building block +------------------------------------ + +A building block's content — the paragraphs and tables Word inserts when the +user picks the snippet from a gallery — is modelled as a block-item +container:: + + >>> block = glossary["Banded Cover Page"] + >>> for paragraph in block.paragraphs: + ... print(paragraph.text) + ... + Document Title + 2026-05-02 + >>> len(block.tables) + 1 + +When the block has no ``w:docPartBody`` element — a legitimate state for +placeholder entries in the glossary — both properties return empty lists. + + +Filtering and aggregating +------------------------- + +The |Glossary| proxy includes a few convenience accessors for bulk +inspection. :meth:`.Glossary.by_category` filters building blocks by +gallery, category name, or both — passing neither is equivalent to +:attr:`.Glossary.building_blocks`:: + + >>> from docx.enum.text import WD_BUILDING_BLOCK_GALLERY + >>> [b.name for b in glossary.by_category( + ... gallery=WD_BUILDING_BLOCK_GALLERY.COVER_PAGES + ... )] + ['Austere Cover Page', 'Banded Cover Page'] + >>> [b.name for b in glossary.by_category(category_name="Built-In")] + ['Austere Cover Page', 'Banded Cover Page'] + +The ``gallery`` argument also accepts a raw XML string, which is useful +when a document uses a gallery value that is not modelled by the enum:: + + >>> glossary.by_category(gallery="custom1") + [] + +Two more properties return deduplicated views of the set as a whole. +:attr:`.Glossary.galleries` returns each raw gallery value once, in +first-seen order:: + + >>> glossary.galleries + ['coverPg', 'quickParts'] + +:attr:`.Glossary.categories` returns one |BuildingBlockCategory| per unique +``(gallery, category_name)`` pair, again in first-seen order. Entries where +both slots are |None| are dropped:: + + >>> for cat in glossary.categories: + ... print(cat) + ... + BuildingBlockCategory(gallery='coverPg', category_name='Built-In') + BuildingBlockCategory(gallery='quickParts', category_name='General') diff --git a/docs/user/install.rst b/docs/user/install.rst index 49bbed0a0..56d406533 100644 --- a/docs/user/install.rst +++ b/docs/user/install.rst @@ -6,33 +6,16 @@ Installing .. note:: python-docx versions 0.3.0 and later are not API-compatible with prior versions. -|docx| is hosted on PyPI, so installation is relatively simple, and just -depends on what installation utilities you have installed. - -|docx| may be installed with ``pip`` if you have it available:: +|docx| is hosted on PyPI, so installation with ``pip`` is straightforward:: pip install python-docx -|docx| can also be installed using ``easy_install``, although this is -discouraged:: - - easy_install python-docx - -If neither ``pip`` nor ``easy_install`` is available, it can be installed -manually by downloading the distribution from PyPI, unpacking the tarball, -and running ``setup.py``:: - - tar xvzf python-docx-{version}.tar.gz - cd python-docx-{version} - python setup.py install - -|docx| depends on the ``lxml`` package. Both ``pip`` and ``easy_install`` -will take care of satisfying those dependencies for you, but if you use this -last method you will need to install those yourself. +|docx| depends on the ``lxml`` package. ``pip`` will install it automatically +along with any other runtime dependencies. Dependencies ------------ -* Python 2.6, 2.7, 3.3, or 3.4 -* lxml >= 2.3.2 +* Python 3.9+ +* lxml >= 4.9.1 diff --git a/docs/user/mail-merge.rst b/docs/user/mail-merge.rst new file mode 100644 index 000000000..3684363bf --- /dev/null +++ b/docs/user/mail-merge.rst @@ -0,0 +1,198 @@ +.. _mail_merge: + +Working with Mail Merge +======================= + +Word supports *mail merge*, a feature in which a single *main document* is +combined with records drawn from an external *data source* to produce one +personalised output per record. Typical outputs include form letters, email +messages, envelopes, mailing labels, and faxes. *python-docx* does not +*execute* a mail merge, but it does expose the configuration block Word stores +inside ``word/settings.xml`` so that callers can read, construct, or remove it +programmatically. + +When you open a main-merge document in Word, Word uses these stored settings +to know: + +- what kind of merge to run (form letter, email, label, ...), +- where the merged output should go (a new document, a printer, email, ...), +- how to reach the data source (a connection string, an ODBC DSN, an Excel + sheet, ...), and +- which rows to select from that data source (the stored query). + +The configuration also records which record is currently "active" in Word's +preview, whether Word should display merged values instead of field +placeholders, and a handful of Boolean flags governing behaviour at merge +time. + +**Scope.** *python-docx* surfaces the ``w:mailMerge`` element, its sub-elements, +and the three mail-merge enumerations. It does **not** create merge fields in +the document body, fetch data from external sources, or produce merged output +— the actual merge is still performed by Word (or by your own code reading the +configuration back out). + + +Accessing the mail-merge configuration +-------------------------------------- + +Every document exposes a :class:`.Settings` object via +:attr:`.Document.settings`. When the document has a ``w:mailMerge`` element, +:attr:`.Settings.mail_merge` returns a |MailMerge| proxy; when it does not, the +attribute is |None|:: + + >>> from docx import Document + >>> document = Document("contacts-form-letter.docx") + >>> document.settings.mail_merge + + + >>> blank = Document() + >>> blank.settings.mail_merge is None + True + + +Enabling mail merge +------------------- + +Use :meth:`.Settings.enable_mail_merge` to create (or replace) the +``w:mailMerge`` block. Every argument other than ``main_document_type`` is +optional; arguments left as |None| are simply omitted from the XML. + +:: + + >>> from docx import Document + >>> from docx.enum.text import ( + ... WD_MAIL_MERGE_DATA_TYPE, + ... WD_MAIL_MERGE_DESTINATION, + ... WD_MAIL_MERGE_TYPE, + ... ) + >>> document = Document() + >>> mail_merge = document.settings.enable_mail_merge( + ... main_document_type=WD_MAIL_MERGE_TYPE.EMAIL, + ... destination=WD_MAIL_MERGE_DESTINATION.EMAIL, + ... data_type=WD_MAIL_MERGE_DATA_TYPE.SPREADSHEET, + ... connect_string="Provider=Microsoft.ACE.OLEDB.12.0;Data Source=contacts.xlsx", + ... query="SELECT FirstName, Email FROM [Sheet1$]", + ... mail_subject="Quarterly update", + ... address_field_name="Email", + ... ) + +``enable_mail_merge()`` returns the |MailMerge| proxy so that additional +properties can be assigned in the same statement or on a subsequent line. +Calling it on a document that is already configured replaces the previous +``w:mailMerge`` element. + +The simplest possible call produces a form-letter merge with no data source +attached:: + + >>> document.settings.enable_mail_merge() + + +|MailMerge| properties +---------------------- + +Every |MailMerge| property is read/write and represents a single ``w:mailMerge`` +child element. Assigning |None| (or |False| for the Boolean flags) removes the +underlying element. + +.. rubric:: Typed, scalar properties + +``main_document_type`` + A :ref:`WdMailMergeType` member identifying the merge kind. Reading returns + |None| when the ``w:mainDocumentType`` child is absent. + +``destination`` + A :ref:`WdMailMergeDestination` member describing where merged output is + sent. + +``data_type`` + A :ref:`WdMailMergeDataType` member describing the data-source kind. + Unknown XML values read back as |None| rather than raising. + +``connect_string`` + The raw connection string used to reach the data source (for example, an + OLE DB or ODBC connection string). A plain |str| or |None|. + +``query`` + The SQL-style query Word executes against the data source to select and + order records. A plain |str| or |None|. + +``mail_subject`` + The subject line used for email-destination merges. + +``address_field_name`` + The name of the column inside the data source containing the recipient + address (typically an email address column for email merges). + +``active_record`` + The 1-based index of the record selected in Word's preview, as an |int|. + Values that cannot be parsed as an integer read back as |None|. + +``check_errors`` + Integer code controlling Word's error-reporting mode during merge. + +.. rubric:: Boolean flags + +The remaining properties correspond to on/off child elements. Each reads as +|True| when present and |False| when absent. + +``link_to_query`` + Preserves the association between the stored query and the data source. + +``do_not_suppress_blank_lines`` + Keeps blank output lines that would otherwise be suppressed when merge + fields resolve to empty strings. + +``mail_as_attachment`` + Sends the merged document as an email attachment rather than as the email + body. + +``view_merged_data`` + Tells Word to show merged field values rather than field placeholders when + the document is opened. + +Example of reading and updating properties:: + + >>> mail_merge = document.settings.mail_merge + >>> mail_merge.main_document_type + + >>> mail_merge.active_record + 3 + >>> mail_merge.view_merged_data + True + >>> mail_merge.mail_subject = "Updated subject" + >>> mail_merge.mail_as_attachment = True + + +Disabling mail merge +-------------------- + +Call :meth:`.Settings.disable_mail_merge` to remove the ``w:mailMerge`` element +entirely. After the call, :attr:`.Settings.mail_merge` is |None|. The method is +idempotent — calling it on a document that has no ``w:mailMerge`` element is a +no-op:: + + >>> document.settings.disable_mail_merge() + >>> document.settings.mail_merge is None + True + + +Mail-merge enumerations +----------------------- + +Three enumerations live in :mod:`docx.enum.text`: + +``WD_MAIL_MERGE_TYPE`` + Selects the main-document kind: ``CATALOG``, ``ENVELOPES``, + ``MAILING_LABELS``, ``FORM_LETTERS`` (the default), + ``EMAIL``, and ``FAX``. + +``WD_MAIL_MERGE_DESTINATION`` + Selects the destination for the merged output: ``NEW_DOCUMENT``, + ``PRINTER``, ``EMAIL``, and ``FAX``. + +``WD_MAIL_MERGE_DATA_TYPE`` + Selects the data-source kind: ``TEXT_FILE``, ``DATABASE``, + ``SPREADSHEET``, ``QUERY``, ``ODBC``, and ``NATIVE``. + +See :ref:`WdMailMergeType`, :ref:`WdMailMergeDestination`, and +:ref:`WdMailMergeDataType` for the full enum reference. diff --git a/docs/user/numbering.rst b/docs/user/numbering.rst new file mode 100644 index 000000000..fdb6326a1 --- /dev/null +++ b/docs/user/numbering.rst @@ -0,0 +1,169 @@ +.. _numbering: + +Numbering and list formatting +============================= + +Bulleted and numbered lists in WordprocessingML are not authored per-paragraph +the way they are in the UI. Instead, Word stores **numbering definitions** +in the ``word/numbering.xml`` part and each paragraph that participates in a +list *references* a definition by id. This indirection is what lets Word +renumber automatically when paragraphs are inserted, deleted, or moved. + +|docx| provides a three-layer proxy API over the numbering part: + +- :class:`.Numbering` — the top-level collection, available as + :attr:`Document.numbering `. +- :class:`.NumberingDefinition` — a single ``w:abstractNum`` element that + describes the visual format of a list (its level text, indentation, number + format, font). +- :class:`.Level` — one ``w:lvl`` child of a |NumberingDefinition|, one per + indent level (levels 0 through 8 are permitted by the spec). + + +Anatomy of a list +----------------- + +The numbering part holds two kinds of element: + +- ``w:abstractNum`` describes formatting (indentation, number format, level + text pattern, font). +- ``w:num`` is a concrete *instance* that points at an ``w:abstractNum`` and + can optionally override its starting value. + +A paragraph joins a list by carrying two attributes inside its ``w:numPr``: + +- ``w:numId`` — the id of a ``w:num`` instance. +- ``w:ilvl`` — the integer indent level (``0`` through ``8``). + +|docx| hides the abstract/instance distinction behind +:meth:`.NumberingDefinition.apply_to`: you describe the *formatting* you +want, and python-docx allocates (or reuses) a matching ``w:num`` instance +internally. + + +Reading existing lists +---------------------- + +:: + + >>> from docx import Document + >>> document = Document("report.docx") + >>> numbering = document.numbering + >>> len(numbering) + 10 + >>> for definition in numbering: + ... print(definition.abstract_num_id, [lvl.number_format for lvl in definition.levels]) + +Each |NumberingDefinition| exposes the set of levels it declares:: + + >>> definition = numbering.definitions[-1] + >>> for level in definition.levels: + ... print(level.ilvl, level.number_format, level.text, level.start, level.indent) + 0 WD_NUMBER_FORMAT.DECIMAL %1. 5 228600 + 1 WD_NUMBER_FORMAT.LOWER_LETTER %2) 1 457200 + 2 WD_NUMBER_FORMAT.BULLET • 1 685800 + +:attr:`.Level.indent` is a :class:`.Length` (EMU); the other accessors are +straight strings or enumeration members. + + +Building a new numbering definition +----------------------------------- + +Use :meth:`.Numbering.add_numbering_definition` to create a definition from a +sequence of per-level specifications. Each spec can be either a mapping or a +positional tuple:: + + >>> from docx.enum.text import WD_NUMBER_FORMAT + >>> from docx.shared import Inches + >>> definition = document.numbering.add_numbering_definition([ + ... { + ... "format": WD_NUMBER_FORMAT.DECIMAL, + ... "text": "%1.", + ... "indent": Inches(0.25), + ... "start": 1, + ... }, + ... { + ... "format": "lowerLetter", # string forms are accepted + ... "text": "%2)", + ... "indent": Inches(0.5), + ... }, + ... { + ... "format": "bullet", + ... "text": "•", + ... "indent": Inches(0.75), + ... "font": "Symbol", # required for bullet glyphs + ... }, + ... ]) + +``format`` accepts a :class:`.WD_NUMBER_FORMAT` member or a raw OOXML token string +(``"decimal"``, ``"bullet"``, ``"lowerLetter"``, ``"upperRoman"``, etc.). +``text`` is a :class:`str` template using ``%N`` placeholders where ``N`` is +1-based — ``%1.%2`` on level 1 produces ``"1.a"``, ``"1.b"``, and so on. +``indent`` accepts either a |Length| or a bare integer count of twips. +``font`` sets the ``w:rFonts`` on the level's run properties; it is usually +required for bullet lists that reference ``"•"`` or other non-Latin glyphs, +since Word's default body font often does not ship those shapes. + + +Applying a definition to paragraphs +----------------------------------- + +:meth:`.NumberingDefinition.apply_to` joins a paragraph to the list and +selects its indent level:: + + >>> p1 = document.add_paragraph("First point") + >>> p2 = document.add_paragraph("Sub-point") + >>> p3 = document.add_paragraph("Another sub-point") + >>> definition.apply_to(p1, level=0) + >>> definition.apply_to(p2, level=1) + >>> definition.apply_to(p3, level=1) + +Levels run 0 through 8; :meth:`apply_to` raises :class:`ValueError` for any +other value. The same definition can be applied to any number of paragraphs; +Word's numbering engine automatically renumbers them when the document is +opened. + + +Restart numbering +----------------- + +The ``start`` key in the level spec sets the first number the list emits. +This is persisted on the ``w:start`` child of the ``w:lvl`` element:: + + >>> definition = document.numbering.add_numbering_definition([ + ... {"format": WD_NUMBER_FORMAT.DECIMAL, "text": "%1.", "start": 5}, + ... ]) + >>> definition.levels[0].start + 5 + + +Nested definitions +------------------ + +A "nested" list is simply a definition that declares more than one level. +Paragraphs at different levels within the same document reference the same +``w:num`` instance but with different ``w:ilvl`` values. The three-level +example above demonstrates the common pattern of decimal → lower-letter → +bullet for a technical outline. + + +Reading a paragraph's list membership +------------------------------------- + +:attr:`Paragraph.list_format ` +returns a :class:`.ListFormat` named tuple of +``(numbering_definition, level)``. The definition is |None| for paragraphs +outside any list. + + +Limitations +----------- + +- |docx| does not compute the *rendered* number for a paragraph — that is + the job of Word's numbering engine, which runs at layout time. +- Per-instance starting-number overrides on ``w:num`` (the ``w:lvlOverride`` + mechanism) are not exposed by the proxy API; use the + :attr:`~.NumberingDefinition.element` escape hatch for direct XML access. +- Modifying a level's formatting on an existing definition is not supported + — create a new definition instead. diff --git a/docs/user/permissions.rst b/docs/user/permissions.rst new file mode 100644 index 000000000..5524ab563 --- /dev/null +++ b/docs/user/permissions.rst @@ -0,0 +1,167 @@ +.. _permissions: + +Permissions and document protection +=================================== + +Word supports two complementary mechanisms for restricting edits: + +- **Document protection** is a document-wide setting that locks the whole + document into a mode — read-only, comments-only, tracked-changes, or + forms-only — and optionally secures the lock with a password hash. +- **Permission ranges** carve out portions of a protected document that + specific users or groups *are* allowed to edit. They are the escape + hatch that makes "editable form in a locked template" workflows work. + +|docx| surfaces both: document protection through +:attr:`.Settings.document_protection` and the convenience helpers +:meth:`.Settings.enable_protection`/:meth:`.Settings.disable_protection`, +and permission ranges through :meth:`.Paragraph.add_permission_range` and +:attr:`.Document.permission_ranges`. + + +Document protection +------------------- + +Enabling protection +~~~~~~~~~~~~~~~~~~~ + +:meth:`.Settings.enable_protection` is the recommended entry point. It +creates the ``w:documentProtection`` element if absent, sets the mode, +enforces the restriction, and optionally hashes a password:: + + >>> from docx import Document + >>> from docx.enum.text import WD_PROTECTION + >>> document = Document() + >>> dp = document.settings.enable_protection( + ... WD_PROTECTION.COMMENTS, + ... password="s3cret", + ... enforce=True, + ... ) + >>> dp.mode + + >>> dp.enforce + True + +The supported modes are the members of :class:`.WD_PROTECTION`: + +================================= ==================== =========================================== +Member XML value Behaviour +================================= ==================== =========================================== +``WD_PROTECTION.READ_ONLY`` ``readOnly`` Document is read-only. +``WD_PROTECTION.COMMENTS`` ``comments`` Only comments may be added or modified. +``WD_PROTECTION.TRACKED_CHANGES`` ``trackedChanges`` All edits are recorded as tracked changes. +``WD_PROTECTION.FORMS`` ``forms`` Only form-field content may be edited. +================================= ==================== =========================================== + + +Password hashing +~~~~~~~~~~~~~~~~ + +When a `password` is supplied, |docx| generates a random 16-byte salt and +hashes the password using Word's SHA-1 scheme with 100,000 iterations +(ISO/IEC 29500-1 §17.15.1.28). The resulting hash and salt are stored in +``@w:hash`` / ``@w:salt`` along with the algorithm metadata +(``cryptProviderType=rsaAES``, ``cryptAlgorithmSid=4``, ...). + +Word's own implementation has historically had subtle variations across +versions; callers who need Word itself to accept the password at open time +should verify against their target Word release. For *detection* use cases +(reporting "this document is password-protected") the stored fields are +sufficient. + + +Disabling protection +~~~~~~~~~~~~~~~~~~~~ + +:meth:`.Settings.disable_protection` clears the mode and enforce flag but +leaves the ``w:documentProtection`` element in place so external tooling +that keyed off its presence still sees it:: + + >>> document.settings.disable_protection() + >>> document.settings.document_protection.mode is None + True + >>> document.settings.document_protection.enforce + False + + +Fine-grained read access +~~~~~~~~~~~~~~~~~~~~~~~~ + +:attr:`.Settings.document_protection` returns a |DocumentProtection| proxy +exposing every underlying attribute individually: :attr:`mode`, +:attr:`enforce`, :attr:`formatting_locked`, :attr:`password_hash`, +:attr:`password_salt`, :attr:`crypto_provider_type`, +:attr:`crypto_algorithm_class`, :attr:`crypto_algorithm_type`, +:attr:`crypto_algorithm_sid`, and :attr:`spin_count`. + + +Permission ranges +----------------- + +A permission range is delimited by ``w:permStart`` and ``w:permEnd`` markers +embedded in the body; between them the specified user or group may edit +even when the document is otherwise locked. + +Adding a permission range +~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`.Paragraph.add_permission_range` wraps the calling paragraph in the +necessary markers:: + + >>> p1 = document.add_paragraph("Editable by everyone.") + >>> p2 = document.add_paragraph("Editable by Alice.") + >>> p1.add_permission_range(edit_group="everyone") + + >>> p2.add_permission_range(user="alice@example.com") + +At least one of `edit_group` and `user` should typically be supplied — the +former for group restrictions (``"everyone"``, ``"current"``, or a named +group) and the latter for a single principal. The `name` parameter accepted +for API symmetry with :meth:`add_bookmark` is not persisted; ``w:permStart`` +has no ``@w:name`` attribute. + + +Enumerating permission ranges +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:attr:`.Document.permission_ranges` returns every permission range in the +document body in document order:: + + >>> for pr in document.permission_ranges: + ... print(pr.id, pr.user, pr.edit_group) + 0 None everyone + 1 alice@example.com None + +Each |PermissionRange| exposes: + +- :attr:`.PermissionRange.id` — the integer identifier linking the + matching ``permStart``/``permEnd`` pair. +- :attr:`.PermissionRange.user`, :attr:`.PermissionRange.edit_group`, + :attr:`.PermissionRange.displaced_by_custom_xml` — the corresponding + attributes on the underlying ``w:permStart``. + + +Deleting a permission range +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`.PermissionRange.delete` removes both the start and end markers from +the document body:: + + >>> document.permission_ranges[0].delete() + +The body content between the markers is left untouched. + + +Scope and caveats +----------------- + +- |docx| does not *enforce* the restrictions — that is Word's job at open + time. Anything calling ``python-docx`` can freely modify every paragraph + regardless of the protection mode, because the XML is just data to + python-docx. +- Permission ranges added to the document body only cover that body. + Ranges inside headers, footers, footnotes, or endnotes are not exposed + via :attr:`.Document.permission_ranges`; reach for the paragraph-level + :attr:`.Paragraph.permission_ranges` accessor in those containers. +- ``w:permStart`` IDs are assigned sequentially from zero; |docx| does not + attempt to interleave them with custom IDs set by other tooling. diff --git a/docs/user/quickstart.rst b/docs/user/quickstart.rst index 0d6982ee0..9f2c7ce7f 100644 --- a/docs/user/quickstart.rst +++ b/docs/user/quickstart.rst @@ -66,6 +66,28 @@ If you specify a level of 0, a "Title" paragraph is added. This can be handy to start a relatively short document that doesn't have a separate title page. +Adding a bookmark +----------------- + +Bookmarks let you name a location (or a range) in the document so that +hyperlinks, cross-references, and fields can target it later. |docx| exposes +:meth:`Paragraph.add_bookmark`, which inserts ```` / +```` markers and returns a |Bookmark| proxy. The bookmark ID is +allocated automatically so you don't have to track ``@w:id`` values yourself:: + + from docx import Document + + document = Document() + paragraph = document.add_paragraph('See the appendix for details.') + + bookmark = paragraph.add_bookmark('appendix_intro') + print(bookmark.name) # -> 'appendix_intro' + +With no ``start_run`` / ``end_run`` arguments, the bookmark wraps the entire +paragraph. Pass runs explicitly to anchor the bookmark to a specific range. +See :ref:`bookmarks` for the full API. + + Adding a page break ------------------- @@ -136,11 +158,15 @@ You can also add rows to a table incrementally like so:: This can be very handy for the variable length table scenario we mentioned above:: + from collections import namedtuple + + Record = namedtuple('Record', 'qty id desc') + # get table data ------------- items = ( - (7, '1024', 'Plush kittens'), - (3, '2042', 'Furbees'), - (1, '1288', 'French Poodle Collars, Deluxe'), + Record(7, '1024', 'Plush kittens'), + Record(3, '2042', 'Furbees'), + Record(1, '1288', 'French Poodle Collars, Deluxe'), ) # add table ------------------ @@ -156,7 +182,7 @@ above:: for item in items: cells = table.add_row().cells cells[0].text = str(item.qty) - cells[1].text = item.sku + cells[1].text = item.id cells[2].text = item.desc @@ -226,7 +252,7 @@ a paragraph at once. It's a lot like CSS styles if you know what those are. You can apply a paragraph style right when you create a paragraph:: - document.add_paragraph('Lorem ipsum dolor sit amet.', style='ListBullet') + document.add_paragraph('Lorem ipsum dolor sit amet.', style='List Bullet') This particular style causes the paragraph to appear as a bullet, a very handy thing. You can also apply a style afterward. These two lines are equivalent to @@ -301,6 +327,31 @@ make your code simpler if you're building the paragraph up from runs anyway:: paragraph.add_run(' sit amet.') +Adding a footnote +----------------- + +A footnote is a small superscript marker in the body paired with commentary +that Word renders at the bottom of the page. In |docx|, footnotes hang off a +run: you choose which run the reference mark is inserted into, and optionally +pass the footnote body as a string:: + + from docx import Document + + document = Document() + paragraph = document.add_paragraph('The rain in Spain.') + + footnote = document.footnotes.add( + paragraph.runs[0], + 'A common saying about Iberian weather.', + ) + print(footnote.footnote_id) # -> 2 (ids 0/1 are reserved) + +The call creates the ``word/footnotes.xml`` part on demand, assigns the next +available id, and inserts a ```` inside the anchoring +run. For richer footnote content (extra paragraphs, formatting, tables), use +the returned |Footnote| object — see :ref:`footnotes`. + + Applying a character style -------------------------- @@ -326,3 +377,110 @@ the same result as the lines above:: run.style = 'Emphasis' As with a paragraph style, the style name is as it appears in the Word UI. + + +Adding a comment +---------------- + +Word comments annotate a range of runs with a side-margin note carrying an +author, initials, and timestamp. The fork adds :meth:`Document.add_comment`, +which takes either a single run or a sequence of runs as the comment's +anchor. Only the first and last run of a sequence are used to delimit the +range, so ``paragraph.runs`` is a convenient input:: + + from docx import Document + + document = Document() + paragraph = document.add_paragraph('The rain in Spain falls mainly in the plain.') + + comment = document.add_comment( + paragraph.runs, + text='Check the citation for this claim.', + author='Jane Reviewer', + initials='JR', + ) + print(comment.comment_id) # -> 0 + +The comment text argument handles the common single-sentence case inline. +For richer comment bodies (multiple paragraphs, formatting, replies), drive +the returned |Comment| object with ``.add_paragraph()`` / ``.add_run()`` — +see :ref:`comments`. + + +Searching and replacing text +---------------------------- + +When generating output from a template, a surprisingly large share of the +work is "find this placeholder and swap in a value". |docx| provides +:meth:`Document.search` / :meth:`Document.replace` for top-level body +paragraphs, and :meth:`Document.search_all` / :meth:`Document.replace_all` +which additionally walk tables, headers and footers, footnotes, endnotes, +and comments:: + + from docx import Document + + document = Document() + document.add_paragraph('Hello {{NAME}}, welcome to {{COMPANY}}.') + + replaced = document.replace_all('{{NAME}}', 'Ada') + replaced += document.replace_all('{{COMPANY}}', 'Analytical Engines Ltd.') + print(replaced) # -> 2 + +Both methods preserve the run formatting of the first matched character, so +bold or styled placeholders keep their look after substitution. Regex +variants (:meth:`Document.replace_regex`, :meth:`Document.replace_regex_all`) +are available for pattern-based work; see :ref:`search_replace`. + + +Reading tracked changes +----------------------- + +When a document has been edited with *Track Changes* turned on, Word records +each insertion, deletion, and move as a revision element inside the affected +paragraph. |docx| exposes those as :attr:`Paragraph.tracked_changes`, a list +of |TrackedChange| proxies carrying the author, date, and inserted/deleted +text:: + + from docx import Document + + document = Document('reviewed.docx') + + for paragraph in document.paragraphs: + for change in paragraph.tracked_changes: + print(change.type, change.author, repr(change.text)) + +Once you've inspected the revisions, resolve them in bulk with +:meth:`Document.accept_all_changes` or :meth:`Document.reject_all_changes`, +which flatten ``w:ins`` / ``w:del`` / ``w:*Change`` markup into plain +content. See :ref:`track_changes` for move-revision pairing and formatting +changes. + + +Computing a stable paragraph ID +------------------------------- + +Word does not attach a durable identifier to paragraphs, which makes it +awkward to correlate the same paragraph across a save/reload cycle. +:attr:`Paragraph.stable_id` computes a 16-character hex digest derived from +the paragraph's ``w:rsidR``, its position within its parent, and its text +content — so it survives a round-trip as long as the paragraph keeps the +same position and text:: + + import io + from docx import Document + + document = Document() + paragraph = document.add_paragraph('Lorem ipsum dolor sit amet.') + before = paragraph.stable_id + + buffer = io.BytesIO() + document.save(buffer) + buffer.seek(0) + + reloaded = Document(buffer) + assert reloaded.paragraphs[0].stable_id == before + +The value is recomputed on each access and is never persisted on the +element, so editing the paragraph's text or moving it to a different parent +will change the result. Treat ``stable_id`` as a within-session correlator, +not a permanent document ID. diff --git a/docs/user/search.rst b/docs/user/search.rst new file mode 100644 index 000000000..f78dca3ab --- /dev/null +++ b/docs/user/search.rst @@ -0,0 +1,188 @@ +.. _search_replace: + +Searching and Replacing Text +============================ + +Word's *Find* and *Replace* dialogs let a user locate text in a document and, optionally, +swap it for something else. *python-docx* exposes a similar capability programmatically +through a small set of methods on the |Document| object and a helper class called +|SearchMatch|. + +The feature is designed around three axes: + +- **Plain text vs regular expression.** ``search`` / ``replace`` match a literal + substring; ``search_regex`` / ``replace_regex`` accept a Python regular expression + (either a string or a pre-compiled :class:`re.Pattern`). +- **Body-only vs every story.** The bare names (``search``, ``replace``, + ``search_regex``, ``replace_regex``) look only at top-level paragraphs of the + document body. The ``*_all`` variants (``search_all``, ``replace_all``, + ``search_regex_all``, ``replace_regex_all``) additionally walk body tables, each + section's non-inherited headers and footers, footnotes, endnotes, and comments. +- **Query vs mutate.** The ``search*`` methods are pure queries — they return a list + of |SearchMatch| objects. The ``replace*`` methods return the number of replacements + performed and mutate the document in place. + + +Plain text search +----------------- + +Call :meth:`.Document.search` with a literal substring to scan the body paragraphs:: + + >>> from docx import Document + >>> document = Document("example.docx") + >>> matches = document.search("Invoice") + >>> len(matches) + 3 + >>> matches[0] + + +By default, matching is case-sensitive and unanchored. The two optional flags +follow Word's Find dialog conventions:: + + >>> document.search("invoice", case_sensitive=False) # also matches "Invoice" + >>> document.search("Total", whole_word=True) # skips "SubTotal" etc. + +Passing an empty string returns an empty list rather than raising. + + +Regular-expression search +------------------------- + +Use :meth:`.Document.search_regex` when a literal substring is not expressive enough. +The ``pattern`` argument may be a string or an already-compiled :class:`re.Pattern`:: + + >>> import re + >>> document.search_regex(r"INV-\d+") + [<...SearchMatch...>, <...SearchMatch...>] + >>> document.search_regex(re.compile(r"\bID: [A-F0-9]+\b", re.IGNORECASE)) + +When ``pattern`` is a string, any ``flags`` you pass are applied at compile time. When +``pattern`` is already compiled, ``flags`` is silently ignored — the existing +compiled flags win, matching the behaviour of :func:`re.search`. + +Zero-width matches (e.g. ``r"^"`` or lookarounds) are reported by ``search_regex``, +but they are *skipped* by ``replace_regex`` because there is no obvious run to host +an empty replacement. + + +The SearchMatch object +---------------------- + +Every hit is returned as a |SearchMatch| carrying: + +- :attr:`~.SearchMatch.paragraph` — the |Paragraph| that contains the match. +- :attr:`~.SearchMatch.paragraph_index` — the paragraph's index within its *story*. + For body-only searches, this is the index into ``document.paragraphs``. For + cross-story searches, it is the index into the paragraph list of the specific + story identified by :attr:`~.SearchMatch.location`. +- :attr:`~.SearchMatch.run_indices` — a sorted list of run indices that overlap + the match. A match that lives in a single run reports ``[n]``; one that spans + several has ``[n, n+1, ...]``. +- :attr:`~.SearchMatch.start` / :attr:`~.SearchMatch.end` — character offsets + into ``paragraph.text`` (the reconstructed plain-text form), using Python + half-open interval semantics. ``paragraph.text[match.start : match.end]`` + reproduces the matched text. +- :attr:`~.SearchMatch.location` — story identifier, populated by the ``_all`` + helpers and |None| for body-only searches. See below. + + +Matches that span several runs +------------------------------ + +Word stores runs of text with uniform formatting. A single paragraph that reads +"the **quick** brown fox" is three runs: ``"the "``, ``"quick"``, and +``" brown fox"``. A search term like ``"e qui"`` therefore spans two runs. + +*python-docx* handles this transparently: + +- During *search*, the match is reported once with + :attr:`~.SearchMatch.run_indices` listing every run it crosses. +- During *replace*, the replacement text is written into the **first** run of the + span (inheriting that run's formatting), and any matched characters in + subsequent runs are trimmed away. Fully-consumed middle runs are left in + place as empty runs so their formatting still exists for Word if needed. + +That last point is important for preserving bold/italic/color applied inside the +match: whichever formatting the *first* matched character had is what the +replacement text will inherit. + + +Replacing text +-------------- + +:meth:`.Document.replace` mutates the document in place and returns the number of +replacements made:: + + >>> n = document.replace("SpamCo", "EggCorp") + >>> n + 4 + >>> document.save("example.docx") + +The same ``case_sensitive`` and ``whole_word`` flags available on +:meth:`~.Document.search` are honoured here. Passing ``old_text=""`` returns +``0`` without touching the document. + +:meth:`.Document.replace_regex` follows :func:`re.sub` semantics for the +``replacement`` argument: backreferences such as ``\1`` or ``\g`` are +expanded per match:: + + >>> document.replace_regex(r"INV-(\d+)", r"Invoice #\1") + 2 + + +Searching every story with ``*_all`` +------------------------------------ + +:meth:`~.Document.search` and :meth:`~.Document.replace` only consider the top-level +body paragraphs. Use the ``*_all`` variants to reach content that Word treats as +separate streams: + +- the document body, tagged ``"body"``; +- paragraphs inside body-level tables, tagged + ``"table::row::col:"``; +- each section's *primary*, *even-page*, and *first-page* headers and footers + (unless the section inherits the previous one's, in which case the inherited + definition is visited only once), tagged + ``"header:section:primary"``, ``"footer:section:even_page"``, etc.; +- footnote paragraphs, tagged ``"footnote:"``; +- endnote paragraphs, tagged ``"endnote:"``; +- comment paragraphs, tagged ``"comment:"``. + +:: + + >>> matches = document.search_all("Confidential") + >>> {m.location for m in matches} + {'body', 'header:section0:primary', 'footnote:2'} + +Tables nested inside *other* stories (a table inside a header, or a table inside +a body-table cell) are not recursively descended; the top-level cell text is +still searchable but doubly-nested tables are skipped. This matches the +invariant documented for :func:`docx.search._iter_all_paragraphs`. + +The regex and replace variants behave identically with respect to story +coverage:: + + >>> document.search_regex_all(r"\bTODO\b") + >>> document.replace_all("v1.0", "v2.0") + >>> document.replace_regex_all(r"\bDRAFT\b", r"FINAL") + +The replace variants return the *total* number of replacements performed across +every story. + + +Working directly on a paragraph list +------------------------------------ + +The four :mod:`docx.search` module-level functions — +:func:`~docx.search.search_paragraphs`, +:func:`~docx.search.search_paragraphs_regex`, +:func:`~docx.search.replace_in_paragraphs`, and +:func:`~docx.search.replace_in_paragraphs_regex` — take a ``list[Paragraph]`` +directly. They are handy when you already have a subset of paragraphs (for +example, the paragraphs inside one table cell) and only want to operate on +those:: + + >>> from docx.search import search_paragraphs + >>> cell = document.tables[0].cell(1, 1) + >>> search_paragraphs(list(cell.paragraphs), "TBD") + [<...SearchMatch...>] diff --git a/docs/user/sections-advanced.rst b/docs/user/sections-advanced.rst new file mode 100644 index 000000000..435df568a --- /dev/null +++ b/docs/user/sections-advanced.rst @@ -0,0 +1,337 @@ +.. _sections-advanced: + +Advanced Section Features +========================= + +The :ref:`sections` page covers the section properties that appear on every +Word document — page size, margins, orientation, section start type, and the +primary header/footer pair. This page covers the less-common *per-section* +settings that |docx| also exposes: + +* page borders, +* line numbering, +* printer paper-source hints, +* the East Asian document grid, +* text direction and right-to-left flow, +* distinct odd/even and first-page header/footer definitions, and +* multi-column page layout. + +Every feature documented here lives on the |Section| object. Unless otherwise +noted, all setters are safe to call on a freshly-opened section without first +ensuring the underlying XML element exists — |docx| creates and removes the +element as needed. + + +Page borders +------------ + +.. currentmodule:: docx.section + +Word can draw a decorative border around the printable area of each page in a +section. The :attr:`Section.page_borders` property returns a |PageBorders| +proxy exposing the four edges (``top``, ``bottom``, ``left``, ``right``) plus +the ``display`` and ``offset_from`` attributes. Each edge is a |PageBorder| +object with ``style``, ``width``, ``color``, and ``space`` attributes:: + + >>> from docx import Document + >>> from docx.enum.section import WD_BORDER_DISPLAY, WD_BORDER_OFFSET_FROM + >>> from docx.enum.text import WD_BORDER_STYLE + >>> from docx.shared import Pt, RGBColor + + >>> document = Document() + >>> section = document.sections[0] + >>> section.page_borders.top.style # no border defined + None + +The :meth:`Section.set_page_border` convenience method writes every +attribute of a single edge in one call. Any argument left as ``None`` leaves +the corresponding XML attribute untouched:: + + >>> section.set_page_border( + ... "top", + ... style=WD_BORDER_STYLE.DOUBLE, + ... width=Pt(1.5), + ... color=RGBColor(0x00, 0x66, 0xCC), + ... space=Pt(24), + ... ) + + +Individual attributes can also be assigned directly on the edge proxy:: + + >>> section.page_borders.bottom.style = WD_BORDER_STYLE.SINGLE + >>> section.page_borders.bottom.width = Pt(1) + +Assigning ``None`` to any attribute clears it; this is how you "reset" an +edge without deleting the whole |PageBorders| element:: + + >>> section.page_borders.top.color = None + +The |PageBorders| proxy also carries two whole-element attributes: + +* :attr:`PageBorders.display` — a :ref:`WdBorderDisplay` member specifying + on which pages the border is drawn (``ALL_PAGES``, ``FIRST_PAGE``, + ``NOT_FIRST_PAGE``); +* :attr:`PageBorders.offset_from` — a :ref:`WdBorderOffsetFrom` member + specifying whether the ``space`` attribute on each edge is measured from + the text extents (``TEXT``) or from the page edge (``PAGE``). + +:: + + >>> section.page_borders.display = WD_BORDER_DISPLAY.ALL_PAGES + >>> section.page_borders.offset_from = WD_BORDER_OFFSET_FROM.PAGE + +To remove every page-border definition from a section, call +:meth:`Section.remove_page_borders`. The call is a no-op when the section +has no borders defined:: + + >>> section.remove_page_borders() + + +Line numbering +-------------- + +Line numbers are displayed in the margin alongside each numbered line. They +are commonly used in legal documents and screenplays. +:attr:`Section.line_numbering` returns a |LineNumbering| proxy when the +section has a ```` element, or ``None`` when it does not:: + + >>> from docx.enum.section import WD_LINE_NUMBERING_RESTART + >>> from docx.shared import Pt + + >>> section.line_numbering + None + +Use :meth:`Section.set_line_numbering` to create or update the element. +Arguments left as ``None`` leave any existing attribute unchanged:: + + >>> ln = section.set_line_numbering( + ... count_by=1, # every line + ... start=1, + ... distance=Pt(20), # 20pt from the text + ... restart=WD_LINE_NUMBERING_RESTART.NEW_PAGE, + ... ) + >>> ln.count_by, ln.start, ln.restart + (1, 1, NEW_PAGE (2)) + +The four attributes are also individually settable after the fact:: + + >>> section.line_numbering.count_by = 5 # only every 5th line + >>> section.line_numbering.restart = WD_LINE_NUMBERING_RESTART.CONTINUOUS + +To turn line numbering off for a section, call +:meth:`Section.remove_line_numbering`. The call is a no-op when the section +has no line numbering defined:: + + >>> section.remove_line_numbering() + >>> section.line_numbering is None + True + + +Paper source (printer tray) +--------------------------- + +Word exposes a printer-tray hint on each section so that, for example, the +first sheet of a multi-page document can be drawn from a letterhead tray +while the remaining sheets come from a standard-paper tray. |docx| surfaces +the hint as two properties on |Section|: + +* :attr:`Section.first_page_paper_source` — the tray number used for the + first page of the section; +* :attr:`Section.other_pages_paper_source` — the tray number used for + subsequent pages. + +Both return ``int`` or ``None``:: + + >>> section.first_page_paper_source, section.other_pages_paper_source + (None, None) + >>> section.first_page_paper_source = 7 + >>> section.other_pages_paper_source = 15 + >>> section.first_page_paper_source, section.other_pages_paper_source + (7, 15) + +Clearing a value by assigning ``None`` removes the underlying XML attribute. +When both values are cleared the enclosing ```` element is +removed from the ````:: + + >>> section.first_page_paper_source = None + >>> section.other_pages_paper_source = None + +.. note:: + Tray numbers are printer-specific. Word doesn't validate the value against + any printer's supported bins — the integer is carried through and passed + to the printer driver at print time. + + +East Asian document grid +------------------------ + +The ```` element controls the East Asian character grid for a +section: whether text is laid out against a grid of lines, or a grid of both +lines and characters, and what the pitch of that grid is. + +:attr:`Section.document_grid` returns a |DocumentGrid| proxy or ``None``:: + + >>> from docx.enum.section import WD_DOC_GRID_TYPE + + >>> dg = section.document_grid + >>> dg.type, dg.line_pitch, dg.char_space + (None, 360, None) + +:meth:`Section.set_document_grid` creates or updates the element:: + + >>> section.set_document_grid( + ... type=WD_DOC_GRID_TYPE.LINES_AND_CHARS, + ... line_pitch=312, + ... char_space=0, + ... ) + + +:meth:`Section.remove_document_grid` deletes the element entirely. Typical +Western-language documents do not need a document grid; the default template +created by ``Document()`` already writes a minimal ```` carrying +only ``linePitch``. + + +Text direction and right-to-left +-------------------------------- + +Two properties on |Section| together control text-flow direction: + +* :attr:`Section.text_direction` — a :ref:`WdTextDirection` member or + ``None``. Maps to the ```` child of ````. Use + this to rotate section body text 90° for East Asian vertical layouts. +* :attr:`Section.right_to_left` — ``True`` when this section flows + right-to-left (e.g. for Arabic or Hebrew body text). Maps to the + ```` child. + +:: + + >>> from docx.enum.table import WD_TEXT_DIRECTION + + >>> section.text_direction + None + >>> section.right_to_left + False + >>> section.text_direction = WD_TEXT_DIRECTION.TB_RL + >>> section.right_to_left = True + >>> section.text_direction, section.right_to_left + (TB_RL (1), True) + +Assigning ``None`` to :attr:`Section.text_direction` removes the +```` element. Assigning ``False`` or ``None`` to +:attr:`Section.right_to_left` removes the ```` element. + +.. note:: + :attr:`Section.right_to_left` is orthogonal to the individual paragraph + or run *bidi* settings. Setting it ``True`` affects the default column + order, gutter placement, and default paragraph direction for the whole + section; run-level and paragraph-level RTL settings still apply on top. + + +Odd, even, and first-page headers & footers +------------------------------------------- + +Every section carries three pairs of header/footer slots: + +================ ======================================================== +``header`` primary — used for every page unless overridden +``first_page_*`` used for the first page of the section when enabled +``even_page_*`` used for even-numbered pages when enabled +================ ======================================================== + +Each slot is a |_Header| or |_Footer| proxy accessed through the +corresponding :class:`Section` property (:attr:`Section.header`, +:attr:`Section.first_page_header`, :attr:`Section.even_page_header`, plus +the ``_footer`` variants). + +Two toggles control whether the non-primary slots are honored by Word: + +* :attr:`Section.different_first_page_header_footer` is a **per-section** + flag mapped to ````. It enables the *first-page* header and + footer for only the section it is set on. +* :attr:`Section.different_odd_and_even_pages_header_footer` is a + **document-level** flag mapped to ```` in the + settings part. Setting it affects every section in the document. It is + surfaced on |Section| purely for discoverability — any section exposes + the same underlying document-wide value. + +:: + + >>> section.different_first_page_header_footer = True + >>> section.first_page_header.paragraphs[0].text = "First-page header" + >>> section.first_page_footer.paragraphs[0].text = "First-page footer" + + >>> section.different_odd_and_even_pages_header_footer = True + >>> section.even_page_header.paragraphs[0].text = "Even-page header" + >>> section.even_page_footer.paragraphs[0].text = "Even-page footer" + + >>> section.header.paragraphs[0].text = "Odd-page header" + >>> section.footer.paragraphs[0].text = "Odd-page footer" + +Like the primary header/footer, each slot's ``is_linked_to_previous`` +property controls whether the slot inherits its content from the +corresponding slot in the preceding section. Setting ``is_linked_to_previous += False`` creates an empty definition in this section that you can then +populate; setting it ``True`` drops the definition (if any) so the slot +inherits again. + + +Multi-column layout +------------------- + +The :attr:`Section.columns` property returns a |SectionColumns| proxy +backed by the ```` element. It behaves like a sequence of +|Column| objects and also carries three whole-element attributes: + +* :attr:`SectionColumns.count` — number of columns (defaults to 1); +* :attr:`SectionColumns.equal_width` — ``True`` when every column has the + same width (defaults to ``True``); +* :attr:`SectionColumns.space` — the gutter between columns when they are + equal-width. + +A brand-new section has no ```` element and reports a single +column:: + + >>> from docx.shared import Inches, Pt + + >>> cols = section.columns + >>> cols.count, cols.equal_width, cols.space, len(cols) + (1, True, None, 0) + +To lay out three equal-width columns with an 18-point gutter, just assign +the three attributes:: + + >>> cols.count = 3 + >>> cols.equal_width = True + >>> cols.space = Pt(18) + +Unequal columns are expressed as a sequence of explicit ```` +children. Set ``equal_width`` to ``False`` and then populate the sequence +individually:: + + >>> cols.count = 2 + >>> cols.equal_width = False + >>> cols[0].width = Inches(2.5) + >>> cols[0].space = Inches(0.5) + >>> cols[1].width = Inches(4.0) + +Each :class:`Column` exposes two properties — :attr:`Column.width` and +:attr:`Column.space` — both of which accept |Length| values or ``None``. + +.. note:: + The ```` element does not *store* an explicit + ```` child for equal-width columns; Word computes per-column + widths from ``count`` and the section's content width. Adding ```` + children only makes sense together with ``equal_width = False``. + + +API reference +------------- + +The classes used on this page are documented in the :doc:`../api/section` +reference. The enumerations are documented in +:doc:`../api/enum/WdBorderDisplay`, :doc:`../api/enum/WdBorderOffsetFrom`, +:doc:`../api/enum/WdBorderStyle`, +:doc:`../api/enum/WdLineNumberingRestart`, +:doc:`../api/enum/WdDocGridType`, and +:doc:`../api/enum/WdTextDirection`. diff --git a/docs/user/shapes.rst b/docs/user/shapes.rst index 5dcefbf61..5b8a917be 100644 --- a/docs/user/shapes.rst +++ b/docs/user/shapes.rst @@ -25,3 +25,7 @@ issue tracker. The ``Document.add_picture()`` method adds a specified picture to the end of the document in a paragraph of its own. However, by digging a little deeper into the API you can place text on either side of the picture in its paragraph, or both. + +For fork-era additions — floating (anchored) images, DrawingML preset shapes, +group shapes, text-frame content, ink annotations, embedded OLE objects, +accessibility alt text and titles, and SVG input — see :doc:`drawing`. diff --git a/docs/user/statistics.rst b/docs/user/statistics.rst new file mode 100644 index 000000000..56df829c2 --- /dev/null +++ b/docs/user/statistics.rst @@ -0,0 +1,104 @@ +.. _statistics: + +Document statistics +=================== + +Word's **Review > Word Count** dialog summarizes the text content of a +document with four figures: *Pages*, *Words*, *Characters (no spaces)*, and +*Characters (with spaces)*, plus a *Paragraphs* count. |docx| provides an +equivalent summary through :attr:`.Document.statistics`, which returns a +|DocumentStatistics| named tuple. + +Page counts are deliberately omitted because they depend on Word's pagination +engine, which does not run when a document is authored programmatically. +Everything else is computed directly from the body XML without requiring Word +to open the file. + + +Accessing the statistics +------------------------ + +:: + + >>> from docx import Document + >>> document = Document("report.docx") + >>> stats = document.statistics + >>> stats + DocumentStatistics(paragraphs=42, words=3128, characters=19204, + characters_no_spaces=16091) + +The returned object is a |DocumentStatistics|, which is a +:class:`collections.namedtuple` subclass. Callers can destructure it directly:: + + paragraphs, words, characters, characters_no_spaces = document.statistics + +Or access fields by name:: + + print(f"{document.statistics.words} words") + + +Field reference +--------------- + +``paragraphs`` + The count of non-empty body paragraphs. A paragraph is considered + non-empty when it contains at least one non-whitespace character. This + matches Word's behavior of excluding the "spacing" paragraphs that + consist solely of whitespace or are entirely empty. The equivalent in + Word's Word Count dialog is labeled **Paragraphs**. + +``words`` + The count of whitespace-delimited tokens in the body text. A "word" is + defined with :meth:`str.split` semantics — any run of non-whitespace + characters surrounded by whitespace or string boundaries counts as one + token. This corresponds to **Words** in Word's dialog. + +``characters`` + The total count of characters in the body text, *including* spaces and + other whitespace. This corresponds to **Characters (with spaces)** in + Word's dialog. + +``characters_no_spaces`` + The total count of characters in the body text, *excluding* any + whitespace (spaces, tabs, and line breaks). This corresponds to + **Characters (no spaces)** in Word's dialog. + + +Scope of the counts +------------------- + +Only the main document story (the ``w:body``) is inspected. Text in headers, +footers, footnotes, endnotes, and comments is *not* included in the counts. +This matches Word's default "Word Count" behavior. Paragraphs nested inside +tables or block-level structured-document tags (content controls) *are* +included, because those paragraphs are part of the body story. + +Because :attr:`.Document.statistics` is a read-only property, each access +recomputes the counts from the current state of the document. It is therefore +safe to call before and after content edits to observe how a change affects +the overall word or character count. + + +Building a simple report +------------------------ + +A typical use is producing a short summary for a pipeline log:: + + stats = document.statistics + print( + f"{stats.paragraphs:>5} paragraphs\n" + f"{stats.words:>5} words\n" + f"{stats.characters:>5} characters\n" + f"{stats.characters_no_spaces:>5} characters (no spaces)" + ) + +Or enforcing an editorial policy — for instance, rejecting a submission that +falls below a minimum word count:: + + MIN_WORDS = 500 + if document.statistics.words < MIN_WORDS: + raise SystemExit(f"Document must contain at least {MIN_WORDS} words") + +The underlying helper, :func:`docx.statistics.compute_statistics`, accepts a +``w:body`` element directly and is useful when you need the same counts for +something other than a top-level |Document|. diff --git a/docs/user/styles-using.rst b/docs/user/styles-using.rst index 93dd7a344..d952f408c 100644 --- a/docs/user/styles-using.rst +++ b/docs/user/styles-using.rst @@ -23,6 +23,14 @@ name:: >>> styles['Normal'] +.. note:: Looking up a style name that is not defined in the document raises + |KeyError|. This includes built-in styles that Word knows about but which + have not yet been materialized in the document's ``styles.xml`` part (so + called *latent styles*). If you need to use a latent style, define it + first via :meth:`~docx.styles.styles.Styles.add_style` or promote the + latent definition. Catch |KeyError| if you want to fall back to a default + style when the requested one is missing. (upstream#170) + .. note:: Built-in styles are stored in a WordprocessingML file using their English name, e.g. 'Heading 1', even though users working on a localized version of Word will see native language names in the UI, e.g. 'Kop 1'. diff --git a/docs/user/tables-advanced.rst b/docs/user/tables-advanced.rst new file mode 100644 index 000000000..b1126bc9b --- /dev/null +++ b/docs/user/tables-advanced.rst @@ -0,0 +1,278 @@ +.. _tables_advanced: + +Advanced Table Formatting +========================= + +The :ref:`tables` chapter covers the basics of creating a table, populating +it with text, and iterating over its rows and columns. This chapter covers +the *formatting* capabilities that were added to *python-docx* by the fork's +Phase D work: borders, shading, cell margins, autofit layout, row +properties, table-style conditional flags, cell text direction, and merge +introspection. + +All of the proxies documented here are accessed off an existing |Table|, +|_Row|, or |_Cell|. They are created lazily and assigning a value to any +property writes the appropriate ``w:tblPr``, ``w:tcPr``, or ``w:trPr`` +child on demand. + + +Borders +------- + +Both tables and cells expose a *borders* proxy. On a |Table|, the proxy +covers six edges — ``top``, ``bottom``, ``left``, ``right``, ``inside_h`` +(the horizontal rules between rows), and ``inside_v`` (the vertical rules +between columns). On a |_Cell|, only the four outer edges are available. + +Each edge is a :class:`~docx.table.BorderElement` with three read/write +properties: ``style`` (a :class:`~docx.enum.table.WD_BORDER_STYLE` member), +``width`` (a |Length|), and ``color`` (an |RGBColor|). A fourth property, +``space``, controls the gap between the border and the cell content. + +Reading a border edge when nothing has been set yields ``None`` on every +property:: + + >>> from docx import Document + >>> document = Document() + >>> table = document.add_table(rows=3, cols=3) + >>> table.borders.top.style + >>> table.borders.top.style is None + True + +Assigning a value to any property creates the underlying ``w:tblBorders`` +(or ``w:tcBorders``) element and the specific edge on demand:: + + >>> from docx.enum.table import WD_BORDER_STYLE + >>> from docx.shared import Pt, RGBColor + >>> table.borders.top.style = WD_BORDER_STYLE.SINGLE + >>> table.borders.top.width = Pt(0.5) + >>> table.borders.top.color = RGBColor(0x00, 0x00, 0x00) + +To clear an edge, assign ``None``:: + + >>> table.borders.top.style = None + +The :meth:`.Table.set_borders` convenience method lets you apply a +consistent border treatment across several edges in one call. It is +particularly handy for the APA-7 "horizontal-only" table style:: + + >>> table.set_borders(top=True, bottom=True, inside_h=True) + +``set_borders`` always writes to all six edges: edges passed as ``True`` +are set to the supplied style/width/color (defaulting to +``WD_BORDER_STYLE.SINGLE``, ``Pt(0.5)``, and black), while those left as +``False`` are explicitly set to ``WD_BORDER_STYLE.NONE`` so the table +style's defaults do not show through. + +Cell-level borders work the same way and override the table-level values +for that one cell:: + + >>> cell = table.cell(0, 0) + >>> cell.borders.left.style = WD_BORDER_STYLE.THICK + >>> cell.borders.left.width = Pt(1) + >>> cell.borders.left.color = RGBColor(0xFF, 0x00, 0x00) + + +Cell shading +------------ + +The :attr:`._Cell.shading` property returns a +:class:`~docx.table.CellShading` proxy with two properties: ``fill_color`` +(an |RGBColor|) and ``pattern`` (a +:class:`~docx.enum.table.WD_SHADING_PATTERN` member). Setting +``fill_color`` is all that's required for the common "solid background +color" case:: + + >>> cell = table.cell(0, 0) + >>> cell.shading.fill_color = RGBColor(0xCC, 0xFF, 0xAA) + >>> cell.shading.pattern + + +When ``fill_color`` is assigned without an explicit ``pattern``, +``WD_SHADING_PATTERN.CLEAR`` is written as the pattern value (this is the +Word default and is what tells Word to render the fill color as a solid +background). Assigning ``None`` to ``fill_color`` removes the attribute +without disturbing ``pattern``, and vice versa. + + +Per-cell margins +---------------- + +Every cell inherits its padding from the table defaults, but individual +cells can override each edge by assigning to :attr:`._Cell.margins`:: + + >>> from docx.shared import Inches + >>> cell.margins.top = Inches(0.05) + >>> cell.margins.start = Inches(0.08) + +The four edges are ``top``, ``bottom``, ``start`` (leading edge), and +``end`` (trailing edge). Reading an edge that has no explicit override +returns ``None``, not the table default. + +Two convenience methods on |_Cell| round out the API: + +* :meth:`._Cell.set_margins` writes only the edges you pass; edges you + omit are left untouched:: + + >>> cell.set_margins(top=Inches(0.05), end=Inches(0.08)) + +* :meth:`._Cell.remove_margins` clears the ``w:tcMar`` element entirely, + restoring full inheritance from the table defaults. + +Assigning ``None`` to an individual edge removes just that edge, and when +the last edge is cleared the empty ``w:tcMar`` is removed automatically +to keep the XML tidy. + + +Table autofit layout +-------------------- + +OOXML distinguishes two interacting concepts that together decide how +column widths behave: ``w:tblLayout`` (``fixed`` vs. ``autofit``) and +``w:tblW`` (the preferred total width, which may be ``dxa``, ``pct``, or +``auto``). *python-docx* exposes three complementary properties on |Table|: + +* :attr:`.Table.autofit_behavior` — a tri-state + :class:`~docx.enum.table.WD_TABLE_AUTOFIT` enum that combines both + concerns into a single, intention-revealing setter. +* :attr:`.Table.allow_autofit` — a narrow boolean view of the + ``w:tblLayout`` child. Writing ``True`` removes any explicit + ``w:tblLayout``; writing ``False`` writes ``w:type="fixed"``. +* :attr:`.Table.preferred_width` — the total table width as a |Length| + (mapping to ``w:tblW`` with ``@w:type="dxa"``), or ``None`` when the + preferred width is absent or expressed as a percentage. + +The three :class:`~docx.enum.table.WD_TABLE_AUTOFIT` members map as +follows:: + + FIXED_WIDTH — w:tblLayout/@w:type="fixed" is written. + AUTOFIT_TO_CONTENTS — no w:tblLayout; w:tblW set to "auto". + AUTOFIT_TO_WINDOW — no w:tblLayout; w:tblW set to "5000 pct" + (i.e. 100% of the window). + +Typical usage:: + + >>> from docx.enum.table import WD_TABLE_AUTOFIT + >>> from docx.shared import Inches + >>> table.autofit_behavior = WD_TABLE_AUTOFIT.FIXED_WIDTH + >>> table.preferred_width = Inches(4) + +If all you care about is flipping the ``w:tblLayout`` bit without +touching the preferred width, use ``allow_autofit`` directly:: + + >>> table.allow_autofit = False # fixed layout, w:tblW untouched + + +Row properties +-------------- + +Three row-level properties are most likely to matter when laying out a +table for print: + +* :attr:`._Row.height` and :attr:`._Row.height_rule` — the row's + minimum/exact height in EMU and whether it is a minimum (``AT_LEAST``), + exact (``EXACT``), or unconstrained (``AUTO``) value. Either property + reads as ``None`` when no explicit value is set. + +* :attr:`._Row.allow_break_across_pages` — when ``False``, the row cannot + split across a page break; Word will push the entire row to the next + page instead. Defaults to ``True``. + +* :attr:`._Row.is_header` — when ``True``, the row repeats at the top of + each page the table spans. Only the first N consecutive rows can be + header rows (a Word limitation). + +Example: mark the first row as a repeating header and keep every row +intact across page breaks:: + + >>> header_row = table.rows[0] + >>> header_row.is_header = True + >>> for row in table.rows: + ... row.allow_break_across_pages = False + + +Table style conditional flags +----------------------------- + +Table styles can define different formatting for the first row, last row, +first column, last column, banded rows, and banded columns. Which of those +conditional formats get applied is controlled by six flags on the table's +``w:tblLook`` element, exposed by :attr:`.Table.style_flags`: + +* ``first_row``, ``last_row``, ``first_column``, ``last_column`` — enable + the matching conditional formatting from the table style. +* ``no_horizontal_banding``, ``no_vertical_banding`` — *suppress* banding. + That is, ``no_horizontal_banding == False`` means banded rows are + active. + +When ``w:tblLook`` is absent, every flag reads as ``False``. Writing any +flag creates ``w:tblLook`` on demand:: + + >>> flags = table.style_flags + >>> flags.first_row = True + >>> flags.first_column = True + >>> flags.first_row + True + >>> flags.no_horizontal_banding + False + +Banded rows are the Word default, so you typically only need to touch +``no_horizontal_banding`` to *suppress* row banding on a style that +normally provides it. + + +Cell text direction +------------------- + +:attr:`._Cell.text_direction` takes a member of +:class:`~docx.enum.table.WD_TEXT_DIRECTION`. The two most common values +for rotated-heading cells are ``TB_RL`` (text reads top-to-bottom, +rotated 90 degrees clockwise) and ``BT_LR`` (bottom-to-top, rotated 90 +degrees counter-clockwise):: + + >>> from docx.enum.table import WD_TEXT_DIRECTION + >>> heading = table.cell(0, 0) + >>> heading.text_direction = WD_TEXT_DIRECTION.TB_RL + +Reading the property when no explicit direction is set returns ``None``. +Assigning ``None`` removes the ``w:textDirection`` child, restoring +inheritance. + + +Merged-cell introspection +------------------------- + +Two properties on |_Cell| make it possible to work with merged regions +without dropping to the XML layer: + +* :attr:`._Cell.is_merge_origin` is a tri-state ``bool | None``: + + * ``None`` — the cell is not part of any merged region. + * ``True`` — the cell is the *origin* (top-left) of a merged region + (either ``w:vMerge/@w:val="restart"`` or a horizontal-only span with + ``w:gridSpan > 1``). + * ``False`` — the cell is a *continuation* of a vertically merged + region (``w:vMerge`` without an explicit ``@w:val="restart"``). + +* :attr:`._Cell.merge_origin` walks up any ``w:vMerge`` continuations + and returns the cell containing the actual content of the merge. If + the cell is already the origin (or not merged), it returns itself. + +Example: collect the distinct content cells of a table, ignoring +continuations of vertical spans:: + + >>> seen = set() + >>> content_cells = [] + >>> for row in table.rows: + ... for cell in row.cells: + ... origin = cell.merge_origin + ... key = id(origin._tc) + ... if key in seen: + ... continue + ... seen.add(key) + ... content_cells.append(origin) + +Accessing cells via :meth:`.Table.cell` already resolves continuations +for you — the returned |_Cell| is always the origin cell. The raw +``w:tc`` elements surface only when you iterate over +``row._tr.tc_lst`` directly. diff --git a/docs/user/text-advanced.rst b/docs/user/text-advanced.rst new file mode 100644 index 000000000..5a9938fe7 --- /dev/null +++ b/docs/user/text-advanced.rst @@ -0,0 +1,313 @@ +.. _text_advanced: + +Advanced Text Formatting +======================== + +This guide covers text-formatting features that go beyond the ``bold`` / +``italic`` / ``underline`` basics described in :doc:`text`. +Everything here lives on the |Font| object exposed by ``run.font`` or on the +|ParagraphFormat| exposed by ``paragraph.paragraph_format``. + +All properties described here follow the same tri-state convention used +elsewhere in *python-docx*: reading |None| means the attribute is absent and +the effective value is inherited from the style hierarchy. Assigning |None| +removes the direct setting so the style default re-applies. + + +Run shading (background color) +------------------------------ + +``Font.shading_color`` provides a read/write |RGBColor| for the *run-level* +background fill. It maps to ``w:rPr/w:shd@w:fill`` with ``w:val="clear"``. It +is different from ``Font.highlight_color``, which selects from a fixed palette +of highlighter colors (``WD_COLOR_INDEX.YELLOW`` etc.) and maps to +``w:rPr/w:highlight``:: + + >>> from docx import Document + >>> from docx.shared import RGBColor + >>> from docx.enum.text import WD_COLOR_INDEX + >>> run = Document().add_paragraph().add_run("shaded text") + + >>> run.font.shading_color is None + True + >>> run.font.shading_color = RGBColor(0xFF, 0xFF, 0x00) + >>> run.font.shading_color + RGBColor(0xff, 0xff, 0x00) + + >>> run.font.highlight_color = WD_COLOR_INDEX.BRIGHT_GREEN + >>> run.font.shading_color, run.font.highlight_color + (RGBColor(0xff, 0xff, 0x00), BRIGHT_GREEN (4)) + +Setting ``shading_color`` to |None| removes the ``w:shd`` element entirely. +The two properties are independent, so both may be set on the same run. + + +Run borders +----------- + +Word can draw a box around a single run. The border is controlled by the +``w:rPr/w:bdr`` element and is exposed on |Font| as four symmetrical +properties plus a convenience :meth:`~docx.text.font.Font.remove_border` +method:: + + >>> from docx.enum.text import WD_BORDER_STYLE + >>> from docx.shared import Pt, RGBColor + >>> font = run.font + + >>> font.border_style = WD_BORDER_STYLE.SINGLE + >>> font.border_color = RGBColor(0xFF, 0x00, 0x00) + >>> font.border_width = Pt(1.5) + >>> font.border_space = Pt(4) + +* ``border_style`` — a :ref:`WdBorderStyle` member (``SINGLE``, ``DOUBLE``, + ``DASHED``, ``DOTTED``, and more). +* ``border_color`` — an |RGBColor| or |None|. Reading returns |None| when the + XML stores ``w:color="auto"`` so assigning a real color is distinguishable + from inheritance. +* ``border_width`` — a |Length|. Word stores this as eighth-points; use + |Pt| to get the right units (``Pt(0.5)``, ``Pt(1)``, ``Pt(1.5)`` etc.). +* ``border_space`` — a |Length| controlling the padding between the border + and the text, typically entered in points. + +Assigning |None| to any individual property clears just that attribute +while leaving the others intact. To clear the whole border in one call use +:meth:`Font.remove_border`:: + + >>> font.remove_border() + >>> font.border_style, font.border_color, font.border_width, font.border_space + (None, None, None, None) + + +Kerning and character spacing +----------------------------- + +Two closely related Font properties expose character-metric adjustments: + +* ``Font.kerning`` is the *minimum* font size, in points, for which the Word + rendering engine will perform automatic kerning. Set it with |Pt|:: + + >>> font.kerning = Pt(10) + >>> font.kerning.pt + 10.0 + + Assigning |None| removes the ``w:kern`` element. + +* ``Font.character_spacing`` is a fixed horizontal offset between characters + in the run. Positive values expand the tracking, negative values condense + it:: + + >>> font.character_spacing = Pt(1) # wider + >>> font.character_spacing = Pt(-0.5) # tighter + >>> font.character_spacing = None # back to inheritance + + +Language tags and East Asian fonts +---------------------------------- + +A run's ``w:rPr/w:lang`` element carries up to three BCP-47 language tags, +each surfaced as an independent property on |Font|: + +* ``Font.language`` — primary (Latin-script) language, e.g. ``"en-US"``. +* ``Font.east_asian_language`` — East Asian language, e.g. ``"ja-JP"``. +* ``Font.bidi_language`` — complex-script (right-to-left) language, e.g. + ``"ar-SA"``. + +Because all three attributes share the same element, assigning |None| to an +individual property clears only the corresponding attribute. To drop the +entire ``w:lang`` element use :meth:`Font.remove_language`:: + + >>> font.language = "en-US" + >>> font.east_asian_language = "ja-JP" + >>> font.bidi_language = "ar-SA" + >>> font.remove_language() + >>> font.language, font.east_asian_language, font.bidi_language + (None, None, None) + +Each script can also use a different typeface. ``Font.name`` drives the +primary (ASCII / high-ANSI) face, and ``Font.name_far_east`` sets the East +Asian face that appears in CJK (Chinese / Japanese / Korean) runs. A legacy +alias ``Font.name_east_asia`` is kept for symmetry with ECMA-376 +terminology; both spellings read and write the same attribute:: + + >>> font.name_far_east = "MS Mincho" + >>> font.name_east_asia + 'MS Mincho' + + +East Asian typography +--------------------- + +``Font.east_asian_layout`` returns an |EastAsianLayout| proxy when the run +carries a ``w:rPr/w:eastAsianLayout`` child, or |None| otherwise. The proxy +exposes three booleans and an integer id: + +* ``two_lines_in_one`` — collapses two adjacent characters into a single + double-glyph (``w:combine``). +* ``vertical_alignment`` — rotates the run for vertical layout + (``w:vert``). +* ``compressed`` — when vertical, compress the run (``w:vertCompress``). +* ``id`` — numeric id Word uses to group related layout runs. + +Create or update the element with :meth:`Font.set_east_asian_layout`; drop it +with :meth:`Font.remove_east_asian_layout`:: + + >>> font.set_east_asian_layout(two_lines_in_one=True, id=1) + + >>> font.east_asian_layout.two_lines_in_one + True + + >>> font.remove_east_asian_layout() + >>> font.east_asian_layout is None + True + +Two paragraph-level toggles complete the East Asian story. Both live on +|ParagraphFormat| and are tri-state (|True| / |False| / |None|): + +* ``kinsoku`` (``w:kinsoku``) — apply kinsoku shori line-break rules so that + certain punctuation characters may not begin or end a line. +* ``word_wrap`` (``w:wordWrap``) — |True| wraps Latin text on word + boundaries (the default); |False| allows breaks inside a word to keep a + tight right edge, which is typical in Japanese layout. + + +Ruby (phonetic) annotations +--------------------------- + +A *ruby annotation* pairs a run of base text with a smaller above-the-line +annotation. Japanese furigana is the most common example. + +*python-docx* exposes existing ruby annotations as read-only +|RubyAnnotation| objects via :attr:`Run.ruby_annotations`. The API does not +yet *create* ruby runs:: + + >>> document = Document("sample-with-ruby.docx") + >>> run = document.paragraphs[0].runs[0] + >>> for ruby in run.ruby_annotations: + ... print(f"{ruby.base_text!r} ↑ {ruby.ruby_text!r}") + '日本' ↑ 'にほん' + '東京' ↑ 'とうきょう' + + >>> ruby = run.ruby_annotations[0] + >>> ruby.alignment, ruby.language + ('distributeSpace', 'ja-JP') + +``alignment`` is the raw value of ``w:rubyPr/w:rubyAlign@w:val`` (typical +values include ``distributeLetter``, ``distributeSpace``, ``center``, +``left``, ``right``, ``rightVertical``). ``language`` is the value of +``w:rubyPr/w:lid@w:val``, usually a BCP-47 tag. + +The base text of a ``w:ruby`` also contributes to ``Run.text``, so +``paragraph.text`` stays readable even for paragraphs that contain ruby +markup. + + +Right-to-left (bidi) layout +--------------------------- + +Right-to-left rendering is controlled at *two* independent scopes; do not +confuse them with the section-level ``w:bidi`` that flips an entire page +layout. + +**Run-level RTL.** ``Font.right_to_left`` (boolean) corresponds to +``w:rPr/w:rtl``. Setting it to |True| causes the run to be rendered +right-to-left using the complex-script (CS) font:: + + >>> run = document.add_paragraph().add_run("שלום") + >>> run.font.right_to_left = True + +``Font.rtl`` exposes the same element as a *tri-state* (|True| / |False| / +|None|), following the style-inheritance convention used by other boolean +Font properties. + +**Paragraph-level RTL.** ``ParagraphFormat.right_to_left`` controls +``w:pPr/w:bidi``. Flipping it reverses the visual order of any runs the +paragraph contains and mirrors paragraph-level indents:: + + >>> p = document.add_paragraph("مرحبا") + >>> p.paragraph_format.right_to_left = True + +Assigning |False| or |None| removes the ``w:bidi`` element. + + +Symbols (glyphs from a named font) +---------------------------------- + +``Run.add_symbol`` appends a ``w:sym`` element that draws its glyph from a +named font rather than from the run's main typeface. Word uses this to +render Wingdings, bullet glyphs that are not standard Unicode, and similar +special characters:: + + >>> run = document.add_paragraph().add_run() + >>> sym = run.add_symbol(0xF0E0, "Wingdings") + >>> sym.char_hex, sym.font + ('F0E0', 'Wingdings') + +The ``char_code`` argument accepts either an integer (``0xF0E0``) or a hex +string (``"F0E0"``, ``"0xf0e0"``). The XML always stores it as a 4-character +uppercase hex string; ``Symbol.char_hex`` returns that canonical form. + +All symbols in a run are iterable via :attr:`Run.symbols`:: + + >>> run.add_symbol(0xF0E1, "Wingdings") + >>> [s.char_hex for s in run.symbols] + ['F0E0', 'F0E1'] + >>> sym.delete() + + +Paragraph text frames +--------------------- + +A *text frame* is an absolutely-positioned text container, the legacy +predecessor of the modern text box. A frame is attached to a paragraph via a +``w:pPr/w:framePr`` element and carries a dozen size, position, and layout +attributes. + +``ParagraphFormat.frame`` returns a read-only |TextFrame| proxy when the +element is present, or |None| otherwise. Use +:meth:`ParagraphFormat.set_frame` to create or update a frame, and +:meth:`ParagraphFormat.remove_frame` to detach it:: + + >>> from docx.enum.text import ( + ... WD_FRAME_H_ANCHOR, WD_FRAME_V_ANCHOR, WD_FRAME_WRAP + ... ) + >>> from docx.shared import Inches + + >>> p = document.add_paragraph("Floating paragraph.") + >>> frame = p.paragraph_format.set_frame( + ... width=Inches(3), + ... height=Inches(1), + ... horizontal_position=Inches(0.5), + ... vertical_position=Inches(0.75), + ... horizontal_anchor=WD_FRAME_H_ANCHOR.PAGE, + ... vertical_anchor=WD_FRAME_V_ANCHOR.MARGIN, + ... wrap=WD_FRAME_WRAP.AROUND, + ... ) + >>> frame.width, frame.height + (2743200, 914400) + +Any keyword argument left at its default of |None| is left unchanged when the +frame already exists, so :meth:`set_frame` doubles as an in-place update:: + + >>> p.paragraph_format.set_frame(width=Inches(4)) + >>> p.paragraph_format.frame.width + 3657600 + +The TextFrame properties cover the full attribute surface of ``w:framePr``: + +* ``width`` / ``height`` — |Length|. +* ``horizontal_position`` / ``vertical_position`` — |Length|. +* ``horizontal_anchor`` / ``vertical_anchor`` — members of + :ref:`WdFrameHAnchor` and :ref:`WdFrameVAnchor`. +* ``wrap`` — a :ref:`WdFrameWrap` member. +* ``drop_cap`` and ``lines`` — for drop-cap frames, via + :ref:`WdFrameDropCap`. +* ``horizontal_alignment`` / ``vertical_alignment`` — :ref:`WdFrameHAlign` + and :ref:`WdFrameVAlign`. + +Assigning |None| to any individual attribute clears just that attribute on +the existing ``w:framePr`` element. To drop the element entirely use +:meth:`~docx.text.parfmt.ParagraphFormat.remove_frame`:: + + >>> p.paragraph_format.remove_frame() + >>> p.paragraph_format.frame is None + True diff --git a/docs/user/themes.rst b/docs/user/themes.rst new file mode 100644 index 000000000..1e04a131d --- /dev/null +++ b/docs/user/themes.rst @@ -0,0 +1,153 @@ +.. _themes: + +Themes +====== + +Every Word document ships with a **theme** — a coordinated set of colours +and fonts that style elements refer to rather than hard-coding values. The +theme lives in ``word/theme/theme1.xml`` and is referenced indirectly from +every paragraph and character style. Changing the theme therefore +re-skins the whole document without touching individual paragraphs. + +|docx| exposes the theme **read-only**: :attr:`.Document.theme` returns a +|Theme| proxy when the document has a theme relationship, or |None| when it +doesn't (hand-authored packages can legally omit the theme part). + +Theme authoring is not supported. If you need a different theme, design it +in Word and use that document as your template. + + +Retrieving the theme +-------------------- + +:: + + >>> from docx import Document + >>> document = Document() + >>> theme = document.theme + >>> theme.name + 'Office Theme' + >>> theme.colors.name + 'Office' + >>> theme.fonts.name + 'Office' + +:attr:`.Theme.name`, :attr:`.ThemeColors.name`, and :attr:`.ThemeFonts.name` +return the human-readable names assigned to the theme as a whole and to its +two schemes; they are all nullable strings. + + +Colour scheme +------------- + +Word's colour scheme has twelve slots named with short OOXML tokens. |docx| +exposes them both by token and by accessor: + +``dk1`` + :attr:`.ThemeColors.dark_1` — dark body text (usually black). + +``lt1`` + :attr:`.ThemeColors.light_1` — light background (usually white). + +``dk2`` + :attr:`.ThemeColors.dark_2` — secondary dark colour. + +``lt2`` + :attr:`.ThemeColors.light_2` — secondary light colour. + +``accent1`` … ``accent6`` + :attr:`.ThemeColors.accent_1` through + :attr:`.ThemeColors.accent_6` — additional accents used by charts, + tables, and shape fills. + +``hlink`` + :attr:`.ThemeColors.hyperlink` — unvisited hyperlink. + +``folHlink`` + :attr:`.ThemeColors.followed_hyperlink` — visited hyperlink. + +Every accessor returns an |RGBColor| or |None| (when the slot is missing or +its value cannot be resolved to RGB — for example, an ``a:sysClr`` without +a ``lastClr`` fallback):: + + >>> theme.colors.dark_1 + RGBColor(0x00, 0x00, 0x00) + >>> theme.colors.accent_1 + RGBColor(0x4F, 0x81, 0xBD) + >>> theme.colors.hyperlink + RGBColor(0x00, 0x00, 0xFF) + +The subscript form uses the OOXML token directly — handy when the slot name +is coming from data rather than a hard-coded attribute:: + + >>> theme.colors["accent1"] + RGBColor(0x4F, 0x81, 0xBD) + >>> theme.colors["hlink"] + RGBColor(0x00, 0x00, 0xFF) + >>> theme.colors["bogus"] + Traceback (most recent call last): + ... + KeyError: 'bogus' + +A |None| value means the slot is defined *but* its child element cannot be +resolved to an RGB triple; a :class:`KeyError` means the token is not one +of the twelve legal slot names. This lets callers distinguish "slot omitted" +from "typo in slot name". + + +Font scheme +----------- + +The font scheme pairs two typeface bundles: + +- **Major** fonts, used for headings (``a:majorFont``); +- **Minor** fonts, used for body text (``a:minorFont``). + +Each bundle nests three slots for the primary script regions: + +==================== =========================================== =================================== +Slot Accessor OOXML element +==================== =========================================== =================================== +Major Latin :attr:`.ThemeFonts.major_latin` ``a:majorFont/a:latin/@typeface`` +Minor Latin :attr:`.ThemeFonts.minor_latin` ``a:minorFont/a:latin/@typeface`` +Major East Asian :attr:`.ThemeFonts.major_east_asian` ``a:majorFont/a:ea/@typeface`` +Minor East Asian :attr:`.ThemeFonts.minor_east_asian` ``a:minorFont/a:ea/@typeface`` +Major Complex Script :attr:`.ThemeFonts.major_cs` ``a:majorFont/a:cs/@typeface`` +Minor Complex Script :attr:`.ThemeFonts.minor_cs` ``a:minorFont/a:cs/@typeface`` +==================== =========================================== =================================== + +Each accessor returns the typeface string, or |None| when the slot is +missing:: + + >>> theme.fonts.major_latin + 'Calibri' + >>> theme.fonts.minor_latin + 'Cambria' + + +When theme is |None| +-------------------- + +A document built from hand-authored parts — or a document that has had its +theme deliberately stripped — will cause :attr:`.Document.theme` to return +|None|. Guard callers accordingly:: + + >>> theme = document.theme + >>> if theme is None: + ... print("no theme") + ... else: + ... print(theme.colors.accent_1) + + +Interoperability +---------------- + +- Styles reference the theme indirectly through ``w:themeColor`` on + ``w:color`` and ``w:themeFont`` on ``w:rFonts``. Changing theme colours in + Word updates every style that uses them automatically; changing an + individual style's colour overrides the theme reference for that style + only. +- Some Word features (theme-aware tables, for example) use theme tint and + shade attributes to derive new colours from the twelve base slots. |docx| + exposes only the base slots; tinted/shaded variants must be computed by + the caller. diff --git a/docs/user/toc.rst b/docs/user/toc.rst new file mode 100644 index 000000000..24ac83d48 --- /dev/null +++ b/docs/user/toc.rst @@ -0,0 +1,169 @@ +.. _toc: + +Working with a Table of Contents +================================ + +Word documents commonly begin with a *table of contents* (TOC) — a navigable +list of the document's headings with their page numbers. In Word, a TOC is +not a static piece of prose: it is a **field**, a small instruction that Word +evaluates every time the document is opened (or when the user presses *F9* +to update fields). The field scans the document for headings and renders +itself as an up-to-date list. + +*python-docx* lets you write that field into a document. It cannot evaluate +the field — it has no layout engine and therefore no real page numbers — so +what it writes is the instruction Word needs plus a *cached preview* that +consumers who do not evaluate fields (raw-XML tools, a Word session where the +user declined the "update fields?" prompt) can still display sensibly. + + +Adding a TOC at the end of the document +--------------------------------------- + +The simplest way to add a TOC is to append one to the end of the document:: + + >>> from docx import Document + >>> document = Document() + >>> document.add_heading("Chapter One", level=1) + >>> document.add_paragraph("Body of chapter one.") + >>> document.add_heading("Section 1.1", level=2) + >>> document.add_paragraph("Body of section 1.1.") + + >>> toc = document.add_table_of_contents() + >>> toc + + +The returned object is the newly-appended |Paragraph|. It now carries one +complex field of type ``TOC``:: + + >>> field = toc.fields[0] + >>> field.type + 'TOC' + >>> field.is_complex + True + +When Word opens the file it offers to update fields. Accepting rebuilds the +TOC against the current document state and inserts the real page numbers. +Declining leaves the cached preview visible. + + +Choosing which heading levels appear +------------------------------------ + +``add_table_of_contents`` accepts a ``levels`` keyword — a +``(min_level, max_level)`` tuple that selects which ``"Heading N"`` styles +feed into the TOC. The default ``(1, 3)`` matches Word's own default and +includes H1 through H3:: + + >>> # H1 only + >>> document.add_table_of_contents(levels=(1, 1)) + + >>> # H2 and H3, skipping top-level chapter titles + >>> document.add_table_of_contents(levels=(2, 3)) + + >>> # every heading level Word supports + >>> document.add_table_of_contents(levels=(1, 9)) + +The range is validated. ``1 <= min_level <= max_level <= 9`` must hold; +otherwise a |ValueError| is raised. A paragraph is treated as a heading only +when its style name matches ``"Heading N"`` (case-insensitive) for ``N`` in +1..9. Paragraphs styled *Title*, *Subtitle*, or custom heading styles do +not contribute. + + +Inserting a TOC at a specific position +-------------------------------------- + +A TOC is often placed near the start of the document rather than at the end. +Use the paragraph-level insertion methods to place a TOC relative to an +existing paragraph:: + + >>> anchor = document.paragraphs[0] + >>> toc = anchor.insert_table_of_contents_before() + >>> document.paragraphs[0] is toc + True + + >>> toc_after = anchor.insert_table_of_contents_after() + +Both methods accept the same ``levels`` keyword as +``Document.add_table_of_contents``. The preview text scans the entire +document body regardless of where the TOC is inserted — Word itself rebuilds +the list on open, so the cached preview covers *all* headings even if some +appear after the TOC paragraph. + + +What ends up in the XML +----------------------- + +A TOC is a *complex field*: three ```` markers (``begin``, +``separate``, ``end``) wrap an ```` instruction and a cached +result. The generated paragraph looks approximately like: + +.. code-block:: xml + + + + + TOC \o "1-3" \h \z \u + + + Chapter One 1 + + Section 1.1 2 + + + +The instruction switches match what Word writes when a TOC is inserted from +the *References* ribbon: + +* ``\o "min-max"`` — build from outline levels ``min..max`` +* ``\h`` — render entries as clickable hyperlinks +* ``\z`` — hide the tab leader and page number in web view +* ``\u`` — include paragraphs with an applied outline level, not only those + using the built-in heading styles + +You can read the instruction and result back via the |Field| proxy:: + + >>> field = toc.fields[0] + >>> field.instruction + ' TOC \\o "1-3" \\h \\z \\u ' + >>> field.result_text + 'Chapter One\t1\nSection 1.1\t2' + +Each line of ``result_text`` has the form ``"{heading text}\t{index}"``. The +trailing integer is a **1-based position** in the filtered heading list, not +a page number. *python-docx* does not paginate, so it cannot compute page +numbers; Word discards the cached numbers and recomputes real ones when it +next updates the field. + + +Verifying the result in Word +---------------------------- + +Because the TOC is a field, what you see in Word depends on whether fields +are up to date: + +1. Open the document in Word. Word prompts *"This document contains fields + that may refer to other files. Do you want to update the fields?"*. +2. Click **Yes**. Word scans the document, rebuilds the TOC entries, and + inserts the real page numbers. The result matches what you would see if + you inserted a TOC via *References > Table of Contents* in the Word UI. +3. Click **No** and the cached preview written by *python-docx* is shown + instead — heading text is correct, but the trailing integers are heading + indexes rather than page numbers. + +You can also force an update at any time: click inside the TOC and press +*F9*, or right-click and choose *Update Field*. + + +API reference +------------- + +* :meth:`docx.document.Document.add_table_of_contents` — append a TOC to the + body. +* :meth:`docx.text.paragraph.Paragraph.insert_table_of_contents_before` and + :meth:`~docx.text.paragraph.Paragraph.insert_table_of_contents_after` — + insert a TOC relative to an existing paragraph. +* :mod:`docx.toc` — lower-level helpers + (:func:`~docx.toc.build_toc_instruction`, + :func:`~docx.toc.populate_toc_paragraph`) exposed for advanced use. diff --git a/docs/user/track-changes.rst b/docs/user/track-changes.rst new file mode 100644 index 000000000..49b9834b3 --- /dev/null +++ b/docs/user/track-changes.rst @@ -0,0 +1,376 @@ +.. _track_changes: + +Working with Tracked Changes +============================ + +Word's *track changes* feature records every edit made to a document so that a +reviewer can later *accept* or *reject* each edit. When track-changes is on, +insertions, deletions, and formatting edits no longer silently modify the +document — they are wrapped in revision elements that carry the author's name, +the date and time of the edit, and (for moves) a pairing name that links the +source to the destination. + +*python-docx* exposes the **read-side** of this model: collections of proxies +describing every tracked change in the document, plus a small preview helper +that renders paragraphs as plain strings with bracketed markers for inserted +and deleted runs. Accepting or rejecting changes is also available via +:meth:`Document.accept_all_changes` and :meth:`Document.reject_all_changes`; +finer-grained per-change accept/reject is planned for a future release. + +.. note:: + Everything described below is read-only. *python-docx* does not yet expose + authoring methods for creating tracked changes (e.g. a hypothetical + ``Paragraph.add_tracked_insertion(...)``). Tracked-change content is + expected to originate from Word itself (or from a workflow that edits the + XML directly). If you need to synthesize track-change wrappers in tests or + fixtures, drop down to the element level via ``lxml`` and the + ``OxmlElement`` helpers used by the library internally. + + +Run-level tracked changes +------------------------- + +Two run-level wrappers capture the most common edits: + +* ```` — one or more runs that were **inserted** by a reviewer +* ```` — one or more runs that were **deleted** (their text lives in + ```` rather than ````) + +Every paragraph exposes these as |TrackedChange| objects via +:attr:`.Paragraph.tracked_changes`:: + + >>> from docx import Document + >>> document = Document("review-draft.docx") + + >>> paragraph = document.paragraphs[1] + >>> paragraph.tracked_changes + [, + ] + +Each proxy exposes the authorship metadata and the text of the change:: + + >>> change = paragraph.tracked_changes[0] + >>> change.type + 'deletion' + >>> change.author + 'Bob' + >>> change.date + datetime.datetime(2025, 4, 10, 9, 0, tzinfo=datetime.timezone.utc) + >>> change.text + 'brown' + +The :attr:`.TrackedChange.type` property reports one of four string values: + +* ``"insertion"`` — a ```` wrapper +* ``"deletion"`` — a ```` wrapper +* ``"move_from"`` — the source side of a move revision (see below) +* ``"move_to"`` — the destination side of a move revision + +To iterate every tracked change in the document body:: + + >>> for paragraph in document.paragraphs: + ... for change in paragraph.tracked_changes: + ... print(f"{change.type:10s} {change.author:12s} {change.text!r}") + deletion Bob 'brown' + insertion Alice 'nimble' + insertion Carol ', cruel world' + +A paragraph with no tracked changes returns an empty list. + + +Move revisions +-------------- + +When a reviewer drags a selection of text from one paragraph to another with +track-changes on, Word records it as a **move revision**: the source is marked +```` (structurally a deletion whose text uses ``w:delText``) and +the destination is marked ```` (structurally an insertion with +plain ``w:t``). Both wrappers carry a shared ``@w:name`` attribute pairing +them. + +*python-docx* surfaces these as |MoveRevision|, a subclass of |TrackedChange| +that adds a ``name`` property and a ``peer`` lookup:: + + >>> source_para = document.paragraphs[1] + >>> move_from = source_para.tracked_changes[0] + >>> type(move_from).__name__ + 'MoveRevision' + >>> move_from.type + 'move_from' + >>> move_from.name + 'pair1' + + >>> peer = move_from.peer + >>> peer.type + 'move_to' + >>> peer.name + 'pair1' + +``.peer`` walks up to the document root and searches the opposite side +(``w:moveTo`` when called on a ``w:moveFrom`` and vice versa) for the first +element whose ``@w:name`` matches. It returns |None| when the element has no +``@w:name`` or when no peer is found (unpaired halves can appear in +intermediate editing states). + +.. note:: + Word also emits paragraph-level range markers + ````, ````, + ````, and ```` to bracket moves + that span paragraph boundaries. These are *range* markers rather than + run wrappers, so they are not exposed as |TrackedChange| proxies. + They round-trip unchanged; callers needing to work with them can iterate + the underlying XML. + + +Formatting changes +------------------ + +When a reviewer changes *formatting* — bold, alignment, section margins, +table or cell properties — Word records the prior state in a +*formatting-change* element rather than mutating the properties in place. +The revision is appended as a child of the relevant properties element: + +======================= ============================ +Revision element Parent element +======================= ============================ +``w:rPrChange`` ``w:rPr`` (on a run) +``w:pPrChange`` ``w:pPr`` (on a paragraph) +``w:sectPrChange`` ``w:sectPr`` (on a section) +``w:tblPrChange`` ``w:tblPr`` (on a table) +``w:tcPrChange`` ``w:tcPr`` (on a cell) +``w:trPrChange`` ``w:trPr`` (on a row) +======================= ============================ + +Each revision element carries the same ``w:id`` / ``w:author`` / ``w:date`` +metadata as the run-level wrappers, plus a single nested properties element +holding the *pre-revision* state. *python-docx* surfaces every one of them +through a |FormattingChange| proxy, accessible as a ``formatting_change`` +property on the corresponding object: + +* :attr:`.Run.formatting_change` — run formatting (``w:rPrChange``) +* :attr:`.Paragraph.formatting_change` — paragraph formatting (``w:pPrChange``) +* :attr:`.Section.formatting_change` — section formatting (``w:sectPrChange``) +* :attr:`.Table.formatting_change` — table-level formatting (``w:tblPrChange``) +* :attr:`._Cell.formatting_change` — cell formatting (``w:tcPrChange``) +* :attr:`._Row.formatting_change` — row formatting (``w:trPrChange``) + +All six return |None| when the corresponding tracked revision is not present, +making the property easy to use as a predicate:: + + >>> paragraph = document.paragraphs[1] + >>> change = paragraph.formatting_change + >>> change is None + False + >>> change.author + 'Bob' + >>> change.date + datetime.datetime(2025, 4, 10, 9, 5, tzinfo=datetime.timezone.utc) + +The prior formatting is available via ``old_properties``:: + + >>> old_pPr = change.old_properties + >>> old_pPr.tag + '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr' + +``old_properties`` returns the raw oxml element (``w:rPr``, ``w:pPr``, +``w:sectPr``, ``w:tblPr``, ``w:tcPr``, or ``w:trPr`` depending on the revision +kind). Callers that need to inspect specific properties can iterate its +children — there is no high-level proxy wrapping the historical state. + + +Table cell and row revisions +---------------------------- + +Word records **whole-cell** insertions and deletions via empty marker +elements inside the cell's ``w:tcPr``: + +* ```` — the cell was inserted by a tracked change +* ```` — the cell was deleted by a tracked change + +*python-docx* exposes these as boolean flags on |_Cell|:: + + >>> table = document.tables[0] + >>> cell = table.cell(0, 1) + >>> cell.is_tracked_insertion + True + >>> cell.is_tracked_deletion + False + + >>> deleted = table.cell(1, 0) + >>> deleted.is_tracked_deletion + True + +Both flags are |False| when the cell has no ``w:tcPr`` or when neither +marker element is present. Row- and table-level property revisions use +``w:trPrChange`` and ``w:tblPrChange`` and are surfaced via the same +``formatting_change`` property described above. + + +revision_marks_text() preview +----------------------------- + +For a quick terminal preview of tracked changes in running prose, +:meth:`.Paragraph.revision_marks_text` renders the paragraph as a plain +string with bracketed markers around inserted and deleted content:: + + >>> paragraph = document.paragraphs[1] + >>> paragraph.revision_marks_text() + 'The quick [-brown-][+nimble+] fox jumps.' + +The default markers are CLI-friendly ``[+`` / ``+]`` for insertions and +``[-`` / ``-]`` for deletions. When the paragraph contains no tracked +changes the returned string matches :attr:`.Paragraph.text` exactly. + +All four markers can be overridden:: + + >>> paragraph.revision_marks_text( + ... open_ins="", close_ins="", + ... open_del="", close_del="", + ... ) + 'The quick brownnimble fox jumps.' + +Pass ANSI escape sequences for styled terminal output — for example +``"\033[32m"`` / ``"\033[0m"`` for green insertions and +``"\033[31m"`` / ``"\033[0m"`` for red deletions. + +:meth:`.Document.revision_marks_text` calls the paragraph-level helper on +each top-level paragraph and joins the results with a blank-line separator +(``"\n\n"``). Tables in the body are skipped — this helper is meant as a +*quick preview* of prose, not a full-fidelity renderer:: + + >>> print(document.revision_marks_text()) + Tracked insertions and deletions + + The quick [-brown-][+nimble+] fox jumps. + + Goodbye[+, cruel world+]. + + Nothing to see here. + + +Revision-save IDs (rsid) +------------------------ + +Word stamps every paragraph and run with a **revision-save ID** — an +8-character hex string identifying the editing session during which the +element was last modified. The full set of session IDs lives in the +document settings under ``w:rsids``, with a single ``w:rsidRoot`` naming +the *first* session ever recorded for the document. + +*python-docx* reads these values as plain strings. + +Document-level ids are on |Settings|:: + + >>> document.settings.rsid_root + '00CAFE00' + >>> document.settings.rsids + ['00A1B2C3', '00DEAD00', '00BEEF00'] + +Per-element ids are on |Paragraph| and |Run|:: + + >>> paragraph = document.paragraphs[1] + >>> paragraph.rsid + '00A1B2C3' + + >>> run = paragraph.runs[0] + >>> run.rsid + '00DEAD00' + +Both return |None| when the element has no ``@w:rsidR`` attribute. RSIDs +are primarily useful to downstream *diff / merge* tooling: two runs with +the same RSID were last touched in the same editing session, so RSIDs +correlate edits across saves even when the text itself is unchanged. For a +stronger identifier that also accounts for position and content, see the +``stable_id`` property on |Run| and |Paragraph|. + + +Accepting or rejecting changes +------------------------------ + +Two document-level helpers apply every tracked revision in the body at +once:: + + >>> n = document.accept_all_changes() + >>> n + 5 + + >>> n = document.reject_all_changes() + +Accepting an insertion keeps the inserted content and removes the +``w:ins`` wrapper; accepting a deletion removes the wrapper *and* its +content. Rejecting does the opposite. Formatting revisions are resolved in +the same pass: accepting keeps the post-edit properties and discards the +change record; rejecting restores the pre-edit properties from +``old_properties``. Cell-level revisions (``w:cellIns``, ``w:cellDel``) +remove or preserve the enclosing cell as appropriate. + +Both helpers return the count of change elements resolved. + +.. note:: + Per-change ``TrackedChange.accept()`` and ``TrackedChange.reject()`` + methods and fine-grained filtering (by author, date range, or type) are + planned for a future release. For now, prefer + :meth:`.Document.accept_all_changes` / + :meth:`.Document.reject_all_changes` when you need to resolve changes, + and iterate |TrackedChange| objects when you need to inspect them. + + +What ends up in the XML +----------------------- + +A run-level insertion looks approximately like: + +.. code-block:: xml + + + The quick + + nimble + + fox jumps. + + +A deletion is structurally identical but uses ```` in place of +````: + +.. code-block:: xml + + + brown + + +A move revision pairs two wrappers via ``@w:name``: + +.. code-block:: xml + + + + moved text + + + + + moved text + + +A formatting revision nests the *old* properties element inside the change +wrapper: + +.. code-block:: xml + + + + + + + + + +API reference +------------- + +The tracked-changes proxies live in :mod:`docx.tracked_changes`; see +:ref:`tracked_changes_api` for the generated class documentation. +Relevant methods on |Document| are +:meth:`.Document.accept_all_changes`, :meth:`.Document.reject_all_changes`, +and :meth:`.Document.revision_marks_text`. diff --git a/docs/user/watermarks.rst b/docs/user/watermarks.rst new file mode 100644 index 000000000..938b36c6c --- /dev/null +++ b/docs/user/watermarks.rst @@ -0,0 +1,132 @@ +.. _watermarks: + +Page watermarks +=============== + +Word supports two kinds of watermark — text and image — both rendered in the +*default page header* as VML shapes. |docx| exposes them per-section through +four members on :class:`.Section`: + +- :meth:`.Section.add_text_watermark` — attach a text watermark (``"DRAFT"``, + ``"CONFIDENTIAL"``, etc.). +- :meth:`.Section.add_image_watermark` — attach an image watermark + (company logo, classification stamp, ...). +- :attr:`.Section.watermark` — read the current watermark back as a + |Watermark| proxy, or |None| if none is present. +- :meth:`.Section.remove_watermark` — clear any existing watermark. + + +Why the default header? +----------------------- + +Word represents watermarks as VML ``v:shape`` elements living inside a header +paragraph. Painting them in the header is how they end up on every page of +the section without being part of the body text. |docx| follows the same +convention: adding a watermark detaches the section's header from the +previous section (so section-specific watermarks remain independent) and +paints the shape into the default header. + +As a consequence, if a section's header is linked to the previous section, +calling :meth:`add_text_watermark` or :meth:`add_image_watermark` +automatically sets ``is_linked_to_previous = False`` on that section's +:attr:`.Section.header` before adding the shape. + + +Adding a text watermark +----------------------- + +:: + + >>> from docx import Document + >>> from docx.shared import Pt, RGBColor + >>> document = Document() + >>> section = document.sections[0] + >>> watermark = section.add_text_watermark( + ... text="DRAFT", + ... font="Arial", + ... size=Pt(80), + ... color=RGBColor(0x80, 0x80, 0x80), + ... layout="diagonal", + ... ) + >>> watermark.type, watermark.text + ('text', 'DRAFT') + +The `font`, `size`, `color`, and `layout` parameters are optional. Omitting +them yields Word's standard draft-watermark look — 72-point Calibri in +silver (``#C0C0C0``) drawn diagonally across the page. + +Only ``"diagonal"`` and ``"horizontal"`` are accepted for `layout`. Any +other value raises :class:`ValueError`. + + +Adding an image watermark +------------------------- + +:: + + >>> from docx.shared import Inches + >>> watermark = section.add_image_watermark( + ... "logo.png", + ... width=Inches(3), + ... height=Inches(2), + ... ) + >>> watermark.type + 'image' + >>> watermark.text is None + True + +`image_path` may be a filesystem path or a file-like object (any stream +:class:`docx.image.Image` accepts). When both `width` and `height` are +omitted the image's native dimensions are used; otherwise the image is +scaled to the supplied dimensions while preserving aspect ratio. + +Either call replaces any watermark already present — there is no +``add_additional_watermark`` variant because Word itself does not support +stacked watermarks in a single section. + + +Reading the current watermark +----------------------------- + +:attr:`.Section.watermark` returns the |Watermark| proxy for the section's +default header, or |None| when: + +- the header is still linked to the previous section, or +- the header does not contain a watermark shape. + +:: + + >>> document.sections[0].watermark + + >>> document.sections[0].watermark.type + 'text' + + +Removing a watermark +-------------------- + +:meth:`.Section.remove_watermark` strips every watermark paragraph from the +section's default header. It is safe to call on a section that has no +watermark; the method is a no-op in that case:: + + >>> section.remove_watermark() + >>> section.watermark is None + True + + +Scope and limitations +--------------------- + +- Watermarks are per-section. A document with multiple sections can have a + different watermark in each; use :meth:`add_text_watermark` / + :meth:`add_image_watermark` on each |Section| individually. +- |docx| writes VML watermark shapes — this matches what Word has always + done for watermarks, including Word 365. Other word processors' support + for VML varies; LibreOffice in particular renders VML watermarks with + reduced fidelity. +- Colour is specified as RGB; Word's "washout" / semi-transparent effect is + not separately exposed, but supplying a light grey such as ``RGBColor(0xC0, + 0xC0, 0xC0)`` approximates it. +- Font rotation beyond the ``diagonal`` / ``horizontal`` presets is not + parameterised; callers needing more control can modify the returned shape + element directly. diff --git a/features/acc-heading-structure.feature b/features/acc-heading-structure.feature new file mode 100644 index 000000000..36dcb4fba --- /dev/null +++ b/features/acc-heading-structure.feature @@ -0,0 +1,17 @@ +Feature: Validate document heading structure + In order to author accessible documents + As a developer using python-docx + I need to detect common heading-outline issues such as missing or skipped headings + + + Scenario Outline: Document.validate_heading_structure() + Given a document with heading outline + When I call document.validate_heading_structure() + Then the result is a list of HeadingIssue objects + And the first reported issue has kind "" + + Examples: heading-outline cases + | outline | expected-issues | first-kind | + | a valid | 0 | | + | a missing-H2 | 1 | no_h1 | + | a skipped-level | 1 | skipped_level | diff --git a/features/api-docm.feature b/features/api-docm.feature new file mode 100644 index 000000000..d2f7bff4f --- /dev/null +++ b/features/api-docm.feature @@ -0,0 +1,19 @@ +Feature: Open macro-enabled .docm documents and detect their VBA project + In order to let callers decide whether to trust macro-enabled documents before processing them + As a developer using python-docx + I need Document() to load the macroEnabled content type and Document.has_macros to detect VBA + + + Scenario: Loading a .docm document succeeds + Given a macro-enabled .docm document + Then the document loads without error + + + Scenario: Document.has_macros reports presence of a vbaProject relationship + Given a macro-enabled .docm document + Then document.has_macros is True + + + Scenario: Document.has_macros is False for plain .docx + Given a fresh default document + Then document.has_macros is False diff --git a/features/blk-insert-at-position.feature b/features/blk-insert-at-position.feature new file mode 100644 index 000000000..0127cdd3f --- /dev/null +++ b/features/blk-insert-at-position.feature @@ -0,0 +1,50 @@ +Feature: Insert a paragraph or table at an arbitrary position + In order to add new block-level content anywhere in the document + As a developer using python-docx + I need position-aware insert helpers on Paragraph and Table + + + Scenario: Paragraph.insert_paragraph_after inserts directly after a reference paragraph + Given a document containing five paragraphs + When I insert a paragraph after the third paragraph + Then the document contains six paragraphs + And the fourth paragraph text is "inserted-after" + + + Scenario: Paragraph.insert_paragraph_before inserts directly before a reference paragraph + Given a document containing five paragraphs + When I insert a paragraph before the fourth paragraph + Then the document contains six paragraphs + And the fourth paragraph text is "inserted-before" + + + Scenario: Paragraph.insert_table_after inserts a table directly after the reference paragraph + Given a document containing five paragraphs + When I insert a 2x2 table after the third paragraph + Then the document contains one table + And the inserted table has two rows and two columns + + + Scenario: Paragraph.insert_table_before inserts a table directly before the reference paragraph + Given a document containing five paragraphs + When I insert a 2x2 table before the fourth paragraph + Then the document contains one table + And the inserted table has two rows and two columns + + + Scenario: Table.insert_paragraph_after inserts a paragraph directly after a reference table + Given a document containing three tables + When I insert a paragraph after the second table + Then the paragraph after the second table has text "after-table" + + + Scenario: Table.insert_paragraph_before inserts a paragraph directly before a reference table + Given a document containing three tables + When I insert a paragraph before the second table + Then the paragraph before the second table has text "before-table" + + + Scenario: Table.insert_table_after inserts a new table directly after a reference table + Given a document containing three tables + When I insert a 2x2 table after the second table + Then the document contains four tables diff --git a/features/bmk-create-read.feature b/features/bmk-create-read.feature new file mode 100644 index 000000000..3a015cdcc --- /dev/null +++ b/features/bmk-create-read.feature @@ -0,0 +1,73 @@ +Feature: Create and read bookmarks + In order to mark and cross-reference ranges of text in a document + As a developer using python-docx + I need to create bookmarks and access the document's bookmarks collection + + + Scenario: Access document bookmarks collection + Given a document having bookmarks + Then document.bookmarks is a Bookmarks object + + + Scenario: Bookmarks.__len__() + Given a document having bookmarks + Then len(document.bookmarks) == 3 + + + Scenario: Bookmarks.__iter__() + Given a document having bookmarks + Then iterating document.bookmarks yields 3 Bookmark objects + + + Scenario: Bookmarks.get() by name + Given a document having bookmarks + When I call document.bookmarks.get("bm_intro") + Then the result is a Bookmark object named "bm_intro" + + + Scenario: Bookmarks.get() returns None for unknown name + Given a document having bookmarks + When I call document.bookmarks.get("does_not_exist") + Then the result is None + + + Scenario: Bookmarks containment check by name + Given a document having bookmarks + Then "bm_intro" in document.bookmarks + And "does_not_exist" not in document.bookmarks + + + Scenario: Paragraph.add_bookmark() wrapping the whole paragraph + Given a fresh document with one paragraph of text + When I assign bookmark = paragraph.add_bookmark("intro") + Then bookmark.name == "intro" + And bookmark.bookmark_id == 0 + And len(document.bookmarks) == 1 + + + Scenario: Paragraph.add_bookmark() wrapping an existing run + Given a paragraph with three runs + When I add a bookmark named "middle" around the middle run + Then bookmark.name == "middle" + And the bookmark wraps only the middle run + And len(document.bookmarks) == 1 + + + Scenario: Paragraph.add_bookmark() across adjacent runs + Given a paragraph with three runs + When I add a bookmark named "tail" around the second and third runs + Then bookmark.name == "tail" + And the bookmark wraps the last two runs + And len(document.bookmarks) == 1 + + + Scenario: Bookmark spanning multiple paragraphs + Given a document having bookmarks + When I call document.bookmarks.get("bm_span") + Then the result is a Bookmark object named "bm_span" + And the bookmarkStart and bookmarkEnd for "bm_span" are in different paragraphs + + + Scenario: Bookmark IDs are unique in the document + Given a document having bookmarks + Then every bookmark has a unique bookmark_id diff --git a/features/bmk-mutate.feature b/features/bmk-mutate.feature new file mode 100644 index 000000000..f83ec2a9b --- /dev/null +++ b/features/bmk-mutate.feature @@ -0,0 +1,39 @@ +Feature: Mutate bookmarks + In order to revise the bookmarks in a document + As a developer using python-docx + I need to delete and rename bookmarks + + + Scenario: Bookmark.delete() removes both markers + Given a document having bookmarks + When I delete the bookmark named "bm_intro" + Then len(document.bookmarks) == 2 + And "bm_intro" not in document.bookmarks + And no bookmarkStart with that id remains in the body + And no bookmarkEnd with that id remains in the body + + + Scenario: Bookmark.delete() works for a cross-paragraph bookmark + Given a document having bookmarks + When I delete the bookmark named "bm_span" + Then len(document.bookmarks) == 2 + And "bm_span" not in document.bookmarks + And no bookmarkStart with that id remains in the body + And no bookmarkEnd with that id remains in the body + + + Scenario: Deleting one bookmark preserves the others + Given a document having bookmarks + When I delete the bookmark named "bm_middle" + Then len(document.bookmarks) == 2 + And "bm_intro" in document.bookmarks + And "bm_span" in document.bookmarks + + + Scenario: Rename a bookmark + Given a document having bookmarks + When I rename the bookmark "bm_intro" to "bm_renamed" + Then len(document.bookmarks) == 3 + And "bm_renamed" in document.bookmarks + And "bm_intro" not in document.bookmarks + And the bookmark "bm_renamed" keeps its original bookmark_id diff --git a/features/cap-caption.feature b/features/cap-caption.feature new file mode 100644 index 000000000..cf28bd6ed --- /dev/null +++ b/features/cap-caption.feature @@ -0,0 +1,41 @@ +Feature: Add caption paragraphs to a document + In order to auto-number figures, tables, and equations using Word's SEQ fields + As a developer using python-docx + I need Document.add_caption, Paragraph.add_caption_before, and Paragraph.add_caption_after + + + Scenario: Document.add_caption returns a Caption-styled paragraph + Given a fresh default document + When I call document.add_caption("A diagram", label="Figure") + Then the returned paragraph style name is "Caption" + And the returned paragraph text is "Figure 1: A diagram" + + + Scenario Outline: add_caption supports arbitrary labels + Given a fresh default document + When I call document.add_caption("", label=" @@ -724,6 +725,7 @@ +
diff --git a/src/docx/templates/default-docx-template/word/stylesWithEffects.xml b/src/docx/templates/default-docx-template/word/stylesWithEffects.xml index 91c1734e7..1a1de0949 100644 --- a/src/docx/templates/default-docx-template/word/stylesWithEffects.xml +++ b/src/docx/templates/default-docx-template/word/stylesWithEffects.xml @@ -641,6 +641,7 @@ +
@@ -680,6 +681,7 @@ + diff --git a/src/docx/templates/default-endnotes.xml b/src/docx/templates/default-endnotes.xml new file mode 100644 index 000000000..7bb180dd3 --- /dev/null +++ b/src/docx/templates/default-endnotes.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/docx/templates/default-footnotes.xml b/src/docx/templates/default-footnotes.xml new file mode 100644 index 000000000..27b2f07f0 --- /dev/null +++ b/src/docx/templates/default-footnotes.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/docx/templates/default.docx b/src/docx/templates/default.docx index c22ff3620..cbfba408f 100644 Binary files a/src/docx/templates/default.docx and b/src/docx/templates/default.docx differ diff --git a/src/docx/templates/smart_art/colors1.xml b/src/docx/templates/smart_art/colors1.xml new file mode 100644 index 000000000..b7242a471 --- /dev/null +++ b/src/docx/templates/smart_art/colors1.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/docx/templates/smart_art/cycle/data.xml b/src/docx/templates/smart_art/cycle/data.xml new file mode 100644 index 000000000..4e2dbb606 --- /dev/null +++ b/src/docx/templates/smart_art/cycle/data.xml @@ -0,0 +1,2 @@ + + diff --git a/src/docx/templates/smart_art/layout1.xml b/src/docx/templates/smart_art/layout1.xml new file mode 100644 index 000000000..357229b3c --- /dev/null +++ b/src/docx/templates/smart_art/layout1.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/docx/templates/smart_art/list/data.xml b/src/docx/templates/smart_art/list/data.xml new file mode 100644 index 000000000..910b96f50 --- /dev/null +++ b/src/docx/templates/smart_art/list/data.xml @@ -0,0 +1,2 @@ + + diff --git a/src/docx/templates/smart_art/process/data.xml b/src/docx/templates/smart_art/process/data.xml new file mode 100644 index 000000000..6c51febc9 --- /dev/null +++ b/src/docx/templates/smart_art/process/data.xml @@ -0,0 +1,2 @@ + + diff --git a/src/docx/templates/smart_art/quickStyle1.xml b/src/docx/templates/smart_art/quickStyle1.xml new file mode 100644 index 000000000..7b557cc66 --- /dev/null +++ b/src/docx/templates/smart_art/quickStyle1.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/src/docx/text/font.py b/src/docx/text/font.py index 0439f4547..b015537b0 100644 --- a/src/docx/text/font.py +++ b/src/docx/text/font.py @@ -5,11 +5,13 @@ from typing import TYPE_CHECKING, Any from docx.dml.color import ColorFormat +from docx.enum.table import WD_SHADING_PATTERN from docx.enum.text import WD_UNDERLINE -from docx.shared import ElementProxy, Emu +from docx.shared import ElementProxy, Emu, RGBColor if TYPE_CHECKING: - from docx.enum.text import WD_COLOR_INDEX + from docx.enum.text import WD_BORDER_STYLE, WD_COLOR_INDEX + from docx.oxml.text.font import CT_EastAsianLayout from docx.oxml.text.run import CT_R from docx.shared import Length @@ -35,6 +37,26 @@ def all_caps(self) -> bool | None: def all_caps(self, value: bool | None) -> None: self._set_bool_prop("caps", value) + @property + def character_spacing(self) -> Length | None: + """Read/write. + + |Length| value specifying the spacing between characters. Positive values expand + the spacing, negative values condense it. |None| indicates the value is inherited + from the style hierarchy. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + return rPr.spacing_val + + @character_spacing.setter + def character_spacing(self, value: int | Length | None) -> None: + rPr = self._element.get_or_add_rPr() + rPr.spacing_val = None if value is None else Emu(value) + @property def bold(self) -> bool | None: """Read/write. @@ -46,13 +68,185 @@ def bold(self) -> bool | None: @bold.setter def bold(self, value: bool | None) -> None: self._set_bool_prop("b", value) + # Mirror to the complex-script bold toggle. Word emits both + # and together; omitting silently drops + # bold on Arabic/Hebrew/Thai runs when Word reopens the file. + # Callers that need divergent values can still set cs_bold + # explicitly after this setter. + self._set_bool_prop("bCs", value) + + @property + def border_color(self) -> RGBColor | None: + """Run-border color as an |RGBColor|, or |None| if not set. + + Read/write. Reads ``w:rPr/w:bdr/@w:color``. Returns |None| when the + ``w:bdr`` element is absent, when the attribute is missing, or when it + is set to ``"auto"``. Assigning an |RGBColor| creates the ``w:bdr`` + child if necessary. Assigning |None| clears the attribute but leaves + any sibling border attributes intact. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + bdr = rPr.bdr + if bdr is None: + return None + color = bdr.color + if color is None or not isinstance(color, RGBColor): + return None + return color + + @border_color.setter + def border_color(self, value: RGBColor | None) -> None: + if value is None: + rPr = self._element.rPr + if rPr is None: + return + bdr = rPr.bdr + if bdr is None: + return + bdr.color = None + return + rPr = self._element.get_or_add_rPr() + bdr = rPr.get_or_add_bdr() + bdr.color = value + + @property + def border_space(self) -> Length | None: + """Space between the border and the text, in points. + + Read/write. Maps to ``w:rPr/w:bdr/@w:space``. Returns |None| when the + ``w:bdr`` element or the attribute is absent. Assigning a |Length| or + |Pt| value creates the ``w:bdr`` child if necessary. Assigning |None| + clears the attribute. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + bdr = rPr.bdr + if bdr is None: + return None + return bdr.space + + @border_space.setter + def border_space(self, value: Length | None) -> None: + if value is None: + rPr = self._element.rPr + if rPr is None: + return + bdr = rPr.bdr + if bdr is None: + return + bdr.space = None + return + rPr = self._element.get_or_add_rPr() + bdr = rPr.get_or_add_bdr() + bdr.space = value + + @property + def border_style(self) -> WD_BORDER_STYLE | None: + """Border style as a member of :ref:`WdBorderStyle`, or |None| if not set. + + Read/write. Maps to ``w:rPr/w:bdr/@w:val``. Returns |None| when the + ``w:bdr`` element or the attribute is absent. Assigning a + |WD_BORDER_STYLE| member creates the ``w:bdr`` child if necessary. + Assigning |None| clears the attribute. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + bdr = rPr.bdr + if bdr is None: + return None + return bdr.val + + @border_style.setter + def border_style(self, value: WD_BORDER_STYLE | None) -> None: + if value is None: + rPr = self._element.rPr + if rPr is None: + return + bdr = rPr.bdr + if bdr is None: + return + bdr.val = None + return + rPr = self._element.get_or_add_rPr() + bdr = rPr.get_or_add_bdr() + bdr.val = value + + @property + def border_width(self) -> Length | None: + """Border width as a |Length| value, or |None| if not set. + + Read/write. Maps to ``w:rPr/w:bdr/@w:sz`` which is measured in + eighth-points. Returns |None| when the ``w:bdr`` element or the + attribute is absent. Assigning a |Length| or |Pt| value creates the + ``w:bdr`` child if necessary. Assigning |None| clears the attribute. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + bdr = rPr.bdr + if bdr is None: + return None + return bdr.sz + + @border_width.setter + def border_width(self, value: Length | None) -> None: + if value is None: + rPr = self._element.rPr + if rPr is None: + return + bdr = rPr.bdr + if bdr is None: + return + bdr.sz = None + return + rPr = self._element.get_or_add_rPr() + bdr = rPr.get_or_add_bdr() + bdr.sz = value + + def remove_border(self) -> None: + """Remove the entire ``w:rPr/w:bdr`` child element, if present. + + Clears all run-border state in a single call. Has no effect when no + ``w:bdr`` element is present. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return + rPr._remove_bdr() # pyright: ignore[reportPrivateUsage] @property def color(self): """A |ColorFormat| object providing a way to get and set the text color for this - font.""" + font. + + Read-only property returning a |ColorFormat|; assignments set the run's + RGB color. Assigning an |RGBColor| is equivalent to + ``font.color.rgb = value``. Assigning |None| clears any direct color + (``w:rPr/w:color``). + + .. versionadded:: 2026.05.0 + Assignment shorthand for ``font.color.rgb = ``. + """ return ColorFormat(self._element) + @color.setter + def color(self, value: RGBColor | None) -> None: + ColorFormat(self._element).rgb = value + @property def complex_script(self) -> bool | None: """Read/write tri-state value. @@ -155,6 +349,212 @@ def italic(self) -> bool | None: @italic.setter def italic(self, value: bool | None) -> None: self._set_bool_prop("i", value) + # Mirror to the complex-script italic toggle. See bold setter + # for the rationale — Word drops italic on complex-script runs + # if only is present. + self._set_bool_prop("iCs", value) + + @property + def kerning(self) -> Length | None: + """Read/write. + + |Length| value specifying the minimum font size for which kerning is automatically + adjusted. |None| indicates kerning is not specified (inherited from style + hierarchy). + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + return rPr.kern_val + + @kerning.setter + def kerning(self, value: int | Length | None) -> None: + rPr = self._element.get_or_add_rPr() + rPr.kern_val = None if value is None else Emu(value) + + @property + def language(self) -> str | None: + """Primary (Latin-script) language tag for this run. + + BCP-47 language tag (e.g. ``"en-US"``) or |None| when unset. Maps to + ``w:rPr/w:lang/@w:val``. Assigning a string creates the ``w:lang`` + child if necessary. Assigning |None| clears only the ``w:val`` + attribute — use :meth:`remove_language` to remove the entire + ``w:lang`` element. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + lang = rPr.lang + if lang is None: + return None + return lang.val + + @language.setter + def language(self, value: str | None) -> None: + if value is None: + rPr = self._element.rPr + if rPr is None: + return + lang = rPr.lang + if lang is None: + return + lang.val = None + return + rPr = self._element.get_or_add_rPr() + lang = rPr.get_or_add_lang() + lang.val = value + + @property + def east_asian_language(self) -> str | None: + """East Asian language tag for this run. + + BCP-47 language tag or |None| when unset. Maps to + ``w:rPr/w:lang/@w:eastAsia``. Assigning a string creates the + ``w:lang`` child if necessary. Assigning |None| clears only the + ``w:eastAsia`` attribute — use :meth:`remove_language` to remove the + entire ``w:lang`` element. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + lang = rPr.lang + if lang is None: + return None + return lang.eastAsia + + @east_asian_language.setter + def east_asian_language(self, value: str | None) -> None: + if value is None: + rPr = self._element.rPr + if rPr is None: + return + lang = rPr.lang + if lang is None: + return + lang.eastAsia = None + return + rPr = self._element.get_or_add_rPr() + lang = rPr.get_or_add_lang() + lang.eastAsia = value + + @property + def bidi_language(self) -> str | None: + """Complex-script (bidirectional) language tag for this run. + + BCP-47 language tag or |None| when unset. Maps to + ``w:rPr/w:lang/@w:bidi``. Assigning a string creates the ``w:lang`` + child if necessary. Assigning |None| clears only the ``w:bidi`` + attribute — use :meth:`remove_language` to remove the entire + ``w:lang`` element. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + lang = rPr.lang + if lang is None: + return None + return lang.bidi + + @bidi_language.setter + def bidi_language(self, value: str | None) -> None: + if value is None: + rPr = self._element.rPr + if rPr is None: + return + lang = rPr.lang + if lang is None: + return + lang.bidi = None + return + rPr = self._element.get_or_add_rPr() + lang = rPr.get_or_add_lang() + lang.bidi = value + + def remove_language(self) -> None: + """Remove the entire ``w:rPr/w:lang`` child element, if present. + + Clears all language-tag state in a single call. Has no effect when no + ``w:lang`` element is present. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return + rPr._remove_lang() # pyright: ignore[reportPrivateUsage] + + @property + def east_asian_layout(self) -> EastAsianLayout | None: + """|EastAsianLayout| proxy for this run's ``w:eastAsianLayout``, or |None|. + + Returns |None| when the run has no ``w:rPr/w:eastAsianLayout`` child. + Use :meth:`set_east_asian_layout` to create or update the element and + :meth:`remove_east_asian_layout` to drop it entirely. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + eal = rPr.eastAsianLayout + if eal is None: + return None + return EastAsianLayout(eal) + + def set_east_asian_layout( + self, + *, + id: int | None = None, + two_lines_in_one: bool | None = None, + vertical_alignment: bool | None = None, + compressed: bool | None = None, + ) -> EastAsianLayout: + """Create or update the ``w:eastAsianLayout`` element on this run. + + Any keyword argument left at its default of |None| is left unchanged + when the element already exists. To clear an attribute, use the + corresponding setter on the returned |EastAsianLayout| (e.g. + ``layout.two_lines_in_one = None``) or call + :meth:`remove_east_asian_layout` to drop the element entirely. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.get_or_add_rPr() + eal = rPr.get_or_add_eastAsianLayout() + layout = EastAsianLayout(eal) + if id is not None: + layout.id = id + if two_lines_in_one is not None: + layout.two_lines_in_one = two_lines_in_one + if vertical_alignment is not None: + layout.vertical_alignment = vertical_alignment + if compressed is not None: + layout.compressed = compressed + return layout + + def remove_east_asian_layout(self) -> None: + """Remove the ``w:rPr/w:eastAsianLayout`` element, if present. + + Has no effect when no ``w:rPr`` or no ``w:eastAsianLayout`` child is + present. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return + if rPr.eastAsianLayout is None: + return + rPr._remove_eastAsianLayout() # pyright: ignore[reportPrivateUsage] @property def imprint(self) -> bool | None: @@ -181,6 +581,59 @@ def math(self) -> bool | None: def math(self, value: bool | None) -> None: self._set_bool_prop("oMath", value) + @property + def name_cs(self) -> str | None: + """The Complex Script typeface name for this |Font|. + + Causes Complex Script text it controls to appear in the named font. |None| + indicates the typeface is inherited from the style hierarchy. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + return rPr.rFonts_cs + + @name_cs.setter + def name_cs(self, value: str | None) -> None: + rPr = self._element.get_or_add_rPr() + rPr.rFonts_cs = value + + @property + def name_east_asia(self) -> str | None: + """The East Asian typeface name for this |Font|. + + Causes East Asian text it controls to appear in the named font. |None| indicates + the typeface is inherited from the style hierarchy. Alias for `name_far_east`. + + .. versionadded:: 2026.05.0 + """ + return self.name_far_east + + @name_east_asia.setter + def name_east_asia(self, value: str | None) -> None: + self.name_far_east = value + + @property + def name_far_east(self) -> str | None: + """The East Asian typeface name for this |Font|. + + Causes East Asian (CJK) text it controls to appear in the named font. |None| + indicates the typeface is inherited from the style hierarchy. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + return rPr.rFonts_eastAsia + + @name_far_east.setter + def name_far_east(self, value: str | None) -> None: + rPr = self._element.get_or_add_rPr() + rPr.rFonts_eastAsia = value + @property def name(self) -> str | None: """The typeface name for this |Font|. @@ -198,6 +651,11 @@ def name(self, value: str | None) -> None: rPr = self._element.get_or_add_rPr() rPr.rFonts_ascii = value rPr.rFonts_hAnsi = value + # -- mirror the ascii / hAnsi name onto the complex-script slot so + # -- bidi (RTL) runs use the same typeface (upstream #510, #430, #973). + # -- Callers that want a different CS font can explicitly set + # -- :attr:`Font.name_cs` afterwards. -- + rPr.rFonts_cs = value @property def no_proof(self) -> bool | None: @@ -238,6 +696,72 @@ def rtl(self) -> bool | None: def rtl(self, value: bool | None) -> None: self._set_bool_prop("rtl", value) + @property + def right_to_left(self) -> bool: + """|True| when the run is flagged for right-to-left (bidi) rendering. + + Maps to ``w:rPr/w:rtl``. Returns |False| when the element is absent. + Assigning |True| inserts ``w:rtl``; assigning |False| or |None| removes + it. When |True|, the run is rendered right-to-left using the + complex-script (CS) font—appropriate for Arabic, Hebrew, or Farsi text. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return False + rtl = rPr.rtl + if rtl is None: + return False + return rtl.val + + @right_to_left.setter + def right_to_left(self, value: bool | None) -> None: + rPr = self._element.get_or_add_rPr() + if value in (None, False): + rPr._remove_rtl() # pyright: ignore[reportPrivateUsage] + else: + rtl = rPr.get_or_add_rtl() + rtl.val = True + + @property + def shading_color(self) -> RGBColor | None: + """Run-level background (shading) color as an |RGBColor|, or |None| if not set. + + Read/write. Reads the ``w:fill`` attribute of ``w:rPr/w:shd``. Returns |None| + when ``w:shd`` is absent or its ``w:fill`` is missing or set to ``"auto"``. + + Assigning an |RGBColor| writes ``w:rPr/w:shd`` with ``w:val="clear"`` and + ``w:fill="RRGGBB"``. Assigning |None| removes the ``w:shd`` child. Distinct + from :attr:`highlight_color`, which is a predefined palette applied as + ``w:highlight``. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + shd = rPr.shd + if shd is None: + return None + fill = shd.fill + if fill is None or not isinstance(fill, RGBColor): + return None + return fill + + @shading_color.setter + def shading_color(self, value: RGBColor | None) -> None: + if value is None: + rPr = self._element.rPr + if rPr is None: + return + rPr._remove_shd() # pyright: ignore[reportPrivateUsage] + return + rPr = self._element.get_or_add_rPr() + shd = rPr.get_or_add_shd() + shd.val = WD_SHADING_PATTERN.CLEAR + shd.fill = value + @property def shadow(self) -> bool | None: """Read/write tri-state value. @@ -251,6 +775,102 @@ def shadow(self) -> bool | None: def shadow(self, value: bool | None) -> None: self._set_bool_prop("shadow", value) + @property + def cs_size(self) -> Length | None: + """Complex-script (RTL / bidi) font height in English Metric Units. + + Maps to ``w:rPr/w:szCs``. Returns |None| when ``w:szCs`` is absent + (inherited from the style hierarchy). Assigning |None| removes the + attribute. + + Word uses ``w:szCs`` for Arabic / Hebrew / Farsi glyph sizing and + leaves them at the default when only ``w:sz`` is set. The main + :attr:`size` setter also writes ``w:szCs`` for symmetry; use + ``cs_size`` to override the complex-script size independently. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + return rPr.szCs_val + + @cs_size.setter + def cs_size(self, emu: int | Length | None) -> None: + rPr = self._element.get_or_add_rPr() + rPr.szCs_val = None if emu is None else Emu(emu) + + @property + def character_scale(self) -> int | None: + """Horizontal character-scale percentage (``w:rPr/w:w/@w:val``). + + Integer percent, e.g. ``100`` for normal width, ``200`` for double- + width, ``50`` for half-width. Returns |None| when ``w:w`` is absent + (inherited). Assigning |None| removes the element. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + return rPr.w_val + + @character_scale.setter + def character_scale(self, value: int | None) -> None: + rPr = self._element.get_or_add_rPr() + rPr.w_val = value + + @property + def ligatures(self) -> str | None: + """OpenType ligature style (``w:rPr/w14:ligatures/@w14:val``). + + String value such as ``"none"``, ``"standard"``, + ``"standardContextual"``, ``"historical"``, ``"discretional"``, + ``"all"``, or combinations like ``"standardContextualHistorical"``. + Returns |None| when ``w14:ligatures`` is absent. Assigning |None| + removes the element. + + .. versionadded:: 2026.05.0 + """ + rPr = self._element.rPr + if rPr is None: + return None + return rPr.ligatures_val + + @ligatures.setter + def ligatures(self, value: str | None) -> None: + rPr = self._element.get_or_add_rPr() + rPr.ligatures_val = value + + def copy_to(self, target: "Font") -> None: + """Replace `target`'s ``w:rPr`` children with a deep copy of this ``w:rPr``. + + When this font has no ``w:rPr``, `target`'s ``w:rPr`` children (and + attributes) are cleared but the element is preserved. When this font + does have an ``w:rPr``, the target's ``w:rPr`` is ensured to exist + and its contents are replaced — the target run's character + formatting becomes identical to this run's. + + .. versionadded:: 2026.05.0 + """ + from copy import deepcopy + + source_rPr = self._element.rPr + target_rPr = target._element.get_or_add_rPr() + # -- clear target's existing children and attributes -- + for child in list(target_rPr): + target_rPr.remove(child) + for attr_name in list(target_rPr.attrib): + del target_rPr.attrib[attr_name] + if source_rPr is None: + return + # -- copy attributes -- + for attr_name, attr_value in source_rPr.attrib.items(): + target_rPr.set(attr_name, attr_value) + # -- deep-copy children -- + for child in source_rPr: + target_rPr.append(deepcopy(child)) + @property def size(self) -> Length | None: """Font height in English Metric Units (EMU). @@ -275,7 +895,13 @@ def size(self) -> Length | None: @size.setter def size(self, emu: int | Length | None) -> None: rPr = self._element.get_or_add_rPr() - rPr.sz_val = None if emu is None else Emu(emu) + length = None if emu is None else Emu(emu) + rPr.sz_val = length + # -- also write ``w:szCs`` so complex-script / bidi (RTL) runs inherit + # -- the same size. Word uses ``w:szCs`` for Arabic / Hebrew / Farsi + # -- glyphs and leaves them at the default when only ``w:sz`` is set + # -- (upstream #510, #430, #973). -- + rPr.szCs_val = length @property def small_caps(self) -> bool | None: @@ -426,3 +1052,74 @@ def _set_bool_prop(self, name: str, value: bool | None): """Assign `value` to the boolean child `name` of `w:rPr`.""" rPr = self._element.get_or_add_rPr() rPr._set_bool_val(name, value) # pyright: ignore[reportPrivateUsage] + + +class EastAsianLayout: + """Proxy for a run-level ``w:eastAsianLayout`` element. + + Provides read/write access to the East Asian typography attributes + (``@w:id``, ``@w:combine``, ``@w:vert``, ``@w:vertCompress``). Accessed + via :attr:`Font.east_asian_layout`. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, eastAsianLayout: CT_EastAsianLayout): + self._element = eastAsianLayout + + @property + def id(self) -> int | None: + """Unique identifier (``w:eastAsianLayout/@w:id``), or |None|. + + .. versionadded:: 2026.05.0 + """ + return self._element.id + + @id.setter + def id(self, value: int | None) -> None: + self._element.id = value + + @property + def two_lines_in_one(self) -> bool | None: + """|True| when two lines are rendered as one combined glyph. + + Maps to ``w:eastAsianLayout/@w:combine``. Returns |None| when the + attribute is absent. + + .. versionadded:: 2026.05.0 + """ + return self._element.combine + + @two_lines_in_one.setter + def two_lines_in_one(self, value: bool | None) -> None: + self._element.combine = value + + @property + def vertical_alignment(self) -> bool | None: + """|True| when the run is laid out vertically. + + Maps to ``w:eastAsianLayout/@w:vert``. Returns |None| when the + attribute is absent. + + .. versionadded:: 2026.05.0 + """ + return self._element.vert + + @vertical_alignment.setter + def vertical_alignment(self, value: bool | None) -> None: + self._element.vert = value + + @property + def compressed(self) -> bool | None: + """|True| when vertical text is compressed. + + Maps to ``w:eastAsianLayout/@w:vertCompress``. Returns |None| when + the attribute is absent. + + .. versionadded:: 2026.05.0 + """ + return self._element.vertCompress + + @compressed.setter + def compressed(self, value: bool | None) -> None: + self._element.vertCompress = value diff --git a/src/docx/text/hyperlink.py b/src/docx/text/hyperlink.py index a23df1c74..1d4233cf6 100644 --- a/src/docx/text/hyperlink.py +++ b/src/docx/text/hyperlink.py @@ -9,12 +9,14 @@ from typing import TYPE_CHECKING +from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.shared import Parented from docx.text.run import Run if TYPE_CHECKING: import docx.types as t from docx.oxml.text.hyperlink import CT_Hyperlink + from docx.styles.style import CharacterStyle class Hyperlink(Parented): @@ -43,6 +45,47 @@ def address(self) -> str: rId = self._hyperlink.rId return self._parent.part.rels[rId].target_ref if rId else "" + @address.setter + def address(self, value: str | None) -> None: + """Assign the external URL for this hyperlink. + + Assigning a non-empty string creates (or reuses) an external relationship + of type ``HYPERLINK`` on the owning part and writes its ``rId`` to + ``w:hyperlink/@r:id``. Assigning |None| or an empty string removes the + ``r:id`` attribute, leaving the hyperlink as an internal/anchor-only + link. The ``w:anchor`` attribute is not affected. + + .. versionadded:: 2026.05.0 + """ + if value: + rId = self._parent.part.relate_to(value, RT.HYPERLINK, is_external=True) + self._hyperlink.rId = rId + else: + self._hyperlink.rId = None + + def add_run( + self, text: str | None = None, style: str | CharacterStyle | None = None + ) -> Run: + """Append a run containing `text` to this hyperlink and return it. + + `text` becomes the run's visible text; tab (``\\t``), newline (``\\n``), + and carriage-return (``\\r``) characters are mapped to the appropriate + XML forms. When `text` is |None| the new run is empty. When `style` is + provided it is applied to the new run as a character style. + + This supports multi-run hyperlinks where parts of the link text need + different formatting (e.g. a word in bold within the link text). + + .. versionadded:: 2026.05.0 + """ + r = self._hyperlink.add_r() + run = Run(r, self._parent) + if text: + run.text = text + if style is not None: + run.style = style + return run + @property def contains_page_break(self) -> bool: """True when the text of this hyperlink is broken across page boundaries. @@ -79,6 +122,18 @@ def fragment(self) -> str: """ return self._hyperlink.anchor or "" + @fragment.setter + def fragment(self, value: str | None) -> None: + """Assign the ``w:anchor`` value for this hyperlink. + + Assigning a non-empty string sets the ``w:anchor`` attribute (the + "named anchor" or URI fragment). Assigning |None| or an empty string + removes the attribute. The external ``r:id`` / address is not affected. + + .. versionadded:: 2026.05.0 + """ + self._hyperlink.anchor = value if value else None + @property def runs(self) -> list[Run]: """List of |Run| instances in this hyperlink. diff --git a/src/docx/text/pagebreak.py b/src/docx/text/pagebreak.py index 0977ccea9..645001168 100644 --- a/src/docx/text/pagebreak.py +++ b/src/docx/text/pagebreak.py @@ -12,6 +12,9 @@ from docx.text.paragraph import Paragraph +__all__ = ["RenderedPageBreak"] + + class RenderedPageBreak(Parented): """A page-break inserted by Word during page-layout for print or display purposes. diff --git a/src/docx/text/paragraph.py b/src/docx/text/paragraph.py index 234ea66cb..cf454e5dc 100644 --- a/src/docx/text/paragraph.py +++ b/src/docx/text/paragraph.py @@ -2,38 +2,280 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Iterator, List, cast +import os +from typing import IO, TYPE_CHECKING, cast +from collections.abc import Iterator +from docx.drawing import Drawing +from docx.enum.section import WD_SECTION_START +from docx.enum.shape import WD_ANCHOR_H, WD_ANCHOR_V, WD_SHAPE, WD_WRAP_TYPE from docx.enum.style import WD_STYLE_TYPE +from docx.enum.text import WD_BREAK +from docx.fields import Field +from docx.form_fields import FormField +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.oxml.drawing import CT_Drawing +from docx.oxml.shape import CT_Anchor +from docx.oxml.table import CT_Tbl from docx.oxml.text.run import CT_R -from docx.shared import StoryChild +from docx.shape import FloatingImage +from docx.shared import Inches, StoryChild from docx.styles.style import ParagraphStyle from docx.text.hyperlink import Hyperlink from docx.text.pagebreak import RenderedPageBreak from docx.text.parfmt import ParagraphFormat +from docx.tracked_changes import MoveRevision, TrackedChange from docx.text.run import Run if TYPE_CHECKING: import docx.types as t + from docx.bookmarks import Bookmark + from docx.content_controls import ContentControl, ContentControlType from docx.enum.text import WD_PARAGRAPH_ALIGNMENT + from docx.embedded_objects import EmbeddedObject + from docx.equations import Equation + from docx.ink import InkAnnotation + from docx.oxml.content_controls import CT_Sdt + from docx.oxml.document import CT_Body + from docx.oxml.math import CT_OMath, CT_OMathPara from docx.oxml.text.paragraph import CT_P + from docx.permissions import PermissionRange + from docx.section import Section + from docx.shared import Length from docx.styles.style import CharacterStyle + from docx.table import Table as _Table + from docx.styles.style import _TableStyle # pyright: ignore[reportPrivateUsage] + + +__all__ = ["Paragraph"] class Paragraph(StoryChild): """Proxy object wrapping a `` element.""" def __init__(self, p: CT_P, parent: t.ProvidesStoryPart): - super(Paragraph, self).__init__(parent) + super().__init__(parent) self._p = self._element = p - def add_run(self, text: str | None = None, style: str | CharacterStyle | None = None) -> Run: + @property + def element(self) -> CT_P: + """Public alias for the underlying ``w:p`` element. + + Exposes :attr:`_element` / :attr:`_p` without the private-attribute + access warnings generated by pyright. Use this when you need to drop + into the oxml layer (closes upstream#1445). + + .. versionadded:: 2026.05.0 + """ + return self._p + + def add_bookmark( + self, + name: str, + start_run: Run | None = None, + end_run: Run | None = None, + ) -> Bookmark: + """Add a bookmark to this paragraph and return it. + + `name` is the bookmark name, which must be unique within the document. + + When `start_run` and `end_run` are both |None|, the bookmark wraps the entire + paragraph content. When `start_run` is provided, the bookmark starts before that + run. When `end_run` is provided, the bookmark ends after that run. When only + `start_run` is provided, `end_run` defaults to `start_run`. + + .. versionadded:: 2026.05.0 + """ + from docx.bookmarks import Bookmark + + body = self._get_body() + bookmark_id = self._next_bookmark_id(body) + + if start_run is None and end_run is None: + self._p.add_bookmark(bookmark_id, name) + else: + if start_run is None: + start_run = end_run + if end_run is None: + end_run = start_run + assert start_run is not None + assert end_run is not None + start_run._r.insert_bookmark_start_before(bookmark_id, name) + end_run._r.insert_bookmark_end_after(bookmark_id) + + bookmarkStart = self._p.xpath(f".//w:bookmarkStart[@w:id='{bookmark_id}']") + return Bookmark(bookmarkStart[0], body) + + def add_permission_range( + self, + name: str | None = None, + user: str | None = None, + edit_group: str | None = None, + ) -> PermissionRange: + """Add a permission range wrapping this paragraph and return it. + + `user` is the single-user restriction (`w:ed`), and `edit_group` + is a group restriction (`w:edGrp`, e.g. ``"everyone"`` or + ``"current"``). At least one should typically be supplied. + + `name` is accepted for symmetry with ``add_bookmark()`` but is not + persisted on the element — `w:permStart` has no `@w:name` attribute in + OOXML; it is kept in the signature purely for call-site readability. + + .. versionadded:: 2026.05.0 + """ + from docx.permissions import PermissionRange + from docx.oxml.permissions import CT_PermStart + + body = self._get_body() + perm_id = self._next_permission_range_id(body) + + self._p.add_permission_range(perm_id, edit_group=edit_group, user=user) + + permStart = self._p.xpath(f".//w:permStart[@w:id='{perm_id}']")[0] + return PermissionRange(cast(CT_PermStart, permStart), body) + + @property + def permission_ranges(self) -> list[PermissionRange]: + """List of |PermissionRange| objects rooted at `w:permStart` in this paragraph. + + +.. versionadded:: 2026.05.0 + +""" + from docx.permissions import PermissionRange + from docx.oxml.permissions import CT_PermStart + + body = self._get_body() + return [ + PermissionRange(cast(CT_PermStart, ps), body) + for ps in self._p.xpath(".//w:permStart") + ] + + @staticmethod + def _next_permission_range_id(body) -> int: + """Return the next available `w:permStart/@w:id` in the document body.""" + used_ids = [int(x) for x in body.xpath(".//w:permStart/@w:id")] + return max(used_ids, default=-1) + 1 + + def _get_body(self) -> CT_Body: + """Return the w:body ancestor element.""" + from docx.oxml.document import CT_Body + + ancestor = self._p.getparent() + while ancestor is not None and not isinstance(ancestor, CT_Body): + ancestor = ancestor.getparent() + if ancestor is None: + raise ValueError("paragraph is not contained in a document body") + return ancestor + + @staticmethod + def _next_bookmark_id(body) -> int: + """Return the next available bookmark ID in the document body.""" + used_ids = [int(x) for x in body.xpath(".//w:bookmarkStart/@w:id")] + return max(used_ids, default=-1) + 1 + + def add_hyperlink( + self, + url: str | None = None, + text: str | None = None, + style: str | CharacterStyle | None = "Hyperlink", + anchor: str | None = None, + ) -> Hyperlink: + """Append a hyperlink to this paragraph and return a |Hyperlink| object. + + `url` is the target URL for an external hyperlink (e.g. "https://example.com"). + `text` is the visible link text; defaults to `url` or `anchor` when not provided. + `style` is the character style for the hyperlink run, defaulting to "Hyperlink". + `anchor` is a bookmark name for an internal document link. + + Either `url` or `anchor` must be provided, but not both. + + .. versionadded:: 2026.05.0 + """ + if url is None and anchor is None: + raise ValueError("Either url or anchor must be provided") + if url is not None and anchor is not None: + raise ValueError("Only one of url or anchor may be provided, not both") + + display_text = text if text is not None else (url or anchor or "") + + rId = None + if url is not None: + rId = self.part.relate_to(url, RT.HYPERLINK, is_external=True) + + rPr = None + if style is not None: + from docx.oxml.ns import qn + from docx.oxml.parser import OxmlElement + + style_id = self.part.get_style_id(style, WD_STYLE_TYPE.CHARACTER) + if style_id is not None: + rPr = OxmlElement("w:rPr") + rStyle = OxmlElement("w:rStyle") + rStyle.set(qn("w:val"), style_id) + rPr.append(rStyle) + + hyperlink_elm = self._p.add_hyperlink(rId, anchor, display_text, rPr) + return Hyperlink(hyperlink_elm, self) + + def insert_hyperlink_at( + self, + run: Run, + url: str | None = None, + anchor: str | None = None, + start: int | None = None, + end: int | None = None, + ) -> Hyperlink: + """Wrap (part of) `run` in a new hyperlink and return the |Hyperlink|. + + `run` must be a direct child |Run| of this paragraph (i.e. appearing in + :attr:`runs`). Exactly one of `url` (external) or `anchor` (internal + bookmark) must be provided. When `start` and/or `end` are given, the + run is split at those character offsets so only the selected slice + becomes the hyperlink content; otherwise the entire run is wrapped. + + Run formatting is preserved on every piece produced by splitting. This + method does not apply the "Hyperlink" character style automatically — + assign it via :attr:`Run.style` on the returned hyperlink's runs if + desired. + + .. versionadded:: 2026.05.0 + """ + if (url is None) == (anchor is None): + raise ValueError("Exactly one of url or anchor must be provided") + + target_run = run + text_len = len(run.text) + + # -- split off trailing portion first so `start` offset stays valid -- + if end is not None and end < text_len: + target_run, _tail = target_run.split(end) + if start is not None and start > 0: + _head, target_run = target_run.split(start) + + return target_run.make_hyperlink(url=url, anchor=anchor) + + def add_run( + self, + text: str | None = None, + style: str | CharacterStyle | None = None, + track_author: str | None = None, + ) -> Run: """Append run containing `text` and having character-style `style`. `text` can contain tab (``\\t``) characters, which are converted to the appropriate XML form for a tab. `text` can also include newline (``\\n``) or carriage return (``\\r``) characters, each of which is converted to a line break. When `text` is `None`, the new run is empty. + + If `track_author` is supplied (or if an enclosing + :meth:`Document.tracked_changes` context is active), the new run is + wrapped in a `w:ins` tracked-insertion marker attributed to that + author. Closes upstream#1025. + + .. versionadded:: 2026.05.0 + Added ``track_author`` keyword argument. """ r = self._p.add_r() run = Run(r, self) @@ -41,8 +283,498 @@ def add_run(self, text: str | None = None, style: str | CharacterStyle | None = run.text = text if style: run.style = style + _maybe_wrap_tracked_run(r, track_author, self) return run + def add_text(self, text: str) -> Run: + """Append `text` to the last run of this paragraph, or create a new run. + + When the paragraph already contains at least one run, a ``w:t`` element + containing `text` is appended to that last run. The run's existing + character formatting (``w:rPr``) is preserved. When the paragraph has no + runs, a new run is created and `text` assigned to it. Returns the run + that now holds the appended text. + + Unlike :meth:`add_run`, this method does not split ``\\t``, ``\\n`` or + ``\\r`` characters into separate elements — the entire `text` is placed + in a single ``w:t`` element, with ``xml:space="preserve"`` applied if + the text has leading or trailing whitespace. + + .. versionadded:: 2026.05.0 + """ + runs = self._p.xpath("./w:r") + if runs: + r = runs[-1] + r.add_t(text) + return Run(cast(CT_R, r), self) + # -- no existing run; create one and set its text -- + return self.add_run(text) + + def add_content_control( + self, + type: ContentControlType, + tag: str | None = None, + title: str | None = None, + ) -> ContentControl: + """Append an inline content control (structured document tag) to this paragraph. + + `type` is a :class:`ContentControlType` member. `tag` becomes the programmatic + `w:sdtPr/w:tag/@w:val` value, and `title` becomes `w:sdtPr/w:alias/@w:val`. + Returns the newly appended |ContentControl|. + + .. versionadded:: 2026.05.0 + """ + from docx.content_controls import ContentControl, new_sdt + + sdt = new_sdt(type, tag=tag, title=title, inline=True) + self._p.append(sdt) + return ContentControl(sdt) + + def add_citation_reference( + self, + tag: str, + result_text: "str | None" = None, + locale_id: int = 1033, + ) -> ContentControl: + """Append an inline citation SDT referencing the bibliography entry `tag`. + + Inserts a ```` carrying a ```` type marker and + a complex ``CITATION`` field instruction (``CITATION \\l ``) + inside ````. The rendered (cached) text falls back to + ``(tag)`` when `result_text` is not supplied. + + `locale_id` is the LCID used in the field-code ``\\l`` switch; + ``1033`` is ``en-US`` and matches Word's default. + + Does **not** verify that the bibliography actually contains a + matching source — produce the source first with + :meth:`Document.add_citation`. + + .. versionadded:: 2026.05.7 + """ + from docx.content_controls import ContentControl + from docx.oxml.ns import qn + from docx.oxml.parser import OxmlElement + + sdt = OxmlElement("w:sdt") + sdtPr = OxmlElement("w:sdtPr") + sdt.append(sdtPr) + # -- allocate a (negative, Word-style) random SDT id -- + import random as _random + + id_elm = OxmlElement("w:id") + id_elm.set(qn("w:val"), str(-_random.randint(1, 2_000_000_000))) + sdtPr.append(id_elm) + citation_marker = OxmlElement("w:citation") + sdtPr.append(citation_marker) + + sdtContent = OxmlElement("w:sdtContent") + sdt.append(sdtContent) + + display = result_text if result_text is not None else f"({tag})" + instr = f" CITATION {tag} \\l {locale_id} " + + # -- build the five-run complex field inside sdtContent directly -- + def _r() -> object: + return OxmlElement("w:r") + + # -- begin -- + r_begin = _r() + fld_begin = OxmlElement("w:fldChar") + fld_begin.set(qn("w:fldCharType"), "begin") + r_begin.append(fld_begin) + sdtContent.append(r_begin) + + # -- instrText -- + r_instr = _r() + instr_text = OxmlElement("w:instrText") + instr_text.set(qn("xml:space"), "preserve") + instr_text.text = instr + r_instr.append(instr_text) + sdtContent.append(r_instr) + + # -- separate -- + r_sep = _r() + fld_sep = OxmlElement("w:fldChar") + fld_sep.set(qn("w:fldCharType"), "separate") + r_sep.append(fld_sep) + sdtContent.append(r_sep) + + # -- result text run -- + r_text = _r() + t_elm = OxmlElement("w:t") + t_elm.set(qn("xml:space"), "preserve") + t_elm.text = display + r_text.append(t_elm) + sdtContent.append(r_text) + + # -- end -- + r_end = _r() + fld_end = OxmlElement("w:fldChar") + fld_end.set(qn("w:fldCharType"), "end") + r_end.append(fld_end) + sdtContent.append(r_end) + + self._p.append(sdt) + return ContentControl(cast("CT_Sdt", sdt)) + + def add_page_break(self) -> Paragraph: + """Append a page-break run to this paragraph and return self. + + .. versionadded:: 2026.05.0 + """ + run = self.add_run() + run.add_break(WD_BREAK.PAGE) + return self + + def add_simple_field(self, instr: str, text: str | None = None) -> Field: + """Append a ```` field to this paragraph and return a |Field|. + + `instr` is the field instruction (e.g. ``"PAGE"`` or ``"REF bookmark1 \\h"``). + `text` is the optional current rendered result, added as a single run + inside the fldSimple element. + + .. versionadded:: 2026.05.0 + """ + fldSimple = self._p.add_fldSimple(instr, text) + return Field.for_simple(fldSimple) + + def add_complex_field(self, instr: str, result_text: str | None = None) -> Field: + """Append a complex field (begin/separate/end) to this paragraph. + + Returns a |Field| wrapping the run that contains the ``begin`` + ```` marker. `instr` is the field instruction (e.g. + ``"PAGE"``) and `result_text`, if provided, is added as a plain + ```` run between the ``separate`` and ``end`` markers. + + .. versionadded:: 2026.05.0 + """ + begin_run = self._p.add_complex_field(instr, result_text) + return Field.for_complex(begin_run) + + def add_text_form_field( + self, + name: str, + default: str = "", + maxlength: int | None = None, + ) -> FormField: + """Append a legacy text form field (``FORMTEXT``) and return it. + + `name` becomes the form field's ``w:name/@w:val`` — the programmatic + identifier used by Word macros and REF fields to retrieve the value. + `default` is the initial value; it is written both to + ``w:textInput/w:default`` and used as the rendered result text so + Word displays it immediately without a field update. `maxlength` is + the character limit (|None| means no limit). + + .. versionadded:: 2026.05.0 + """ + from docx.form_fields import _append_form_field, new_text_form_field_ffData + + ffData = new_text_form_field_ffData(name, default=default, maxlength=maxlength) + begin_run = _append_form_field( + self._p, " FORMTEXT ", ffData, result_text=default + ) + return FormField(begin_run) + + def add_checkbox_form_field( + self, + name: str, + checked: bool = False, + ) -> FormField: + """Append a legacy checkbox form field (``FORMCHECKBOX``) and return it. + + `name` becomes the form field's ``w:name/@w:val``. `checked` sets both + the default and current checked state. The rendered result region of + the complex field is left empty — Word shows a checkbox glyph for + ``FORMCHECKBOX`` regardless of the result runs. + + .. versionadded:: 2026.05.0 + """ + from docx.form_fields import _append_form_field, new_checkbox_form_field_ffData + + ffData = new_checkbox_form_field_ffData(name, checked=checked) + begin_run = _append_form_field(self._p, " FORMCHECKBOX ", ffData, result_text="") + return FormField(begin_run) + + def add_dropdown_form_field( + self, + name: str, + options: list[str], + default_index: int = 0, + ) -> FormField: + """Append a legacy dropdown form field (``FORMDROPDOWN``) and return it. + + `name` becomes the form field's ``w:name/@w:val``. `options` are the + list entries the dropdown offers, in display order. `default_index` is + the 0-based index of the option that is initially selected; it is + written to both ``w:default`` and ``w:result``. The rendered result + text is set to the option at `default_index` when that index is in + range, so Word displays the initial value immediately. + + .. versionadded:: 2026.05.0 + """ + from docx.form_fields import _append_form_field, new_dropdown_form_field_ffData + + ffData = new_dropdown_form_field_ffData( + name, options=options, default_index=default_index + ) + initial_text = ( + options[default_index] + if 0 <= default_index < len(options) + else "" + ) + begin_run = _append_form_field( + self._p, " FORMDROPDOWN ", ffData, result_text=initial_text + ) + return FormField(begin_run) + + def add_equation( + self, omml_xml: str | bytes, display_mode: bool = False + ) -> Equation: + """Append an OMML equation to this paragraph and return the |Equation|. + + `omml_xml` is an OMML XML string (or bytes) whose root element is + either ``m:oMath`` or ``m:oMathPara``. Namespace declarations for the + ``m`` prefix must be present on the root. When `display_mode` is + |True| and the root is a bare ``m:oMath``, it is wrapped in + ``m:oMathPara`` to render in display mode. + + Raises :class:`ValueError` when the root element is neither + ``m:oMath`` nor ``m:oMathPara``. + + .. versionadded:: 2026.05.0 + """ + from docx.equations import Equation, _make_equation_element + + element = _make_equation_element(omml_xml, display_mode=display_mode) + self._p.append(element) + return Equation(cast("CT_OMath | CT_OMathPara", element)) + + @property + def equations(self) -> list[Equation]: + """List of |Equation| objects for each OMML expression in this paragraph. + + Includes both paragraph-level ``m:oMathPara`` wrappers and loose + inline ``m:oMath`` descendants. Each ``m:oMath`` nested inside an + ``m:oMathPara`` is represented once — by the enclosing + ``m:oMathPara`` — so an equation is not counted twice. + + .. versionadded:: 2026.05.0 + """ + from docx.equations import Equation + + result: list[Equation] = [] + # -- top-level (or oMathPara-wrapped) matches first -- + for el in self._p.xpath( + ".//m:oMathPara | .//m:oMath[not(ancestor::m:oMathPara)]" + ): + result.append(Equation(cast("CT_OMath | CT_OMathPara", el))) + return result + + def add_shape( + self, + shape_type, + width: Length | None = None, + height: Length | None = None, + text: str | None = None, + ): + """Append an inline `wps:wsp` DrawingML shape to this paragraph. + + `shape_type` is a :class:`docx.enum.shape.WD_SHAPE` member identifying + the preset geometry. `width` and `height` are |Length| values; they + default to 2 inches by 1 inch when omitted. When `text` is provided the + shape gets a minimal text frame containing that string. + + Returns a :class:`docx.drawing.WordprocessingShape` proxy for the newly + created shape. + + .. versionadded:: 2026.05.0 + """ + from docx.drawing import WordprocessingShape + from docx.oxml.drawing import new_inline_shape_drawing + + if not isinstance(shape_type, WD_SHAPE): + raise TypeError( + "shape_type must be a WD_SHAPE member, got %r" % (shape_type,) + ) + + cx = int(width) if width is not None else int(Inches(2)) + cy = int(height) if height is not None else int(Inches(1)) + + story_part = self.part + shape_id = story_part.next_id + name = "%s %d" % (_shape_name_for(shape_type), shape_id) + + drawing = new_inline_shape_drawing( + shape_type.value, cx, cy, shape_id, name, text=text + ) + + run = self.add_run() + run._r.append(drawing) + + wsp = drawing.xpath( + ".//wp:inline/a:graphic/a:graphicData/wps:wsp" + )[0] + return WordprocessingShape(wsp, self) + + def add_floating_shape( + self, + image_path_or_stream: str | IO[bytes], + x: int | Length = 0, + y: int | Length = 0, + width: int | Length | None = None, + height: int | Length | None = None, + h_anchor: WD_ANCHOR_H | str = WD_ANCHOR_H.COLUMN, + v_anchor: WD_ANCHOR_V | str = WD_ANCHOR_V.PARAGRAPH, + wrap: WD_WRAP_TYPE | str = WD_WRAP_TYPE.SQUARE, + ) -> FloatingImage: + """Add a floating image anchored at explicit coordinates and return it. + + `x` / `y` are horizontal / vertical offsets (EMU, or |Length|). + `h_anchor` / `v_anchor` are the horizontal / vertical frame of + reference; accepted as |WD_ANCHOR_H| / |WD_ANCHOR_V| members or the + matching OOXML attribute strings (e.g. ``"page"``). `wrap` is the + text-wrap style as a |WD_WRAP_TYPE| member or its string value. + + This is a coordinate-first counterpart to :meth:`add_floating_image`: + use this method when you want to place a shape at a specific x/y + offset (upstream #1414) rather than fall back to square-wrap with a + zero offset. + + .. versionadded:: 2026.05.0 + """ + return self.add_floating_image( + image_path_or_stream, + width=width, + height=height, + position={ + "h_anchor": h_anchor, + "v_anchor": v_anchor, + "horizontal": int(x), + "vertical": int(y), + "wrap": wrap, + }, + ) + + def add_floating_image( + self, + image_path_or_stream: "str | os.PathLike[str] | IO[bytes]", + width: int | Length | None = None, + height: int | Length | None = None, + position: dict | None = None, + ) -> FloatingImage: + """Add a floating (anchored) image to this paragraph and return it. + + `image_path_or_stream` is a ``str`` path, an :class:`os.PathLike` (e.g. + :class:`pathlib.Path`), or a binary file-like object for the image. + `width` and `height` work the same way as for `add_picture`. + + `position` is an optional dict that may contain any of these keys: + - `horizontal`: horizontal offset (int EMU or |Length|) + - `vertical`: vertical offset (int EMU or |Length|) + - `h_anchor`: |WD_ANCHOR_H| member (defaults to `COLUMN`) + - `v_anchor`: |WD_ANCHOR_V| member (defaults to `PARAGRAPH`) + - `wrap`: |WD_WRAP_TYPE| member (defaults to `SQUARE`) + + .. versionadded:: 2026.05.0 + + .. versionchanged:: 2026.05.0 + Accepts :class:`os.PathLike` path arguments. + """ + if isinstance(image_path_or_stream, os.PathLike): + image_path_or_stream = os.fspath(image_path_or_stream) + anchor = self.part.new_pic_anchor(image_path_or_stream, width, height) + + # -- apply optional positioning overrides -- + if position is not None: + h_anchor = position.get("h_anchor", WD_ANCHOR_H.COLUMN) + v_anchor = position.get("v_anchor", WD_ANCHOR_V.PARAGRAPH) + horizontal = position.get("horizontal", 0) + vertical = position.get("vertical", 0) + wrap = position.get("wrap", WD_WRAP_TYPE.SQUARE) + + if isinstance(h_anchor, WD_ANCHOR_H): + h_anchor_value = h_anchor.value + else: + h_anchor_value = str(h_anchor) + if isinstance(v_anchor, WD_ANCHOR_V): + v_anchor_value = v_anchor.value + else: + v_anchor_value = str(v_anchor) + if isinstance(wrap, WD_WRAP_TYPE): + wrap_value = wrap.value + else: + wrap_value = str(wrap) + + anchor.set_horizontal_position(h_anchor_value, int(horizontal)) + anchor.set_vertical_position(v_anchor_value, int(vertical)) + anchor.set_wrap(wrap_value) + + # -- append the anchor inside a new run's `w:drawing` -- + run = self.add_run() + run._r.add_drawing(anchor) + return FloatingImage(anchor) + + @property + def fields(self) -> list[Field]: + """List of |Field| objects for each field in this paragraph. + + Includes both simple (``w:fldSimple``) and complex (``w:fldChar``) + fields, in document order. + + .. versionadded:: 2026.05.0 + """ + result: list[Field] = [] + for kind, el in self._p.iter_field_elements(): + if kind == "simple": + result.append(Field.for_simple(el)) + else: + result.append(Field.for_complex(el)) + return result + + @property + def form_fields(self) -> list[FormField]: + """List of |FormField| objects for each legacy form field in this paragraph. + + A legacy form field is a complex field whose ``begin`` ``w:fldChar`` + carries a ``w:ffData`` child. Returned in document order. Complex + fields without ``w:ffData`` (e.g. ``PAGE``, ``REF``) are ignored — + those remain accessible via :attr:`fields`. + + .. versionadded:: 2026.05.0 + """ + result: list[FormField] = [] + begin_runs = self._p.xpath( + "./w:r[w:fldChar[@w:fldCharType='begin' and w:ffData]]" + ) + for r in begin_runs: + result.append(FormField(cast(CT_R, r))) + return result + + @property + def floating_images(self) -> list[FloatingImage]: + """A |FloatingImage| instance for each `wp:anchor` in this paragraph. + + .. versionadded:: 2026.05.0 + """ + return [ + FloatingImage(cast(CT_Anchor, a)) + for a in self._p.xpath(".//w:r/w:drawing/wp:anchor") + ] + + @property + def content_controls(self) -> list[ContentControl]: + """List of inline |ContentControl| objects in this paragraph, in document order. + + .. versionadded:: 2026.05.0 + """ + from docx.content_controls import ContentControl + + return [ + ContentControl(cast("CT_Sdt", sdt)) for sdt in self._p.xpath("./w:sdt") + ] + @property def alignment(self) -> WD_PARAGRAPH_ALIGNMENT | None: """A member of the :ref:`WdParagraphAlignment` enumeration specifying the @@ -66,16 +798,172 @@ def clear(self): self._p.clear_content() return self + def delete(self) -> None: + """Remove this paragraph from the document. + + The paragraph element is removed from its parent. After calling this method, + this |Paragraph| object is "defunct" and should not be used further. + + .. versionadded:: 2026.05.0 + """ + p = self._p + parent = p.getparent() + if parent is None: + return + parent.remove(p) + + def clear_page_breaks(self) -> None: + """Remove all ```` elements from this paragraph. + + If a run contains only a page break and no other content, the entire run is + removed. If a run contains other content alongside the page break, only the + ```` element is removed. Does nothing when no page breaks are present. + + .. versionadded:: 2026.05.0 + """ + for br in self._p.xpath('.//w:br[@w:type="page"]'): + r = br.getparent() + r.remove(br) + # --- remove the run if it's now empty (no child elements and no text) --- + if len(r) == 0 and not r.text: + r.getparent().remove(r) + + @property + def has_section_break(self) -> bool: + """``True`` if this paragraph contains a section break (```` in its + ````). + + .. versionadded:: 2026.05.0 + """ + pPr = self._p.pPr + if pPr is None: + return False + return pPr.sectPr is not None + @property def contains_page_break(self) -> bool: """`True` when one or more rendered page-breaks occur in this paragraph.""" return bool(self._p.lastRenderedPageBreaks) @property - def hyperlinks(self) -> List[Hyperlink]: + def has_page_break(self) -> bool: + """`True` if this paragraph contains at least one ````. + + .. versionadded:: 2026.05.0 + """ + return bool(self._p.xpath('.//w:br[@w:type="page"]')) + + @property + def drawings(self) -> list[Drawing]: + """A |Drawing| instance for each `` element in this paragraph. + + .. versionadded:: 2026.05.0 + """ + return [ + Drawing(cast(CT_Drawing, d), self) + for d in self._p.xpath(".//w:drawing") + ] + + @property + def ink_annotations(self) -> list[InkAnnotation]: + """List of |InkAnnotation| objects for each ``w:contentPart`` in this paragraph. + + Returns an empty list when the paragraph contains no ink annotations. Read-only + — python-docx does not support creating or modifying ink annotations. + + A ``w:contentPart`` whose relationship cannot be resolved (for example because + the referenced part is missing from the package) is silently skipped rather + than raising. + + .. versionadded:: 2026.05.0 + """ + from docx.ink import InkAnnotation + from docx.oxml.ns import qn + from docx.parts.ink import InkPart + + result: list[InkAnnotation] = [] + part = self.part + for cp in self._p.xpath(".//w:contentPart"): + rId = cp.get(qn("r:id")) + if not rId: + continue + try: + ink_part = part.related_parts[rId] + except KeyError: + continue + if not isinstance(ink_part, InkPart): + continue + result.append(InkAnnotation(self, ink_part)) + return result + + @property + def embedded_objects(self) -> list[EmbeddedObject]: + """List of |EmbeddedObject| for each ``w:object/o:OLEObject`` in this paragraph. + + Returns an empty list when the paragraph contains no embedded OLE + objects. Read-only — python-docx does not support creating or + modifying embedded objects. + + An ``o:OLEObject`` whose ``r:id`` cannot be resolved (for example when + the referenced part is missing from the package or is of an unexpected + type) still produces an |EmbeddedObject|, but its + :attr:`EmbeddedObject.blob` returns ``b""`` and + :attr:`EmbeddedObject.embedded_partname` returns |None|. + + .. versionadded:: 2026.05.0 + """ + from docx.embedded_objects import EmbeddedObject + from docx.oxml.ns import qn + from docx.parts.embedded_object import EmbeddedObjectPart + + result: list[EmbeddedObject] = [] + part = self.part + for ole_elm in self._p.xpath(".//w:object/o:OLEObject"): + rId = ole_elm.get(qn("r:id")) + embedded_part: EmbeddedObjectPart | None = None + if rId: + candidate = part.related_parts.get(rId) + if isinstance(candidate, EmbeddedObjectPart): + embedded_part = candidate + result.append(EmbeddedObject(self, ole_elm, embedded_part)) + return result + + @property + def hyperlinks(self) -> list[Hyperlink]: """A |Hyperlink| instance for each hyperlink in this paragraph.""" return [Hyperlink(hyperlink, self) for hyperlink in self._p.hyperlink_lst] + def insert_section_break( + self, start_type: WD_SECTION_START = WD_SECTION_START.NEW_PAGE + ) -> Section: + """Insert a section break in this paragraph and return the new |Section|. + + `start_type` is a member of :ref:`WdSectionStart` and defaults to + ``WD_SECTION.NEW_PAGE``. If this paragraph already contains a section break, + its type is replaced rather than a new one being added. + + .. versionadded:: 2026.05.0 + """ + from docx.section import Section as SectionCls + + pPr = self._p.get_or_add_pPr() + sectPr = pPr.get_or_add_sectPr() + sectPr.start_type = start_type + return SectionCls(sectPr, self.part) + + def remove_section_break(self) -> None: + """Remove the section break from this paragraph, if one is present. + + Calling this on a paragraph that has no section break is a no-op. + + .. versionadded:: 2026.05.0 + """ + pPr = self._p.pPr + if pPr is None: + return + if pPr.sectPr is not None: + pPr._remove_sectPr() + def insert_paragraph_before( self, text: str | None = None, style: str | ParagraphStyle | None = None ) -> Paragraph: @@ -91,6 +979,190 @@ def insert_paragraph_before( paragraph.style = style return paragraph + def insert_paragraph_after( + self, text: str | None = None, style: str | ParagraphStyle | None = None + ) -> Paragraph: + """Return a newly created paragraph, inserted directly after this paragraph. + + If `text` is supplied, the new paragraph contains that text in a single run. If + `style` is provided, that style is assigned to the new paragraph. The new + paragraph is inserted into the same parent element as this paragraph (which + may be a body, cell, header/footer, or other block-level container). + + .. versionadded:: 2026.05.0 + """ + from docx.oxml.parser import OxmlElement + + new_p = cast("CT_P", OxmlElement("w:p")) + self._p.addnext(new_p) + paragraph = Paragraph(new_p, self._parent) + if text: + paragraph.add_run(text) + if style is not None: + paragraph.style = style + return paragraph + + def add_caption_before( + self, + text: str, + label: str = "Figure", + style: str = "Caption", + ) -> Paragraph: + """Insert a caption paragraph directly before this paragraph and return it. + + This is the common shape for a caption that sits *above* a figure + or table. The inserted paragraph has the standard caption structure: + ``"{label} N: {text}"`` where ``N`` is produced by a + ``SEQ {label} \\* ARABIC`` field. See + :meth:`docx.document.Document.add_caption` for details. + + .. versionadded:: 2026.05.0 + """ + from docx.captions import new_caption_paragraph + + paragraph = self.insert_paragraph_before() + return new_caption_paragraph(paragraph, text, label=label, style=style) + + def add_caption_after( + self, + text: str, + label: str = "Figure", + style: str = "Caption", + ) -> Paragraph: + """Insert a caption paragraph directly after this paragraph and return it. + + This is the common shape for a caption that sits *below* a figure + or table. The inserted paragraph has the standard caption structure: + ``"{label} N: {text}"`` where ``N`` is produced by a + ``SEQ {label} \\* ARABIC`` field. See + :meth:`docx.document.Document.add_caption` for details. + + .. versionadded:: 2026.05.0 + """ + from docx.captions import new_caption_paragraph + + paragraph = self.insert_paragraph_after() + return new_caption_paragraph(paragraph, text, label=label, style=style) + + def insert_table_of_contents_before( + self, levels: tuple[int, int] = (1, 3) + ) -> Paragraph: + """Insert a TOC paragraph directly before this paragraph and return it. + + `levels` is a ``(min_level, max_level)`` tuple (default ``(1, 3)``) + controlling which ``"Heading N"`` paragraphs contribute to the cached + preview text. See + :meth:`docx.document.Document.add_table_of_contents` for the full + contract — this method uses the same helper and the same semantics, + just placed before this paragraph rather than appended. + + The preview scans the document body for headings; the headings that + appear *before* or *after* this paragraph are both included, since + Word will rebuild the real TOC on open. + + .. versionadded:: 2026.05.0 + """ + from docx.toc import populate_toc_paragraph + + body = self._get_body() + source_paragraphs = [Paragraph(p, self._parent) for p in body.xpath(".//w:p")] + paragraph = self.insert_paragraph_before() + return populate_toc_paragraph(paragraph, source_paragraphs, levels) + + def insert_table_of_contents_after( + self, levels: tuple[int, int] = (1, 3) + ) -> Paragraph: + """Insert a TOC paragraph directly after this paragraph and return it. + + See :meth:`insert_table_of_contents_before` for the full contract; + this variant places the TOC paragraph immediately after this one. + + .. versionadded:: 2026.05.0 + """ + from docx.toc import populate_toc_paragraph + + body = self._get_body() + source_paragraphs = [Paragraph(p, self._parent) for p in body.xpath(".//w:p")] + paragraph = self.insert_paragraph_after() + return populate_toc_paragraph(paragraph, source_paragraphs, levels) + + def insert_table_before( + self, + rows: int, + cols: int, + style: str | _TableStyle | None = None, + width: Length | None = None, + ) -> _Table: + """Return a new table with `rows` rows and `cols` cols, inserted directly + before this paragraph. + + If `style` is supplied, that style is assigned to the new table. The new + table is inserted as a sibling of this paragraph in its parent element. + `width` is an optional total table width; if not provided it defaults to 6 + inches (a reasonable default for a US-Letter page with 1" margins). + + .. versionadded:: 2026.05.0 + """ + from docx.table import Table + + table_width = width if width is not None else Inches(6) + tbl = CT_Tbl.new_tbl(rows, cols, table_width) + self._p.addprevious(tbl) + table = Table(tbl, self._parent) + if style is not None: + table.style = style + return table + + def insert_table_after( + self, + rows: int, + cols: int, + style: str | _TableStyle | None = None, + width: Length | None = None, + ) -> _Table: + """Return a new table with `rows` rows and `cols` cols, inserted directly + after this paragraph. + + If `style` is supplied, that style is assigned to the new table. The new + table is inserted as a sibling of this paragraph in its parent element. + `width` is an optional total table width; if not provided it defaults to 6 + inches. + + .. versionadded:: 2026.05.0 + """ + from docx.table import Table + + table_width = width if width is not None else Inches(6) + tbl = CT_Tbl.new_tbl(rows, cols, table_width) + self._p.addnext(tbl) + table = Table(tbl, self._parent) + if style is not None: + table.style = style + return table + + @property + def next_block(self) -> Paragraph | _Table | None: + """The next sibling ``w:p`` or ``w:tbl`` block, or |None| at end-of-container. + + Walks forward through siblings using ``getnext()``, skipping + non-block elements such as ``w:sectPr``, ``w:bookmarkStart``, + ``w:bookmarkEnd``, ``w:permStart``, and ``w:permEnd``. Returns a + |Paragraph| for ``w:p`` and a |Table| for ``w:tbl``. + + .. versionadded:: 2026.05.0 + """ + return _next_block_sibling(self._p, self._parent) + + @property + def previous_block(self) -> Paragraph | _Table | None: + """The previous sibling ``w:p`` or ``w:tbl`` block, or |None| at start-of-container. + + Mirror of :attr:`next_block`; walks backwards via ``getprevious()``. + + .. versionadded:: 2026.05.0 + """ + return _previous_block_sibling(self._p, self._parent) + def iter_inner_content(self) -> Iterator[Run | Hyperlink]: """Generate the runs and hyperlinks in this paragraph, in the order they appear. @@ -113,7 +1185,270 @@ def paragraph_format(self): return ParagraphFormat(self._element) @property - def rendered_page_breaks(self) -> List[RenderedPageBreak]: + def font(self): + """A |Font| object providing access to the paragraph-mark character formatting. + + This exposes the ``w:pPr/w:rPr`` element — the "paragraph mark" character + properties that control the font used to render the pilcrow (paragraph + mark) itself and, in some contexts, the default run formatting for the + paragraph. When no ``w:pPr`` or ``w:rPr`` element is present, reads + return |None| for inheritable properties; writes create the chain of + parent elements as needed. + + Note this is distinct from the per-run :attr:`Run.font`, which controls + the appearance of the text runs inside the paragraph. + + .. versionadded:: 2026.05.0 + """ + from docx.text.font import Font + + pPr = self._p.get_or_add_pPr() + return Font(pPr) # type: ignore[arg-type] + + @property + def list_level(self) -> int | None: + """The integer list-level of this paragraph (``w:numPr/w:ilvl/@w:val``). + + Returns |None| when the paragraph has no ``w:numPr`` or ``w:ilvl`` + child. Valid values are ``0`` through ``8``. + + Assigning |None| removes the ``w:ilvl`` child. Assigning an integer + outside the range 0..8 raises ``ValueError``. + + .. versionadded:: 2026.05.0 + """ + pPr = self._p.pPr + if pPr is None or pPr.numPr is None: + return None + return pPr.numPr.ilvl_val + + @list_level.setter + def list_level(self, value: int | None) -> None: + if value is not None: + if not isinstance(value, int) or not 0 <= value <= 8: + raise ValueError( + "list_level must be an int in 0..8 or None, got %r" % (value,) + ) + pPr = self._p.get_or_add_pPr() + numPr = pPr.get_or_add_numPr() + numPr.ilvl_val = value + + @property + def list_format(self): + """Named tuple ``(numbering_definition, level)`` describing this paragraph's + list settings. + + Both fields are |None| when the paragraph is not part of a list. The + ``numbering_definition`` is resolved by looking up the paragraph's + ``numId`` in the document's numbering part. + + To set a paragraph's list format, use + :meth:`NumberingDefinition.apply_to`. + + .. versionadded:: 2026.05.0 + """ + from docx.numbering import ListFormat, Numbering, NumberingDefinition + + pPr = self._p.pPr + if pPr is None or pPr.numPr is None: + return ListFormat(None, None) + numPr = pPr.numPr + num_id = numPr.numId_val + level = numPr.ilvl_val + if num_id is None: + return ListFormat(None, level) + + numbering_part = getattr(self.part, "numbering_part", None) + if numbering_part is None: + return ListFormat(None, level) + + numbering_elm = numbering_part.numbering_element + try: + num = numbering_elm.num_having_numId(num_id) + except KeyError: + return ListFormat(None, level) + + abstractNumId_elm = num.abstractNumId + abstract_num_id = abstractNumId_elm.val + try: + abstractNum = numbering_elm.abstractNum_having_abstractNumId( + abstract_num_id + ) + except KeyError: + return ListFormat(None, level) + + numbering_proxy = Numbering(numbering_elm, numbering_part) + return ListFormat( + NumberingDefinition(abstractNum, numbering_proxy), level + ) + + @property + def list_label(self) -> str | None: + """The rendered number/bullet string Word would display for this paragraph. + + Resolves this paragraph's ``numId`` and ``ilvl`` (directly or + style-inherited), walks the document body from the start, and returns + the formatted label — for example ``"1."``, ``"a)"``, ``"I."``, + ``"1.1."``, or ``"•"`` — computed from the level's ``lvlText`` + pattern and ``numFmt`` (``decimal``, ``decimalZero``, ``upperRoman``, + ``lowerRoman``, ``upperLetter``, ``lowerLetter``, ``bullet``). + Returns |None| when this paragraph is not part of any numbered list + or the referenced numbering cannot be resolved. + + Note that the returned label reflects the paragraph's *current* + position in the document body. Counters propagate across siblings at + the same level and reset when a deeper level is entered. This + property walks the full body on each access — cache the result on + the caller's side, or use :meth:`Document.list_labels` for a single + bulk traversal, when label lookup is hot. + + .. versionadded:: 2026.05.0 + """ + from docx.numbering import ListLabelRenderer + + pPr = self._p.pPr + has_direct_numPr = pPr is not None and pPr.numPr is not None + # -- quickly bail out when neither direct numPr nor a pStyle pointing at + # -- a numbered style is present: avoids walking the body unnecessarily -- + has_pStyle = pPr is not None and pPr.style is not None + if not has_direct_numPr and not has_pStyle: + return None + + # -- locate the body element that contains this paragraph -- + try: + body = self._get_body() + except ValueError: + return None + + numbering_part = getattr(self.part, "numbering_part", None) + numbering_elm = ( + numbering_part.numbering_element if numbering_part is not None else None + ) + + styles_elm = None + try: + styles_part = self.part.part_related_by(RT.STYLES) + except (KeyError, AttributeError): + styles_part = None + if styles_part is not None: + styles_elm = getattr(styles_part, "element", None) + + renderer = ListLabelRenderer(numbering_elm, styles_elm) + + # -- walk body paragraphs in order until we hit self -- + target_id = id(self._p) + for p in body.xpath(".//w:p"): + label = renderer.label_for(cast("CT_P", p)) + if id(p) == target_id: + return label + return None + + @property + def numbering_format(self): + """Read-only |Level| describing this paragraph's current level in its list. + + Returns |None| if the paragraph is not part of a numbered list, or if the + list-level entry cannot be found in the document's numbering part. + + .. versionadded:: 2026.05.0 + """ + list_format = self.list_format + if list_format.numbering_definition is None: + return None + level = list_format.level if list_format.level is not None else 0 + return list_format.numbering_definition.level(level) + + def restart_numbering( + self, level: int | None = None, start: int = 1 + ) -> None: + """Create a new numbering instance that restarts the current list at `start`. + + The new ``w:num`` reuses the existing abstract definition but adds a + ``w:lvlOverride/w:startOverride`` for `level` (defaulting to this + paragraph's current level). The paragraph's ``w:numPr/w:numId`` is + rewritten to point at the new instance, so subsequent siblings at the + same level continue the fresh count. + + `level` may be provided to override an outer level instead of the + paragraph's own level (closes upstream#25). `start` is the starting + value for the override (default ``1``). Accepting kwargs-style + ``(level=..., start=...)`` calls or the positional-order + ``restart_numbering(level, start)`` shape described in the upstream + request. + + Raises ``ValueError`` when the paragraph is not currently part of a + numbered list. + + .. versionadded:: 2026.05.0 + .. versionchanged:: 2026.05.0 + Added the optional ``level`` parameter. + """ + pPr = self._p.pPr + if pPr is None or pPr.numPr is None or pPr.numPr.numId_val is None: + raise ValueError( + "paragraph is not part of a numbered list; apply a numbering " + "definition before calling restart_numbering()" + ) + numPr = pPr.numPr + num_id = numPr.numId_val + ilvl = level if level is not None else (numPr.ilvl_val or 0) + + try: + numbering_part = self.part.numbering_part # type: ignore[attr-defined] + except AttributeError as err: + raise ValueError( + "cannot locate numbering part for this paragraph" + ) from err + + numbering_elm = numbering_part.numbering_element + try: + existing_num = numbering_elm.num_having_numId(num_id) + except KeyError as err: + raise ValueError( + "paragraph's numId %d does not match any w:num" % num_id + ) from err + + abstract_num_id = existing_num.abstractNumId.val + new_num = numbering_elm.add_num(abstract_num_id) + override = new_num.add_lvlOverride(ilvl=ilvl) + override.add_startOverride(val=start) + + numPr.numId_val = new_num.numId + + @property + def rsid(self) -> str | None: + """The paragraph's revision-save ID (``w:p/@w:rsidR``) or |None|. + + Read-only. Returns the 8-character hex string Word assigns to mark the + editing session in which this paragraph was last modified, or |None| + when the ``@w:rsidR`` attribute is not present. + + .. versionadded:: 2026.05.0 + """ + return self._p.rsidR + + @property + def stable_id(self) -> str: + """A 16-character hex stable identifier for this paragraph. + + The ID is derived from the paragraph's ``w:rsidR`` (when present), its + position within its parent, and its text content. It is stable across + save/reload *when the paragraph keeps the same position with the same + text*; it changes if the paragraph is reordered or edited. The value + is recomputed on each access and never persisted on the element. + + This is intended for tools that need to correlate paragraphs across a + save/reload cycle in a single editing session. For more robust cross- + session tracking, compare :attr:`rsid` combined with :attr:`text`. + + .. versionadded:: 2026.05.0 + """ + from docx.ids import compute_stable_id + + return compute_stable_id(self._p, self._p.text, self._p.rsidR) + + @property + def rendered_page_breaks(self) -> list[RenderedPageBreak]: """All rendered page-breaks in this paragraph. Most often an empty list, sometimes contains one page-break, but can contain @@ -122,10 +1457,52 @@ def rendered_page_breaks(self) -> List[RenderedPageBreak]: return [RenderedPageBreak(lrpb, self) for lrpb in self._p.lastRenderedPageBreaks] @property - def runs(self) -> List[Run]: + def page_breaks_inside(self) -> list[RenderedPageBreak]: + """All ``w:lastRenderedPageBreak`` positions inside this paragraph. + + Same data as :attr:`rendered_page_breaks`, exposed under the name used + by upstream issue #744 so the pagination-detection use case is + discoverable. ``w:lastRenderedPageBreak`` is written by Word when it + renders a document; a programmatically-created document typically has + none until Word opens and re-saves it. The explicit page-break markers + used by :meth:`add_page_break` (````) are a + different element and are reported via :attr:`has_page_break`. + + .. versionadded:: 2026.05.0 + """ + return self.rendered_page_breaks + + @property + def runs(self) -> list[Run]: """Sequence of |Run| instances corresponding to the elements in this - paragraph.""" - return [Run(r, self) for r in self._p.r_lst] + paragraph. + + Descends transparently through ``w:smartTag`` and ``w:customXml`` + wrappers so runs nested inside those elements are reported alongside + direct-child runs (upstream #932, #225). Runs nested inside + ``w:hyperlink``, ``w:fldSimple``, or ``w:sdt`` are not included here + (they surface via :attr:`hyperlinks`, :attr:`fields`, and + :attr:`content_controls` respectively); use :attr:`all_runs` when a + flat view over *every* visible run is needed. + """ + return [Run(r, self) for r in self._p.iter_r_elements()] + + @property + def all_runs(self) -> list[Run]: + """Every visible |Run| in this paragraph, including those nested inside + ``w:hyperlink``, ``w:fldSimple``, ``w:sdt/w:sdtContent``, complex-field + ``separate``..``end`` regions, tracked insertions (``w:ins``), move- + destinations (``w:moveTo``), and smartTag / customXml wrappers. + + Runs whose only content is ``w:instrText`` (the field *code*, not the + rendered result) are excluded. This is the iterator routed through by + the Find/Replace helpers in :mod:`docx.search` so that search and + replace work on the text the user actually sees (upstream #1370, + #1021). + + .. versionadded:: 2026.05.0 + """ + return [Run(r, self) for r in self._p.iter_all_r_elements()] @property def style(self) -> ParagraphStyle | None: @@ -146,6 +1523,73 @@ def style(self, style_or_name: str | ParagraphStyle | None): style_id = self.part.get_style_id(style_or_name, WD_STYLE_TYPE.PARAGRAPH) self._p.style = style_id + @property + def formatting_change(self): + """A |FormattingChange| for this paragraph's `w:pPrChange`, or |None|. + + Present when the paragraph's formatting (its `w:pPr`) has been edited while + track-changes is enabled. The returned object exposes the author, date, and + the prior `w:pPr` via ``old_properties``. + + .. versionadded:: 2026.05.0 + """ + from docx.tracked_changes import FormattingChange + + pPr = self._p.pPr + if pPr is None: + return None + pPrChange = pPr.pPrChange # pyright: ignore[reportAttributeAccessIssue] + if pPrChange is None: + return None + return FormattingChange(pPrChange) + + @property + def tracked_changes(self) -> list[TrackedChange]: + """A list of |TrackedChange| objects for each run-level track change. + + Yields proxies for `w:ins`, `w:del`, `w:moveFrom`, and `w:moveTo` children + of this paragraph in document order. Move-revision elements are wrapped + in |MoveRevision|, exposing the `@w:name` pairing attribute and + ``.peer`` lookup. + + .. versionadded:: 2026.05.0 + """ + from docx.oxml.tracked_changes import CT_MoveFrom, CT_MoveTo + + result: list[TrackedChange] = [] + for tc in self._p.tracked_change_elements: + if isinstance(tc, (CT_MoveFrom, CT_MoveTo)): + result.append(MoveRevision(tc)) + else: + result.append(TrackedChange(tc)) + return result + + def revision_marks_text( + self, + open_ins: str = "[+", + close_ins: str = "+]", + open_del: str = "[-", + close_del: str = "-]", + ) -> str: + """Return this paragraph's text with tracked-change markers applied. + + Inserted runs (inside ````) are wrapped with `open_ins`/`close_ins` + and deleted runs (inside ````) with `open_del`/`close_del`. Runs + outside of any track-change wrapper are rendered as plain text. + + When the paragraph contains no tracked changes, the return value matches + :attr:`text`. The defaults are CLI-friendly square-bracket markers; callers + can pass ANSI escape sequences (e.g. ``"\\033[4m"`` / ``"\\033[0m"``) to + style terminal output instead. + + .. versionadded:: 2026.05.0 + """ + from docx.tracked_changes import _render_paragraph_marks + + return _render_paragraph_marks( + self._p, open_ins, close_ins, open_del, close_del + ) + @property def text(self) -> str: """The textual content of this paragraph. @@ -171,3 +1615,101 @@ def _insert_paragraph_before(self): """Return a newly created paragraph, inserted directly before this paragraph.""" p = self._p.add_p_before() return Paragraph(p, self._parent) + + +# -- human-readable prefixes for generated shape names (used in @name attrs) -- +_SHAPE_NAME_PREFIX: dict[WD_SHAPE, str] = { + WD_SHAPE.RECTANGLE: "Rectangle", + WD_SHAPE.ROUNDED_RECTANGLE: "Rounded Rectangle", + WD_SHAPE.OVAL: "Oval", + WD_SHAPE.ARROW_RIGHT: "Right Arrow", + WD_SHAPE.CALLOUT_ROUNDED_RECTANGLE: "Callout", +} + + +def _shape_name_for(shape_type: WD_SHAPE) -> str: + """Return a human-readable prefix used to build a `wps:cNvPr/@name`.""" + return _SHAPE_NAME_PREFIX.get(shape_type, "Shape") + + +def _maybe_wrap_tracked_run( + r: CT_R, + track_author: str | None, + paragraph: "Paragraph", +) -> None: + """Wrap `r` in a `w:ins` revision marker when tracked-change writing is active. + + Resolution order for the author/date: + + 1. An explicit `track_author` keyword argument (from + :meth:`Paragraph.add_run` or :meth:`BlockItemContainer.add_paragraph`) + always wins. The paired date is taken from the active + :meth:`Document.tracked_changes` context, or `None` (``now()``) when + no context is active. + 2. When `track_author` is |None|, the active + :meth:`Document.tracked_changes` context supplies both author and + date. + + A paragraph is considered to have no active context when + ``paragraph.part`` does not reference a :class:`Document` proxy. In that + case, only the explicit `track_author` branch applies. + """ + import datetime as _dt + + from docx.tracked_changes import _active_track_author, wrap_run_in_ins + + # -- dial in author/date -- + author: str | None = None + date: _dt.datetime | None = None + try: + part = paragraph.part + except Exception: # pragma: no cover -- detached paragraphs in tests + part = None + + active = _active_track_author(part) if part is not None else None + if track_author is not None: + author = track_author + if active is not None: + date = active[1] + elif active is not None: + author, date = active + + if not author: + return + wrap_run_in_ins(r, author, date) + + +def _next_block_sibling( + elm, parent: "t.ProvidesStoryPart" +) -> "Paragraph | _Table | None": + """Return the next sibling ``w:p`` / ``w:tbl``, skipping non-block elements.""" + from docx.table import Table as _Table + + sibling = elm.getnext() + while sibling is not None: + if isinstance(sibling, CT_Tbl): + return _Table(sibling, parent) + # -- CT_P is the ``w:p`` wrapper; avoid import at module top -- + from docx.oxml.text.paragraph import CT_P as _CT_P + + if isinstance(sibling, _CT_P): + return Paragraph(sibling, parent) + sibling = sibling.getnext() + return None + + +def _previous_block_sibling( + elm, parent: "t.ProvidesStoryPart" +) -> "Paragraph | _Table | None": + """Return the previous sibling ``w:p`` / ``w:tbl``, skipping non-block elements.""" + from docx.table import Table as _Table + from docx.oxml.text.paragraph import CT_P as _CT_P + + sibling = elm.getprevious() + while sibling is not None: + if isinstance(sibling, CT_Tbl): + return _Table(sibling, parent) + if isinstance(sibling, _CT_P): + return Paragraph(sibling, parent) + sibling = sibling.getprevious() + return None diff --git a/src/docx/text/parfmt.py b/src/docx/text/parfmt.py index ea374373b..b7994cf17 100644 --- a/src/docx/text/parfmt.py +++ b/src/docx/text/parfmt.py @@ -1,14 +1,124 @@ """Paragraph-related proxy types.""" -from docx.enum.text import WD_LINE_SPACING -from docx.shared import ElementProxy, Emu, Length, Pt, Twips, lazyproperty +from __future__ import annotations + +from typing import TYPE_CHECKING + +from docx.enum.table import WD_SHADING_PATTERN +from docx.enum.text import WD_LINE_SPACING, WD_OUTLINELVL +from docx.shared import ElementProxy, Emu, Length, Pt, RGBColor, Twips, lazyproperty from docx.text.tabstops import TabStops +if TYPE_CHECKING: + from docx.enum.text import ( + WD_BORDER_STYLE, + WD_FRAME_DROP_CAP, + WD_FRAME_H_ALIGN, + WD_FRAME_H_ANCHOR, + WD_FRAME_V_ALIGN, + WD_FRAME_V_ANCHOR, + WD_FRAME_WRAP, + ) + from docx.oxml.text.parfmt import CT_Border, CT_FramePr + from docx.text.font import Font + class ParagraphFormat(ElementProxy): """Provides access to paragraph formatting such as justification, indentation, line spacing, space before and after, and widow/orphan control.""" + @property + def borders(self) -> ParagraphBorders: + """|ParagraphBorders| object providing access to the border settings for this + paragraph. + + .. versionadded:: 2026.05.0 + """ + return ParagraphBorders(self._element) + + @property + def frame(self) -> TextFrame | None: + """|TextFrame| proxy for this paragraph's ``w:framePr`` element, or |None|. + + Returns |None| when the paragraph has no ``w:pPr/w:framePr`` child. A text + frame is an absolutely-positioned text container, the legacy predecessor + to text boxes. + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + framePr = pPr.framePr + if framePr is None: + return None + return TextFrame(framePr) + + def set_frame( + self, + *, + width: Length | None = None, + height: Length | None = None, + horizontal_position: Length | None = None, + vertical_position: Length | None = None, + horizontal_anchor: WD_FRAME_H_ANCHOR | None = None, + vertical_anchor: WD_FRAME_V_ANCHOR | None = None, + wrap: WD_FRAME_WRAP | None = None, + drop_cap: WD_FRAME_DROP_CAP | None = None, + lines: int | None = None, + horizontal_alignment: WD_FRAME_H_ALIGN | None = None, + vertical_alignment: WD_FRAME_V_ALIGN | None = None, + ) -> TextFrame: + """Create or update the ``w:framePr`` element on this paragraph. + + Any keyword argument left at its default of |None| is left unchanged when the + frame already exists. To clear an attribute, use the corresponding setter on + the returned |TextFrame| (e.g. ``frame.width = None``) or call + :meth:`remove_frame` to drop the frame entirely. + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.get_or_add_pPr() + framePr = pPr.get_or_add_framePr() + frame = TextFrame(framePr) + if width is not None: + frame.width = width + if height is not None: + frame.height = height + if horizontal_position is not None: + frame.horizontal_position = horizontal_position + if vertical_position is not None: + frame.vertical_position = vertical_position + if horizontal_anchor is not None: + frame.horizontal_anchor = horizontal_anchor + if vertical_anchor is not None: + frame.vertical_anchor = vertical_anchor + if wrap is not None: + frame.wrap = wrap + if drop_cap is not None: + frame.drop_cap = drop_cap + if lines is not None: + frame.lines = lines + if horizontal_alignment is not None: + frame.horizontal_alignment = horizontal_alignment + if vertical_alignment is not None: + frame.vertical_alignment = vertical_alignment + return frame + + def remove_frame(self) -> None: + """Remove the ``w:framePr`` element, if present. + + No-op when no ``w:pPr`` or no ``w:framePr`` child is present. + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return + if pPr.framePr is None: + return + pPr._remove_framePr() + @property def alignment(self): """A member of the :ref:`WdParagraphAlignment` enumeration specifying the @@ -46,6 +156,31 @@ def first_line_indent(self, value): pPr = self._element.get_or_add_pPr() pPr.first_line_indent = value + @property + def kinsoku(self) -> bool | None: + """Tri-state value controlling Japanese kinsoku line-break rules. + + Maps to ``w:pPr/w:kinsoku``. Returns |True| when the element is + present and its ``w:val`` is truthy, |False| when present but + explicitly turned off, and |None| when the element is absent + (inherited from the style hierarchy). Kinsoku rules constrain + punctuation from appearing at the start or end of a line. + + Assigning |True| or |False| inserts the element. Assigning |None| + removes it. + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + return pPr.kinsoku_val + + @kinsoku.setter + def kinsoku(self, value: bool | None) -> None: + pPr = self._element.get_or_add_pPr() + pPr.kinsoku_val = value + @property def keep_together(self): """|True| if the paragraph should be kept "in one piece" and not broken across a @@ -194,6 +329,27 @@ def right_indent(self, value): pPr = self._element.get_or_add_pPr() pPr.ind_right = value + @property + def right_to_left(self) -> bool: + """|True| if paragraph uses right-to-left (bidirectional) layout. + + Maps to the ``w:pPr/w:bidi`` element. Returns |False| when the element is + absent. Assigning |True| inserts ``w:bidi``; assigning |False| or |None| + removes it. When |True|, paragraph-level runs are laid out right-to-left + (e.g. for Arabic, Hebrew, or Farsi text). + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return False + return pPr.bidi_val + + @right_to_left.setter + def right_to_left(self, value: bool | None): + pPr = self._element.get_or_add_pPr() + pPr.bidi_val = value + @property def space_after(self): """|Length| value specifying the spacing to appear between this paragraph and @@ -237,6 +393,30 @@ def tab_stops(self): pPr = self._element.get_or_add_pPr() return TabStops(pPr) + @property + def word_wrap(self) -> bool | None: + """Tri-state value controlling Latin-text word-wrap behaviour. + + Maps to ``w:pPr/w:wordWrap``. Returns |True| when Latin text wraps + on word boundaries (the default behaviour), |False| when the + paragraph uses aggressive Asian word-wrap (allowing breaks within + words), and |None| when the element is absent (inherited). + + Assigning |True| or |False| inserts the element. Assigning |None| + removes it. + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + return pPr.wordWrap_val + + @word_wrap.setter + def word_wrap(self, value: bool | None) -> None: + pPr = self._element.get_or_add_pPr() + pPr.wordWrap_val = value + @property def widow_control(self): """|True| if the first and last lines in the paragraph remain on the same page @@ -253,6 +433,160 @@ def widow_control(self): def widow_control(self, value): self._element.get_or_add_pPr().widowControl_val = value + @property + def outline_level(self) -> WD_OUTLINELVL | None: + """Outline level (``w:pPr/w:outlineLvl``), or |None| if not set. + + Values are members of :ref:`WdOutlineLvl` — ``LEVEL_1`` through + ``LEVEL_10`` for heading levels and ``BODY_TEXT`` for body text + (``10``). Returns |None| when the element is absent (inherited from + the style hierarchy). + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + val = pPr.outlineLvl_val + if val is None: + return None + return WD_OUTLINELVL(val) + + @outline_level.setter + def outline_level(self, value: WD_OUTLINELVL | int | None) -> None: + pPr = self._element.get_or_add_pPr() + if value is None: + pPr.outlineLvl_val = None + return + val = int(value) + if not 0 <= val <= 10: + raise ValueError( + "outline_level must be 0..10 or a WD_OUTLINELVL member, got %r" % (value,) + ) + pPr.outlineLvl_val = val + + @property + def contextual_spacing(self) -> bool | None: + """Tri-state value controlling ``w:pPr/w:contextualSpacing``. + + When |True|, space above and below this paragraph is ignored when the + neighbouring paragraph uses the same paragraph style (typical for list + items). Returns |None| when the element is absent (inherited from the + style hierarchy). + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + return pPr.contextualSpacing_val + + @contextual_spacing.setter + def contextual_spacing(self, value: bool | None) -> None: + pPr = self._element.get_or_add_pPr() + pPr.contextualSpacing_val = value + + @property + def first_line_chars(self) -> int | None: + """Value of ``w:pPr/w:ind/@w:firstLineChars``, or |None| if not set. + + Specifies the first-line indent in units of 1/100 of a character + width (the "character unit" used for East-Asian layouts). Returns + |None| when the ``w:ind`` element or the attribute is absent. + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + return pPr.first_line_chars + + @first_line_chars.setter + def first_line_chars(self, value: int | None) -> None: + if value is None and (self._element.pPr is None or self._element.pPr.ind is None): + return + pPr = self._element.get_or_add_pPr() + pPr.first_line_chars = value + + @property + def auto_space_de(self) -> bool | None: + """Tri-state value controlling ``w:pPr/w:autoSpaceDE``. + + When |True|, automatically adjusts spacing between East-Asian and + Latin text in this paragraph. Returns |None| when the element is + absent (inherited from the style hierarchy). + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + return pPr.autoSpaceDE_val + + @auto_space_de.setter + def auto_space_de(self, value: bool | None) -> None: + pPr = self._element.get_or_add_pPr() + pPr.autoSpaceDE_val = value + + @property + def auto_space_dn(self) -> bool | None: + """Tri-state value controlling ``w:pPr/w:autoSpaceDN``. + + When |True|, automatically adjusts spacing between East-Asian text + and numerals in this paragraph. Returns |None| when the element is + absent (inherited from the style hierarchy). + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + return pPr.autoSpaceDN_val + + @auto_space_dn.setter + def auto_space_dn(self, value: bool | None) -> None: + pPr = self._element.get_or_add_pPr() + pPr.autoSpaceDN_val = value + + @property + def shading_color(self) -> RGBColor | None: + """Paragraph-level background (shading) color as an |RGBColor|, or |None|. + + Read/write. Reads the ``w:fill`` attribute of ``w:pPr/w:shd``. + Returns |None| when ``w:shd`` is absent or its ``w:fill`` is missing + or set to ``"auto"``. + + Assigning an |RGBColor| writes ``w:pPr/w:shd`` with ``w:val="clear"`` + and ``w:fill="RRGGBB"``. Assigning |None| removes the ``w:shd`` + child. Mirrors :attr:`Font.shading_color` but applies at the + paragraph level. + + .. versionadded:: 2026.05.0 + """ + pPr = self._element.pPr + if pPr is None: + return None + shd = pPr.shd + if shd is None: + return None + fill = shd.fill + if fill is None or not isinstance(fill, RGBColor): + return None + return fill + + @shading_color.setter + def shading_color(self, value: RGBColor | None) -> None: + if value is None: + pPr = self._element.pPr + if pPr is None: + return + pPr._remove_shd() # pyright: ignore[reportPrivateUsage] + return + pPr = self._element.get_or_add_pPr() + shd = pPr.get_or_add_shd() + shd.val = WD_SHADING_PATTERN.CLEAR + shd.fill = value + @staticmethod def _line_spacing(spacing_line, spacing_lineRule): """Return the line spacing value calculated from the combination of @@ -284,3 +618,331 @@ def _line_spacing_rule(line, lineRule): if line == Twips(480): return WD_LINE_SPACING.DOUBLE return lineRule + + +class ParagraphBorders: + """Provides access to the border settings for a paragraph. + + Accessed via the :attr:`ParagraphFormat.borders` property. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, element: object): + self._element = element + + @property + def top(self) -> Border: + """The |Border| object for the top edge of the paragraph. + + .. versionadded:: 2026.05.0 + """ + return Border(self._element, "top") + + @property + def bottom(self) -> Border: + """The |Border| object for the bottom edge of the paragraph. + + .. versionadded:: 2026.05.0 + """ + return Border(self._element, "bottom") + + @property + def left(self) -> Border: + """The |Border| object for the left edge of the paragraph. + + .. versionadded:: 2026.05.0 + """ + return Border(self._element, "left") + + @property + def right(self) -> Border: + """The |Border| object for the right edge of the paragraph. + + .. versionadded:: 2026.05.0 + """ + return Border(self._element, "right") + + @property + def between(self) -> Border: + """The |Border| object for the border between identical paragraphs. + + .. versionadded:: 2026.05.0 + """ + return Border(self._element, "between") + + @property + def bar(self) -> Border: + """The |Border| object for the bar border of the paragraph. + + .. versionadded:: 2026.05.0 + """ + return Border(self._element, "bar") + + +class Border: + """Provides access to a single border edge of a paragraph. + + Accessed via the properties of |ParagraphBorders|, e.g. + ``paragraph_format.borders.bottom``. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, element: object, side: str): + self._element = element + self._side = side + + @property + def _border_elm(self) -> CT_Border | None: + pPr = self._element.pPr # type: ignore[attr-defined] + if pPr is None: + return None + pBdr = pPr.pBdr + if pBdr is None: + return None + return getattr(pBdr, self._side) + + def _get_or_add_border_elm(self) -> CT_Border: + pPr = self._element.get_or_add_pPr() # type: ignore[attr-defined] + pBdr = pPr.get_or_add_pBdr() + return getattr(pBdr, f"get_or_add_{self._side}")() + + @property + def style(self) -> WD_BORDER_STYLE | None: + """The border style as a member of :ref:`WdBorderStyle`, or |None| if no border + is defined. + + .. versionadded:: 2026.05.0 + """ + border = self._border_elm + if border is None: + return None + return border.val + + @style.setter + def style(self, value: WD_BORDER_STYLE | None) -> None: + if value is None: + pPr = self._element.pPr # type: ignore[attr-defined] + if pPr is not None: + pBdr = pPr.pBdr + if pBdr is not None: + remove_fn = getattr(pBdr, f"_remove_{self._side}", None) + if remove_fn is not None: + remove_fn() + return + self._get_or_add_border_elm().val = value + + @property + def width(self) -> Length | None: + """The border width as a |Length| value, or |None| if not defined. + + Stored in the XML as eighths of a point in the ``w:sz`` attribute. + + .. versionadded:: 2026.05.0 + """ + border = self._border_elm + if border is None: + return None + return border.sz + + @width.setter + def width(self, value: Length | None) -> None: + if value is None: + border = self._border_elm + if border is not None: + border.sz = None + return + self._get_or_add_border_elm().sz = value + + @property + def color(self) -> RGBColor | None: + """|RGBColor| value of the border color, or |None| if not defined. + + An ``"auto"`` value in the XML is returned as |None|. + + .. versionadded:: 2026.05.0 + """ + border = self._border_elm + if border is None: + return None + color = border.color + if isinstance(color, str): + return None + return color + + @color.setter + def color(self, value: RGBColor | None) -> None: + if value is None: + border = self._border_elm + if border is not None: + border.color = None + return + self._get_or_add_border_elm().color = value + + @property + def space(self) -> Length | None: + """The spacing between the border and paragraph text as a |Length| value, or + |None| if not defined. + + .. versionadded:: 2026.05.0 + """ + border = self._border_elm + if border is None: + return None + return border.space + + @space.setter + def space(self, value: Length | None) -> None: + if value is None: + border = self._border_elm + if border is not None: + border.space = None + return + self._get_or_add_border_elm().space = value + + +class TextFrame: + """Proxy object for a paragraph-level text frame (``w:framePr``). + + Provides read/write access to the attributes of the ``w:framePr`` element. A + text frame is an absolutely-positioned text container, the legacy predecessor + to text boxes. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, framePr: CT_FramePr): + self._framePr = framePr + + @property + def width(self) -> Length | None: + """Frame width (``w:framePr/@w:w``) as a |Length|, or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.w + + @width.setter + def width(self, value: Length | None) -> None: + self._framePr.w = value + + @property + def height(self) -> Length | None: + """Frame height (``w:framePr/@w:h``) as a |Length|, or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.h + + @height.setter + def height(self, value: Length | None) -> None: + self._framePr.h = value + + @property + def horizontal_position(self) -> Length | None: + """Horizontal position (``w:framePr/@w:x``) as a |Length|, or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.x + + @horizontal_position.setter + def horizontal_position(self, value: Length | None) -> None: + self._framePr.x = value + + @property + def vertical_position(self) -> Length | None: + """Vertical position (``w:framePr/@w:y``) as a |Length|, or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.y + + @vertical_position.setter + def vertical_position(self, value: Length | None) -> None: + self._framePr.y = value + + @property + def horizontal_anchor(self) -> WD_FRAME_H_ANCHOR | None: + """Horizontal anchor (``w:framePr/@w:hAnchor``), or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.hAnchor + + @horizontal_anchor.setter + def horizontal_anchor(self, value: WD_FRAME_H_ANCHOR | None) -> None: + self._framePr.hAnchor = value + + @property + def vertical_anchor(self) -> WD_FRAME_V_ANCHOR | None: + """Vertical anchor (``w:framePr/@w:vAnchor``), or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.vAnchor + + @vertical_anchor.setter + def vertical_anchor(self, value: WD_FRAME_V_ANCHOR | None) -> None: + self._framePr.vAnchor = value + + @property + def wrap(self) -> WD_FRAME_WRAP | None: + """Text-wrap behaviour (``w:framePr/@w:wrap``), or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.wrap + + @wrap.setter + def wrap(self, value: WD_FRAME_WRAP | None) -> None: + self._framePr.wrap = value + + @property + def drop_cap(self) -> WD_FRAME_DROP_CAP | None: + """Drop-cap positioning (``w:framePr/@w:dropCap``), or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.dropCap + + @drop_cap.setter + def drop_cap(self, value: WD_FRAME_DROP_CAP | None) -> None: + self._framePr.dropCap = value + + @property + def lines(self) -> int | None: + """Number of lines for a drop-cap frame (``w:framePr/@w:lines``), or |None|. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.lines + + @lines.setter + def lines(self, value: int | None) -> None: + self._framePr.lines = value + + @property + def horizontal_alignment(self) -> WD_FRAME_H_ALIGN | None: + """Horizontal alignment (``w:framePr/@w:xAlign``), or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.xAlign + + @horizontal_alignment.setter + def horizontal_alignment(self, value: WD_FRAME_H_ALIGN | None) -> None: + self._framePr.xAlign = value + + @property + def vertical_alignment(self) -> WD_FRAME_V_ALIGN | None: + """Vertical alignment (``w:framePr/@w:yAlign``), or |None| if not set. + + .. versionadded:: 2026.05.0 + """ + return self._framePr.yAlign + + @vertical_alignment.setter + def vertical_alignment(self, value: WD_FRAME_V_ALIGN | None) -> None: + self._framePr.yAlign = value diff --git a/src/docx/text/run.py b/src/docx/text/run.py index 57ea31fa4..4921a567e 100644 --- a/src/docx/text/run.py +++ b/src/docx/text/run.py @@ -2,11 +2,14 @@ from __future__ import annotations -from typing import IO, TYPE_CHECKING, Iterator, cast +import os +from typing import IO, TYPE_CHECKING, cast +from collections.abc import Iterator from docx.drawing import Drawing from docx.enum.style import WD_STYLE_TYPE from docx.enum.text import WD_BREAK +from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.oxml.drawing import CT_Drawing from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak from docx.shape import InlineShape @@ -14,14 +17,51 @@ from docx.styles.style import CharacterStyle from docx.text.font import Font from docx.text.pagebreak import RenderedPageBreak +from docx.text.symbol import Symbol if TYPE_CHECKING: import docx.types as t + from docx.embedded_objects import EmbeddedObject from docx.enum.text import WD_UNDERLINE from docx.oxml.text.run import CT_R, CT_Text + from docx.ruby import RubyAnnotation from docx.shared import Length +def _content_type_for_ole(prog_id: str, ext: str, blob: bytes) -> str: + """Return the best content-type for an embedded OLE payload. + + `prog_id` is matched first (e.g. ``Excel.Sheet.12`` → xlsx). Falls back to + the supplied file-extension hint (``xlsx``/``pdf``/``zip``) and finally to + the generic oleObject content-type. + """ + from docx.opc.constants import CONTENT_TYPE as _CT + + pid = (prog_id or "").lower() + if pid.startswith("excel.sheet") or pid.startswith("excel.sheetbinarymacro"): + return _CT.SML_SHEET + if pid.startswith("acroexch.document") or pid.startswith("pdf"): + return _CT.PDF + if ext in ("xlsx", "xlsm"): + return _CT.SML_SHEET + if ext == "pdf": + return _CT.PDF + if ext == "zip": + return _CT.ZIP + # -- magic-byte sniffing for zip / xlsx (PK header) -- + if blob[:2] == b"PK": + # -- xlsx is a zip; prefer xlsx when prog_id hints at Excel -- + if "excel" in pid or ext in ("xlsx", "xlsm"): + return _CT.SML_SHEET + return _CT.ZIP + if blob[:4] == b"%PDF": + return _CT.PDF + return _CT.OFC_OLE_OBJECT + + +__all__ = ["Run"] + + class Run(StoryChild): """Proxy object wrapping `` element. @@ -58,16 +98,19 @@ def add_break(self, break_type: WD_BREAK = WD_BREAK.LINE): def add_picture( self, - image_path_or_stream: str | IO[bytes], + image_path_or_stream: "str | os.PathLike[str] | IO[bytes] | None" = None, width: int | Length | None = None, height: int | Length | None = None, + link: bool = False, + save_with_document: bool = True, + url: str | None = None, ) -> InlineShape: """Return |InlineShape| containing image identified by `image_path_or_stream`. The picture is added to the end of this run. - `image_path_or_stream` can be a path (a string) or a file-like object containing - a binary image. + `image_path_or_stream` can be a ``str`` path, an :class:`os.PathLike` + (e.g. :class:`pathlib.Path`), or a binary file-like object containing an image. If neither width nor height is specified, the picture appears at its native size. If only one is specified, it is used to compute a scaling @@ -75,10 +118,181 @@ def add_picture( ratio of the image. The native size of the picture is calculated using the dots- per-inch (dpi) value specified in the image file, defaulting to 72 dpi if no value is specified, as is often the case. + + When `link` is |True| and `save_with_document` is |False|, the + picture is added as a linked (external) image: no image part is + created in the package and the `a:blip` uses ``r:link`` referencing + an external relationship. `url` may be supplied to link a remote + image; when both `url` and `image_path_or_stream` are supplied, + `url` becomes the link target while the local path is used only to + probe the native dimensions. + + .. versionchanged:: 2026.05.0 + Accepts :class:`os.PathLike` path arguments. + + .. versionadded:: 2026.05.0 + ``link``, ``save_with_document``, and ``url`` parameters. """ - inline = self.part.new_pic_inline(image_path_or_stream, width, height) + if isinstance(image_path_or_stream, os.PathLike): + image_path_or_stream = os.fspath(image_path_or_stream) + inline = self.part.new_pic_inline( + image_path_or_stream, + width, + height, + link=link, + save_with_document=save_with_document, + url=url, + ) self._r.add_drawing(inline) - return InlineShape(inline) + return InlineShape(inline, self.part) + + def add_text_box( + self, + width: Length | None = None, + height: Length | None = None, + text: str | None = None, + ): + """Append a DrawingML text box (``wps:wsp`` + ``wps:txbx``) to this run. + + The text box is created with a rectangular preset geometry of `width` + by `height` (defaults 3" x 1.5") and may be seeded with `text` in a + single initial paragraph. Callers can add further paragraphs via + :meth:`~docx.drawing.WordprocessingShape.add_paragraph`. + + Returns the :class:`~docx.drawing.WordprocessingShape` proxy. + + .. versionadded:: 2026.05.0 + """ + from docx.drawing import WordprocessingShape + from docx.enum.shape import WD_SHAPE + from docx.oxml.drawing import new_inline_shape_drawing + from docx.shared import Inches + + cx = int(width) if width is not None else int(Inches(3)) + cy = int(height) if height is not None else int(Inches(1.5)) + + story_part = self.part + shape_id = story_part.next_id + name = "Text Box %d" % shape_id + + drawing = new_inline_shape_drawing( + WD_SHAPE.RECTANGLE.value, + cx, + cy, + shape_id, + name, + text=text if text is not None else "", + ) + self._r.append(drawing) + + wsp = drawing.xpath( + ".//wp:inline/a:graphic/a:graphicData/wps:wsp" + )[0] + return WordprocessingShape(wsp, self) + + def add_ole_object( + self, + ole_path_or_stream: str | IO[bytes], + prog_id: str, + icon_path_or_stream: str | IO[bytes] | None = None, + ) -> "EmbeddedObject": + """Embed the OLE file at `ole_path_or_stream` in this run. + + Creates a new ``EmbeddedObjectPart`` holding the file bytes and appends + a ``/`` element referencing it via an ``r:id`` + relationship. `prog_id` identifies the object's type and must match the + ProgID Word expects for that payload (e.g. ``"Excel.Sheet.12"`` for an + xlsx workbook, ``"AcroExch.Document.DC"`` for a PDF). + + The content-type of the generated embedding part is chosen based on + `prog_id` and the file extension where possible — xlsx payloads get + ``application/vnd.openxmlformats-officedocument.spreadsheetml.sheet``, + pdf payloads get ``application/pdf``, zip archives get + ``application/zip`` and anything else falls back to the generic OLE + ``oleObject`` content-type. + + `icon_path_or_stream` is optional; when provided it is added as a + separate image relationship and referenced by a minimal + ```` fallback so Word displays the + icon in place of the OLE payload. When omitted an empty ``v:shape`` + placeholder is emitted; Word will still open and read the embedded + payload either way. + + Returns an |EmbeddedObject| wrapping the newly-created ``o:OLEObject`` + element. + + .. versionadded:: 2026.05.0 + """ + from docx.embedded_objects import EmbeddedObject + from docx.opc.constants import RELATIONSHIP_TYPE as _RT + from docx.oxml.ns import nsmap, qn + from docx.parts.embedded_object import EmbeddedObjectPart + from lxml import etree as _etree + + # -- read the payload bytes -- + if hasattr(ole_path_or_stream, "read"): + stream = cast("IO[bytes]", ole_path_or_stream) + try: + stream.seek(0) + except Exception: # pragma: no cover - best-effort rewind + pass + blob = stream.read() + ext_hint = "" + else: + ole_path = cast(str, ole_path_or_stream) + with open(ole_path, "rb") as f: + blob = f.read() + ext_hint = ole_path.rsplit(".", 1)[-1].lower() if "." in ole_path else "" + + content_type = _content_type_for_ole(prog_id, ext_hint, blob) + + part = self.part + package = part.package + assert package is not None + partname = package.next_partname("/word/embeddings/oleObject%d.bin") + ole_part = EmbeddedObjectPart(partname, content_type, blob) + rId = part.relate_to(ole_part, _RT.OLE_OBJECT) + + # -- optional icon image -- + image_rId: str | None = None + if icon_path_or_stream is not None: + image_rId, _ = part.get_or_add_image(icon_path_or_stream) + + # -- build the w:object/o:OLEObject fragment -- + obj_elm = _etree.SubElement(self._r, qn("w:object")) + if image_rId is not None: + v_shape = _etree.SubElement( + obj_elm, + "{%s}shape" % nsmap["v"], + ) + v_shape.set("style", "width:72pt;height:72pt") + imagedata = _etree.SubElement( + v_shape, "{%s}imagedata" % nsmap["v"] + ) + imagedata.set(qn("r:id"), image_rId) + imagedata.set("{%s}title" % nsmap["o"], "") + ole_elm = _etree.SubElement(obj_elm, qn("o:OLEObject")) + ole_elm.set("Type", "Embed") + ole_elm.set("ProgID", prog_id) + ole_elm.set("ShapeType", "75") # -- reasonable default -- + ole_elm.set(qn("r:id"), rId) + + # -- resolve the containing paragraph for the EmbeddedObject proxy -- + # The run's parent is typically a Paragraph; fall back to the nearest + # w:p ancestor if not. + from docx.text.paragraph import Paragraph + + paragraph_obj: Paragraph | None = None + if isinstance(self._parent, Paragraph): + paragraph_obj = self._parent + else: + p_elm = self._r.getparent() + while p_elm is not None and p_elm.tag != qn("w:p"): + p_elm = p_elm.getparent() + if p_elm is not None: + paragraph_obj = Paragraph(p_elm, self._parent) # pyright: ignore[reportArgumentType] + assert paragraph_obj is not None + return EmbeddedObject(paragraph_obj, ole_elm, ole_part) # pyright: ignore[reportArgumentType] def add_tab(self) -> None: """Add a ```` element at the end of the run, which Word interprets as a @@ -95,6 +309,82 @@ def add_text(self, text: str): t = self._r.add_t(text) return _Text(t) + def add_symbol(self, char_code: int | str, font: str) -> Symbol: + """Append a ```` element to this run and return a |Symbol| for it. + + `char_code` identifies the glyph's Unicode code point within `font`. It + may be an ``int`` (e.g. ``0xF0E0``) or a hex ``str`` (e.g. ``"F0E0"`` + or ``"0xF0E0"``). Word always stores this value as a 4-character + uppercase hex string in the XML; integer and lowercase-hex inputs are + normalized on write. `font` is the name of the font supplying the + glyph, for example ``"Wingdings"``. + + .. versionadded:: 2026.05.0 + """ + if isinstance(char_code, str): + code_int = int(char_code, 16) + else: + code_int = int(char_code) + char_hex = format(code_int, "04X") + sym = self._r.add_sym(char_hex, font) + return Symbol(sym) + + @property + def symbols(self) -> Iterator[Symbol]: + """Generate a |Symbol| for each ```` child of this run, in document + order. + + .. versionadded:: 2026.05.0 + """ + for sym in self._r.sym_lst: + yield Symbol(sym) + + @property + def text_with_symbols(self) -> str: + """Run text including ``w:sym`` glyphs rendered as ``chr(@w:char)``. + + Alias for :attr:`text`; kept as a named property because the upstream + issue request (upstream#1528) was specifically for a symbol-aware + variant of ``run.text``. Provided so callers can opt into the intent + explicitly even though ``.text`` now includes symbols too. + + .. versionadded:: 2026.05.0 + """ + return self._r.text + + @property + def equations(self): + """List of |Equation| objects for OMML elements inside this run. + + OMML is almost always a paragraph-level sibling of ``w:r`` (not a run + child), so this property is usually empty. It is provided for symmetry + with :attr:`Paragraph.equations` so callers can query any run without + a type check. Walks descendant ``m:oMath`` and ``m:oMathPara`` nodes. + + .. versionadded:: 2026.05.0 + """ + from docx.equations import Equation + + result: list[Equation] = [] + for el in self._r.xpath( + ".//m:oMathPara | .//m:oMath[not(ancestor::m:oMathPara)]" + ): + result.append(Equation(el)) + return result + + @property + def ruby_annotations(self) -> list["RubyAnnotation"]: + """A |RubyAnnotation| for each ```` child, in document order. + + Read-only. Ruby is used for phonetic annotation (Japanese furigana etc.) + pairing base text with an above-the-line reading. + + .. versionadded:: 2026.05.0 + """ + from docx.ruby import RubyAnnotation + + return [RubyAnnotation(r) for r in self._r.ruby_lst] + @property def bold(self) -> bool | None: """Read/write tri-state value. @@ -117,6 +407,32 @@ def clear(self): self._r.clear_content() return self + def copy_formatting_from(self, source: "Run") -> "Run": + """Replace this run's character formatting with a deep copy of `source`'s. + + The source run's ``w:rPr`` is deep-copied onto this run, replacing any + pre-existing character formatting on this run. The run's text content + is untouched. Returns this run for chaining convenience. + + .. versionadded:: 2026.05.0 + """ + source.font.copy_to(self.font) + return self + + def delete(self) -> None: + """Remove this run from its parent paragraph. + + The run element is removed from its parent. After calling this method, + this |Run| object is "defunct" and should not be used further. + + .. versionadded:: 2026.05.0 + """ + r = self._r + parent = r.getparent() + if parent is None: + return + parent.remove(r) + @property def contains_page_break(self) -> bool: """`True` when one or more rendered page-breaks occur in this run. @@ -185,6 +501,115 @@ def mark_comment_range(self, last_run: Run, comment_id: int) -> None: # -- `last_run` last_run._r.insert_comment_range_end_and_reference_below(comment_id) + def split(self, offset: int) -> tuple[Run, Run]: + """Return (left_run, right_run) after splitting this run at character `offset`. + + Text before `offset` stays in this run and text from `offset` onward moves + to a new run inserted immediately after this one. Both runs share the same + character formatting (`w:rPr`). + + .. versionadded:: 2026.05.0 + """ + new_r = self._r.split_run(offset) + right_run = Run(new_r, self._parent) + return self, right_run + + def make_hyperlink( + self, + url: str | None = None, + anchor: str | None = None, + ): + """Wrap this run in a new ``w:hyperlink`` and return the |Hyperlink|. + + The existing run is removed from its position in the paragraph and + re-inserted inside a new ``w:hyperlink`` element at the same position. + `url` is the external target URL (creates an external relationship of + type ``HYPERLINK`` on the owning part). `anchor` is an internal bookmark + name. Exactly one of `url` or `anchor` must be supplied. + + Run formatting is preserved. Note that this does not apply the + "Hyperlink" character style automatically — apply it via + :attr:`Run.style` if desired. + + .. versionadded:: 2026.05.0 + """ + from docx.oxml.parser import OxmlElement + from docx.oxml.text.hyperlink import CT_Hyperlink + from docx.text.hyperlink import Hyperlink + + if (url is None) == (anchor is None): + raise ValueError("Exactly one of url or anchor must be provided") + + r = self._r + parent = r.getparent() + if parent is None: + raise ValueError("run is not attached to a parent element") + + hyperlink = cast("CT_Hyperlink", OxmlElement("w:hyperlink")) + if url is not None: + rId = self.part.relate_to(url, RT.HYPERLINK, is_external=True) + hyperlink.rId = rId + if anchor is not None: + hyperlink.anchor = anchor + + # -- replace the run with the hyperlink, then move the run inside -- + r.addprevious(hyperlink) + parent.remove(r) + hyperlink.append(r) + + return Hyperlink(hyperlink, self._parent) + + @property + def formatting_change(self): + """A |FormattingChange| for this run's `w:rPrChange`, or |None|. + + Present when the run's formatting (its `w:rPr`) has been edited while + track-changes is enabled. The returned object exposes the author, date, + and the prior `w:rPr` via ``old_properties``. + + .. versionadded:: 2026.05.0 + """ + from docx.tracked_changes import FormattingChange + + rPr = self._r.rPr + if rPr is None: + return None + rPrChange = rPr.rPrChange # pyright: ignore[reportAttributeAccessIssue] + if rPrChange is None: + return None + return FormattingChange(rPrChange) + + @property + def rsid(self) -> str | None: + """The run's revision-save ID (``w:r/@w:rsidR``) or |None|. + + Read-only. Returns the 8-character hex string Word assigns to mark the + editing session in which this run was last modified, or |None| when + the ``@w:rsidR`` attribute is not present. + + .. versionadded:: 2026.05.0 + """ + return self._r.rsidR + + @property + def stable_id(self) -> str: + """A 16-character hex stable identifier for this run. + + The ID is derived from the run's ``w:rsidR`` (when present), its + position within its parent element, and its text. It is stable across + save/reload *when the run keeps the same position with the same text*; + it changes if the run is reordered or edited. The value is recomputed + on each access and never persisted on the element. + + For more robust cross-session tracking, compare :attr:`rsid` combined + with :attr:`text`. + + .. versionadded:: 2026.05.0 + """ + from docx.ids import compute_stable_id + + return compute_stable_id(self._r, self._r.text, self._r.rsidR) + @property def style(self) -> CharacterStyle: """Read/write. @@ -253,5 +678,5 @@ class _Text: """Proxy object wrapping `` element.""" def __init__(self, t_elm: CT_Text): - super(_Text, self).__init__() + super().__init__() self._t = t_elm diff --git a/src/docx/text/symbol.py b/src/docx/text/symbol.py new file mode 100644 index 000000000..31024189d --- /dev/null +++ b/src/docx/text/symbol.py @@ -0,0 +1,69 @@ +"""Proxy object for a `w:sym` (special-character-from-font) element.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from docx.oxml.text.run import CT_Sym + + +class Symbol: + """A special character whose glyph is drawn from a named font. + + Wraps a `w:sym` element inside a run. Word uses this element to represent + characters whose glyph is taken from a font like "Wingdings" where the + glyph at a given code point isn't the normal Unicode character at that + code point. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, sym: CT_Sym): + self._sym = sym + self._element = sym + + @property + def char_code(self) -> int: + """Integer Unicode code point of the symbol within ``font``. + + The ``w:char`` attribute stores this value as a hex string in the XML + (e.g. ``"F0E0"``); this property returns it as an ``int``. + + .. versionadded:: 2026.05.0 + """ + return int(self._sym.char, 16) + + @property + def char_hex(self) -> str: + """The 4-character uppercase hex string representation of the code point. + + This is the form used by Word to serialize the ``w:char`` attribute. + + .. versionadded:: 2026.05.0 + """ + # -- normalise whatever the XML actually stored so the value returned + # -- is always 4+ uppercase hex digits, padded to at least 4 chars -- + return format(self.char_code, "04X") + + @property + def font(self) -> str: + """The font the glyph is rendered from, e.g. ``"Wingdings"``. + + .. versionadded:: 2026.05.0 + """ + return self._sym.font + + def delete(self) -> None: + """Remove this symbol element from its parent run. + + After calling this method, this |Symbol| object is "defunct" and + should not be used further. + + .. versionadded:: 2026.05.0 + """ + sym = self._sym + parent = sym.getparent() + if parent is None: + return + parent.remove(sym) diff --git a/src/docx/text/tabstops.py b/src/docx/text/tabstops.py index 0f8c22c9c..47afcb585 100644 --- a/src/docx/text/tabstops.py +++ b/src/docx/text/tabstops.py @@ -14,7 +14,7 @@ class TabStops(ElementProxy): """ def __init__(self, element): - super(TabStops, self).__init__(element, None) + super().__init__(element, None) self._pPr = element def __delitem__(self, idx): @@ -76,7 +76,7 @@ class TabStop(ElementProxy): """ def __init__(self, element): - super(TabStop, self).__init__(element, None) + super().__init__(element, None) self._tab = element @property diff --git a/src/docx/theme.py b/src/docx/theme.py new file mode 100644 index 000000000..a709a4d6e --- /dev/null +++ b/src/docx/theme.py @@ -0,0 +1,339 @@ +"""|Theme| proxy for ``word/theme/theme1.xml``. + +Provides read-only access to the document theme: color scheme, font +scheme, and theme name. Access via +:attr:`docx.document.Document.theme`, which returns a :class:`Theme` +instance when the document has a ``theme`` relationship, or |None| +otherwise. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +from docx.shared import ElementProxy + +if TYPE_CHECKING: + import docx.types as t + from docx.oxml.theme import ( + CT_ClrScheme, + CT_FontCollection, + CT_FontScheme, + CT_Theme, + ) + from docx.oxml.xmlchemy import BaseOxmlElement + from docx.shared import RGBColor + + +# -- OOXML ``w:themeColor`` tokens we accept as ``ThemeColors[...]`` keys. -- +# The sequence mirrors ``CT_ClrScheme``'s child order; the ``*Reference`` +# aliases exposed by ``ST_ThemeColor`` are not implemented here — callers +# resolve theme-color references by looking up the matching slot. +_COLOR_SLOTS = ( + "dk1", + "lt1", + "dk2", + "lt2", + "accent1", + "accent2", + "accent3", + "accent4", + "accent5", + "accent6", + "hlink", + "folHlink", +) + + +class Theme(ElementProxy): + """Proxy for the ``a:theme`` root element of the theme part. + + Exposes the theme name and lazy accessors for the color scheme and + font scheme. Read-only — python-docx does not support authoring + themes. + + .. versionadded:: 2026.05.0 + """ + + def __init__( + self, + element: BaseOxmlElement, + parent: t.ProvidesXmlPart | None = None, + ): + super().__init__(element, parent) + self._theme = cast("CT_Theme", element) + + @property + def name(self) -> str | None: + """The value of ``a:theme/@name``, or |None| if the attribute is absent. + + .. versionadded:: 2026.05.0 + """ + return self._theme.name + + @property + def colors(self) -> ThemeColors: + """A |ThemeColors| proxy for the ``a:clrScheme`` of this theme. + + The returned object exposes a no-op view when the theme has no + color scheme (every slot returns |None|) — mirroring how Word + falls back to the default theme for missing references. + + .. versionadded:: 2026.05.0 + """ + return ThemeColors(self._theme.clrScheme) + + @property + def fonts(self) -> ThemeFonts: + """A |ThemeFonts| proxy for the ``a:fontScheme`` of this theme. + + The returned object returns |None| for every slot when the theme + has no font scheme. + + .. versionadded:: 2026.05.0 + """ + return ThemeFonts(self._theme.fontScheme) + + +class ThemeColors: + """Read-only view over the twelve-slot theme color scheme. + + Slot accessors return an |RGBColor| resolved from either the + ``a:srgbClr/@val`` attribute or, for ``a:sysClr``, the ``lastClr`` + fallback. A missing slot — or a scheme child with no resolvable color + — yields |None|. Lookup by OOXML token is available via + ``colors[name]`` (e.g. ``colors["accent1"]`` or ``colors["hlink"]``). + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, clrScheme: CT_ClrScheme | None): + self._clrScheme = clrScheme + + def __getitem__(self, name: str) -> RGBColor | None: + """Return the |RGBColor| for theme-color token `name`, or |None|. + + `name` is an OOXML ``w:themeColor`` token — one of the twelve + scheme slots: ``"dk1"``, ``"lt1"``, ``"dk2"``, ``"lt2"``, + ``"accent1"``..``"accent6"``, ``"hlink"``, ``"folHlink"``. + Returns |None| when the theme does not define the slot or when + the slot's color cannot be resolved to RGB (e.g. an ``a:sysClr`` + with no ``lastClr`` fallback). Raises |KeyError| for unknown + tokens so callers can distinguish "undefined slot" from "unknown + name". + """ + if name not in _COLOR_SLOTS: + raise KeyError(name) + if self._clrScheme is None: + return None + choice = self._clrScheme.color_for(name) + if choice is None: + return None + return choice.rgb + + def _get(self, name: str) -> RGBColor | None: + if self._clrScheme is None: + return None + choice = self._clrScheme.color_for(name) + if choice is None: + return None + return choice.rgb + + @property + def name(self) -> str | None: + """The value of ``a:clrScheme/@name``, or |None| when absent. + + .. versionadded:: 2026.05.0 + """ + if self._clrScheme is None: + return None + return self._clrScheme.name + + @property + def dark_1(self) -> RGBColor | None: + """The ``a:dk1`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("dk1") + + @property + def dark_2(self) -> RGBColor | None: + """The ``a:dk2`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("dk2") + + @property + def light_1(self) -> RGBColor | None: + """The ``a:lt1`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("lt1") + + @property + def light_2(self) -> RGBColor | None: + """The ``a:lt2`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("lt2") + + @property + def accent_1(self) -> RGBColor | None: + """The ``a:accent1`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("accent1") + + @property + def accent_2(self) -> RGBColor | None: + """The ``a:accent2`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("accent2") + + @property + def accent_3(self) -> RGBColor | None: + """The ``a:accent3`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("accent3") + + @property + def accent_4(self) -> RGBColor | None: + """The ``a:accent4`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("accent4") + + @property + def accent_5(self) -> RGBColor | None: + """The ``a:accent5`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("accent5") + + @property + def accent_6(self) -> RGBColor | None: + """The ``a:accent6`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("accent6") + + @property + def hyperlink(self) -> RGBColor | None: + """The ``a:hlink`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("hlink") + + @property + def followed_hyperlink(self) -> RGBColor | None: + """The ``a:folHlink`` color, or |None| when unresolved. + + .. versionadded:: 2026.05.0 + """ + return self._get("folHlink") + + +class ThemeFonts: + """Read-only view over the theme's font scheme. + + Each property returns the typeface string from the matching + ``a:latin``/``a:ea``/``a:cs`` child of ``a:majorFont`` or + ``a:minorFont``, or |None| when the slot is missing. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, fontScheme: CT_FontScheme | None): + self._fontScheme = fontScheme + + @property + def name(self) -> str | None: + """The value of ``a:fontScheme/@name``, or |None| when absent. + + .. versionadded:: 2026.05.0 + """ + if self._fontScheme is None: + return None + return self._fontScheme.name + + @staticmethod + def _typeface(collection: CT_FontCollection | None, slot: str) -> str | None: + if collection is None: + return None + child = getattr(collection, slot) + if child is None: + return None + return child.typeface + + @property + def major_latin(self) -> str | None: + """Typeface at ``a:majorFont/a:latin/@typeface``, or |None|. + + .. versionadded:: 2026.05.0 + """ + if self._fontScheme is None: + return None + return self._typeface(self._fontScheme.majorFont, "latin") + + @property + def minor_latin(self) -> str | None: + """Typeface at ``a:minorFont/a:latin/@typeface``, or |None|. + + .. versionadded:: 2026.05.0 + """ + if self._fontScheme is None: + return None + return self._typeface(self._fontScheme.minorFont, "latin") + + @property + def major_east_asian(self) -> str | None: + """Typeface at ``a:majorFont/a:ea/@typeface``, or |None|. + + .. versionadded:: 2026.05.0 + """ + if self._fontScheme is None: + return None + return self._typeface(self._fontScheme.majorFont, "ea") + + @property + def minor_east_asian(self) -> str | None: + """Typeface at ``a:minorFont/a:ea/@typeface``, or |None|. + + .. versionadded:: 2026.05.0 + """ + if self._fontScheme is None: + return None + return self._typeface(self._fontScheme.minorFont, "ea") + + @property + def major_cs(self) -> str | None: + """Typeface at ``a:majorFont/a:cs/@typeface``, or |None|. + + .. versionadded:: 2026.05.0 + """ + if self._fontScheme is None: + return None + return self._typeface(self._fontScheme.majorFont, "cs") + + @property + def minor_cs(self) -> str | None: + """Typeface at ``a:minorFont/a:cs/@typeface``, or |None|. + + .. versionadded:: 2026.05.0 + """ + if self._fontScheme is None: + return None + return self._typeface(self._fontScheme.minorFont, "cs") diff --git a/src/docx/toc.py b/src/docx/toc.py new file mode 100644 index 000000000..1ec8a6c06 --- /dev/null +++ b/src/docx/toc.py @@ -0,0 +1,181 @@ +"""Table-of-contents (TOC) building helpers. + +A Word table of contents is a field — a paragraph that contains a ``TOC`` +instruction which Word evaluates at display time to produce a list of +headings with page numbers. The field uses the "complex" XML shape +(``w:fldChar`` begin / separate / end markers around a ``w:instrText``) and +between the *separate* and *end* markers Word caches a preview of the +rendered TOC. Word rebuilds the result when the document is opened or when +the user asks to update fields, so the cached result is purely a preview +used when the document is viewed by a consumer that does not itself +evaluate fields (e.g. a raw-XML tool or Word in an "update fields?" +prompt-declined state). + +The XML shape produced by this module looks approximately like:: + + + + + TOC \\o "1-3" \\h \\z \\u + + + Heading one\\t1 + + Heading two\\t2 + ... + + + +The per-entry ``\\t`` and "page hint" are cosmetic: python-docx has no +layout engine, so the cached page number is a 1-based heading index rather +than a true page number. Word discards it and recomputes the real page +numbers on open. + +This module exposes two helpers — :func:`build_toc_instruction` (builds +the ``TOC`` instruction string for a level range) and +:func:`populate_toc_paragraph` (populates a freshly-created empty +paragraph with the TOC field). The public API is surfaced via +:meth:`docx.document.Document.add_table_of_contents`, +:meth:`docx.text.paragraph.Paragraph.insert_table_of_contents_before`, +and :meth:`docx.text.paragraph.Paragraph.insert_table_of_contents_after`. +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING, Iterable + +if TYPE_CHECKING: + from docx.text.paragraph import Paragraph + + +# -- heading style names look like "Heading 1" .. "Heading 9" (case-insensitive). +# The regex is shared with `docx.accessibility` but duplicated here to avoid +# importing that module just for a regex. -- +_HEADING_RE = re.compile(r"^heading\s+([1-9])$", re.IGNORECASE) + + +def _paragraph_heading_level(paragraph: Paragraph) -> int | None: + """Return the integer heading level for `paragraph`, or |None| if not a heading. + + A paragraph is considered a heading when its style name matches ``"Heading N"`` + (case-insensitively) for ``N`` in 1..9. + """ + style = paragraph.style + if style is None: + return None + name = style.name + if name is None: + return None + match = _HEADING_RE.match(name.strip()) + if match is None: + return None + return int(match.group(1)) + + +def _validate_levels(levels: tuple[int, int]) -> tuple[int, int]: + """Return `levels` after validating shape and contents. + + `levels` must be a 2-tuple of integers ``(min_level, max_level)`` where + ``1 <= min_level <= max_level <= 9``. ``levels`` is also accepted as a + list of two integers for caller convenience. + """ + try: + min_level, max_level = levels + except (TypeError, ValueError): + raise ValueError( + "levels must be a 2-tuple of ints (min_level, max_level), got %r" + % (levels,) + ) + if not (isinstance(min_level, int) and isinstance(max_level, int)): # pyright: ignore[reportUnnecessaryIsInstance] + raise ValueError( + "levels must be a 2-tuple of ints (min_level, max_level), got %r" + % (levels,) + ) + if not 1 <= min_level <= max_level <= 9: + raise ValueError( + "levels must satisfy 1 <= min_level <= max_level <= 9, got %r" + % (levels,) + ) + return (min_level, max_level) + + +def build_toc_instruction(levels: tuple[int, int] = (1, 3)) -> str: + """Return the ``TOC`` field instruction string for a heading-level range. + + `levels` is a ``(min_level, max_level)`` tuple. The produced instruction + uses Word's conventional switches: + + * ``\\o "min-max"`` — build from outline levels ``min..max`` + * ``\\h`` — render entries as hyperlinks + * ``\\z`` — hide tab-leader and page numbers in web view + * ``\\u`` — use applied paragraph outline levels (not just headings) + + The returned string is wrapped in single spaces, matching the form Word + writes when it inserts a TOC via the Ribbon. + + .. versionadded:: 2026.05.0 + """ + min_level, max_level = _validate_levels(levels) + return f' TOC \\o "{min_level}-{max_level}" \\h \\z \\u ' + + +def _collect_entries( + paragraphs: Iterable[Paragraph], levels: tuple[int, int] +) -> list[tuple[int, str]]: + """Return ``(level, text)`` pairs for each heading in `paragraphs` matching `levels`.""" + min_level, max_level = levels + entries: list[tuple[int, str]] = [] + for paragraph in paragraphs: + level = _paragraph_heading_level(paragraph) + if level is None: + continue + if level < min_level or level > max_level: + continue + entries.append((level, paragraph.text)) + return entries + + +def _render_result_text(entries: list[tuple[int, str]]) -> str: + """Return a newline-joined cached TOC preview built from `entries`. + + Each entry becomes ``"{text}\\t{index}"`` where ``index`` is the 1-based + position of the heading in the filtered list — a stand-in for the page + number python-docx cannot compute. An empty `entries` list produces an + empty string. + """ + lines: list[str] = [] + for idx, (_, text) in enumerate(entries, start=1): + lines.append(f"{text}\t{idx}") + return "\n".join(lines) + + +def populate_toc_paragraph( + paragraph: Paragraph, + source_paragraphs: Iterable[Paragraph], + levels: tuple[int, int] = (1, 3), +) -> Paragraph: + """Populate `paragraph` with a TOC complex field and return it. + + `paragraph` must be an empty, freshly-created |Paragraph|. + `source_paragraphs` is the iterable of paragraphs to scan for headings + (typically ``document.paragraphs``). `levels` selects the heading-level + range to include in the TOC (default H1..H3). + + The paragraph's style is set to ``"TOC Heading"`` would be conventional, + but since that style is not guaranteed to exist we leave the style + untouched; callers can assign a style explicitly if they have one + defined. Word rebuilds the TOC on open or field-update, so the cached + result added here is intended only as a preview for consumers that do + not themselves evaluate fields. + + .. versionadded:: 2026.05.0 + """ + levels = _validate_levels(levels) + entries = _collect_entries(source_paragraphs, levels) + instr = build_toc_instruction(levels) + result_text = _render_result_text(entries) + # -- pass None when result is empty so add_complex_field emits no + # separator run; Word still happily renders the empty TOC. -- + paragraph.add_complex_field(instr, result_text if result_text else None) + return paragraph diff --git a/src/docx/tracked_changes.py b/src/docx/tracked_changes.py new file mode 100644 index 000000000..ad7acea10 --- /dev/null +++ b/src/docx/tracked_changes.py @@ -0,0 +1,572 @@ +"""Proxy objects for tracked changes (revision marks) in a document.""" + +from __future__ import annotations + +import datetime as dt +from typing import TYPE_CHECKING, cast + +from docx.oxml.ns import qn +from docx.oxml.parser import OxmlElement +from docx.shared import ElementProxy + +if TYPE_CHECKING: + from docx.oxml.section import CT_SectPr + from docx.oxml.table import CT_TblPr, CT_TcPr, CT_TrPr + from docx.oxml.text.font import CT_RPr + from docx.oxml.text.parfmt import CT_PPr + from docx.oxml.text.paragraph import CT_P + from docx.oxml.text.run import CT_R + from docx.oxml.tracked_changes import ( + CT_PPrChange, + CT_RPrChange, + CT_RunTrackChange, + CT_SectPrChange, + CT_TblPrChange, + CT_TcPrChange, + CT_TrackChange, + CT_TrPrChange, + ) + from docx.oxml.xmlchemy import BaseOxmlElement + + +class TrackedChange(ElementProxy): + """Proxy for a single tracked change in a paragraph. + + Wraps a ``, ``, ``, or `` element + containing one or more runs. For move revisions the :class:`MoveRevision` + subclass exposes the additional `w:name` attribute and paired-peer lookup. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, element: CT_RunTrackChange): + super().__init__(element) + self._tc_element = element + + @property + def author(self) -> str: + """The author who made this change. + + .. versionadded:: 2026.05.0 + """ + return self._tc_element.author + + @property + def date(self) -> dt.datetime | None: + """The date and time when this change was made, or |None| if not recorded. + + .. versionadded:: 2026.05.0 + """ + return self._tc_element.date + + @property + def text(self) -> str: + """The textual content of this tracked change. + + .. versionadded:: 2026.05.0 + """ + return cast(str, self._tc_element.text) + + @property + def type(self) -> str: + """The type of this tracked change. + + One of ``"insertion"``, ``"deletion"``, ``"move_from"``, or ``"move_to"``. + + .. versionadded:: 2026.05.0 + """ + # -- check the move subclasses before their bases (CT_MoveFrom extends + # -- CT_Del, CT_MoveTo extends CT_Ins) -- + from docx.oxml.tracked_changes import CT_Ins, CT_MoveFrom, CT_MoveTo + + if isinstance(self._tc_element, CT_MoveFrom): + return "move_from" + if isinstance(self._tc_element, CT_MoveTo): + return "move_to" + return "insertion" if isinstance(self._tc_element, CT_Ins) else "deletion" + + def accept(self) -> None: + """Accept this tracked change. + + For an insertion, the `w:ins` wrapper is removed and its inserted runs remain + in the paragraph. For a deletion, the `w:del` element and its deleted content + are removed entirely. For a `w:moveFrom`, the source element and its content + are removed (completing the move). For a `w:moveTo`, the wrapper is removed + and its runs survive as live content. + + .. versionadded:: 2026.05.0 + """ + self._tc_element.accept() + + def reject(self) -> None: + """Reject this tracked change. + + For an insertion, the `w:ins` element and its inserted content are removed + entirely. For a deletion, the `w:del` wrapper is removed and its `w:delText` + children are converted back to `w:t` so the content is restored as live text. + For a `w:moveFrom`, the wrapper is unwound so the source text is restored in + place. For a `w:moveTo`, the destination element and its content are removed + (cancelling the move). + + .. versionadded:: 2026.05.0 + """ + self._tc_element.reject() + + +class MoveRevision(TrackedChange): + """Proxy for a move revision — a `` or `` element. + + In addition to the common author/date/text surface inherited from + :class:`TrackedChange`, a move revision carries a ``name`` that pairs the + source (`w:moveFrom`) with the destination (`w:moveTo`). The :attr:`peer` + property resolves the counterpart element anywhere in the same XML tree by + matching `@w:name`. + + Note on the paragraph-level range markers `w:moveFromRangeStart/End` and + `w:moveToRangeStart/End`: those bracket cross-paragraph moves rather than + wrap run content, so no proxy type is exposed for them. They survive a + round-trip unchanged; callers that need to work with them can iterate the + underlying XML. + + .. versionadded:: 2026.05.0 + """ + + @property + def name(self) -> str | None: + """The `@w:name` attribute pairing this move half with its peer, or |None|. + + Well-formed move-revision XML always includes a name, but the attribute + is declared optional per ECMA-376 so callers must handle |None|. + + .. versionadded:: 2026.05.0 + """ + return self._tc_element.get(qn("w:name")) + + @property + def peer(self) -> MoveRevision | None: + """The paired `w:moveFrom`/`w:moveTo` on the other side of the move. + + Looks up the first element (other than ``self``) in the same tree whose + local tag matches the opposite side and whose `@w:name` equals this + element's name. Returns |None| if the name is unset, if there is no + tree root (detached element), or if no peer is found. + + .. versionadded:: 2026.05.0 + """ + from docx.oxml.tracked_changes import CT_MoveFrom, CT_MoveTo + + name = self.name + if not name: + return None + + # -- walk up to the document root (or nearest ancestor) and search from + # -- there; this handles both attached and fragment-rooted elements -- + root = self._tc_element + while root.getparent() is not None: + root = cast("CT_RunTrackChange", root.getparent()) + + if isinstance(self._tc_element, CT_MoveFrom): + peer_xpath = ".//w:moveTo" + peer_cls: type = CT_MoveTo + else: + peer_xpath = ".//w:moveFrom" + peer_cls = CT_MoveFrom + + for candidate in root.xpath(peer_xpath): + if candidate is self._tc_element: + continue + if not isinstance(candidate, peer_cls): + continue + if candidate.get(qn("w:name")) == name: + return MoveRevision(candidate) + return None + + +class FormattingChange(ElementProxy): + """Proxy for a formatting revision mark (`w:rPrChange`, `w:pPrChange`, + `w:sectPrChange`). + + Records the author and date of a formatting edit and provides access to the + previous formatting via :attr:`old_properties`, which returns the inner + `w:rPr`, `w:pPr`, or `w:sectPr` element holding the pre-edit values. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, element: CT_TrackChange): + super().__init__(element) + self._fc_element = element + + @property + def author(self) -> str: + """The author who made this formatting change. + + .. versionadded:: 2026.05.0 + """ + return self._fc_element.author + + @property + def date(self) -> dt.datetime | None: + """When this formatting change was made, or |None| if not recorded. + + .. versionadded:: 2026.05.0 + """ + return self._fc_element.date + + @property + def old_properties( + self, + ) -> CT_RPr | CT_PPr | CT_SectPr | CT_TcPr | CT_TrPr | CT_TblPr | None: + """The nested properties element holding the prior formatting. + + Returns the inner `w:rPr`, `w:pPr`, `w:sectPr`, `w:tcPr`, `w:trPr`, or + `w:tblPr` element for the corresponding change type. + + |None| if the change element has no inner properties element (malformed or + "no prior formatting" case). + + .. versionadded:: 2026.05.0 + """ + from docx.oxml.tracked_changes import ( + CT_PPrChange, + CT_RPrChange, + CT_SectPrChange, + CT_TblPrChange, + CT_TcPrChange, + CT_TrPrChange, + ) + + if isinstance(self._fc_element, CT_RPrChange): + return self._fc_element.rPr + if isinstance(self._fc_element, CT_PPrChange): + return self._fc_element.pPr + if isinstance(self._fc_element, CT_SectPrChange): + return self._fc_element.sectPr + if isinstance(self._fc_element, CT_TcPrChange): + return self._fc_element.tcPr + if isinstance(self._fc_element, CT_TrPrChange): + return self._fc_element.trPr + if isinstance(self._fc_element, CT_TblPrChange): + return self._fc_element.tblPr + return None + + +def _render_paragraph_marks( + p_elm: CT_P, + open_ins: str = "[+", + close_ins: str = "+]", + open_del: str = "[-", + close_del: str = "-]", +) -> str: + """Render `p_elm` as text with insertion/deletion revision markers. + + Walks the paragraph's children in document order. Plain runs contribute their + text; `w:ins` and `w:del` wrappers contribute their inner text wrapped with the + corresponding open/close markers. `w:hyperlink` elements are recursed into so + track-change wrappers inside them are rendered in place. Other inner-content + elements (`w:fldSimple`, `w:sdt`) contribute their plain text. + + When the paragraph has no tracked changes the returned string matches + `paragraph.text`. + """ + parts: list[str] = [] + _append_container_text( + p_elm, parts, open_ins, close_ins, open_del, close_del + ) + return "".join(parts) + + +def _append_container_text( + container: BaseOxmlElement, + parts: list[str], + open_ins: str, + close_ins: str, + open_del: str, + close_del: str, +) -> None: + """Walk direct children of `container` and append rendered text into `parts`.""" + ins_tag = qn("w:ins") + del_tag = qn("w:del") + r_tag = qn("w:r") + hyperlink_tag = qn("w:hyperlink") + fldSimple_tag = qn("w:fldSimple") + sdt_tag = qn("w:sdt") + + for child in container: + tag = child.tag + if tag == r_tag: + parts.append(child.text or "") # CT_R.text + elif tag == ins_tag: + parts.append(open_ins) + _append_container_text( + cast("BaseOxmlElement", child), + parts, open_ins, close_ins, open_del, close_del, + ) + parts.append(close_ins) + elif tag == del_tag: + parts.append(open_del) + # -- `w:del` contains `w:r` children whose text sits in `w:delText` -- + for delText in child.xpath(".//w:delText"): + parts.append(delText.text or "") + parts.append(close_del) + elif tag == hyperlink_tag: + _append_container_text( + cast("BaseOxmlElement", child), + parts, open_ins, close_ins, open_del, close_del, + ) + elif tag == fldSimple_tag or tag == sdt_tag: + # -- defer to the element's own `.text` for fields and SDTs -- + parts.append(child.text or "") + + +def _resolve_all_changes(root: BaseOxmlElement, *, accept: bool) -> int: + """Accept or reject every tracked change beneath `root`. + + Processes run-level track changes (`w:ins`, `w:del`, `w:moveFrom`, + `w:moveTo`), formatting track changes (`w:rPrChange`, `w:pPrChange`, + `w:sectPrChange`, `w:tcPrChange`, `w:trPrChange`, `w:tblPrChange`), and + cell-level revisions (`w:cellIns`, `w:cellDel`). Returns the count of + change elements resolved. + + Nested changes (e.g. a `w:ins` inside a `w:del`) are handled by processing + innermost elements first so outer wrappers see stable children. + """ + from docx.oxml.tracked_changes import ( + CT_Del, + CT_Ins, + accept_formatting_change, + reject_formatting_change, + ) + + run_changes: list[BaseOxmlElement] = root.xpath( + ".//w:ins | .//w:del | .//w:moveFrom | .//w:moveTo" + ) + run_changes.sort(key=lambda e: len(list(e.iterancestors())), reverse=True) + count = 0 + for elm in run_changes: + if elm.getparent() is None: + continue + # -- CT_MoveFrom is a CT_Del and CT_MoveTo is a CT_Ins, so this check + # -- covers all four element types without listing the move classes -- + if isinstance(elm, (CT_Ins, CT_Del)): + elm.accept() if accept else elm.reject() + count += 1 + + # -- cell-level revisions. Resolve before formatting changes so a + # -- `w:tcPrChange` inside a cell being deleted is only processed once (the + # -- enclosing `w:tc` is removed here if needed). -- + cell_changes: list[BaseOxmlElement] = root.xpath(".//w:cellIns | .//w:cellDel") + for elm in cell_changes: + if elm.getparent() is None: + continue + count += _resolve_cell_change(elm, accept=accept) + + fmt_changes: list[BaseOxmlElement] = root.xpath( + ".//w:rPrChange | .//w:pPrChange | .//w:sectPrChange" + " | .//w:tcPrChange | .//w:trPrChange | .//w:tblPrChange" + ) + for elm in fmt_changes: + if elm.getparent() is None: + continue + if accept: + accept_formatting_change(elm) + else: + reject_formatting_change(elm) + count += 1 + + return count + + +def _resolve_cell_change(elm: BaseOxmlElement, *, accept: bool) -> int: + """Accept or reject a `w:cellIns` or `w:cellDel` revision marker. + + - Accept `w:cellIns` -> the insertion is accepted; the marker is removed but + the cell is kept. + - Reject `w:cellIns` -> the insertion is rejected; the whole enclosing cell + is removed. + - Accept `w:cellDel` -> the deletion is accepted; the whole enclosing cell + is removed. + - Reject `w:cellDel` -> the deletion is rejected; the marker is removed but + the cell is kept. + + Returns 1 if the change was processed, 0 if the marker was orphaned or its + surrounding structure was unexpected. + """ + from docx.oxml.tracked_changes import CT_CellDel, CT_CellIns + + tcPr = elm.getparent() + if tcPr is None: + return 0 + tc = tcPr.getparent() # -- the enclosing `w:tc` + if tc is None: + # -- detached `w:tcPr`; just remove the marker -- + tcPr.remove(elm) + return 1 + + is_insertion = isinstance(elm, CT_CellIns) + is_deletion = isinstance(elm, CT_CellDel) + if not (is_insertion or is_deletion): + # -- unexpected element class; remove marker defensively -- + tcPr.remove(elm) + return 1 + + remove_cell = (is_deletion and accept) or (is_insertion and not accept) + if remove_cell: + row = tc.getparent() + if row is not None: + row.remove(tc) + return 1 + + # -- keep the cell, just remove the marker -- + tcPr.remove(elm) + return 1 + + +# -- Track-changes writer helpers ------------------------------------------- +# +# These helpers are used by `BlockItemContainer.add_paragraph` and +# `Paragraph.add_run` when the document-level `Document.tracked_changes(...)` +# context manager is active (or when the `track_author=` keyword argument is +# passed to either of those methods). + + +def _next_revision_id(root: BaseOxmlElement) -> int: + """Return the next unused integer revision id within `root`. + + Scans every `w:ins`, `w:del`, `w:moveFrom`, `w:moveTo`, `w:rPrChange`, + `w:pPrChange`, `w:sectPrChange`, `w:tcPrChange`, `w:trPrChange`, + `w:tblPrChange`, `w:cellIns`, and `w:cellDel` descendant for a `w:id` + attribute and returns ``max(existing) + 1``. Returns ``1`` when no + revision element is present. + """ + ids: list[int] = [] + for el in root.xpath( + ".//w:ins | .//w:del | .//w:moveFrom | .//w:moveTo" + " | .//w:rPrChange | .//w:pPrChange | .//w:sectPrChange" + " | .//w:tcPrChange | .//w:trPrChange | .//w:tblPrChange" + " | .//w:cellIns | .//w:cellDel" + ): + raw = el.get(qn("w:id")) + if raw is None: + continue + try: + ids.append(int(raw)) + except ValueError: + continue + return (max(ids) + 1) if ids else 1 + + +def wrap_run_in_ins( + r: CT_R, + author: str, + date: dt.datetime | None = None, + change_id: int | None = None, +) -> CT_RunTrackChange: + """Replace `r` in its parent with a `w:ins` wrapper and return the wrapper. + + The newly-created `w:ins` element is positioned where `r` sat among its + siblings and `r` becomes its sole child. If `change_id` is omitted an id + is allocated by scanning `r`'s document root for the next unused id. If + `date` is omitted the current UTC time is used (normalised to whole + seconds for deterministic XML). + + Returns the new `w:ins` element. + + .. versionadded:: 2026.05.0 + """ + parent = r.getparent() + if parent is None: + raise ValueError("cannot wrap a detached run in a w:ins element") + if change_id is None: + # -- walk up to document root to scope id allocation -- + root = r + while root.getparent() is not None: + root = cast("CT_R", root.getparent()) + change_id = _next_revision_id(cast("BaseOxmlElement", root)) + if date is None: + date = dt.datetime.now(dt.timezone.utc).replace(microsecond=0) + + ins = cast( + "CT_RunTrackChange", + OxmlElement( + "w:ins", + attrs={ + qn("w:id"): str(change_id), + qn("w:author"): author, + qn("w:date"): date.strftime("%Y-%m-%dT%H:%M:%SZ"), + }, + ), + ) + index = parent.index(r) + parent.remove(r) + ins.append(r) + parent.insert(index, ins) + return ins + + +class _TrackedChangesCtx: + """Context-manager for tracked-change writes on a :class:`Document`. + + Created by :meth:`Document.tracked_changes`. While active, every call to + :meth:`Document.add_paragraph`, :meth:`BlockItemContainer.add_paragraph`, + or :meth:`Paragraph.add_run` wraps the freshly-inserted `w:r` in a + `w:ins` element whose `w:author` and `w:date` come from this context. + + Contexts can be nested; the innermost active context supplies the + author/date. Passing an explicit ``track_author=`` keyword argument to + `add_paragraph` / `add_run` overrides the context and works even when no + context is active. + + .. versionadded:: 2026.05.0 + """ + + def __init__( + self, document, author: str, date: dt.datetime | None = None + ): + self._document = document + self._author = author + self._date = date + + @property + def author(self) -> str: + """Author string applied to each tracked insertion.""" + return self._author + + @property + def date(self) -> dt.datetime | None: + """Timestamp applied to each tracked insertion, or |None| to use now().""" + return self._date + + def __enter__(self): + self._document._tracked_changes_stack.append(self) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + stack = self._document._tracked_changes_stack + # -- pop the top frame; be defensive if the caller has nested wrongly -- + if stack and stack[-1] is self: + stack.pop() + elif self in stack: + stack.remove(self) + + +def _active_track_author(part) -> tuple[str, dt.datetime | None] | None: + """Return ``(author, date)`` from the active tracked-changes context. + + Looks up the innermost `_TrackedChangesCtx` registered on the Document + proxy that owns `part`. Returns |None| when no context is active or the + part does not belong to a Document proxy (e.g. header/footer story + parts, which don't carry the stack). + """ + doc_proxy = getattr(part, "_track_changes_doc_proxy", None) + if doc_proxy is None: + return None + stack = getattr(doc_proxy, "_tracked_changes_stack", None) + # -- Require a real list; `Mock.getattr` returns another Mock which is + # -- truthy but not a valid stack. Reject anything that isn't a list. -- + if not isinstance(stack, list) or not stack: + return None + top = stack[-1] + if not isinstance(top, _TrackedChangesCtx): + return None + return top.author, top.date diff --git a/src/docx/watermark.py b/src/docx/watermark.py new file mode 100644 index 000000000..dcb75ec92 --- /dev/null +++ b/src/docx/watermark.py @@ -0,0 +1,48 @@ +"""The |Watermark| proxy class. + +Provides a small read-side API for a VML watermark stored in a section's +default page header. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from docx.oxml.ns import qn + +if TYPE_CHECKING: + from docx.oxml.watermark import CT_VmlShape + + +class Watermark: + """Proxy for a VML watermark shape (``v:shape``) residing in a header. + + .. versionadded:: 2026.05.0 + """ + + def __init__(self, shape: "CT_VmlShape"): + self._shape = shape + + @property + def type(self) -> str: + """``"text"`` or ``"image"``. + + Returns ``"image"`` when an ```` child is present, otherwise + ``"text"``. + + .. versionadded:: 2026.05.0 + """ + if self._shape.find(qn("v:imagedata")) is not None: + return "image" + return "text" + + @property + def text(self) -> str | None: + """Text string of a text watermark, or ``None`` for an image watermark. + + .. versionadded:: 2026.05.0 + """ + textpath = self._shape.find(qn("v:textpath")) + if textpath is None: + return None + return textpath.get("string") diff --git a/src/docx/web_settings.py b/src/docx/web_settings.py new file mode 100644 index 000000000..3ed76f6bb --- /dev/null +++ b/src/docx/web_settings.py @@ -0,0 +1,96 @@ +"""|WebSettings| proxy for ``word/webSettings.xml``. + +Provides read-only-ish access to a small subset of the OOXML web-settings +part: encoding, "optimize for browser", "allow PNG", and +"do not save as single file". The remaining schema children +(``w:frameset``, ``w:divs``, and several rarely-used flags) are not +exposed because they are unlikely to be useful from Python code and +add significant surface area. + +Access via :attr:`docx.document.Document.web_settings`, which returns a +:class:`WebSettings` instance when the document has a ``webSettings`` +relationship, or |None| otherwise. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +from docx.shared import ElementProxy + +if TYPE_CHECKING: + import docx.types as t + from docx.oxml.web_settings import CT_WebSettings + from docx.oxml.xmlchemy import BaseOxmlElement + + +class WebSettings(ElementProxy): + """Proxy for the ``w:webSettings`` root element of the web-settings part. + + Exposes a small, read-oriented slice of the OOXML web-settings + schema. Boolean flag properties accept a setter that toggles the + corresponding ``w:val`` child. + + .. versionadded:: 2026.05.0 + """ + + def __init__( + self, + element: "BaseOxmlElement", + parent: "t.ProvidesXmlPart | None" = None, + ): + super().__init__(element, parent) + self._web_settings = cast("CT_WebSettings", element) + + @property + def encoding(self) -> str | None: + """Value of ``w:encoding/@w:val`` or |None| if the element is absent. + + Read-only. Records the text encoding Word should use when the + document is saved as a web page. + + .. versionadded:: 2026.05.0 + """ + return self._web_settings.encoding_val + + @property + def optimize_for_browser(self) -> bool: + """True when ``w:optimizeForBrowser`` is present and not disabled. + + Read/write. Assigning ``False`` (or |None|) removes the element. + + .. versionadded:: 2026.05.0 + """ + return self._web_settings.optimizeForBrowser_val + + @optimize_for_browser.setter + def optimize_for_browser(self, value: bool | None): + self._web_settings.optimizeForBrowser_val = value + + @property + def allow_png(self) -> bool: + """True when ``w:allowPNG`` is present and not disabled. + + Read/write. Assigning ``False`` (or |None|) removes the element. + + .. versionadded:: 2026.05.0 + """ + return self._web_settings.allowPNG_val + + @allow_png.setter + def allow_png(self, value: bool | None): + self._web_settings.allowPNG_val = value + + @property + def do_not_save_as_single_file(self) -> bool: + """True when ``w:doNotSaveAsSingleFile`` is present and not disabled. + + Read/write. Assigning ``False`` (or |None|) removes the element. + + .. versionadded:: 2026.05.0 + """ + return self._web_settings.doNotSaveAsSingleFile_val + + @do_not_save_as_single_file.setter + def do_not_save_as_single_file(self, value: bool | None): + self._web_settings.doNotSaveAsSingleFile_val = value diff --git a/tests/conftest.py b/tests/conftest.py index 2abfcc969..82afc864f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,10 +2,15 @@ from __future__ import annotations +import os +import tempfile from typing import TYPE_CHECKING import pytest +from docx import Document +from docx.document import Document as DocumentCls + if TYPE_CHECKING: from docx import types as t from docx.parts.story import StoryPart @@ -19,3 +24,19 @@ def part(self) -> StoryPart: raise NotImplementedError return ProvidesStoryPart() + + +@pytest.fixture +def tmp_docx_path(): + """Yield a temporary file path for .docx output; cleaned up after test.""" + fd, path = tempfile.mkstemp(suffix=".docx") + os.close(fd) + yield path + if os.path.exists(path): + os.unlink(path) + + +@pytest.fixture +def blank_document() -> DocumentCls: + """Return a new blank Document for use in tests.""" + return Document() diff --git a/tests/dml/test_color.py b/tests/dml/test_color.py index f9fcae0c6..95dcbe68f 100644 --- a/tests/dml/test_color.py +++ b/tests/dml/test_color.py @@ -130,3 +130,103 @@ def it_can_change_its_theme_color( color_format = ColorFormat(cast(CT_R, element(r_cxml))) color_format.theme_color = new_value assert color_format._element.xml == xml(expected_cxml) + + +class DescribeColorFormat_Brightness: + """Phase A-v2 #7: ColorFormat.brightness implementation. + + See upstream #665 — the documented property used to raise AttributeError. + """ + + def it_returns_zero_when_no_color_is_set(self): + cf = ColorFormat(cast(CT_R, element("w:r"))) + assert cf.brightness == 0.0 + + def it_returns_zero_when_no_tint_or_shade(self): + cf = ColorFormat( + cast(CT_R, element("w:r/w:rPr/w:color{w:themeColor=accent1}")) + ) + assert cf.brightness == 0.0 + + def it_reads_a_positive_brightness_from_themeTint(self): + # -- themeTint=7F (≈127/255) → brightness ≈ 1 - 127/255 ≈ 0.502 -- + cf = ColorFormat( + cast( + CT_R, + element( + "w:r/w:rPr/w:color{w:themeColor=accent1,w:themeTint=7F}" + ), + ) + ) + assert 0.49 < cf.brightness < 0.51 + + def it_reads_a_negative_brightness_from_themeShade(self): + # -- themeShade=BF (≈191/255) → brightness ≈ 191/255 - 1 ≈ -0.251 -- + cf = ColorFormat( + cast( + CT_R, + element( + "w:r/w:rPr/w:color{w:themeColor=accent1,w:themeShade=BF}" + ), + ) + ) + assert -0.26 < cf.brightness < -0.24 + + def it_writes_themeTint_for_positive_brightness(self): + from docx.oxml.ns import qn + + r = cast( + CT_R, element("w:r/w:rPr/w:color{w:val=000000,w:themeColor=accent1}") + ) + cf = ColorFormat(r) + cf.brightness = 0.5 + color_elm = r.find(qn("w:rPr")).find(qn("w:color")) + tint = color_elm.get(qn("w:themeTint")) + assert tint is not None + assert 0x7E <= int(tint, 16) <= 0x80 + assert color_elm.get(qn("w:themeShade")) is None + + def it_writes_themeShade_for_negative_brightness(self): + from docx.oxml.ns import qn + + r = cast( + CT_R, element("w:r/w:rPr/w:color{w:val=000000,w:themeColor=accent1}") + ) + cf = ColorFormat(r) + cf.brightness = -0.25 + color_elm = r.find(qn("w:rPr")).find(qn("w:color")) + shade = color_elm.get(qn("w:themeShade")) + assert shade is not None + assert 0xBE <= int(shade, 16) <= 0xC0 + assert color_elm.get(qn("w:themeTint")) is None + + def it_clears_tint_and_shade_when_brightness_zero(self): + from docx.oxml.ns import qn + + r = cast( + CT_R, + element( + "w:r/w:rPr/w:color{w:val=000000,w:themeColor=accent1," + "w:themeTint=7F}" + ), + ) + cf = ColorFormat(r) + cf.brightness = 0.0 + color_elm = r.find(qn("w:rPr")).find(qn("w:color")) + assert color_elm.get(qn("w:themeTint")) is None + assert color_elm.get(qn("w:themeShade")) is None + + def it_rejects_out_of_range_brightness(self): + cf = ColorFormat( + cast( + CT_R, + element("w:r/w:rPr/w:color{w:val=000000,w:themeColor=accent1}"), + ) + ) + with pytest.raises(ValueError, match="-1.0 .. \\+1.0"): + cf.brightness = 1.5 + + def it_rejects_brightness_assignment_without_theme_color(self): + cf = ColorFormat(cast(CT_R, element("w:r/w:rPr/w:color{w:val=000000}"))) + with pytest.raises(ValueError, match="theme color"): + cf.brightness = 0.5 diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py new file mode 100644 index 000000000..228640d95 --- /dev/null +++ b/tests/helpers/__init__.py @@ -0,0 +1,16 @@ +"""Test helpers for validating python-docx output across multiple layers. + +Provides utilities for XML structure validation, OOXML schema validation, +round-trip testing, and reference file comparison. +""" + +from tests.helpers.roundtrip import assert_round_trip +from tests.helpers.validate import extract_xml_part, validate_ooxml_structure +from tests.helpers.xmlparse import parse_docx_xml + +__all__ = [ + "assert_round_trip", + "extract_xml_part", + "parse_docx_xml", + "validate_ooxml_structure", +] diff --git a/tests/helpers/libreoffice.py b/tests/helpers/libreoffice.py new file mode 100644 index 000000000..f3dcff54d --- /dev/null +++ b/tests/helpers/libreoffice.py @@ -0,0 +1,101 @@ +"""LibreOffice headless validation for .docx files. + +Converts .docx files to PDF using LibreOffice in headless mode. If the conversion +fails, it indicates the file is malformed or contains unsupported content. + +This validation layer is optional and requires LibreOffice to be installed. Tests +using this helper should be marked with `@pytest.mark.libreoffice`. +""" + +from __future__ import annotations + +import os +import shutil +import subprocess +import tempfile + + +class LibreOfficeNotAvailable(RuntimeError): + """Raised when LibreOffice is not installed or not on PATH.""" + + +class LibreOfficeConversionError(RuntimeError): + """Raised when LibreOffice fails to convert a .docx file.""" + + +def is_libreoffice_available() -> bool: + """Return True if LibreOffice is available on the system PATH.""" + return shutil.which("libreoffice") is not None + + +def validate_with_libreoffice( + docx_path: str, timeout: int = 60, outdir: str | None = None +) -> tuple[str, str]: + """Validate a .docx file by converting it to PDF with LibreOffice headless. + + Returns a (pdf_path, outdir) tuple on success. The caller is responsible for + cleaning up `outdir` (e.g. via `shutil.rmtree(outdir)`). + + Raises LibreOfficeConversionError if the conversion fails. + Raises LibreOfficeNotAvailable if LibreOffice is not installed. + + Args: + docx_path: Path to the .docx file to validate. + timeout: Maximum seconds to wait for conversion (default 60). + outdir: Optional output directory. A temporary directory is created if None. + """ + if not is_libreoffice_available(): + raise LibreOfficeNotAvailable( + "LibreOffice is not installed. Install with: " + "sudo apt-get install libreoffice-writer" + ) + + created_outdir = outdir is None + if outdir is None: + outdir = tempfile.mkdtemp(prefix="docx_lo_validate_") + + try: + result = subprocess.run( + [ + "libreoffice", + "--headless", + "--convert-to", + "pdf", + "--outdir", + outdir, + docx_path, + ], + capture_output=True, + text=True, + timeout=timeout, + ) + except subprocess.TimeoutExpired: + if created_outdir: + shutil.rmtree(outdir, ignore_errors=True) + raise LibreOfficeConversionError( + f"LibreOffice conversion timed out after {timeout}s for {docx_path}" + ) + + if result.returncode != 0: + if created_outdir: + shutil.rmtree(outdir, ignore_errors=True) + raise LibreOfficeConversionError( + f"LibreOffice conversion failed (exit code {result.returncode}):\n" + f"stdout: {result.stdout}\n" + f"stderr: {result.stderr}" + ) + + # Find the generated PDF + basename = os.path.splitext(os.path.basename(docx_path))[0] + pdf_path = os.path.join(outdir, f"{basename}.pdf") + + if not os.path.exists(pdf_path): + if created_outdir: + shutil.rmtree(outdir, ignore_errors=True) + raise LibreOfficeConversionError( + f"LibreOffice conversion produced no output PDF for {docx_path}.\n" + f"stdout: {result.stdout}\n" + f"stderr: {result.stderr}" + ) + + return pdf_path, outdir diff --git a/tests/helpers/refcmp.py b/tests/helpers/refcmp.py new file mode 100644 index 000000000..158ea8a80 --- /dev/null +++ b/tests/helpers/refcmp.py @@ -0,0 +1,112 @@ +"""Reference file comparison helpers. + +Provides utilities for comparing python-docx output against reference .docx files +created in Microsoft Word. This ensures python-docx can correctly read files produced +by Word and that its output is structurally compatible. +""" + +from __future__ import annotations + +import os + +from lxml import etree + +from tests.helpers.xmlparse import parse_docx_xml + +_REF_DOCS_DIR = os.path.join(os.path.dirname(__file__), "..", "ref-docs") + + +def ref_docx_path(name: str) -> str: + """Return the absolute path to a reference .docx file by name (without extension).""" + return os.path.join(_REF_DOCS_DIR, f"{name}.docx") + + +def ref_docx_exists(name: str) -> bool: + """Return True if a reference .docx file with the given name exists.""" + return os.path.exists(ref_docx_path(name)) + + +def compare_xml_structure( + actual_path: str, + reference_path: str, + part_name: str, + ignore_attrs: set[str] | None = None, +) -> list[str]: + """Compare the XML structure of a part between two .docx files. + + Returns a list of differences. An empty list means the structures match. + Only compares element tags and specified attributes — text content and + element ordering are compared, but whitespace differences are ignored. + + `ignore_attrs` is a set of attribute names (in Clark notation) to exclude + from comparison. This is useful for attributes like `w:id` that may differ + between files but are not structurally significant. + """ + actual_elem = parse_docx_xml(actual_path, part_name) + ref_elem = parse_docx_xml(reference_path, part_name) + + if actual_elem is None and ref_elem is None: + return [] + if actual_elem is None: + return [f"Part '{part_name}' missing in actual file"] + if ref_elem is None: + return [f"Part '{part_name}' missing in reference file"] + + ignore = ignore_attrs or set() + differences: list[str] = [] + _compare_elements(actual_elem, ref_elem, "", ignore, differences) + return differences + + +def _compare_elements( + actual: etree._Element, + reference: etree._Element, + path: str, + ignore_attrs: set[str], + differences: list[str], +) -> None: + """Recursively compare two XML elements for structural equivalence.""" + current_path = f"{path}/{_local_tag(actual)}" + + # -- Compare tags -- + if actual.tag != reference.tag: + differences.append(f"{current_path}: tag mismatch: '{actual.tag}' vs '{reference.tag}'") + return + + # -- Compare attributes (excluding ignored ones) -- + actual_attrs = {k: v for k, v in actual.attrib.items() if k not in ignore_attrs} + ref_attrs = {k: v for k, v in reference.attrib.items() if k not in ignore_attrs} + if actual_attrs != ref_attrs: + differences.append( + f"{current_path}: attribute mismatch: {actual_attrs} vs {ref_attrs}" + ) + + # -- Compare text content (stripped) -- + actual_text = (actual.text or "").strip() + ref_text = (reference.text or "").strip() + if actual_text != ref_text: + differences.append( + f"{current_path}: text mismatch: '{actual_text}' vs '{ref_text}'" + ) + + # -- Compare children -- + actual_children = list(actual) + ref_children = list(reference) + + if len(actual_children) != len(ref_children): + differences.append( + f"{current_path}: child count mismatch: " + f"{len(actual_children)} vs {len(ref_children)}" + ) + return + + for a_child, r_child in zip(actual_children, ref_children): + _compare_elements(a_child, r_child, current_path, ignore_attrs, differences) + + +def _local_tag(elem: etree._Element) -> str: + """Return just the local part of an element's tag (strips namespace).""" + tag = elem.tag + if isinstance(tag, str) and tag.startswith("{"): + return tag.split("}", 1)[1] + return str(tag) diff --git a/tests/helpers/roundtrip.py b/tests/helpers/roundtrip.py new file mode 100644 index 000000000..555cab23d --- /dev/null +++ b/tests/helpers/roundtrip.py @@ -0,0 +1,63 @@ +"""Round-trip testing helpers for python-docx. + +Provides utilities for the write-save-reopen-assert pattern used to verify that +python-docx can correctly round-trip document content. +""" + +from __future__ import annotations + +import os +import tempfile +from typing import Callable, TypeVar + +from docx import Document +from docx.document import Document as DocumentCls + +T = TypeVar("T") + + +def assert_round_trip( + create_fn: Callable[[DocumentCls], T], + assert_fn: Callable[[DocumentCls, T], None], +) -> None: + """Create a document, save it, re-open it, and run assertions. + + `create_fn` receives a blank Document and should populate it with the content + under test. It may return any value that will be passed to `assert_fn` as + context (e.g. expected values). + + `assert_fn` receives the re-opened Document and the context value returned by + `create_fn`, and should assert that the content survived the round trip. + + The temporary file is automatically cleaned up. + """ + fd, path = tempfile.mkstemp(suffix=".docx") + os.close(fd) + + try: + # -- create and save -- + doc = Document() + context = create_fn(doc) + doc.save(path) + + # -- re-open and assert -- + doc2 = Document(path) + assert_fn(doc2, context) + finally: + os.unlink(path) + + +def save_and_reopen(doc: DocumentCls) -> DocumentCls: + """Save a document to a temp file and re-open it, returning the new Document. + + This is a simpler alternative to `assert_round_trip` when you need more control + over the test flow. The temporary file is cleaned up automatically. + """ + fd, path = tempfile.mkstemp(suffix=".docx") + os.close(fd) + + try: + doc.save(path) + return Document(path) + finally: + os.unlink(path) diff --git a/tests/helpers/schema.py b/tests/helpers/schema.py new file mode 100644 index 000000000..1c7093464 --- /dev/null +++ b/tests/helpers/schema.py @@ -0,0 +1,114 @@ +"""OOXML schema validation using lxml.etree.XMLSchema. + +Validates individual XML parts against XSD schemas derived from ECMA-376. +The schemas are simplified subsets focusing on the elements python-docx produces. + +For full schema validation, the complete ECMA-376 XSD files can be downloaded from: +https://www.ecma-international.org/publications-and-standards/standards/ecma-376/ + +This module provides a practical alternative that validates the most important +structural constraints without requiring the full (very large) schema set. +""" + +from __future__ import annotations + +import os +import zipfile +from typing import Optional + +from lxml import etree + +_SCHEMAS_DIR = os.path.join(os.path.dirname(__file__), "schemas") + +# -- OOXML namespace URIs -- +WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +REL_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" +PKG_REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships" +CT_NS = "http://schemas.openxmlformats.org/package/2006/content-types" + +# -- Namespace map for xpath queries -- +OOXML_NSMAP = { + "w": WML_NS, + "r": REL_NS, + "pr": PKG_REL_NS, + "ct": CT_NS, +} + + +class SchemaValidationResult: + """Result of validating an XML part against a schema.""" + + def __init__(self, is_valid: bool, errors: list[str]): + self.is_valid = is_valid + self.errors = errors + + def __bool__(self) -> bool: + return self.is_valid + + def __repr__(self) -> str: + if self.is_valid: + return "SchemaValidationResult(valid)" + return f"SchemaValidationResult(invalid, {len(self.errors)} errors)" + + +def validate_part_xml( + xml_bytes: bytes, + schema: etree.XMLSchema, +) -> SchemaValidationResult: + """Validate XML bytes against the provided lxml XMLSchema. + + Returns a SchemaValidationResult with is_valid=True if the XML is valid, + or is_valid=False with a list of error messages otherwise. + """ + try: + doc = etree.fromstring(xml_bytes) + except etree.XMLSyntaxError as e: + return SchemaValidationResult(False, [f"XML syntax error: {e}"]) + + is_valid = schema.validate(doc) + errors = [str(e) for e in schema.error_log] if not is_valid else [] + return SchemaValidationResult(is_valid, errors) + + +def load_schema(schema_path: str) -> etree.XMLSchema: + """Load an XSD schema from a file path.""" + with open(schema_path, "rb") as f: + schema_doc = etree.parse(f) + return etree.XMLSchema(schema_doc) + + +def load_bundled_schema(name: str) -> Optional[etree.XMLSchema]: + """Load a bundled XSD schema by name. + + Returns None if the schema file does not exist (schemas are optional and may + need to be downloaded separately). + """ + path = os.path.join(_SCHEMAS_DIR, f"{name}.xsd") + if not os.path.exists(path): + return None + return load_schema(path) + + +def validate_docx_xml_parts(docx_path: str) -> dict[str, SchemaValidationResult]: + """Validate all XML parts in a .docx file for well-formedness. + + This is a lighter check that ensures every XML part in the archive is at least + well-formed XML. For schema validation of specific parts, use `validate_part_xml` + with an appropriate schema. + + Returns a dict mapping part names to their validation results. + """ + results: dict[str, SchemaValidationResult] = {} + + with zipfile.ZipFile(docx_path) as zf: + for name in zf.namelist(): + if not (name.endswith(".xml") or name.endswith(".rels")): + continue + xml_bytes = zf.read(name) + try: + etree.fromstring(xml_bytes) + results[name] = SchemaValidationResult(True, []) + except etree.XMLSyntaxError as e: + results[name] = SchemaValidationResult(False, [f"XML syntax error: {e}"]) + + return results diff --git a/tests/helpers/schemas/README.md b/tests/helpers/schemas/README.md new file mode 100644 index 000000000..934c24960 --- /dev/null +++ b/tests/helpers/schemas/README.md @@ -0,0 +1,24 @@ +# OOXML Schema Files + +This directory contains XSD schema files for validating OOXML XML parts. + +## Bundled Schemas + +- `wml-comments.xsd` — Simplified schema for `word/comments.xml` validation. + +## Full ECMA-376 Schemas + +For comprehensive schema validation, download the full XSD schemas from ECMA: + + https://www.ecma-international.org/publications-and-standards/standards/ecma-376/ + +The relevant files are in Part 4 (Transitional Migration Features) of the standard. +Place the downloaded `.xsd` files in this directory and use `load_schema()` from +`tests/helpers/schema.py` to load them. + +## How Bundled Schemas Work + +The bundled schemas are simplified subsets of the full ECMA-376 schemas. They validate +the most important structural constraints for elements that python-docx produces, without +requiring the complete (very large) schema set. They use `processContents="lax"` for +child elements to allow content that goes beyond what the simplified schema defines. diff --git a/tests/helpers/schemas/wml-comments.xsd b/tests/helpers/schemas/wml-comments.xsd new file mode 100644 index 000000000..4eff8f6d9 --- /dev/null +++ b/tests/helpers/schemas/wml-comments.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/helpers/validate.py b/tests/helpers/validate.py new file mode 100644 index 000000000..3c0b16c09 --- /dev/null +++ b/tests/helpers/validate.py @@ -0,0 +1,208 @@ +"""OOXML structure and schema validation helpers for .docx files.""" + +from __future__ import annotations + +import os +import zipfile +from typing import Sequence + +from lxml import etree + +from tests.helpers.xmlparse import parse_docx_xml + +# -- Namespaces used in OOXML documents ------------------------------------------------ + +_CONTENT_TYPES_NS = "http://schemas.openxmlformats.org/package/2006/content-types" +_RELS_NS = "http://schemas.openxmlformats.org/package/2006/relationships" +_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +class OoxmlValidationError(Exception): + """Raised when OOXML structural validation fails.""" + + +def extract_xml_part(docx_path: str, part_name: str) -> etree._Element: + """Extract and parse an XML part from a .docx, raising if it does not exist. + + This is a convenience wrapper around `parse_docx_xml` that raises rather than + returning None when the part is missing. + """ + element = parse_docx_xml(docx_path, part_name) + if element is None: + raise OoxmlValidationError(f"Part '{part_name}' not found in {docx_path}") + return element + + +def validate_ooxml_structure(docx_path: str) -> list[str]: + """Validate the structural integrity of a .docx file. + + Returns a list of validation error messages. An empty list means the file is + structurally valid. Checks include: + + - The file is a valid ZIP archive. + - `[Content_Types].xml` exists and is well-formed XML. + - Every Override in `[Content_Types].xml` references a part that exists. + - `_rels/.rels` exists and is well-formed XML. + - `word/document.xml` exists and has a `w:document` root element. + - All relationship targets in `word/_rels/document.xml.rels` exist in the archive. + - All XML parts referenced are well-formed XML. + """ + errors: list[str] = [] + + # -- Check that it's a valid zip ------------------------------------------------- + if not zipfile.is_zipfile(docx_path): + return [f"{docx_path} is not a valid ZIP file"] + + with zipfile.ZipFile(docx_path) as zf: + names = set(zf.namelist()) + + # -- [Content_Types].xml ----------------------------------------------------- + if "[Content_Types].xml" not in names: + errors.append("Missing [Content_Types].xml") + else: + ct_elem = _parse_zip_xml(zf, "[Content_Types].xml", errors) + if ct_elem is not None: + _check_content_types_overrides(ct_elem, names, errors) + + # -- _rels/.rels ------------------------------------------------------------- + if "_rels/.rels" not in names: + errors.append("Missing _rels/.rels") + else: + _parse_zip_xml(zf, "_rels/.rels", errors) + + # -- word/document.xml ------------------------------------------------------- + if "word/document.xml" not in names: + errors.append("Missing word/document.xml") + else: + doc_elem = _parse_zip_xml(zf, "word/document.xml", errors) + if doc_elem is not None: + _check_root_tag(doc_elem, f"{{{_WML_NS}}}document", "word/document.xml", errors) + + # -- word/_rels/document.xml.rels -------------------------------------------- + doc_rels_path = "word/_rels/document.xml.rels" + if doc_rels_path in names: + rels_elem = _parse_zip_xml(zf, doc_rels_path, errors) + if rels_elem is not None: + _check_relationship_targets(rels_elem, names, errors) + + # -- Validate all XML parts are well-formed ---------------------------------- + already_parsed = {"[Content_Types].xml", "_rels/.rels", "word/document.xml", doc_rels_path} + for name in names: + if name in already_parsed: + continue + if name.endswith(".xml") or name.endswith(".rels"): + _parse_zip_xml(zf, name, errors) + + return errors + + +def validate_content_type_present(docx_path: str, content_type: str) -> bool: + """Return True if `content_type` is registered in [Content_Types].xml.""" + ct_elem = extract_xml_part(docx_path, "[Content_Types].xml") + for override in ct_elem.findall(f"{{{_CONTENT_TYPES_NS}}}Override"): + if override.get("ContentType") == content_type: + return True + for default in ct_elem.findall(f"{{{_CONTENT_TYPES_NS}}}Default"): + if default.get("ContentType") == content_type: + return True + return False + + +def validate_relationship_present( + docx_path: str, + rel_type: str, + rels_part: str = "word/_rels/document.xml.rels", +) -> bool: + """Return True if a relationship of `rel_type` exists in the specified rels part.""" + rels_elem = parse_docx_xml(docx_path, rels_part) + if rels_elem is None: + return False + for rel in rels_elem.findall(f"{{{_RELS_NS}}}Relationship"): + if rel.get("Type") == rel_type: + return True + return False + + +def validate_elements_present( + docx_path: str, + part_name: str, + xpath: str, + namespaces: dict[str, str] | None = None, + min_count: int = 1, +) -> list[etree._Element]: + """Assert that at least `min_count` elements matching `xpath` exist in `part_name`. + + Returns the matching elements. Raises OoxmlValidationError if the count is below + `min_count`. + """ + element = extract_xml_part(docx_path, part_name) + ns = namespaces or {"w": _WML_NS} + matches = element.xpath(xpath, namespaces=ns) + if not isinstance(matches, list): + matches = [matches] + if len(matches) < min_count: + raise OoxmlValidationError( + f"Expected at least {min_count} elements matching '{xpath}' in " + f"'{part_name}', found {len(matches)}" + ) + return matches + + +# -- internal helpers ---------------------------------------------------------------- + + +def _parse_zip_xml( + zf: zipfile.ZipFile, name: str, errors: list[str] +) -> etree._Element | None: + """Parse an XML file from the zip, appending to errors on failure.""" + try: + return etree.fromstring(zf.read(name)) + except etree.XMLSyntaxError as e: + errors.append(f"Malformed XML in {name}: {e}") + return None + + +def _check_content_types_overrides( + ct_elem: etree._Element, archive_names: set[str], errors: list[str] +) -> None: + """Verify every Override PartName in [Content_Types].xml has a matching archive entry.""" + for override in ct_elem.findall(f"{{{_CONTENT_TYPES_NS}}}Override"): + part_name = override.get("PartName", "") + # PartName starts with "/" in the XML, but zip entries don't + zip_name = part_name.lstrip("/") + if zip_name not in archive_names: + errors.append( + f"[Content_Types].xml Override references missing part: {part_name}" + ) + + +def _check_root_tag( + elem: etree._Element, expected_tag: str, part_name: str, errors: list[str] +) -> None: + """Verify an element has the expected root tag.""" + if elem.tag != expected_tag: + errors.append( + f"{part_name}: expected root tag '{expected_tag}', got '{elem.tag}'" + ) + + +def _check_relationship_targets( + rels_elem: etree._Element, archive_names: set[str], errors: list[str] +) -> None: + """Verify relationship targets exist in the archive (for internal targets only).""" + for rel in rels_elem.findall(f"{{{_RELS_NS}}}Relationship"): + target_mode = rel.get("TargetMode", "Internal") + if target_mode == "External": + continue + target = rel.get("Target", "") + # Relationship targets are relative to the source part's directory + if target.startswith("/"): + zip_path = target.lstrip("/") + else: + zip_path = f"word/{target}" + # Normalize parent-directory references (e.g. "word/../customXml/item1.xml") + zip_path = os.path.normpath(zip_path).replace("\\", "/") + if zip_path not in archive_names: + errors.append( + f"Relationship target '{target}' not found in archive (expected '{zip_path}')" + ) diff --git a/tests/helpers/xmlparse.py b/tests/helpers/xmlparse.py new file mode 100644 index 000000000..e57d65df6 --- /dev/null +++ b/tests/helpers/xmlparse.py @@ -0,0 +1,31 @@ +"""Helpers for extracting and parsing XML from .docx files.""" + +from __future__ import annotations + +import zipfile +from typing import Optional + +from lxml import etree + + +def parse_docx_xml(docx_path: str, part_name: str) -> Optional[etree._Element]: + """Extract and parse an XML part from a .docx file. + + Returns the parsed lxml Element for the specified part, or None if the part + does not exist in the archive. + + Args: + docx_path: Path to the .docx file. + part_name: The part name within the zip (e.g. "word/comments.xml"). + """ + with zipfile.ZipFile(docx_path) as zf: + if part_name not in zf.namelist(): + return None + xml_bytes = zf.read(part_name) + return etree.fromstring(xml_bytes) + + +def list_docx_parts(docx_path: str) -> list[str]: + """Return a list of all part names in a .docx file.""" + with zipfile.ZipFile(docx_path) as zf: + return zf.namelist() diff --git a/tests/image/test_bmp.py b/tests/image/test_bmp.py index 27c0e8f5c..1f7fa8099 100644 --- a/tests/image/test_bmp.py +++ b/tests/image/test_bmp.py @@ -32,6 +32,23 @@ def it_knows_its_default_ext(self): bmp = Bmp(None, None, None, None) assert bmp.default_ext == "bmp" + @pytest.mark.parametrize( + ("px_per_meter", "expected_dpi"), + [ + (0, 96), + (None, 96), + (1, 96), # -- rounds to zero, falls back -- + (3780, 96), # -- ~96 dpi -- + (11811, 300), + ], + ) + def it_falls_back_when_px_per_meter_is_zero_or_rounds_to_zero( + self, px_per_meter, expected_dpi + ): + # -- exercise private helper directly; covers both legacy 0 case and + # the new rounds-to-zero guard -- + assert Bmp._dpi(px_per_meter) == expected_dpi + # fixtures ------------------------------------------------------- @pytest.fixture diff --git a/tests/image/test_emf.py b/tests/image/test_emf.py new file mode 100644 index 000000000..4662d578f --- /dev/null +++ b/tests/image/test_emf.py @@ -0,0 +1,88 @@ +"""Unit test suite for docx.image.emf module.""" + +from __future__ import annotations + +import io +import struct + +import pytest + +from docx.image.constants import MIME_TYPE +from docx.image.emf import Emf +from docx.image.exceptions import InvalidImageStreamError +from docx.image.image import _ImageHeaderFactory + + +def _emf_header( + bounds=(0, 0, 200, 100), + frame=(0, 0, 5291, 2645), # 0.01-mm units; 5291 ≈ 2 in, 2645 ≈ 1 in + device_px=(1920, 1080), + device_mm=(508, 285), # 20 in x ~11.22 in → 96 dpi both axes +): + """Build a minimal but valid EMR_HEADER record.""" + header = bytearray(88) + # offset 0x00 RecordType = 1 + struct.pack_into("' + b"" + ) + stream = io.BytesIO(svg_bytes) + svg = Svg.from_stream(stream) + assert svg.px_width == 200 + assert svg.px_height == 100 + assert svg.content_type == MIME_TYPE.SVG + assert svg.default_ext == "svg" + + def it_parses_dimensions_from_width_and_height_attrs(self): + svg_bytes = ( + b'' + b"" + ) + stream = io.BytesIO(svg_bytes) + svg = Svg.from_stream(stream) + assert svg.px_width == 300 + assert svg.px_height == 200 + + def it_parses_dimensions_from_viewBox_when_no_width_height(self): + svg_bytes = ( + b'' + b"" + ) + stream = io.BytesIO(svg_bytes) + svg = Svg.from_stream(stream) + assert svg.px_width == 400 + assert svg.px_height == 300 + + def it_parses_width_and_height_with_units(self): + svg_bytes = ( + b'' + b"" + ) + stream = io.BytesIO(svg_bytes) + svg = Svg.from_stream(stream) + assert svg.px_width == 192 # 2 * 96 + assert svg.px_height == 96 # 1 * 96 + + def it_uses_default_dimensions_when_no_size_info(self): + svg_bytes = ( + b'' + b"" + ) + stream = io.BytesIO(svg_bytes) + svg = Svg.from_stream(stream) + assert svg.px_width == 300 + assert svg.px_height == 150 + + def it_uses_96_dpi(self): + svg_bytes = ( + b'' + b"" + ) + stream = io.BytesIO(svg_bytes) + svg = Svg.from_stream(stream) + assert svg.horz_dpi == 96 + assert svg.vert_dpi == 96 + + +class Describe_is_svg_stream: + def it_returns_True_for_an_svg_stream(self): + svg_bytes = b'' + stream = io.BytesIO(svg_bytes) + assert is_svg_stream(stream) is True + + def it_returns_True_for_svg_with_xml_declaration(self): + svg_bytes = ( + b'' + b'' + ) + stream = io.BytesIO(svg_bytes) + assert is_svg_stream(stream) is True + + def it_returns_True_for_svg_with_BOM(self): + svg_bytes = ( + b"\xef\xbb\xbf" + b'' + ) + stream = io.BytesIO(svg_bytes) + assert is_svg_stream(stream) is True + + def it_returns_False_for_a_non_svg_stream(self): + stream = io.BytesIO(b"not an svg file at all") + assert is_svg_stream(stream) is False + + def it_returns_False_for_non_svg_xml(self): + stream = io.BytesIO(b'') + assert is_svg_stream(stream) is False + + +class Describe_generate_fallback_png: + def it_generates_a_valid_png(self): + png_bytes = generate_fallback_png() + assert png_bytes[:8] == b"\x89PNG\r\n\x1a\n" + assert len(png_bytes) > 8 + + +class Describe_ImageHeaderFactory_SVG: + def it_returns_Svg_for_an_svg_stream(self): + svg_bytes = ( + b'' + b"" + ) + stream = io.BytesIO(svg_bytes) + image_header = _ImageHeaderFactory(stream) + assert isinstance(image_header, Svg) + + def it_returns_Svg_for_svg_with_xml_declaration(self): + svg_bytes = ( + b'\n' + b'' + b"" + ) + stream = io.BytesIO(svg_bytes) + image_header = _ImageHeaderFactory(stream) + assert isinstance(image_header, Svg) diff --git a/tests/image/test_tiff.py b/tests/image/test_tiff.py index 35344eede..097cee424 100644 --- a/tests/image/test_tiff.py +++ b/tests/image/test_tiff.py @@ -115,6 +115,51 @@ def it_knows_the_horz_and_vert_dpi_after_parsing(self, dpi_fixture): assert tiff_parser.horz_dpi == expected_horz_dpi assert tiff_parser.vert_dpi == expected_vert_dpi + @pytest.mark.parametrize( + "bad_value", + [ + "UNIMPLEMENTED FIELD TYPE", + "Multi-value short integer NOT IMPLEMENTED", + None, + object(), + ], + ) + def it_falls_back_to_72_dpi_when_resolution_tag_is_non_numeric(self, bad_value): + entries = { + TIFF_TAG.RESOLUTION_UNIT: 2, + TIFF_TAG.X_RESOLUTION: bad_value, + TIFF_TAG.Y_RESOLUTION: bad_value, + } + tiff_parser = _TiffParser(_IfdEntries(entries)) + + # -- prior to the guard, these would raise TypeError on round() -- + assert tiff_parser.horz_dpi == 72 + assert tiff_parser.vert_dpi == 72 + + def it_falls_back_to_72_dpi_when_resolution_rounds_to_zero(self): + entries = { + TIFF_TAG.RESOLUTION_UNIT: 2, + TIFF_TAG.X_RESOLUTION: 0, + TIFF_TAG.Y_RESOLUTION: 0, + } + tiff_parser = _TiffParser(_IfdEntries(entries)) + + assert tiff_parser.horz_dpi == 72 + assert tiff_parser.vert_dpi == 72 + + @pytest.mark.parametrize("value", [1, 2, 3, 4, 5, 6, 7, 8]) + def it_exposes_a_valid_Orientation_tag_value(self, value): + tiff_parser = _TiffParser(_IfdEntries({TIFF_TAG.ORIENTATION: value})) + + assert tiff_parser.orientation == value + + @pytest.mark.parametrize("bad", [0, 9, -1, "2", None, True]) + def it_returns_None_for_absent_or_out_of_range_orientation(self, bad): + entries = {} if bad is None else {TIFF_TAG.ORIENTATION: bad} + tiff_parser = _TiffParser(_IfdEntries(entries)) + + assert tiff_parser.orientation is None + # fixtures ------------------------------------------------------- @pytest.fixture( @@ -417,6 +462,34 @@ def it_can_parse_an_ascii_string_IFD_entry(self): val = _AsciiIfdEntry._parse_value(stream_rdr, None, 7, 0) assert val == "foobar" + def it_reads_short_ascii_values_inline_per_TIFF_6_0(self): + # -- layout: first 8 bytes are the leading part of the IFD entry (tag, + # type, count, left un-set here as padding); the next 4 bytes are + # the inline ASCII value. For value_count=3 the string is "AB" + # plus a NUL terminator, and the library strips the NUL. + bytes_ = b"\x00\x00\x00\x00\x00\x00\x00\x00AB\x00\x00" + stream_rdr = StreamReader(io.BytesIO(bytes_), BIG_ENDIAN) + + val = _AsciiIfdEntry._parse_value(stream_rdr, 0, 3, 0) + + # -- value is read from offset+8 (inline), not from value_offset=0 + # which would return "\x00\x00" (two NULs) -- + assert val == "AB" + + def it_reads_long_ascii_values_from_value_offset(self): + # -- 7 byte string "foobar" + NUL, held at value_offset 8 because + # total size exceeds 4 bytes -- + bytes_ = b"\x00" * 8 + b"foobar\x00" + stream_rdr = StreamReader(io.BytesIO(bytes_), BIG_ENDIAN) + + val = _AsciiIfdEntry._parse_value(stream_rdr, 0, 7, 8) + + assert val == "foobar" + + def it_returns_empty_string_for_zero_count(self): + stream_rdr = StreamReader(io.BytesIO(b""), BIG_ENDIAN) + assert _AsciiIfdEntry._parse_value(stream_rdr, 0, 0, 0) == "" + class Describe_ShortIfdEntry: def it_can_parse_a_short_int_IFD_entry(self): @@ -425,6 +498,24 @@ def it_can_parse_a_short_int_IFD_entry(self): val = _ShortIfdEntry._parse_value(stream_rdr, 0, 1, None) assert val == 42 + def it_reads_two_packed_shorts_inline(self): + # -- value_count=2 still fits in the 4-byte slot at offset+8 -- + bytes_ = b"\x00" * 8 + b"\x00\x2a\x00\x54" + stream_rdr = StreamReader(io.BytesIO(bytes_), BIG_ENDIAN) + + val = _ShortIfdEntry._parse_value(stream_rdr, 0, 2, 0) + + assert val == 42 + + def it_reads_larger_short_arrays_from_value_offset(self): + bytes_ = b"\x00" * 12 + b"\x00\x2a" + stream_rdr = StreamReader(io.BytesIO(bytes_), BIG_ENDIAN) + + # -- value_count=3 does not fit inline; falls back to value_offset -- + val = _ShortIfdEntry._parse_value(stream_rdr, 0, 3, 12) + + assert val == 42 + class Describe_LongIfdEntry: def it_can_parse_a_long_int_IFD_entry(self): diff --git a/tests/image/test_webp.py b/tests/image/test_webp.py new file mode 100644 index 000000000..575fbc9fa --- /dev/null +++ b/tests/image/test_webp.py @@ -0,0 +1,105 @@ +"""Unit test suite for docx.image.webp module.""" + +from __future__ import annotations + +import io +import struct + +import pytest + +from docx.image.constants import MIME_TYPE +from docx.image.exceptions import InvalidImageStreamError +from docx.image.image import _ImageHeaderFactory +from docx.image.webp import WebP + + +def _riff_container(chunk_fourcc: bytes, chunk_payload: bytes) -> bytes: + """Build a minimal RIFF/WEBP container wrapping `chunk_fourcc`+payload.""" + chunk = chunk_fourcc + struct.pack(" bytes: + """Build a minimal VP8 lossy payload with the given 14-bit dims.""" + # 3-byte frame tag (keyframe, bit0=0) + 3-byte start code + 2-byte width + # with 2-bit scale + 2-byte height with 2-bit scale. Remaining bytes are + # bitstream which we never parse. + frame_tag = b"\x00\x00\x00" + start_code = b"\x9d\x01\x2a" + w = struct.pack(" bytes: + """Build a minimal VP8L lossless payload encoding (w-1, h-1).""" + sig = b"\x2f" + bits = ((width - 1) & 0x3FFF) | (((height - 1) & 0x3FFF) << 14) + return sig + struct.pack(" bytes: + """Build a minimal VP8X extended payload encoding (w-1, h-1) as 24-bit.""" + flags = b"\x00" * 4 # flags + reserved + w_minus_1 = width - 1 + h_minus_1 = height - 1 + w_bytes = bytes((w_minus_1 & 0xFF, (w_minus_1 >> 8) & 0xFF, (w_minus_1 >> 16) & 0xFF)) + h_bytes = bytes((h_minus_1 & 0xFF, (h_minus_1 >> 8) & 0xFF, (h_minus_1 >> 16) & 0xFF)) + return flags + w_bytes + h_bytes + + +class DescribeWebP: + def it_parses_a_vp8_lossy_stream(self): + blob = _riff_container(b"VP8 ", _vp8_payload(200, 100)) + stream = io.BytesIO(blob) + + webp = WebP.from_stream(stream) + + assert webp.px_width == 200 + assert webp.px_height == 100 + assert webp.horz_dpi == 72 + assert webp.vert_dpi == 72 + + def it_parses_a_vp8l_lossless_stream(self): + blob = _riff_container(b"VP8L", _vp8l_payload(640, 480)) + stream = io.BytesIO(blob) + + webp = WebP.from_stream(stream) + + assert webp.px_width == 640 + assert webp.px_height == 480 + + def it_parses_a_vp8x_extended_stream(self): + blob = _riff_container(b"VP8X", _vp8x_payload(1024, 768)) + stream = io.BytesIO(blob) + + webp = WebP.from_stream(stream) + + assert webp.px_width == 1024 + assert webp.px_height == 768 + + def it_knows_its_content_type(self): + assert WebP(0, 0, 0, 0).content_type == MIME_TYPE.WEBP + + def it_knows_its_default_ext(self): + assert WebP(0, 0, 0, 0).default_ext == "webp" + + def it_rejects_a_non_riff_stream(self): + with pytest.raises(InvalidImageStreamError): + WebP.from_stream(io.BytesIO(b"NOT A WEBP" + b"\x00" * 64)) + + def it_rejects_an_unknown_chunk_type(self): + blob = b"RIFF" + struct.pack(" str: diff --git a/tests/opc/test_crypto.py b/tests/opc/test_crypto.py new file mode 100644 index 000000000..f9fa1d3d3 --- /dev/null +++ b/tests/opc/test_crypto.py @@ -0,0 +1,149 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.opc._crypto` module.""" + +from __future__ import annotations + +import importlib.util +import io +import os + +import pytest + +from docx.exceptions import EncryptedDocumentError +from docx.opc import _crypto +from docx.opc._crypto import ( + _OLE_SIGNATURE, + decrypt_stream, + encrypt_bytes, + is_encrypted_stream, +) + +# -- gracefully skip tests that depend on the optional python-ooxml-crypto +# -- package when it is not installed (matches the pptx convention). +requires_ooxml_crypto = pytest.mark.skipif( + importlib.util.find_spec("ooxml_crypto") is None, + reason="python-ooxml-crypto is not installed (optional dependency)", +) + + +def _default_docx_bytes() -> bytes: + """Return the bytes of the bundled default .docx template.""" + here = os.path.dirname(os.path.dirname(__file__)) + # -- test.docx lives under tests/test_files and is a valid zip package -- + path = os.path.join(here, "test_files", "test.docx") + with open(path, "rb") as f: + return f.read() + + +class Describe_is_encrypted_stream: + """Unit-test suite for `docx.opc._crypto.is_encrypted_stream`.""" + + def it_returns_True_when_stream_starts_with_the_OLE_magic(self): + stream = io.BytesIO(_OLE_SIGNATURE + b"rest-of-cfbf-container") + assert is_encrypted_stream(stream) is True + + def it_returns_False_for_a_plain_zip_stream(self): + stream = io.BytesIO(b"PK\x03\x04rest-of-zip") + assert is_encrypted_stream(stream) is False + + def it_returns_False_for_an_empty_stream(self): + assert is_encrypted_stream(io.BytesIO(b"")) is False + + def it_restores_the_stream_position(self): + stream = io.BytesIO(_OLE_SIGNATURE + b"rest") + stream.seek(3) + is_encrypted_stream(stream) + assert stream.tell() == 3 + + +class Describe_decrypt_stream: + """Unit-test suite for `docx.opc._crypto.decrypt_stream`.""" + + def it_raises_when_ooxml_crypto_is_not_installed(self, monkeypatch: pytest.MonkeyPatch): + # -- block the ooxml_crypto import so the ImportError branch executes -- + import builtins + + real_import = builtins.__import__ + + def fake_import(name: str, *args: object, **kwargs: object): + if name.startswith("ooxml_crypto"): + raise ImportError(name) + return real_import(name, *args, **kwargs) # pyright: ignore[reportArgumentType] + + monkeypatch.setattr(builtins, "__import__", fake_import) + + with pytest.raises(EncryptedDocumentError, match="python-ooxml-crypto"): + decrypt_stream(io.BytesIO(b""), "pw") + + @requires_ooxml_crypto + def it_raises_on_wrong_password(self, encrypted_minimal_docx: bytes): + with pytest.raises(EncryptedDocumentError, match="password does not match"): + decrypt_stream(io.BytesIO(encrypted_minimal_docx), "wrong") + + def it_raises_on_malformed_encrypted_input(self): + # -- bytes that pass the OLE sniff but fail further down the ooxml_crypto path -- + garbage = _OLE_SIGNATURE + b"\x00" * 4096 + with pytest.raises(EncryptedDocumentError): + decrypt_stream(io.BytesIO(garbage), "pw") + + @requires_ooxml_crypto + def it_returns_plain_bytes_on_success( + self, encrypted_minimal_docx: bytes, minimal_docx_bytes: bytes + ): + plain = decrypt_stream(io.BytesIO(encrypted_minimal_docx), "unittest") + assert plain == minimal_docx_bytes + + # -- fixtures ------------------------------------------------------ + + @pytest.fixture + def minimal_docx_bytes(self) -> bytes: + return _default_docx_bytes() + + @pytest.fixture + def encrypted_minimal_docx(self, minimal_docx_bytes: bytes) -> bytes: + return encrypt_bytes(minimal_docx_bytes, "unittest") + + +class Describe_encrypt_bytes: + """Unit-test suite for `docx.opc._crypto.encrypt_bytes`.""" + + def it_raises_when_ooxml_crypto_is_not_installed(self, monkeypatch: pytest.MonkeyPatch): + import builtins + + real_import = builtins.__import__ + + def fake_import(name: str, *args: object, **kwargs: object): + if name.startswith("ooxml_crypto"): + raise ImportError(name) + return real_import(name, *args, **kwargs) # pyright: ignore[reportArgumentType] + + monkeypatch.setattr(builtins, "__import__", fake_import) + + with pytest.raises(EncryptedDocumentError, match="python-ooxml-crypto"): + encrypt_bytes(b"", "pw") + + @requires_ooxml_crypto + def it_produces_a_CFBF_container(self): + plain = _default_docx_bytes() + + encrypted = encrypt_bytes(plain, "pw") + + assert encrypted.startswith(_OLE_SIGNATURE) + + @requires_ooxml_crypto + def it_produces_bytes_that_round_trip_through_decrypt(self): + plain = _default_docx_bytes() + + encrypted = encrypt_bytes(plain, "pw") + recovered = decrypt_stream(io.BytesIO(encrypted), "pw") + + assert recovered == plain + + +class Describe_missing_dep_message: + """The message exposed when python-ooxml-crypto is absent.""" + + def it_mentions_the_package_and_pip_install(self): + assert "python-ooxml-crypto" in _crypto._MISSING_DEP_MSG + assert "pip install" in _crypto._MISSING_DEP_MSG diff --git a/tests/opc/test_oxml.py b/tests/opc/test_oxml.py index 0b3e5e36f..54d7977ed 100644 --- a/tests/opc/test_oxml.py +++ b/tests/opc/test_oxml.py @@ -1,5 +1,8 @@ """Test suite for opc.oxml module.""" +import pytest +from lxml import etree + from docx.opc.constants import RELATIONSHIP_TARGET_MODE as RTM from docx.opc.oxml import ( CT_Default, @@ -7,6 +10,7 @@ CT_Relationship, CT_Relationships, CT_Types, + parse_xml, ) from docx.oxml.xmlchemy import serialize_for_reading @@ -51,6 +55,35 @@ def it_provides_read_access_to_xml_values(self): assert rel.target_ref == "docProps/core.xml" assert rel.target_mode == RTM.INTERNAL + def it_reports_external_for_fragment_targets_even_without_target_mode(self): + # -- upstream#902 / #1349 / #678: internal-bookmark hyperlinks have + # -- `Target="#bookmark1"` with no (or Internal) TargetMode. The part + # -- resolver must treat them as external so no partname lookup fires. + rel_xml = ( + '' + ) + rel = parse_xml(rel_xml) + assert rel.target_mode == RTM.EXTERNAL + + def it_reports_external_for_empty_or_missing_targets(self): + rel_xml = ( + '' + ) + rel = parse_xml(rel_xml) + assert rel.target_mode == RTM.EXTERNAL + + def it_preserves_external_target_mode_for_real_external_targets(self): + rel_xml = ( + '' + ) + rel = parse_xml(rel_xml) + assert rel.target_mode == RTM.EXTERNAL + def it_can_construct_from_attribute_values(self): cases = ( ("rId9", "ReLtYpE", "foo/bar.xml", None), @@ -90,8 +123,11 @@ def it_can_build_rels_element_incrementally(self): assert serialize_for_reading(rels) == expected_rels_xml def it_can_generate_rels_file_xml(self): + # -- Shared runtime emits double-quoted XML declaration attributes + # -- to match Microsoft Office output; pre-0.2 docx relied on + # -- lxml's single-quoted default. -- expected_xml = ( - "\n" + '\n' ''.encode("utf-8") ) @@ -130,3 +166,23 @@ def it_can_build_types_element_incrementally(self): types.add_override("/docProps/thumbnail.jpeg", "image/jpeg") expected_types_xml = a_Types().xml assert types.xml == expected_types_xml + + +class Describe_parse_xml_recovery: + """Unit-test suite for `parse_xml(..., recover=True)`.""" + + def it_raises_on_malformed_xml_by_default(self): + with pytest.raises(etree.XMLSyntaxError): + parse_xml(b"unclosed") + + def it_returns_partial_tree_in_recover_mode(self): + element = parse_xml(b"unclosed", recover=True) + + assert element is not None + assert element.tag == "root" + + def it_recovers_mismatched_end_tags(self): + element = parse_xml(b"", recover=True) + + assert element is not None + assert element.tag == "a" diff --git a/tests/opc/test_package.py b/tests/opc/test_package.py index d8fcef453..fd44431ac 100644 --- a/tests/opc/test_package.py +++ b/tests/opc/test_package.py @@ -38,10 +38,37 @@ def it_can_open_a_pkg_file(self, PackageReader_, PartFactory_, Unmarshaller_): # exercise --------------------- pkg = OpcPackage.open(pkg_file) # verify ----------------------- - PackageReader_.from_file.assert_called_once_with(pkg_file) + PackageReader_.from_file.assert_called_once_with(pkg_file, password=None) Unmarshaller_.unmarshal.assert_called_once_with(pkg_reader, pkg, PartFactory_) assert isinstance(pkg, OpcPackage) + def it_plumbs_password_through_to_PackageReader_from_file( + self, PackageReader_, PartFactory_, Unmarshaller_ + ): + pkg_file = Mock(name="pkg_file") + OpcPackage.open(pkg_file, password="hunter2") + PackageReader_.from_file.assert_called_once_with(pkg_file, password="hunter2") + + def it_activates_huge_tree_mode_when_requested( + self, PackageReader_, PartFactory_, Unmarshaller_ + ): + # -- upstream#1086: `huge_tree=True` must be plumbed through Package.open + # -- so that extremely large documents parse without raising on + # -- AttValue>10MB or 256-deep nesting. + from docx.oxml.parser import _huge_tree_state + + seen_states: list[bool] = [] + + def _record_state(pkg_reader, pkg, part_factory): + seen_states.append(_huge_tree_state.active) + + Unmarshaller_.unmarshal.side_effect = _record_state + OpcPackage.open(Mock(name="pkg_file"), huge_tree=True) + + assert seen_states == [True] + # -- and the state is reset after the call -- + assert _huge_tree_state.active is False + def it_initializes_its_rels_collection_on_first_reference(self, Relationships_): pkg = OpcPackage() rels = pkg.rels @@ -126,8 +153,58 @@ def it_can_save_to_a_pkg_file( pkg = OpcPackage() pkg.save(pkg_file_) for part in parts_: - part.before_marshal.assert_called_once_with() - PackageWriter_.write.assert_called_once_with(pkg_file_, pkg.rels, parts_) + part.before_marshal.assert_called_once_with(reproducible=False) + PackageWriter_.write.assert_called_once_with( + pkg_file_, pkg.rels, parts_, reproducible=False, password=None + ) + + def it_plumbs_password_through_to_PackageWriter_write( + self, pkg_file_: Mock, PackageWriter_: Mock, parts_prop_: Mock, parts_: list[Mock] + ): + parts_prop_.return_value = parts_ + pkg = OpcPackage() + pkg.save(pkg_file_, password="hunter2") + PackageWriter_.write.assert_called_once_with( + pkg_file_, pkg.rels, parts_, reproducible=False, password="hunter2" + ) + + def it_raises_on_save_path_with_windows_invalid_chars(self): + # -- upstream#1111: historically `Document.save("foo:bar.docx")` silently + # -- produced no file / an empty file on Windows. Raise instead. + pkg = OpcPackage() + invalid_names = [ + "foo:bar.docx", + "foo|bar.docx", + 'foo"bar.docx', + "foo?bar.docx", + "foo*bar.docx", + "foobar.docx", + ] + for path in invalid_names: + with pytest.raises(OSError, match="Windows-invalid"): + pkg.save(path) + + def it_accepts_drive_letter_colons_on_save( + self, PackageWriter_: Mock, parts_prop_: Mock, parts_: list[Mock] + ): + parts_prop_.return_value = parts_ + pkg = OpcPackage() + # -- drive-letter colon is in the path prefix, not the filename. Valid. -- + pkg.save("C:/tmp/foo.docx") + PackageWriter_.write.assert_called_once_with( + "C:/tmp/foo.docx", pkg.rels, parts_, reproducible=False, password=None + ) + + def it_raises_on_save_path_with_embedded_control_chars(self): + pkg = OpcPackage() + with pytest.raises(OSError, match="control character"): + pkg.save("foo\x01bar.docx") + + def it_raises_on_save_path_with_no_filename(self): + pkg = OpcPackage() + with pytest.raises(OSError, match="no filename component"): + pkg.save("/tmp/") def it_provides_access_to_the_core_properties(self, core_props_fixture): opc_package, core_properties_ = core_props_fixture @@ -152,6 +229,101 @@ def it_creates_a_default_core_props_part_if_none_present( relate_to_.assert_called_once_with(opc_package, core_properties_part_, RT.CORE_PROPERTIES) assert core_properties_part is core_properties_part_ + def it_finds_core_props_part_via_alternate_reltype( + self, part_related_by_, CorePropertiesPart_, relate_to_, core_properties_part_ + ): + # -- upstream-PR#1436: some producers declare the core-properties rel + # -- under the `officeDocument/2006/.../core-properties` alternate + # -- reltype. We must discover it rather than creating a duplicate. + def _lookup(self, reltype): + if reltype == RT.CORE_PROPERTIES: + raise KeyError + if reltype == RT.CORE_PROPERTIES_ALT: + return core_properties_part_ + raise KeyError + + part_related_by_.side_effect = _lookup + opc_package = OpcPackage() + + core_properties_part = opc_package._core_properties_part + + assert core_properties_part is core_properties_part_ + CorePropertiesPart_.default.assert_not_called() + relate_to_.assert_not_called() + + def it_remaps_clashing_cp_prefix_on_load(self, part_related_by_): + # -- upstream#1037: LibreOffice-produced core.xml where ``cp:`` is + # -- bound to the custom-properties URI collides with python-docx's + # -- own use of ``cp:`` for core-properties. Writing to such a part + # -- produced a duplicate ``cp:lastModifiedBy`` at serialise time. + # -- The load-side remap rewrites the offending binding to + # -- ``custprops:`` so python-docx can safely emit its own ``cp:*`` + # -- elements afterwards. + from docx.opc.constants import CONTENT_TYPE as CT + from docx.opc.packuri import PackURI + from docx.opc.parts.coreprops import CorePropertiesPart + from docx.oxml.parser import parse_xml + + libre_xml = ( + b'\n' + b'' + b'original' + b'old_user' + b'' + ) + element = parse_xml(libre_xml) + part = CorePropertiesPart( + PackURI("/docProps/core.xml"), CT.OPC_CORE_PROPERTIES, element, None + ) + part_related_by_.return_value = part + opc_package = OpcPackage() + + core_properties_part = opc_package._core_properties_part + + # -- After remap, the root's ``cp:`` binding must point to the + # -- core-properties URI, not the custom-properties URI. -- + assert core_properties_part.element.nsmap.get("cp") == ( + "http://schemas.openxmlformats.org/package/2006/metadata/core-properties" + ) + # -- Writing ``last_modified_by`` afterwards must not create a + # -- duplicate ``cp:lastModifiedBy`` in the serialised output. -- + core_properties_part.core_properties.last_modified_by = "new_user" + blob = core_properties_part.blob + assert blob.count(b"cp:lastModifiedBy") <= 2 # one open, one close + + def it_leaves_well_formed_cp_binding_untouched(self, part_related_by_): + # -- No-op case: an ordinary core.xml where ``cp:`` is bound to the + # -- core-properties URI must be passed through unchanged. + from docx.opc.constants import CONTENT_TYPE as CT + from docx.opc.packuri import PackURI + from docx.opc.parts.coreprops import CorePropertiesPart + from docx.oxml.parser import parse_xml + + good_xml = ( + b'\n' + b'' + b'u' + b'' + ) + element = parse_xml(good_xml) + original_id = id(element) + part = CorePropertiesPart( + PackURI("/docProps/core.xml"), CT.OPC_CORE_PROPERTIES, element, None + ) + part_related_by_.return_value = part + opc_package = OpcPackage() + + core_properties_part = opc_package._core_properties_part + + # -- Same underlying element object: the remap early-outed. -- + assert id(core_properties_part.element) == original_id + # fixtures --------------------------------------------- @pytest.fixture diff --git a/tests/opc/test_phys_pkg.py b/tests/opc/test_phys_pkg.py index 6de0d868b..5b8c559a5 100644 --- a/tests/opc/test_phys_pkg.py +++ b/tests/opc/test_phys_pkg.py @@ -6,6 +6,7 @@ import pytest +from docx.exceptions import EncryptedDocumentError from docx.opc.exceptions import PackageNotFoundError from docx.opc.packuri import PACKAGE_URI, PackURI from docx.opc.phys_pkg import ( @@ -16,6 +17,8 @@ _ZipPkgWriter, ) +_OLE_SIGNATURE = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" + from ..unitutil.file import absjoin, test_file_dir from ..unitutil.mock import Mock, class_mock, loose_mock @@ -52,6 +55,14 @@ def it_returns_none_when_part_has_no_rels_xml(self, dir_reader): rels_xml = dir_reader.rels_xml_for(partname) assert rels_xml is None + def it_raises_on_path_traversal(self, dir_reader): + # As of python-ooxml-opc 0.1.0, PackURI construction itself refuses + # '..' path segments, so the guard fires earlier than the reader's + # filesystem check. Both behaviours are correct; this asserts the + # earliest, strongest guarantee. + with pytest.raises(ValueError, match=r"\.\."): + PackURI("/../../../etc/passwd") + # fixtures --------------------------------------------- @pytest.fixture @@ -68,6 +79,58 @@ def it_raises_when_pkg_path_is_not_a_package(self): with pytest.raises(PackageNotFoundError): PhysPkgReader("foobar") + def it_raises_FileNotFoundError_when_path_does_not_exist(self, tmp_path): + # -- upstream#1410: distinguish missing file from not-a-zip file -- + missing = str(tmp_path / "no-such-file.docx") + + with pytest.raises(FileNotFoundError): + PhysPkgReader(missing) + + def it_still_satisfies_PackageNotFoundError_for_missing_file(self, tmp_path): + # -- backward-compat: existing callers catching PackageNotFoundError + # -- still work for the missing-file case. -- + missing = str(tmp_path / "no-such-file.docx") + + with pytest.raises(PackageNotFoundError): + PhysPkgReader(missing) + + def it_raises_NotADocxError_when_file_exists_but_is_not_a_zip(self, tmp_path): + from docx.opc.exceptions import NotADocxError + + not_a_zip = tmp_path / "bogus.docx" + not_a_zip.write_bytes(b"this is plain text, not a zip") + + with pytest.raises(NotADocxError): + PhysPkgReader(str(not_a_zip)) + + def it_raises_EncryptedDocumentError_for_OLE_path(self, tmp_path): + encrypted_path = tmp_path / "encrypted.docx" + # -- OLE signature + some trailing bytes; enough to look like an OLE file -- + encrypted_path.write_bytes(_OLE_SIGNATURE + b"\x00" * 512) + + with pytest.raises(EncryptedDocumentError, match="python-ooxml-crypto"): + PhysPkgReader(str(encrypted_path)) + + def it_raises_EncryptedDocumentError_for_OLE_stream(self): + stream = io.BytesIO(_OLE_SIGNATURE + b"\x00" * 512) + + with pytest.raises(EncryptedDocumentError, match="password-protected"): + PhysPkgReader(stream) + + def it_restores_stream_position_when_detecting_encryption(self): + stream = io.BytesIO(_OLE_SIGNATURE + b"\x00" * 512) + stream.seek(0) + + with pytest.raises(EncryptedDocumentError): + PhysPkgReader(stream) + + assert stream.tell() == 0 + + def it_opens_a_normal_zip_stream_without_raising(self): + with open(zip_pkg_path, "rb") as stream: + phys_reader = PhysPkgReader(stream) + assert isinstance(phys_reader, _ZipPkgReader) + class DescribeZipPkgReader: def it_is_used_by_PhysPkgReader_when_pkg_is_a_zip(self): @@ -169,6 +232,48 @@ def pkg_file(self): pkg_file.close() +class DescribeReproducibleZipPkgWriter: + """Exercises the deterministic-save path (upstream#1042).""" + + def it_uses_fixed_timestamps_for_every_member(self, tmp_docx_path): + from docx.opc.phys_pkg import REPRODUCIBLE_TIMESTAMP + + pkg_writer = PhysPkgWriter(tmp_docx_path, reproducible=True) + pkg_writer.write(PackURI("/a.xml"), b"") + pkg_writer.write(PackURI("/b.xml"), b"") + pkg_writer.close() + + with ZipFile(tmp_docx_path, "r") as zipf: + for info in zipf.infolist(): + assert info.date_time == REPRODUCIBLE_TIMESTAMP + + def it_writes_members_in_sorted_order(self, tmp_docx_path): + pkg_writer = PhysPkgWriter(tmp_docx_path, reproducible=True) + # -- write in reverse order deliberately -- + pkg_writer.write(PackURI("/z.xml"), b"") + pkg_writer.write(PackURI("/a.xml"), b"") + pkg_writer.write(PackURI("/m.xml"), b"") + pkg_writer.close() + + with ZipFile(tmp_docx_path, "r") as zipf: + names = zipf.namelist() + assert names == sorted(names) + + def it_produces_byte_identical_output_across_runs(self, tmp_path): + import io + from docx import Document + + document = Document() + document.add_paragraph("hello world") + + out1 = io.BytesIO() + out2 = io.BytesIO() + document.save(out1, reproducible=True) + document.save(out2, reproducible=True) + + assert out1.getvalue() == out2.getvalue() + + # fixtures ------------------------------------------------- diff --git a/tests/opc/test_pkgreader.py b/tests/opc/test_pkgreader.py index 0aed52c8d..83dd53856 100644 --- a/tests/opc/test_pkgreader.py +++ b/tests/opc/test_pkgreader.py @@ -41,7 +41,7 @@ def it_can_construct_from_pkg_file( pkg_reader = PackageReader.from_file(pkg_file) - PhysPkgReader_.assert_called_once_with(pkg_file) + PhysPkgReader_.assert_called_once_with(pkg_file, password=None) from_xml.assert_called_once_with(phys_reader.content_types_xml) _srels_for.assert_called_once_with(phys_reader, "/") _load_serialized_parts.assert_called_once_with(phys_reader, pkg_srels, content_types) @@ -119,29 +119,33 @@ def it_can_walk_phys_pkg_parts(self, _srels_for): part_1_blob, part_2_blob, part_3_blob = ("", "", "") reltype1, reltype2, reltype3 = ("reltype1", "reltype2", "reltype3") srels = [ - Mock(name="rId1", is_external=True), + Mock(name="rId1", is_external=True, target_ref="http://external/"), Mock( name="rId2", is_external=False, reltype=reltype1, + target_ref="name1.xml", target_partname=partname_1, ), Mock( name="rId3", is_external=False, reltype=reltype2, + target_ref="name2.xml", target_partname=partname_2, ), Mock( name="rId4", is_external=False, reltype=reltype1, + target_ref="name1.xml", target_partname=partname_1, ), Mock( name="rId5", is_external=False, reltype=reltype3, + target_ref="name3.xml", target_partname=partname_3, ), ] @@ -163,6 +167,50 @@ def it_can_walk_phys_pkg_parts(self, _srels_for): ] assert generated_tuples == expected_tuples + def it_skips_rels_whose_target_is_a_pure_fragment(self, _srels_for): + # -- Internal-bookmark hyperlinks emit `Target="#bookmark1"` with + # -- TargetMode="Internal". Those rels must not cause a part lookup — + # -- they have no backing package part. upstream#902 / #1349 / #678. + fragment_srel = Mock( + name="fragment_srel", + is_external=False, + target_ref="#bookmark1", + ) + phys_reader = Mock(name="phys_reader") + generated = list(PackageReader._walk_phys_parts(phys_reader, [fragment_srel])) + assert generated == [] + phys_reader.blob_for.assert_not_called() + + def it_skips_rels_whose_target_is_null_or_empty(self, _srels_for): + phys_reader = Mock(name="phys_reader") + cases = ("", None) + for target_ref in cases: + srel = Mock( + name="empty_srel", + is_external=False, + target_ref=target_ref, + ) + generated = list(PackageReader._walk_phys_parts(phys_reader, [srel])) + assert generated == [] + phys_reader.blob_for.assert_not_called() + + def it_tolerates_rels_pointing_at_missing_parts(self, _srels_for): + # -- Word-style loose docs sometimes declare a rel whose target zip + # -- entry has been dropped. Don't crash the entire package load on a + # -- single dangling rel. upstream-PR#1219. + missing_srel = Mock( + name="missing_srel", + is_external=False, + reltype="rt1", + target_ref="ghost.xml", + target_partname="/ghost.xml", + ) + _srels_for.return_value = [] + phys_reader = Mock(name="phys_reader") + phys_reader.blob_for.side_effect = KeyError("ghost.xml") + generated = list(PackageReader._walk_phys_parts(phys_reader, [missing_srel])) + assert generated == [] + def it_can_retrieve_srels_for_a_source_uri(self, _SerializedRelationships_): # mockery ---------------------- phys_reader = Mock(name="phys_reader") @@ -439,6 +487,21 @@ def it_can_calculate_its_target_partname(self): # verify ------------------- assert srel.target_partname == expected_partname + def it_normalises_windows_backslashes_in_target_ref(self): + # -- upstream-PR#1205: some DOCX producers emit `Target="media\image1.png"` + # -- which must be normalised to `media/image1.png` so PackURI.from_rel_ref + # -- produces the correct partname. + rel_elm = Mock( + name="rel_elm", + rId="rId1", + reltype="ReLtYpE", + target_ref="media\\image1.png", + target_mode=RTM.INTERNAL, + ) + srel = _SerializedRelationship("/word", rel_elm) + assert srel.target_ref == "media/image1.png" + assert srel.target_partname == "/word/media/image1.png" + def it_raises_on_target_partname_when_external(self): rel_elm = Mock( name="rel_elm", diff --git a/tests/opc/test_pkgwriter.py b/tests/opc/test_pkgwriter.py index aff8b22d9..43e6da789 100644 --- a/tests/opc/test_pkgwriter.py +++ b/tests/opc/test_pkgwriter.py @@ -38,9 +38,9 @@ def it_can_write_a_package(self, PhysPkgWriter_, _write_methods): expected_calls = [ call._write_content_types_stream(phys_writer, parts), call._write_pkg_rels(phys_writer, pkg_rels), - call._write_parts(phys_writer, parts), + call._write_parts(phys_writer, parts, reproducible=False), ] - PhysPkgWriter_.assert_called_once_with(pkg_file) + PhysPkgWriter_.assert_called_once_with(pkg_file, reproducible=False) assert _write_methods.mock_calls == expected_calls phys_writer.close.assert_called_once_with() @@ -149,6 +149,23 @@ def it_can_compose_content_types_element(self, xml_for_fixture): types_elm = cti._element assert types_elm.xml == expected_xml + def it_emits_vbaProject_as_Default_not_Override(self, request: FixtureRequest): + # Word rejects .docm packages where the vbaProject content type is + # written as ; it must be a . + part_ = self._mock_part( + request, "vba_part_", "/word/vbaProject.bin", CT.WML_VBA_PROJECT + ) + cti = _ContentTypesItem.from_parts([part_]) + + xml = cti._element.xml + + assert ( + '' + ) in xml + # ... and it must not appear as an Override + assert "vbaProject.bin" not in xml + # fixtures --------------------------------------------- def _mock_part(self, request: FixtureRequest, name, partname_str, content_type): diff --git a/tests/oxml/test__init__.py b/tests/oxml/test__init__.py index 9f19094b4..518e02c6f 100644 --- a/tests/oxml/test__init__.py +++ b/tests/oxml/test__init__.py @@ -80,6 +80,85 @@ def xml_bytes(self): ).encode("utf-8") +class DescribeParseXmlRecovery: + def it_raises_XMLSyntaxError_by_default_on_malformed_xml(self): + with pytest.raises(etree.XMLSyntaxError): + parse_xml(b"unclosed") + + def it_returns_partial_tree_when_recover_is_True(self): + element = parse_xml(b"unclosed", recover=True) + + assert element is not None + assert element.tag.endswith("}p") or element.tag == "p" or element.tag.endswith("p") + + def it_returns_None_when_input_is_unrecoverable(self): + from docx.oxml.parser import recovery_mode + + with recovery_mode() as warnings: + element = parse_xml(b"", recover=True) + + assert element is None + assert len(warnings) >= 1 + + def it_accumulates_warnings_under_recovery_mode(self): + from docx.oxml.parser import recovery_mode + + with recovery_mode() as warnings: + parse_xml(b"") + + assert len(warnings) >= 1 + assert all(isinstance(w, str) for w in warnings) + + def it_deactivates_recovery_after_context_exits(self): + from docx.oxml.parser import _recovery_state, recovery_mode + + with recovery_mode(): + assert _recovery_state.active is True + assert _recovery_state.active is False + # -- and subsequent calls without recover=True raise as usual -- + with pytest.raises(etree.XMLSyntaxError): + parse_xml(b"unclosed") + + +class DescribeParseXmlHugeTree: + def it_rejects_attvalue_over_10mb_by_default(self): + # -- libxml2's default AttValue cap is 10 MB. Building a 12 MB attvalue + # -- triggers the "AttValue length too long" XMLSyntaxError — that's + # -- the exact failure mode upstream#1086 describes. + from lxml import etree as _etree + + big = "x" * (12 * 1024 * 1024) + xml = '' % big + with pytest.raises(_etree.XMLSyntaxError): + parse_xml(xml.encode("utf-8")) + + def it_parses_attvalue_over_10mb_when_huge_tree_active(self): + from docx.oxml.parser import huge_tree_mode + + big = "x" * (12 * 1024 * 1024) + xml = '' % big + with huge_tree_mode(): + element = parse_xml(xml.encode("utf-8")) + assert element is not None + assert element.get("a") == big + + def it_deactivates_huge_tree_after_context_exits(self): + from docx.oxml.parser import _huge_tree_state, huge_tree_mode + + with huge_tree_mode(): + assert _huge_tree_state.active is True + assert _huge_tree_state.active is False + + def it_composes_with_recovery_mode(self): + from docx.oxml.parser import huge_tree_mode, recovery_mode + + with huge_tree_mode(), recovery_mode() as warnings: + element = parse_xml(b"unclosed") + # -- malformed + huge-tree must still recover, not crash -- + assert element is not None + assert isinstance(warnings, list) + + class DescribeRegisterElementCls: def it_determines_class_used_for_elements_with_matching_tagname(self, xml_text): register_element_cls("a:foo", CustElmCls) diff --git a/tests/oxml/test_bibliography.py b/tests/oxml/test_bibliography.py new file mode 100644 index 000000000..f0499ec7b --- /dev/null +++ b/tests/oxml/test_bibliography.py @@ -0,0 +1,141 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.bibliography` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.bibliography import CT_Source, CT_Sources, new_sources_root +from docx.oxml.ns import qn + + +def _empty_sources() -> CT_Sources: + return new_sources_root() + + +class DescribeCT_Sources: + """Unit-test suite for `docx.oxml.bibliography.CT_Sources`.""" + + def it_starts_with_an_empty_source_lst(self): + sources = _empty_sources() + + assert sources.source_lst == [] + + def it_carries_default_selected_style_and_style_name(self): + sources = _empty_sources() + + assert sources.selected_style == "/APA.XSL" + assert sources.style_name == "APA" + + def it_can_add_a_simple_source(self): + sources = _empty_sources() + + src = sources.add_source_from_kwargs( + "smith2020", title="Test Book", author="Smith, J.", year=2020 + ) + + assert isinstance(src, CT_Source) + assert src.tag_val == "smith2020" + assert src.title == "Test Book" + assert src.year == "2020" + assert src.source_type == "Book" + assert src.author == "Smith, J." + + def it_defaults_source_type_to_Book(self): + sources = _empty_sources() + + src = sources.add_source_from_kwargs("x") + + assert src.source_type == "Book" + + def it_respects_explicit_source_type(self): + sources = _empty_sources() + + src = sources.add_source_from_kwargs("x", source_type="JournalArticle") + + assert src.source_type == "JournalArticle" + + def it_exposes_extra_kwargs_as_text_children(self): + sources = _empty_sources() + + src = sources.add_source_from_kwargs("x", city="London", publisher="Acme") + + # -- two direct children with capitalized tag names -- + city = src.find(qn("b:City")) + publisher = src.find(qn("b:Publisher")) + assert city is not None and city.text == "London" + assert publisher is not None and publisher.text == "Acme" + + def it_can_look_up_a_source_by_tag(self): + sources = _empty_sources() + sources.add_source_from_kwargs("alpha") + target = sources.add_source_from_kwargs("beta") + sources.add_source_from_kwargs("gamma") + + assert sources.get_source_by_tag("beta") is target + assert sources.get_source_by_tag("missing") is None + + def it_allows_clearing_the_selected_style(self): + sources = _empty_sources() + + sources.selected_style = None + sources.style_name = None + + assert sources.selected_style is None + assert sources.style_name is None + + def it_appends_each_new_source_to_the_end(self): + sources = _empty_sources() + + sources.add_source_from_kwargs("a") + sources.add_source_from_kwargs("b") + sources.add_source_from_kwargs("c") + + tags = [s.tag_val for s in sources.source_lst] + assert tags == ["a", "b", "c"] + + +class DescribeCT_Source: + """Unit-test suite for `docx.oxml.bibliography.CT_Source`.""" + + def its_author_falls_back_to_the_person_NameList_form(self): + sources = _empty_sources() + # -- build a source with a Person-style Author block by hand -- + from docx.oxml.parser import OxmlElement + + src = sources.add_source_from_kwargs("k") + # -- remove the Corporate-style Author the helper generated -- + for author_root in src.findall(qn("b:Author")): + src.remove(author_root) + b_ns = "http://schemas.openxmlformats.org/officeDocument/2006/bibliography" + + def _e(tag: str, text: "str | None" = None): + e = OxmlElement(f"b:{tag}", nsdecls={"b": b_ns}) + if text is not None: + e.text = text + return e + + author_root = _e("Author") + inner = _e("Author") + name_list = _e("NameList") + person = _e("Person") + person.append(_e("First", "Jane")) + person.append(_e("Last", "Doe")) + name_list.append(person) + inner.append(name_list) + author_root.append(inner) + src.append(author_root) + + assert src.author == "Jane Doe" + + def its_author_is_None_when_no_author_is_set(self): + sources = _empty_sources() + src = sources.add_source_from_kwargs("k") + # -- drop the helper-added Author -- + for author_root in src.findall(qn("b:Author")): + src.remove(author_root) + + assert src.author is None diff --git a/tests/oxml/test_bookmarks.py b/tests/oxml/test_bookmarks.py new file mode 100644 index 000000000..6b55445b9 --- /dev/null +++ b/tests/oxml/test_bookmarks.py @@ -0,0 +1,31 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.bookmarks` module.""" + +from __future__ import annotations + +from typing import cast + +from docx.oxml.bookmarks import CT_BookmarkEnd, CT_BookmarkStart + +from ..unitutil.cxml import element + + +class DescribeCT_BookmarkStart: + """Unit-test suite for `docx.oxml.bookmarks.CT_BookmarkStart`.""" + + def it_knows_its_id(self): + bookmarkStart = cast(CT_BookmarkStart, element("w:bookmarkStart{w:id=7,w:name=bm1}")) + assert bookmarkStart.id == 7 + + def it_knows_its_name(self): + bookmarkStart = cast(CT_BookmarkStart, element("w:bookmarkStart{w:id=7,w:name=bm1}")) + assert bookmarkStart.name == "bm1" + + +class DescribeCT_BookmarkEnd: + """Unit-test suite for `docx.oxml.bookmarks.CT_BookmarkEnd`.""" + + def it_knows_its_id(self): + bookmarkEnd = cast(CT_BookmarkEnd, element("w:bookmarkEnd{w:id=7}")) + assert bookmarkEnd.id == 7 diff --git a/tests/oxml/test_chart.py b/tests/oxml/test_chart.py new file mode 100644 index 000000000..27e519430 --- /dev/null +++ b/tests/oxml/test_chart.py @@ -0,0 +1,155 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.chart` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.chart import CT_BarChart, CT_Chart, CT_ChartSpace, CT_PlotArea, CT_Ser + +from ..unitutil.cxml import element + + +class DescribeCT_ChartSpace: + def it_provides_access_to_its_chart_child(self): + cs = cast(CT_ChartSpace, element("c:chartSpace/c:chart")) + assert cs.chart is not None + assert isinstance(cs.chart, CT_Chart) + + def and_returns_None_when_chart_is_absent(self): + cs = cast(CT_ChartSpace, element("c:chartSpace")) + assert cs.chart is None + + +class DescribeCT_Chart: + def it_provides_access_to_its_plotArea(self): + chart = cast(CT_Chart, element("c:chart/c:plotArea")) + assert chart.plotArea is not None + assert isinstance(chart.plotArea, CT_PlotArea) + + def it_extracts_the_title_text(self): + cxml = 'c:chart/c:title/c:tx/c:rich/a:p/(a:r/a:t"Foo",a:r/a:t" Bar")' + chart = cast(CT_Chart, element(cxml)) + assert chart.title_text == "Foo Bar" + + def it_returns_None_when_no_title_present(self): + chart = cast(CT_Chart, element("c:chart/c:plotArea")) + assert chart.title_text is None + + @pytest.mark.parametrize( + ("cxml", "expected"), + [ + ("c:chart/c:plotArea", False), + ("c:chart/(c:plotArea,c:legend)", True), + ], + ) + def it_knows_whether_it_has_a_legend(self, cxml: str, expected: bool): + chart = cast(CT_Chart, element(cxml)) + assert chart.has_legend == expected + + +class DescribeCT_PlotArea: + @pytest.mark.parametrize( + ("child_tag",), + [ + ("c:barChart",), + ("c:lineChart",), + ("c:pieChart",), + ("c:doughnutChart",), + ("c:scatterChart",), + ("c:areaChart",), + ], + ) + def it_finds_its_chart_kind_element(self, child_tag: str): + plotArea = cast(CT_PlotArea, element(f"c:plotArea/(c:layout,{child_tag})")) + kind = plotArea.chart_kind_element + assert kind is not None + assert kind.tag.endswith(child_tag.split(":")[1]) + + def it_returns_None_when_no_kind_child_present(self): + plotArea = cast(CT_PlotArea, element("c:plotArea/c:layout")) + assert plotArea.chart_kind_element is None + + def it_lists_its_series(self): + cxml = "c:plotArea/c:barChart/(c:ser,c:ser,c:ser)" + plotArea = cast(CT_PlotArea, element(cxml)) + assert len(plotArea.ser_lst) == 3 + + +class DescribeCT_BarChart: + @pytest.mark.parametrize( + ("direction",), + [("bar",), ("col",)], + ) + def it_reads_its_bar_direction(self, direction: str): + bar = cast( + CT_BarChart, + element(f"c:barChart/c:barDir{{val={direction}}}"), + ) + assert bar.bar_dir == direction + + def it_reads_its_grouping(self): + bar = cast(CT_BarChart, element("c:barChart/c:grouping{val=stacked}")) + assert bar.grouping == "stacked" + + +class DescribeCT_Ser: + def it_reads_its_name_from_strCache(self): + cxml = ( + "c:ser/c:tx/c:strRef/c:strCache/c:pt{idx=0}" + '/c:v"Revenue"' + ) + ser = cast(CT_Ser, element(cxml)) + assert ser.tx_name == "Revenue" + + def it_reads_its_name_from_literal_v(self): + cxml = 'c:ser/c:tx/c:v"Inline Name"' + ser = cast(CT_Ser, element(cxml)) + assert ser.tx_name == "Inline Name" + + def its_name_is_None_when_no_tx(self): + ser = cast(CT_Ser, element("c:ser")) + assert ser.tx_name is None + + def it_reads_categories_from_strCache(self): + cxml = ( + "c:ser/c:cat/c:strRef/c:strCache/" + '(c:pt{idx=0}/c:v"Q1",c:pt{idx=1}/c:v"Q2",c:pt{idx=2}/c:v"Q3")' + ) + ser = cast(CT_Ser, element(cxml)) + assert ser.cat_values == ["Q1", "Q2", "Q3"] + + def it_returns_empty_categories_when_absent(self): + ser = cast(CT_Ser, element("c:ser")) + assert ser.cat_values == [] + + def it_reads_values_from_numCache(self): + cxml = ( + "c:ser/c:val/c:numRef/c:numCache/" + '(c:pt{idx=0}/c:v"1.5",c:pt{idx=1}/c:v"2.0",c:pt{idx=2}/c:v"3.25")' + ) + ser = cast(CT_Ser, element(cxml)) + assert ser.val_values == [1.5, 2.0, 3.25] + + def it_falls_back_to_numLit_for_values(self): + cxml = ( + "c:ser/c:val/c:numLit/" + '(c:pt{idx=0}/c:v"7",c:pt{idx=1}/c:v"8")' + ) + ser = cast(CT_Ser, element(cxml)) + assert ser.val_values == [7.0, 8.0] + + def it_returns_empty_values_when_absent(self): + ser = cast(CT_Ser, element("c:ser")) + assert ser.val_values == [] + + def it_skips_unparseable_value_points(self): + cxml = ( + "c:ser/c:val/c:numRef/c:numCache/" + '(c:pt{idx=0}/c:v"1.0",c:pt{idx=1}/c:v"not-a-number",c:pt{idx=2}/c:v"3.0")' + ) + ser = cast(CT_Ser, element(cxml)) + assert ser.val_values == [1.0, 3.0] diff --git a/tests/oxml/test_comments.py b/tests/oxml/test_comments.py index 8fc116144..19e2aab5a 100644 --- a/tests/oxml/test_comments.py +++ b/tests/oxml/test_comments.py @@ -8,7 +8,7 @@ import pytest -from docx.oxml.comments import CT_Comments +from docx.oxml.comments import CT_Comment, CT_Comments from ..unitutil.cxml import element @@ -29,3 +29,86 @@ class DescribeCT_Comments: def it_finds_the_next_available_comment_id_to_help(self, cxml: str, expected_value: int): comments_elm = cast(CT_Comments, element(cxml)) assert comments_elm._next_available_comment_id() == expected_value + + def it_can_add_a_comment_with_a_paraId(self): + comments_elm = cast(CT_Comments, element("w:comments")) + + comment = comments_elm.add_comment() + + assert comment.paraId is not None + assert len(comment.paraId) == 8 + # -- paraId should be a hex string -- + int(comment.paraId, 16) + + def it_generates_unique_paraIds(self): + comments_elm = cast(CT_Comments, element("w:comments")) + + comment1 = comments_elm.add_comment() + comment2 = comments_elm.add_comment() + + assert comment1.paraId != comment2.paraId + + def it_can_add_a_reply_comment(self): + comments_elm = cast(CT_Comments, element("w:comments")) + parent = comments_elm.add_comment() + parent_para_id = parent.paraId + assert parent_para_id is not None + + reply = comments_elm.add_reply(parent_para_id) + + assert reply.paraIdParent == parent_para_id + assert reply.paraId is not None + assert reply.paraId != parent_para_id + assert reply.id != parent.id + + def it_can_find_replies_for_a_comment(self): + comments_elm = cast(CT_Comments, element("w:comments")) + parent = comments_elm.add_comment() + parent_para_id = parent.paraId + assert parent_para_id is not None + reply1 = comments_elm.add_reply(parent_para_id) + reply2 = comments_elm.add_reply(parent_para_id) + # -- add an unrelated comment to make sure it's not included -- + comments_elm.add_comment() + + replies = comments_elm.get_replies_for(parent_para_id) + + assert len(replies) == 2 + assert replies[0] is reply1 + assert replies[1] is reply2 + + def but_it_returns_empty_list_when_no_replies(self): + comments_elm = cast(CT_Comments, element("w:comments")) + parent = comments_elm.add_comment() + parent_para_id = parent.paraId + assert parent_para_id is not None + + replies = comments_elm.get_replies_for(parent_para_id) + + assert replies == [] + + +class DescribeCT_Comment: + """Unit-test suite for `docx.oxml.comments.CT_Comment`.""" + + def it_can_get_and_set_paraId(self): + comment_elm = cast(CT_Comment, element("w:comment{w:id=1}")) + + assert comment_elm.paraId is None + + comment_elm.paraId = "AABB0011" + assert comment_elm.paraId == "AABB0011" + + comment_elm.paraId = None + assert comment_elm.paraId is None + + def it_can_get_and_set_paraIdParent(self): + comment_elm = cast(CT_Comment, element("w:comment{w:id=1}")) + + assert comment_elm.paraIdParent is None + + comment_elm.paraIdParent = "CCDD2233" + assert comment_elm.paraIdParent == "CCDD2233" + + comment_elm.paraIdParent = None + assert comment_elm.paraIdParent is None diff --git a/tests/oxml/test_content_controls.py b/tests/oxml/test_content_controls.py new file mode 100644 index 000000000..006dcac68 --- /dev/null +++ b/tests/oxml/test_content_controls.py @@ -0,0 +1,141 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.content_controls` module.""" + +from __future__ import annotations + +from typing import cast + +from docx.oxml.content_controls import CT_Sdt, CT_SdtContent +from docx.oxml.ns import qn + +from ..unitutil.cxml import element + + +class DescribeCT_Sdt: + """Unit-test suite for `docx.oxml.content_controls.CT_Sdt`.""" + + def it_reads_its_tag_val_from_sdtPr(self): + sdt = cast( + CT_Sdt, + element("w:sdt/w:sdtPr/w:tag{w:val=MyTag}"), + ) + assert sdt.tag_val == "MyTag" + + def it_returns_None_for_tag_val_when_not_present(self): + sdt = cast(CT_Sdt, element("w:sdt/w:sdtPr")) + assert sdt.tag_val is None + + def it_can_set_tag_val(self): + sdt = cast(CT_Sdt, element("w:sdt")) + sdt.tag_val = "MyTag" + assert sdt.tag_val == "MyTag" + assert sdt.sdtPr is not None + tag_elm = sdt.sdtPr.find(qn("w:tag")) + assert tag_elm is not None + assert tag_elm.get(qn("w:val")) == "MyTag" + + def it_reads_its_alias_val_from_sdtPr(self): + sdt = cast( + CT_Sdt, + element("w:sdt/w:sdtPr/w:alias{w:val=MyTitle}"), + ) + assert sdt.alias_val == "MyTitle" + + def it_returns_None_for_alias_val_when_not_present(self): + sdt = cast(CT_Sdt, element("w:sdt/w:sdtPr")) + assert sdt.alias_val is None + + def it_can_set_alias_val(self): + sdt = cast(CT_Sdt, element("w:sdt")) + sdt.alias_val = "Hello" + assert sdt.alias_val == "Hello" + + def it_can_remove_tag_val_by_assigning_None(self): + sdt = cast( + CT_Sdt, + element("w:sdt/w:sdtPr/w:tag{w:val=x}"), + ) + sdt.tag_val = None + assert sdt.tag_val is None + + def it_reads_its_id(self): + sdt = cast( + CT_Sdt, + element("w:sdt/w:sdtPr/w:id{w:val=42}"), + ) + assert sdt.sdt_id == 42 + + def it_can_set_its_id(self): + sdt = cast(CT_Sdt, element("w:sdt")) + sdt.sdt_id = 99 + assert sdt.sdt_id == 99 + + def it_detects_no_type_marker_as_rich_text_default(self): + sdt = cast(CT_Sdt, element("w:sdt/w:sdtPr")) + assert sdt.type_marker_tag() is None + + def it_detects_w_text_type_marker(self): + sdt = cast(CT_Sdt, element("w:sdt/w:sdtPr/w:text")) + assert sdt.type_marker_tag() == "w:text" + + def it_detects_w_date_type_marker(self): + sdt = cast(CT_Sdt, element("w:sdt/w:sdtPr/w:date")) + assert sdt.type_marker_tag() == "w:date" + + def it_reads_checked_value_when_present(self): + xml = ( + '' + '' + "" + ) + from docx.oxml.parser import parse_xml + + sdt = cast(CT_Sdt, parse_xml(xml)) + assert sdt.checked is True + + def it_returns_None_for_checked_when_no_checkbox(self): + sdt = cast(CT_Sdt, element("w:sdt/w:sdtPr")) + assert sdt.checked is None + + def it_can_set_checked(self): + sdt = cast(CT_Sdt, element("w:sdt")) + sdt.checked = True + assert sdt.checked is True + sdt.checked = False + assert sdt.checked is False + + def it_can_set_a_type_marker(self): + sdt = cast(CT_Sdt, element("w:sdt")) + sdt.set_type_marker("w:text") + assert sdt.type_marker_tag() == "w:text" + + def it_replaces_existing_type_marker_on_set(self): + sdt = cast(CT_Sdt, element("w:sdt/w:sdtPr/w:text")) + sdt.set_type_marker("w:date") + assert sdt.type_marker_tag() == "w:date" + assert sdt.sdtPr is not None + assert sdt.sdtPr.find(qn("w:text")) is None + + +class DescribeCT_SdtContent: + """Unit-test suite for `docx.oxml.content_controls.CT_SdtContent`.""" + + def it_concatenates_paragraph_text(self): + sdtContent = cast( + CT_SdtContent, + element('w:sdtContent/(w:p/w:r/w:t"Hello")'), + ) + assert sdtContent.text == "Hello" + + def it_concatenates_run_text(self): + sdtContent = cast( + CT_SdtContent, + element('w:sdtContent/(w:r/w:t"Hi")'), + ) + assert sdtContent.text == "Hi" + + def it_returns_empty_string_when_no_children(self): + sdtContent = cast(CT_SdtContent, element("w:sdtContent")) + assert sdtContent.text == "" diff --git a/tests/oxml/test_custom_properties.py b/tests/oxml/test_custom_properties.py new file mode 100644 index 000000000..474c6f987 --- /dev/null +++ b/tests/oxml/test_custom_properties.py @@ -0,0 +1,225 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.custom_properties` module.""" + +from __future__ import annotations + +import datetime as dt +from typing import cast + +import pytest + +from docx.oxml.custom_properties import ( + CUSTOM_PROPERTIES_FMTID, + CT_CustomProperties, + CT_CustomProperty, +) +from docx.oxml.parser import parse_xml + + +_EMPTY_PROPERTIES_XML = ( + b'' +) + + +def _empty_properties() -> CT_CustomProperties: + return cast(CT_CustomProperties, parse_xml(_EMPTY_PROPERTIES_XML)) + + +class DescribeCT_CustomProperties: + """Unit-test suite for `docx.oxml.custom_properties.CT_CustomProperties`.""" + + def it_exposes_an_empty_property_lst_initially(self): + props = _empty_properties() + + assert props.property_lst == [] + + def it_can_add_a_property_with_a_unique_pid(self): + props = _empty_properties() + + p1 = props.add_property("Project") + p2 = props.add_property("Year") + + assert p1.pid == 2 + assert p2.pid == 3 + assert p1.name == "Project" + assert p2.name == "Year" + assert p1.fmtid == CUSTOM_PROPERTIES_FMTID + assert p2.fmtid == CUSTOM_PROPERTIES_FMTID + + def it_picks_the_lowest_unused_pid(self): + props = _empty_properties() + p2 = props.add_property("A") + p3 = props.add_property("B") + props.remove(p2) # pid 2 is now free + + p_new = props.add_property("C") + + # -- pid 3 is still in use, pid 2 is free -- + assert p_new.pid == 2 + del p3 + + def it_can_find_a_property_by_name(self): + props = _empty_properties() + props.add_property("Alpha") + target = props.add_property("Beta") + props.add_property("Gamma") + + found = props.get_property_by_name("Beta") + + assert found is target + + def but_it_returns_None_for_an_unknown_name(self): + props = _empty_properties() + props.add_property("Alpha") + + assert props.get_property_by_name("Missing") is None + + +class DescribeCT_CustomProperty: + """Unit-test suite for `docx.oxml.custom_properties.CT_CustomProperty`.""" + + @pytest.mark.parametrize( + ("value", "expected_localname", "expected_text"), + [ + ("hello", "lpwstr", "hello"), + (42, "i4", "42"), + (3.14, "r8", "3.14"), + (True, "bool", "true"), + (False, "bool", "false"), + ], + ) + def it_writes_the_appropriate_vt_child_for_each_type( + self, value: object, expected_localname: str, expected_text: str + ): + props = _empty_properties() + prop = props.add_property("P") + + prop.value = value + + child = prop._vt_child + assert child is not None + assert child.tag.split("}", 1)[-1] == expected_localname + assert child.text == expected_text + + def it_writes_a_filetime_for_a_datetime(self): + props = _empty_properties() + prop = props.add_property("P") + + prop.value = dt.datetime(2024, 1, 15, 10, 30, 0) + + child = prop._vt_child + assert child is not None + assert child.tag.endswith("}filetime") + assert child.text == "2024-01-15T10:30:00Z" + + def it_converts_aware_datetimes_to_utc(self): + props = _empty_properties() + prop = props.add_property("P") + tz = dt.timezone(dt.timedelta(hours=5)) + + prop.value = dt.datetime(2024, 1, 15, 15, 30, 0, tzinfo=tz) + + child = prop._vt_child + assert child is not None + # -- 15:30 +05:00 == 10:30 UTC -- + assert child.text == "2024-01-15T10:30:00Z" + + @pytest.mark.parametrize( + ("assigned", "expected"), + [ + ("hello", "hello"), + (42, 42), + (3.14, 3.14), + (True, True), + (False, False), + ], + ) + def it_round_trips_each_supported_scalar_type(self, assigned: object, expected: object): + props = _empty_properties() + prop = props.add_property("P") + + prop.value = assigned + + assert prop.value == expected + assert type(prop.value) is type(expected) + + def it_round_trips_a_datetime(self): + props = _empty_properties() + prop = props.add_property("P") + original = dt.datetime(2024, 1, 15, 10, 30, 45) + + prop.value = original + + assert prop.value == original + + def it_writes_a_vt_date_for_a_date(self): + """`datetime.date` (not `datetime`) serialises as `vt:date`, `YYYY-MM-DD`.""" + props = _empty_properties() + prop = props.add_property("P") + + prop.value = dt.date(2024, 1, 15) + + child = prop._vt_child + assert child is not None + assert child.tag.endswith("}date") + assert child.text == "2024-01-15" + + def it_round_trips_a_date(self): + props = _empty_properties() + prop = props.add_property("P") + original = dt.date(2024, 1, 15) + + prop.value = original + + round_tripped = prop.value + # -- must be a plain `date`, not a `datetime`, to preserve type identity -- + assert isinstance(round_tripped, dt.date) + assert not isinstance(round_tripped, dt.datetime) + assert round_tripped == original + + def it_dispatches_datetime_to_filetime_not_date(self): + """A `datetime` must still route to `vt:filetime`, not `vt:date` (it is + also a `date` instance, so ordering of the isinstance checks matters). + """ + props = _empty_properties() + prop = props.add_property("P") + + prop.value = dt.datetime(2024, 1, 15, 10, 30, 45) + + child = prop._vt_child + assert child is not None + assert child.tag.endswith("}filetime") + + def it_raises_on_unsupported_value_type(self): + props = _empty_properties() + prop = props.add_property("P") + + with pytest.raises(TypeError): + prop.value = object() + + def it_replaces_the_existing_value_on_reassignment(self): + props = _empty_properties() + prop = props.add_property("P") + prop.value = "first" + + prop.value = 99 + + assert prop.value == 99 + assert type(prop.value) is int + + def it_exposes_pid_as_int(self): + prop = cast( + CT_CustomProperty, + parse_xml( + b'' + ), + ) + + assert prop.pid == 7 + assert prop.name == "Foo" + assert prop.fmtid == "{X}" diff --git a/tests/oxml/test_document.py b/tests/oxml/test_document.py index f7f99a524..4bf84d0e9 100644 --- a/tests/oxml/test_document.py +++ b/tests/oxml/test_document.py @@ -4,11 +4,12 @@ from typing import cast -from docx.oxml.document import CT_Body +from docx.oxml.document import CT_Background, CT_Body, CT_Document from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P +from docx.shared import RGBColor -from ..unitutil.cxml import element +from ..unitutil.cxml import element, xml class DescribeCT_Body: @@ -17,3 +18,37 @@ class DescribeCT_Body: def it_knows_its_inner_content_block_item_elements(self): body = cast(CT_Body, element("w:body/(w:tbl, w:p,w:p)")) assert [type(e) for e in body.inner_content_elements] == [CT_Tbl, CT_P, CT_P] + + +class DescribeCT_Background: + """Unit-test suite for `docx.oxml.document.CT_Background`.""" + + def it_parses_its_color_attribute_as_an_RGBColor(self): + background = cast(CT_Background, element("w:background{w:color=FF0000}")) + assert background.color == RGBColor(0xFF, 0x00, 0x00) + + def it_returns_None_for_color_when_attribute_is_absent(self): + background = cast(CT_Background, element("w:background")) + assert background.color is None + + +class DescribeCT_Document: + """Unit-test suite for `docx.oxml.document.CT_Document`.""" + + def it_has_no_background_element_by_default(self): + document = cast(CT_Document, element("w:document/w:body")) + assert document.background is None + + def it_exposes_its_background_child_when_present(self): + document = cast( + CT_Document, element("w:document/(w:background{w:color=112233},w:body)") + ) + assert document.background is not None + assert document.background.color == RGBColor(0x11, 0x22, 0x33) + + def it_inserts_background_before_body(self): + document = cast(CT_Document, element("w:document/w:body")) + + document.get_or_add_background() + + assert document.xml == xml("w:document/(w:background,w:body)") diff --git a/tests/oxml/test_drawing.py b/tests/oxml/test_drawing.py new file mode 100644 index 000000000..e5e3a114a --- /dev/null +++ b/tests/oxml/test_drawing.py @@ -0,0 +1,367 @@ +# pyright: reportPrivateUsage=false + +"""Unit test suite for the docx.oxml.drawing module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.drawing import ( + CT_Drawing, + CT_GroupShape, + CT_TxbxContent, + CT_WordprocessingCanvas, + CT_WordprocessingShape, + new_inline_canvas_drawing, + new_inline_shape_drawing, +) +from docx.oxml.ns import qn +from docx.oxml.shape import CT_Picture + +from ..unitutil.cxml import element + + +class DescribeCT_Drawing: + """Unit test suite for `docx.oxml.drawing.CT_Drawing` objects.""" + + def it_provides_access_to_txbxContent_descendants(self): + drawing = cast( + CT_Drawing, + element( + "w:drawing/wp:inline/a:graphic/a:graphicData" + "/wps:wsp/wps:txbx/w:txbxContent/w:p" + ), + ) + + txbx_contents = drawing.txbxContent_lst + + assert len(txbx_contents) == 1 + assert isinstance(txbx_contents[0], CT_TxbxContent) + + def it_returns_empty_list_when_no_txbxContent(self): + drawing = cast( + CT_Drawing, + element("w:drawing/wp:inline/a:graphic/a:graphicData/pic:pic"), + ) + + assert drawing.txbxContent_lst == [] + + +class DescribeCT_TxbxContent: + """Unit test suite for `docx.oxml.drawing.CT_TxbxContent` objects.""" + + def it_provides_access_to_its_paragraph_children(self): + txbxContent = cast( + CT_TxbxContent, + element("w:txbxContent/(w:p,w:p)"), + ) + + assert len(txbxContent.p_lst) == 2 + + def it_can_get_concatenated_text(self): + txbxContent = cast( + CT_TxbxContent, + element('w:txbxContent/(w:p/w:r/w:t"Hello",w:p/w:r/w:t"World")'), + ) + + assert txbxContent.text == "Hello\nWorld" + + def it_returns_empty_string_when_no_text(self): + txbxContent = cast( + CT_TxbxContent, + element("w:txbxContent/w:p"), + ) + + assert txbxContent.text == "" + + +class DescribeCT_Drawing_GroupShape: + """Unit-test suite for `CT_Drawing.grpSp_lst`.""" + + def it_finds_an_inline_group_shape(self): + drawing = cast( + CT_Drawing, + element("w:drawing/wp:inline/a:graphic/a:graphicData/wpg:grpSp"), + ) + + grpSp_lst = drawing.grpSp_lst + + assert len(grpSp_lst) == 1 + assert isinstance(grpSp_lst[0], CT_GroupShape) + + def it_finds_an_anchor_group_shape(self): + drawing = cast( + CT_Drawing, + element("w:drawing/wp:anchor/a:graphic/a:graphicData/wpg:grpSp"), + ) + + assert len(drawing.grpSp_lst) == 1 + + def it_recognizes_legacy_wgp_as_a_group_shape(self): + drawing = cast( + CT_Drawing, + element("w:drawing/wp:inline/a:graphic/a:graphicData/wpg:wgp"), + ) + + grpSp_lst = drawing.grpSp_lst + + assert len(grpSp_lst) == 1 + assert isinstance(grpSp_lst[0], CT_GroupShape) + + def it_returns_empty_when_drawing_is_not_a_group(self): + drawing = cast( + CT_Drawing, + element("w:drawing/wp:inline/a:graphic/a:graphicData/pic:pic"), + ) + + assert drawing.grpSp_lst == [] + + +class DescribeCT_GroupShape: + """Unit-test suite for `docx.oxml.drawing.CT_GroupShape`.""" + + def it_reads_name_from_cNvPr(self): + grpSp = cast( + CT_GroupShape, + element("wpg:grpSp/wpg:nvGrpSpPr/wpg:cNvPr{id=1,name=Group 1}"), + ) + + assert grpSp.name == "Group 1" + + def its_name_is_None_when_nvGrpSpPr_is_missing(self): + grpSp = cast(CT_GroupShape, element("wpg:grpSp")) + + assert grpSp.name is None + + def its_name_is_None_when_cNvPr_is_missing(self): + grpSp = cast(CT_GroupShape, element("wpg:grpSp/wpg:nvGrpSpPr")) + + assert grpSp.name is None + + def it_provides_access_to_direct_child_shapes(self): + grpSp = cast( + CT_GroupShape, + element("wpg:grpSp/(wpg:nvGrpSpPr,wps:wsp,wps:wsp)"), + ) + + wsp_lst = grpSp.wsp_lst + + assert len(wsp_lst) == 2 + assert all(isinstance(w, CT_WordprocessingShape) for w in wsp_lst) + + def it_provides_access_to_nested_groups(self): + grpSp = cast( + CT_GroupShape, + element("wpg:grpSp/(wpg:nvGrpSpPr,wpg:grpSp,wpg:grpSp)"), + ) + + assert len(grpSp.grpSp_lst) == 2 + assert all(isinstance(g, CT_GroupShape) for g in grpSp.grpSp_lst) + + def it_provides_access_to_child_pictures(self): + grpSp = cast( + CT_GroupShape, + element("wpg:grpSp/(wpg:nvGrpSpPr,pic:pic)"), + ) + + assert len(grpSp.pic_lst) == 1 + assert isinstance(grpSp.pic_lst[0], CT_Picture) + + def it_iterates_shape_children_in_document_order(self): + grpSp = cast( + CT_GroupShape, + element("wpg:grpSp/(wpg:nvGrpSpPr,wps:wsp,wpg:grpSp,pic:pic,wps:wsp)"), + ) + + children = grpSp.shape_children + + assert [type(c).__name__ for c in children] == [ + "CT_WordprocessingShape", + "CT_GroupShape", + "CT_Picture", + "CT_WordprocessingShape", + ] + + +class DescribeCT_WordprocessingShape: + """Unit-test suite for `docx.oxml.drawing.CT_WordprocessingShape`.""" + + def it_reads_name_from_wps_cNvPr(self): + wsp = cast( + CT_WordprocessingShape, + element("wps:wsp/wps:cNvPr{id=1,name=My Shape}"), + ) + + assert wsp.name == "My Shape" + + def its_name_is_None_when_wps_cNvPr_absent(self): + wsp = cast(CT_WordprocessingShape, element("wps:wsp")) + + assert wsp.name is None + + def it_reads_prst_from_prstGeom(self): + wsp = cast( + CT_WordprocessingShape, + element("wps:wsp/wps:spPr/a:prstGeom{prst=roundRect}"), + ) + + assert wsp.prst == "roundRect" + + def its_prst_is_None_when_absent(self): + wsp = cast(CT_WordprocessingShape, element("wps:wsp")) + + assert wsp.prst is None + + def it_can_set_text_on_an_empty_shape(self): + wsp = cast(CT_WordprocessingShape, element("wps:wsp")) + + wsp.set_text("Hello") + + assert wsp.txbx is not None + assert wsp.txbx.txbxContent is not None + assert wsp.txbx.txbxContent.text == "Hello" + + def it_replaces_existing_txbx_content_on_set_text(self): + wsp = cast( + CT_WordprocessingShape, + element('wps:wsp/wps:txbx/w:txbxContent/w:p/w:r/w:t"Old"'), + ) + + wsp.set_text("New") + + # -- only one txbx remains -- + assert len(wsp.findall(qn("wps:txbx"))) == 1 + assert wsp.txbx is not None + assert wsp.txbx.txbxContent is not None + assert wsp.txbx.txbxContent.text == "New" + + def it_preserves_leading_and_trailing_whitespace_with_xml_space(self): + wsp = cast(CT_WordprocessingShape, element("wps:wsp")) + + wsp.set_text(" leading and trailing ") + + assert wsp.txbx is not None + assert wsp.txbx.txbxContent is not None + t_elm = wsp.txbx.txbxContent.find( + f"{qn('w:p')}/{qn('w:r')}/{qn('w:t')}" + ) + assert t_elm is not None + assert t_elm.get(qn("xml:space")) == "preserve" + + +class DescribeNewInlineShapeDrawing: + """Unit-test suite for `docx.oxml.drawing.new_inline_shape_drawing`.""" + + def it_builds_a_drawing_with_the_expected_structure(self): + drawing = new_inline_shape_drawing( + prst="rect", + cx=1828800, + cy=914400, + shape_id=1, + name="Rectangle 1", + ) + + # -- extent populated -- + extent = drawing.find(f"{qn('wp:inline')}/{qn('wp:extent')}") + assert extent is not None + assert extent.get("cx") == "1828800" + assert extent.get("cy") == "914400" + + # -- docPr populated -- + docPr = drawing.find(f"{qn('wp:inline')}/{qn('wp:docPr')}") + assert docPr is not None + assert docPr.get("id") == "1" + assert docPr.get("name") == "Rectangle 1" + + # -- graphicData uri references the wps namespace -- + graphicData = drawing.find( + f"{qn('wp:inline')}/{qn('a:graphic')}/{qn('a:graphicData')}" + ) + assert graphicData is not None + assert ( + graphicData.get("uri") + == "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" + ) + + # -- wps:wsp is present and carries expected metadata -- + wsp_list = drawing.xpath(".//wps:wsp") + assert len(wsp_list) == 1 + wsp = wsp_list[0] + assert isinstance(wsp, CT_WordprocessingShape) + assert wsp.name == "Rectangle 1" + assert wsp.prst == "rect" + + # -- xfrm extent populated -- + ext = wsp.find(f"{qn('wps:spPr')}/{qn('a:xfrm')}/{qn('a:ext')}") + assert ext is not None + assert ext.get("cx") == "1828800" + assert ext.get("cy") == "914400" + + @pytest.mark.parametrize( + "prst", + ["rect", "roundRect", "ellipse", "rightArrow", "wedgeRoundRectCallout"], + ) + def it_round_trips_each_supported_preset(self, prst: str): + drawing = new_inline_shape_drawing( + prst=prst, cx=100, cy=200, shape_id=5, name="X" + ) + + wsp = drawing.xpath(".//wps:wsp")[0] + assert wsp.prst == prst + + def it_includes_a_text_frame_when_text_is_provided(self): + drawing = new_inline_shape_drawing( + prst="rect", cx=100, cy=200, shape_id=1, name="R", text="Hi" + ) + + wsp = drawing.xpath(".//wps:wsp")[0] + assert wsp.txbx is not None + assert wsp.txbx.txbxContent is not None + assert wsp.txbx.txbxContent.text == "Hi" + + def it_omits_a_text_frame_when_text_is_None(self): + drawing = new_inline_shape_drawing( + prst="rect", cx=100, cy=200, shape_id=1, name="R", text=None + ) + + wsp = drawing.xpath(".//wps:wsp")[0] + assert wsp.txbx is None + + +class DescribeNewInlineCanvasDrawing: + """Unit-test suite for `docx.oxml.drawing.new_inline_canvas_drawing`.""" + + def it_builds_a_canvas_drawing_with_the_expected_structure(self): + drawing = new_inline_canvas_drawing( + cx=5486400, cy=2743200, shape_id=7, name="Canvas 7" + ) + + # -- extent populated -- + extent = drawing.find(f"{qn('wp:inline')}/{qn('wp:extent')}") + assert extent is not None + assert extent.get("cx") == "5486400" + assert extent.get("cy") == "2743200" + + # -- docPr populated -- + docPr = drawing.find(f"{qn('wp:inline')}/{qn('wp:docPr')}") + assert docPr is not None + assert docPr.get("id") == "7" + assert docPr.get("name") == "Canvas 7" + + # -- graphicData uri references the canvas namespace -- + graphicData = drawing.find( + f"{qn('wp:inline')}/{qn('a:graphic')}/{qn('a:graphicData')}" + ) + assert graphicData is not None + assert ( + graphicData.get("uri") + == "http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" + ) + + # -- wpc:wpc is present and empty of shapes -- + wpc_list = drawing.xpath(".//wpc:wpc") + assert len(wpc_list) == 1 + assert isinstance(wpc_list[0], CT_WordprocessingCanvas) + assert wpc_list[0].wsp_lst == [] diff --git a/tests/oxml/test_endnotes.py b/tests/oxml/test_endnotes.py new file mode 100644 index 000000000..fdb99b36f --- /dev/null +++ b/tests/oxml/test_endnotes.py @@ -0,0 +1,145 @@ +"""Unit test suite for the docx.oxml.endnotes module.""" + +from __future__ import annotations + +from typing import cast + +from docx.enum.text import WD_NUMBER_FORMAT +from docx.oxml.endnotes import CT_EdnDocProps, CT_Endnote, CT_Endnotes +from docx.oxml.ns import qn + +from ..unitutil.cxml import element + + +class DescribeCT_Endnotes: + """Unit test suite for `docx.oxml.endnotes.CT_Endnotes` objects.""" + + def it_provides_access_to_its_endnote_children(self): + endnotes = cast( + CT_Endnotes, + element("w:endnotes/(w:endnote{w:id=0},w:endnote{w:id=1})"), + ) + + assert len(endnotes.endnote_lst) == 2 + + def it_can_add_an_endnote(self): + endnotes = cast( + CT_Endnotes, + element( + "w:endnotes/(w:endnote{w:id=0,w:type=separator}" + ",w:endnote{w:id=1,w:type=continuationSeparator})" + ), + ) + + endnote = endnotes.add_endnote() + + assert endnote.id == 2 + # -- the endnote has a paragraph with EndnoteText style -- + assert len(endnote.p_lst) == 1 + p = endnote.p_lst[0] + assert p.style == "EndnoteText" + # -- the paragraph has a run with EndnoteReference style and endnoteRef -- + assert len(p.r_lst) == 1 + r = p.r_lst[0] + assert r.style == "EndnoteReference" + assert r[-1].tag == qn("w:endnoteRef") + + def it_assigns_sequential_ids_to_added_endnotes(self): + endnotes = cast( + CT_Endnotes, + element( + "w:endnotes/(w:endnote{w:id=0,w:type=separator}" + ",w:endnote{w:id=1,w:type=continuationSeparator})" + ), + ) + + en1 = endnotes.add_endnote() + en2 = endnotes.add_endnote() + + assert en1.id == 2 + assert en2.id == 3 + + def it_skips_used_ids_when_assigning(self): + endnotes = cast( + CT_Endnotes, + element( + "w:endnotes/(w:endnote{w:id=0,w:type=separator}" + ",w:endnote{w:id=1,w:type=continuationSeparator}" + ",w:endnote{w:id=2})" + ), + ) + + endnote = endnotes.add_endnote() + + assert endnote.id == 3 + + +class DescribeCT_Endnote: + """Unit test suite for `docx.oxml.endnotes.CT_Endnote` objects.""" + + def it_provides_access_to_its_id(self): + endnote = cast(CT_Endnote, element("w:endnote{w:id=42}")) + + assert endnote.id == 42 + + def it_provides_access_to_its_type(self): + endnote = cast(CT_Endnote, element("w:endnote{w:id=0,w:type=separator}")) + + assert endnote.type == "separator" + + def it_returns_None_for_type_when_not_present(self): + endnote = cast(CT_Endnote, element("w:endnote{w:id=2}")) + + assert endnote.type is None + + def it_can_clear_its_content(self): + endnote = cast( + CT_Endnote, + element('w:endnote{w:id=2}/(w:p/w:r/w:t"Para one",w:p/w:r/w:t"Para two")'), + ) + assert len(endnote.p_lst) == 2 + + endnote.clear_content() + + assert len(endnote.p_lst) == 1 + p = endnote.p_lst[0] + assert p.style == "EndnoteText" + # -- the paragraph has an endnoteRef run to preserve the auto-number mark -- + assert len(p.r_lst) == 1 + r = p.r_lst[0] + assert r.style == "EndnoteReference" + assert r[-1].tag == qn("w:endnoteRef") + + def it_provides_access_to_its_inner_content_elements(self): + endnote = cast( + CT_Endnote, + element("w:endnote{w:id=2}/(w:p,w:tbl,w:p)"), + ) + + content = endnote.inner_content_elements + assert len(content) == 3 + + +class DescribeCT_EdnDocProps: + """Unit-test suite for `docx.oxml.endnotes.CT_EdnDocProps`.""" + + def it_returns_None_when_child_elements_absent(self): + endnotePr = cast(CT_EdnDocProps, element("w:endnotePr")) + assert endnotePr.numFmt is None + assert endnotePr.numStart is None + assert endnotePr.numRestart is None + assert endnotePr.pos is None + + def it_provides_access_to_numFmt_val(self): + endnotePr = cast( + CT_EdnDocProps, element("w:endnotePr/w:numFmt{w:val=upperRoman}") + ) + assert endnotePr.numFmt is not None + assert endnotePr.numFmt.val == WD_NUMBER_FORMAT.UPPER_ROMAN + + def it_provides_access_to_pos_val(self): + endnotePr = cast( + CT_EdnDocProps, element("w:endnotePr/w:pos{w:val=sectEnd}") + ) + assert endnotePr.pos is not None + assert endnotePr.pos.val == "sectEnd" diff --git a/tests/oxml/test_fields.py b/tests/oxml/test_fields.py new file mode 100644 index 000000000..65256d6cd --- /dev/null +++ b/tests/oxml/test_fields.py @@ -0,0 +1,171 @@ +"""Unit-test suite for docx.oxml.fields.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.fields import CT_FldChar, CT_FldSimple, CT_InstrText, ST_FldCharType +from docx.oxml.ns import qn +from docx.oxml.text.paragraph import CT_P + +from ..unitutil.cxml import element, xml + + +class DescribeST_FldCharType: + """Unit-test suite for the ``w:fldCharType`` simple type.""" + + @pytest.mark.parametrize("value", ["begin", "separate", "end"]) + def it_accepts_valid_values(self, value: str): + ST_FldCharType.validate(value) + + def it_rejects_invalid_values(self): + with pytest.raises(ValueError, match="w:fldCharType must be one of"): + ST_FldCharType.validate("nope") + + +class DescribeCT_FldSimple: + """Unit-test suite for `docx.oxml.fields.CT_FldSimple`.""" + + def it_exposes_its_instruction(self): + fldSimple = cast( + CT_FldSimple, + element('w:fldSimple{w:instr=PAGE}'), + ) + assert fldSimple.instr == "PAGE" + + def it_exposes_its_result_text(self): + fldSimple = cast( + CT_FldSimple, + element('w:fldSimple{w:instr=PAGE}/w:r/w:t"3"'), + ) + assert fldSimple.text == "3" + + def it_concatenates_multiple_run_text_children(self): + fldSimple = cast( + CT_FldSimple, + element( + 'w:fldSimple{w:instr=PAGE}/(w:r/w:t"Page ",w:r/w:t"3")' + ), + ) + assert fldSimple.text == "Page 3" + + +class DescribeCT_FldChar: + """Unit-test suite for `docx.oxml.fields.CT_FldChar`.""" + + @pytest.mark.parametrize("fld_type", ["begin", "separate", "end"]) + def it_exposes_its_fldCharType(self, fld_type: str): + fldChar = cast( + CT_FldChar, + element(f"w:fldChar{{w:fldCharType={fld_type}}}"), + ) + assert fldChar.fldCharType == fld_type + + +class DescribeCT_InstrText: + """Unit-test suite for `docx.oxml.fields.CT_InstrText`.""" + + def it_exposes_its_text_via_str(self): + instrText = cast(CT_InstrText, element('w:instrText"PAGE"')) + assert str(instrText) == "PAGE" + + def it_returns_empty_string_when_no_text(self): + instrText = cast(CT_InstrText, element("w:instrText")) + assert str(instrText) == "" + + +class DescribeCT_P_FieldHelpers: + """Unit-test suite for field-related helpers on CT_P.""" + + def it_can_add_a_simple_field(self): + p = cast(CT_P, element("w:p")) + + fldSimple = p.add_fldSimple("PAGE", "3") + + assert fldSimple.instr == "PAGE" + assert len(p.fldSimple_lst) == 1 + assert fldSimple.text == "3" + + def it_can_add_a_simple_field_without_result_text(self): + p = cast(CT_P, element("w:p")) + + fldSimple = p.add_fldSimple("DATE") + + assert fldSimple.instr == "DATE" + # -- no run children when no text provided -- + assert len(fldSimple.r_lst) == 0 + + def it_preserves_space_in_instruction(self): + p = cast(CT_P, element("w:p")) + + p.add_complex_field("REF bookmark1", "See here") + + # -- instrText should have xml:space="preserve" since instr has spaces -- + instrTexts = p.xpath(".//w:instrText") + assert instrTexts[0].get(qn("xml:space")) == "preserve" + + def it_emits_the_five_run_sequence_for_a_complex_field(self): + p = cast(CT_P, element("w:p")) + + p.add_complex_field("PAGE", "3") + + runs = p.r_lst + assert len(runs) == 5 + assert runs[0][0].tag == qn("w:fldChar") + assert runs[0][0].get(qn("w:fldCharType")) == "begin" + assert runs[1][0].tag == qn("w:instrText") + assert runs[2][0].tag == qn("w:fldChar") + assert runs[2][0].get(qn("w:fldCharType")) == "separate" + assert runs[3][0].tag == qn("w:t") + assert runs[4][0].tag == qn("w:fldChar") + assert runs[4][0].get(qn("w:fldCharType")) == "end" + + def it_emits_four_runs_when_result_text_is_omitted(self): + p = cast(CT_P, element("w:p")) + + p.add_complex_field("PAGE") + + runs = p.r_lst + assert len(runs) == 4 + # -- no result-text run between separate and end -- + assert runs[3][0].tag == qn("w:fldChar") + assert runs[3][0].get(qn("w:fldCharType")) == "end" + + def it_returns_the_begin_run_from_add_complex_field(self): + p = cast(CT_P, element("w:p")) + + begin_run = p.add_complex_field("PAGE", "3") + + assert begin_run is p.r_lst[0] + + def it_iterates_field_elements_in_document_order(self): + p = cast(CT_P, element("w:p")) + p.add_fldSimple("PAGE", "1") + p.add_complex_field("NUMPAGES", "10") + p.add_fldSimple("DATE", "2026-01-01") + + kinds = [kind for kind, _ in p.iter_field_elements()] + + assert kinds == ["simple", "complex", "simple"] + + +class DescribeCT_P_Text: + """Verify `paragraph.text` picks up fldSimple content.""" + + def it_includes_fldSimple_text(self): + p = cast( + CT_P, + element( + 'w:p/(w:r/w:t"Page ",w:fldSimple{w:instr=PAGE}/w:r/w:t"3")' + ), + ) + assert p.text == "Page 3" + + def it_includes_complex_field_result_text(self): + # -- complex fields use regular w:r children, so result text is already + # covered by the existing xpath; just confirm it here. + p = cast(CT_P, element('w:p/w:r/w:t"Page "')) + p.add_complex_field("PAGE", "3") + assert p.text == "Page 3" diff --git a/tests/oxml/test_font_table.py b/tests/oxml/test_font_table.py new file mode 100644 index 000000000..63b5a00ae --- /dev/null +++ b/tests/oxml/test_font_table.py @@ -0,0 +1,116 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.font_table` module.""" + +from __future__ import annotations + +from typing import cast + +from docx.oxml.font_table import CT_Font, CT_Fonts + +from ..unitutil.cxml import element + + +class DescribeCT_Fonts: + """Unit-test suite for `docx.oxml.font_table.CT_Fonts`.""" + + def it_exposes_its_fonts_as_a_list(self): + fonts = cast(CT_Fonts, element("w:fonts")) + assert fonts.font_lst == [] + + def it_enumerates_font_children_in_xml_order(self): + fonts = cast( + CT_Fonts, + element( + "w:fonts/(" + "w:font{w:name=Arial}," + "w:font{w:name=Calibri}," + "w:font{w:name=Times New Roman}" + ")" + ), + ) + assert [f.name for f in fonts.font_lst] == [ + "Arial", + "Calibri", + "Times New Roman", + ] + + def it_can_find_a_font_by_name(self): + fonts = cast( + CT_Fonts, + element("w:fonts/(w:font{w:name=Arial},w:font{w:name=Calibri})"), + ) + + font = fonts.get_font_by_name("Calibri") + + assert font is not None + assert font.name == "Calibri" + + def but_it_returns_None_when_the_named_font_is_not_present(self): + fonts = cast(CT_Fonts, element("w:fonts/(w:font{w:name=Arial})")) + assert fonts.get_font_by_name("Helvetica") is None + + +class DescribeCT_Font: + """Unit-test suite for `docx.oxml.font_table.CT_Font`.""" + + def it_exposes_its_name_attribute(self): + font = cast(CT_Font, element("w:font{w:name=Arial}")) + assert font.name == "Arial" + + def it_exposes_altName_charset_family_pitch_and_panose_children(self): + font = cast( + CT_Font, + element( + "w:font{w:name=Arial}/(" + "w:altName{w:val=Helvetica}," + "w:panose1{w:val=020B0604020202020204}," + "w:charset{w:val=00}," + "w:family{w:val=swiss}," + "w:pitch{w:val=variable}" + ")" + ), + ) + assert font.altName is not None + assert font.altName.val == "Helvetica" + assert font.panose1 is not None + assert font.panose1.val == "020B0604020202020204" + assert font.charset is not None + assert font.charset.val == "00" + assert font.family is not None + assert font.family.val == "swiss" + assert font.pitch is not None + assert font.pitch.val == "variable" + + def its_optional_child_elements_are_None_when_absent(self): + font = cast(CT_Font, element("w:font{w:name=Arial}")) + assert font.altName is None + assert font.panose1 is None + assert font.charset is None + assert font.family is None + assert font.pitch is None + assert font.embedRegular is None + assert font.embedBold is None + assert font.embedItalic is None + assert font.embedBoldItalic is None + + def it_exposes_embed_elements_for_embedded_fonts(self): + font = cast( + CT_Font, + element( + "w:font{w:name=Arial}/(" + "w:embedRegular{r:id=rId1}," + "w:embedBold{r:id=rId2}," + "w:embedItalic{r:id=rId3}," + "w:embedBoldItalic{r:id=rId4}" + ")" + ), + ) + assert font.embedRegular is not None + assert font.embedRegular.rId == "rId1" + assert font.embedBold is not None + assert font.embedBold.rId == "rId2" + assert font.embedItalic is not None + assert font.embedItalic.rId == "rId3" + assert font.embedBoldItalic is not None + assert font.embedBoldItalic.rId == "rId4" diff --git a/tests/oxml/test_footnotes.py b/tests/oxml/test_footnotes.py new file mode 100644 index 000000000..cd952cf9d --- /dev/null +++ b/tests/oxml/test_footnotes.py @@ -0,0 +1,159 @@ +"""Unit test suite for the docx.oxml.footnotes module.""" + +from __future__ import annotations + +from typing import cast + +from docx.enum.text import WD_FOOTNOTE_RESTART, WD_NUMBER_FORMAT +from docx.oxml.footnotes import CT_Footnote, CT_Footnotes, CT_FtnDocProps +from docx.oxml.ns import qn + +from ..unitutil.cxml import element + + +class DescribeCT_Footnotes: + """Unit test suite for `docx.oxml.footnotes.CT_Footnotes` objects.""" + + def it_provides_access_to_its_footnote_children(self): + footnotes = cast( + CT_Footnotes, + element("w:footnotes/(w:footnote{w:id=0},w:footnote{w:id=1})"), + ) + + assert len(footnotes.footnote_lst) == 2 + + def it_can_add_a_footnote(self): + footnotes = cast( + CT_Footnotes, + element( + "w:footnotes/(w:footnote{w:id=0,w:type=separator}" + ",w:footnote{w:id=1,w:type=continuationSeparator})" + ), + ) + + footnote = footnotes.add_footnote() + + assert footnote.id == 2 + # -- the footnote has a paragraph with FootnoteText style -- + assert len(footnote.p_lst) == 1 + p = footnote.p_lst[0] + assert p.style == "FootnoteText" + # -- the paragraph has a run with FootnoteReference style and footnoteRef -- + assert len(p.r_lst) == 1 + r = p.r_lst[0] + assert r.style == "FootnoteReference" + assert r[-1].tag == qn("w:footnoteRef") + + def it_assigns_sequential_ids_to_added_footnotes(self): + footnotes = cast( + CT_Footnotes, + element( + "w:footnotes/(w:footnote{w:id=0,w:type=separator}" + ",w:footnote{w:id=1,w:type=continuationSeparator})" + ), + ) + + fn1 = footnotes.add_footnote() + fn2 = footnotes.add_footnote() + + assert fn1.id == 2 + assert fn2.id == 3 + + def it_skips_used_ids_when_assigning(self): + footnotes = cast( + CT_Footnotes, + element( + "w:footnotes/(w:footnote{w:id=0,w:type=separator}" + ",w:footnote{w:id=1,w:type=continuationSeparator}" + ",w:footnote{w:id=2})" + ), + ) + + footnote = footnotes.add_footnote() + + assert footnote.id == 3 + + +class DescribeCT_Footnote: + """Unit test suite for `docx.oxml.footnotes.CT_Footnote` objects.""" + + def it_provides_access_to_its_id(self): + footnote = cast(CT_Footnote, element("w:footnote{w:id=42}")) + + assert footnote.id == 42 + + def it_provides_access_to_its_type(self): + footnote = cast(CT_Footnote, element("w:footnote{w:id=0,w:type=separator}")) + + assert footnote.type == "separator" + + def it_returns_None_for_type_when_not_present(self): + footnote = cast(CT_Footnote, element("w:footnote{w:id=2}")) + + assert footnote.type is None + + def it_can_clear_its_content(self): + footnote = cast( + CT_Footnote, + element('w:footnote{w:id=2}/(w:p/w:r/w:t"Para one",w:p/w:r/w:t"Para two")'), + ) + assert len(footnote.p_lst) == 2 + + footnote.clear_content() + + assert len(footnote.p_lst) == 1 + p = footnote.p_lst[0] + assert p.style == "FootnoteText" + # -- the paragraph has a footnoteRef run to preserve the auto-number mark -- + assert len(p.r_lst) == 1 + r = p.r_lst[0] + assert r.style == "FootnoteReference" + assert r[-1].tag == qn("w:footnoteRef") + + def it_provides_access_to_its_inner_content_elements(self): + footnote = cast( + CT_Footnote, + element("w:footnote{w:id=2}/(w:p,w:tbl,w:p)"), + ) + + content = footnote.inner_content_elements + assert len(content) == 3 + + +class DescribeCT_FtnDocProps: + """Unit-test suite for `docx.oxml.footnotes.CT_FtnDocProps`.""" + + def it_returns_None_when_child_elements_absent(self): + footnotePr = cast(CT_FtnDocProps, element("w:footnotePr")) + assert footnotePr.numFmt is None + assert footnotePr.numStart is None + assert footnotePr.numRestart is None + assert footnotePr.pos is None + + def it_provides_access_to_numFmt_val(self): + footnotePr = cast( + CT_FtnDocProps, element("w:footnotePr/w:numFmt{w:val=lowerRoman}") + ) + assert footnotePr.numFmt is not None + assert footnotePr.numFmt.val == WD_NUMBER_FORMAT.LOWER_ROMAN + + def it_provides_access_to_numStart_val(self): + footnotePr = cast( + CT_FtnDocProps, element("w:footnotePr/w:numStart{w:val=5}") + ) + assert footnotePr.numStart is not None + assert footnotePr.numStart.val == 5 + + def it_provides_access_to_numRestart_val(self): + footnotePr = cast( + CT_FtnDocProps, element("w:footnotePr/w:numRestart{w:val=eachPage}") + ) + assert footnotePr.numRestart is not None + assert footnotePr.numRestart.val == WD_FOOTNOTE_RESTART.EACH_PAGE + + def it_provides_access_to_pos_val(self): + footnotePr = cast( + CT_FtnDocProps, element("w:footnotePr/w:pos{w:val=pageBottom}") + ) + assert footnotePr.pos is not None + assert footnotePr.pos.val == "pageBottom" diff --git a/tests/oxml/test_form_fields.py b/tests/oxml/test_form_fields.py new file mode 100644 index 000000000..166585f3a --- /dev/null +++ b/tests/oxml/test_form_fields.py @@ -0,0 +1,148 @@ +"""Unit-test suite for docx.oxml.form_fields.""" + +from __future__ import annotations + +from typing import cast + +from docx.oxml.fields import CT_FldChar +from docx.oxml.form_fields import ( + CT_FFCheckBox, + CT_FFData, + CT_FFDDList, + CT_FFTextInput, +) +from docx.oxml.ns import qn + +from ..unitutil.cxml import element + + +class DescribeCT_FFData: + """Unit-test suite for `docx.oxml.form_fields.CT_FFData`.""" + + def it_exposes_a_text_input_child(self): + ffData = cast( + CT_FFData, + element("w:ffData/(w:name{w:val=T1},w:enabled,w:textInput)"), + ) + assert isinstance(ffData.textInput, CT_FFTextInput) + assert ffData.checkBox is None + assert ffData.ddList is None + + def it_exposes_a_check_box_child(self): + ffData = cast( + CT_FFData, + element("w:ffData/(w:name{w:val=C1},w:checkBox)"), + ) + assert isinstance(ffData.checkBox, CT_FFCheckBox) + assert ffData.textInput is None + assert ffData.ddList is None + + def it_exposes_a_dd_list_child(self): + ffData = cast( + CT_FFData, + element("w:ffData/(w:name{w:val=D1},w:ddList)"), + ) + assert isinstance(ffData.ddList, CT_FFDDList) + assert ffData.textInput is None + assert ffData.checkBox is None + + def it_exposes_the_name_help_and_status_children(self): + ffData = cast( + CT_FFData, + element( + "w:ffData/(" + "w:name{w:val=FF1}" + ",w:enabled" + ",w:calcOnExit" + ",w:helpText{w:val=Help}" + ",w:statusText{w:val=Status}" + ",w:textInput" + ")" + ), + ) + assert ffData.name.get(qn("w:val")) == "FF1" + assert ffData.enabled is not None + assert ffData.calcOnExit is not None + assert ffData.helpText.get(qn("w:val")) == "Help" + assert ffData.statusText.get(qn("w:val")) == "Status" + + +class DescribeCT_FFTextInput: + """Unit-test suite for `CT_FFTextInput`.""" + + def it_exposes_its_default_max_length_and_format(self): + ti = cast( + CT_FFTextInput, + element( + "w:textInput/(" + "w:default{w:val=hello}" + ",w:maxLength{w:val=10}" + ",w:format{w:val=UPPERCASE}" + ")" + ), + ) + assert ti.default.get(qn("w:val")) == "hello" + assert ti.maxLength.get(qn("w:val")) == "10" + assert ti.format.get(qn("w:val")) == "UPPERCASE" + + def it_returns_None_for_missing_children(self): + ti = cast(CT_FFTextInput, element("w:textInput")) + assert ti.default is None + assert ti.maxLength is None + assert ti.format is None + + +class DescribeCT_FFCheckBox: + """Unit-test suite for `CT_FFCheckBox`.""" + + def it_exposes_default_and_checked(self): + cb = cast( + CT_FFCheckBox, + element("w:checkBox/(w:default{w:val=1},w:checked{w:val=0})"), + ) + assert cb.default.get(qn("w:val")) == "1" + assert cb.checked.get(qn("w:val")) == "0" + + +class DescribeCT_FFDDList: + """Unit-test suite for `CT_FFDDList`.""" + + def it_exposes_result_default_and_entries(self): + dd = cast( + CT_FFDDList, + element( + "w:ddList/(" + "w:result{w:val=1}" + ",w:default{w:val=0}" + ",w:listEntry{w:val=US}" + ",w:listEntry{w:val=UK}" + ",w:listEntry{w:val=AU}" + ")" + ), + ) + assert dd.result.get(qn("w:val")) == "1" + assert dd.default.get(qn("w:val")) == "0" + entries = [le.get(qn("w:val")) for le in dd.xpath("./w:listEntry")] + assert entries == ["US", "UK", "AU"] + + +class DescribeCT_FldChar_ffData: + """Verify CT_FldChar exposes its ffData child.""" + + def it_exposes_its_ffData_child(self): + fldChar = cast( + CT_FldChar, + element( + "w:fldChar{w:fldCharType=begin}" + "/w:ffData/(w:name{w:val=T1},w:textInput)" + ), + ) + assert fldChar.ffData is not None + assert fldChar.ffData.textInput is not None + + def it_returns_None_when_ffData_is_absent(self): + fldChar = cast( + CT_FldChar, + element("w:fldChar{w:fldCharType=begin}"), + ) + assert fldChar.ffData is None diff --git a/tests/oxml/test_glossary.py b/tests/oxml/test_glossary.py new file mode 100644 index 000000000..305bec0c4 --- /dev/null +++ b/tests/oxml/test_glossary.py @@ -0,0 +1,163 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.glossary` module.""" + +from __future__ import annotations + +from typing import cast + +from docx.oxml.glossary import ( + CT_DocPart, + CT_DocPartBody, + CT_DocPartCategory, + CT_DocPartPr, + CT_DocParts, + CT_GlossaryDocument, +) + +from ..unitutil.cxml import element + + +class DescribeCT_GlossaryDocument: + """Unit-test suite for `docx.oxml.glossary.CT_GlossaryDocument`.""" + + def it_exposes_its_docParts_child(self): + glossary = cast(CT_GlossaryDocument, element("w:glossaryDocument/w:docParts")) + assert glossary.docParts is not None + assert isinstance(glossary.docParts, CT_DocParts) + + def it_returns_None_for_an_absent_docParts(self): + glossary = cast(CT_GlossaryDocument, element("w:glossaryDocument")) + assert glossary.docParts is None + + def it_yields_an_empty_docPart_lst_when_docParts_is_absent(self): + glossary = cast(CT_GlossaryDocument, element("w:glossaryDocument")) + assert glossary.docPart_lst == [] + + def it_yields_an_empty_docPart_lst_when_docParts_is_empty(self): + glossary = cast(CT_GlossaryDocument, element("w:glossaryDocument/w:docParts")) + assert glossary.docPart_lst == [] + + def it_yields_each_docPart_in_order(self): + glossary = cast( + CT_GlossaryDocument, + element("w:glossaryDocument/w:docParts/(w:docPart,w:docPart,w:docPart)"), + ) + assert len(glossary.docPart_lst) == 3 + assert all(isinstance(dp, CT_DocPart) for dp in glossary.docPart_lst) + + +class DescribeCT_DocParts: + """Unit-test suite for `docx.oxml.glossary.CT_DocParts`.""" + + def it_exposes_its_docPart_children_in_order(self): + docParts = cast(CT_DocParts, element("w:docParts/(w:docPart,w:docPart)")) + assert len(docParts.docPart_lst) == 2 + + +class DescribeCT_DocPart: + """Unit-test suite for `docx.oxml.glossary.CT_DocPart`.""" + + def it_exposes_its_docPartPr_child(self): + doc_part = cast(CT_DocPart, element("w:docPart/w:docPartPr")) + assert doc_part.docPartPr is not None + assert isinstance(doc_part.docPartPr, CT_DocPartPr) + + def it_returns_None_for_absent_docPartPr(self): + doc_part = cast(CT_DocPart, element("w:docPart")) + assert doc_part.docPartPr is None + + def it_exposes_its_docPartBody_child(self): + doc_part = cast(CT_DocPart, element("w:docPart/w:docPartBody")) + assert doc_part.docPartBody is not None + assert isinstance(doc_part.docPartBody, CT_DocPartBody) + + def it_returns_None_for_absent_docPartBody(self): + doc_part = cast(CT_DocPart, element("w:docPart")) + assert doc_part.docPartBody is None + + +class DescribeCT_DocPartPr: + """Unit-test suite for `docx.oxml.glossary.CT_DocPartPr`.""" + + def it_exposes_the_name_w_val(self): + pr = cast(CT_DocPartPr, element("w:docPartPr/w:name{w:val=MyBlock}")) + assert pr.name_val == "MyBlock" + + def it_returns_None_when_w_name_is_absent(self): + pr = cast(CT_DocPartPr, element("w:docPartPr")) + assert pr.name_val is None + + def it_returns_None_when_w_name_has_no_val_attribute(self): + pr = cast(CT_DocPartPr, element("w:docPartPr/w:name")) + assert pr.name_val is None + + def it_exposes_the_description_w_val(self): + pr = cast( + CT_DocPartPr, element("w:docPartPr/w:description{w:val=a description}") + ) + assert pr.description_val == "a description" + + def it_returns_None_when_description_is_absent(self): + pr = cast(CT_DocPartPr, element("w:docPartPr")) + assert pr.description_val is None + + def it_exposes_the_guid_w_val(self): + pr = cast( + CT_DocPartPr, + element("w:docPartPr/w:guid{w:val=abc-123-def}"), + ) + assert pr.guid_val == "abc-123-def" + + def it_returns_None_when_guid_is_absent(self): + pr = cast(CT_DocPartPr, element("w:docPartPr")) + assert pr.guid_val is None + + def it_exposes_its_category_child(self): + pr = cast(CT_DocPartPr, element("w:docPartPr/w:category")) + assert pr.category is not None + assert isinstance(pr.category, CT_DocPartCategory) + + def it_returns_None_when_category_is_absent(self): + pr = cast(CT_DocPartPr, element("w:docPartPr")) + assert pr.category is None + + +class DescribeCT_DocPartCategory: + """Unit-test suite for `docx.oxml.glossary.CT_DocPartCategory`.""" + + def it_exposes_the_name_w_val(self): + cat = cast(CT_DocPartCategory, element("w:category/w:name{w:val=General}")) + assert cat.name_val == "General" + + def it_returns_None_when_name_is_absent(self): + cat = cast(CT_DocPartCategory, element("w:category")) + assert cat.name_val is None + + def it_exposes_the_gallery_w_val(self): + cat = cast( + CT_DocPartCategory, element("w:category/w:gallery{w:val=quickParts}") + ) + assert cat.gallery is not None + assert cat.gallery.val == "quickParts" + + def it_returns_None_when_gallery_is_absent(self): + cat = cast(CT_DocPartCategory, element("w:category")) + assert cat.gallery is None + + +class DescribeCT_DocPartBody: + """Unit-test suite for `docx.oxml.glossary.CT_DocPartBody`.""" + + def it_exposes_its_paragraphs(self): + body = cast(CT_DocPartBody, element("w:docPartBody/(w:p,w:p)")) + assert len(body.p_lst) == 2 + + def it_exposes_its_tables(self): + body = cast(CT_DocPartBody, element("w:docPartBody/w:tbl")) + assert len(body.tbl_lst) == 1 + + def it_orders_inner_content_elements_in_document_order(self): + body = cast(CT_DocPartBody, element("w:docPartBody/(w:p,w:tbl,w:p)")) + tags = [el.tag.rsplit("}", 1)[-1] for el in body.inner_content_elements] + assert tags == ["p", "tbl", "p"] diff --git a/tests/oxml/test_math.py b/tests/oxml/test_math.py new file mode 100644 index 000000000..3f79cb278 --- /dev/null +++ b/tests/oxml/test_math.py @@ -0,0 +1,56 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.math` module.""" + +from __future__ import annotations + +from typing import cast + +from docx.oxml.math import CT_MathR, CT_MathT, CT_OMath, CT_OMathPara + +from ..unitutil.cxml import element + + +class DescribeCT_OMath: + """Unit-test suite for `docx.oxml.math.CT_OMath`.""" + + def it_is_registered_for_m_oMath(self): + el = element('m:oMath/m:r/m:t"x"') + assert isinstance(el, CT_OMath) + + def it_concatenates_m_t_text(self): + el = cast(CT_OMath, element('m:oMath/(m:r/m:t"a",m:r/m:t"bc")')) + assert el.text == "abc" + + def it_returns_empty_text_when_no_m_t_children(self): + el = cast(CT_OMath, element("m:oMath")) + assert el.text == "" + + +class DescribeCT_OMathPara: + """Unit-test suite for `docx.oxml.math.CT_OMathPara`.""" + + def it_is_registered_for_m_oMathPara(self): + el = element("m:oMathPara/m:oMath") + assert isinstance(el, CT_OMathPara) + + def it_concatenates_descendant_m_t_text(self): + el = cast( + CT_OMathPara, + element('m:oMathPara/m:oMath/(m:r/m:t"a",m:r/m:t"b")'), + ) + assert el.text == "ab" + + +class DescribeCT_MathR: + def it_is_registered_for_m_r(self): + el = element('m:oMath/m:r/m:t"x"') + child = el[0] + assert isinstance(child, CT_MathR) + + +class DescribeCT_MathT: + def it_is_registered_for_m_t(self): + el = element('m:oMath/m:r/m:t"x"') + t = el[0][0] + assert isinstance(t, CT_MathT) diff --git a/tests/oxml/test_numbering.py b/tests/oxml/test_numbering.py new file mode 100644 index 000000000..2036b4f7f --- /dev/null +++ b/tests/oxml/test_numbering.py @@ -0,0 +1,248 @@ +# pyright: reportPrivateUsage=false + +"""Unit test suite for the `docx.oxml.numbering` module.""" + +from __future__ import annotations + +import time +from typing import cast + +from docx.oxml.numbering import ( + CT_AbstractNum, + CT_Lvl, + CT_Num, + CT_Numbering, + CT_NumPr, +) + +from ..unitutil.cxml import element + + +class DescribeCT_Numbering: + """Unit-test suite for `docx.oxml.numbering.CT_Numbering`.""" + + def it_can_add_an_abstractNum_to_an_empty_numbering(self): + numbering = cast(CT_Numbering, element("w:numbering")) + + abstractNum = numbering.add_abstractNum() + + assert isinstance(abstractNum, CT_AbstractNum) + assert abstractNum.abstractNumId == 0 + + def it_assigns_next_abstractNumId_for_consecutive_calls(self): + numbering = cast(CT_Numbering, element("w:numbering")) + + a = numbering.add_abstractNum() + b = numbering.add_abstractNum() + c = numbering.add_abstractNum() + + assert [a.abstractNumId, b.abstractNumId, c.abstractNumId] == [0, 1, 2] + + def it_can_add_an_abstractNum_with_an_explicit_id(self): + numbering = cast(CT_Numbering, element("w:numbering")) + numbering.add_abstractNum(7) + + abstractNum = numbering.add_abstractNum() + + assert abstractNum.abstractNumId == 0 + + def it_can_add_a_num_with_an_explicit_id(self): + numbering = cast(CT_Numbering, element("w:numbering")) + + num = numbering.add_num(abstractNum_id=0, num_id=5) + + assert isinstance(num, CT_Num) + assert num.numId == 5 + + def it_finds_an_abstractNum_by_id(self): + numbering = cast(CT_Numbering, element("w:numbering")) + a = numbering.add_abstractNum() + b = numbering.add_abstractNum() + + assert numbering.abstractNum_having_abstractNumId(a.abstractNumId) is a + assert numbering.abstractNum_having_abstractNumId(b.abstractNumId) is b + + def it_returns_1_for_next_numId_when_empty(self): + numbering = cast(CT_Numbering, element("w:numbering")) + + assert numbering._next_numId == 1 + + def it_fills_gaps_in_numId_sequence(self): + numbering = cast(CT_Numbering, element("w:numbering")) + numbering.add_num(abstractNum_id=0, num_id=1) + numbering.add_num(abstractNum_id=0, num_id=2) + numbering.add_num(abstractNum_id=0, num_id=4) + + # -- gap at 3 must be filled before appending past the max -- + assert numbering._next_numId == 3 + + def it_returns_max_plus_1_when_contiguous(self): + numbering = cast(CT_Numbering, element("w:numbering")) + numbering.add_num(abstractNum_id=0, num_id=1) + numbering.add_num(abstractNum_id=0, num_id=2) + numbering.add_num(abstractNum_id=0, num_id=3) + + assert numbering._next_numId == 4 + + def it_handles_a_single_non_contiguous_numId(self): + numbering = cast(CT_Numbering, element("w:numbering")) + numbering.add_num(abstractNum_id=0, num_id=5) + + # -- gap starts at 1 -- + assert numbering._next_numId == 1 + + def it_computes_next_numId_quickly_on_large_contiguous_set(self): + """Regression test for upstream#940 (O(n^2) _next_numId). + + With 10_000+ existing contiguous numIds the fast path is O(n) to build + the id list and O(1) to decide the answer; the whole call must finish + well under a second. A previously quadratic impl would take many + seconds here. + """ + numbering = cast(CT_Numbering, element("w:numbering")) + n = 10_000 + for i in range(1, n + 1): + numbering.add_num(abstractNum_id=0, num_id=i) + + start = time.perf_counter() + next_id = numbering._next_numId + elapsed = time.perf_counter() - start + + assert next_id == n + 1 + # -- generous upper bound; real-world runtime is ~a few ms -- + assert elapsed < 1.0, f"_next_numId took {elapsed:.3f}s for n={n}" + + def it_still_gap_fills_on_a_large_sparse_set(self): + """Sparse case still picks the lowest free id, even at scale.""" + numbering = cast(CT_Numbering, element("w:numbering")) + n = 10_000 + # -- skip id 42 to create a gap -- + for i in range(1, n + 1): + if i == 42: + continue + numbering.add_num(abstractNum_id=0, num_id=i) + + start = time.perf_counter() + next_id = numbering._next_numId + elapsed = time.perf_counter() - start + + assert next_id == 42 + assert elapsed < 1.0, f"_next_numId took {elapsed:.3f}s for n={n}" + + def it_returns_0_for_next_abstractNumId_when_empty(self): + numbering = cast(CT_Numbering, element("w:numbering")) + + assert numbering._next_abstractNumId == 0 + + def it_computes_next_abstractNumId_quickly_on_large_set(self): + numbering = cast(CT_Numbering, element("w:numbering")) + n = 5_000 + for _ in range(n): + numbering.add_abstractNum() + + start = time.perf_counter() + next_id = numbering._next_abstractNumId + elapsed = time.perf_counter() - start + + assert next_id == n + assert elapsed < 1.0, f"_next_abstractNumId took {elapsed:.3f}s for n={n}" + + def it_gap_fills_next_abstractNumId(self): + numbering = cast(CT_Numbering, element("w:numbering")) + numbering.add_abstractNum(0) + numbering.add_abstractNum(1) + numbering.add_abstractNum(3) + + assert numbering._next_abstractNumId == 2 + + +class DescribeCT_AbstractNum: + """Unit-test suite for `docx.oxml.numbering.CT_AbstractNum`.""" + + def it_can_add_a_level(self): + numbering = cast(CT_Numbering, element("w:numbering")) + abstractNum = numbering.add_abstractNum() + + lvl = abstractNum.add_lvl() + lvl.ilvl = 2 + + assert isinstance(lvl, CT_Lvl) + assert lvl.ilvl == 2 + + def it_can_retrieve_a_level_by_ilvl(self): + numbering = cast(CT_Numbering, element("w:numbering")) + abstractNum = numbering.add_abstractNum() + l0 = abstractNum.add_lvl() + l0.ilvl = 0 + l1 = abstractNum.add_lvl() + l1.ilvl = 1 + + assert abstractNum.get_lvl(0) is l0 + assert abstractNum.get_lvl(1) is l1 + assert abstractNum.get_lvl(5) is None + + +class DescribeCT_Lvl: + """Unit-test suite for `docx.oxml.numbering.CT_Lvl`.""" + + def it_round_trips_start_numFmt_and_lvlText_values(self): + from docx.enum.text import WD_NUMBER_FORMAT + + numbering = cast(CT_Numbering, element("w:numbering")) + abstractNum = numbering.add_abstractNum() + lvl = abstractNum.add_lvl() + lvl.ilvl = 0 + + lvl.start_val = 3 + lvl.numFmt_val = WD_NUMBER_FORMAT.UPPER_ROMAN + lvl.lvlText_val = "%1)" + + assert lvl.start_val == 3 + assert lvl.numFmt_val == WD_NUMBER_FORMAT.UPPER_ROMAN + assert lvl.lvlText_val == "%1)" + + def it_defaults_start_to_1_when_no_start_child(self): + numbering = cast(CT_Numbering, element("w:numbering")) + abstractNum = numbering.add_abstractNum() + lvl = abstractNum.add_lvl() + lvl.ilvl = 0 + + assert lvl.start_val == 1 + + +class DescribeCT_NumPr: + """Unit-test suite for `docx.oxml.numbering.CT_NumPr`.""" + + def it_exposes_ilvl_val_and_numId_val(self): + numPr = cast( + CT_NumPr, + element( + "w:numPr/(w:ilvl{w:val=2},w:numId{w:val=7})" + ), + ) + + assert numPr.ilvl_val == 2 + assert numPr.numId_val == 7 + + def it_accepts_writes_for_ilvl_and_numId(self): + numPr = cast(CT_NumPr, element("w:numPr")) + + numPr.ilvl_val = 3 + numPr.numId_val = 4 + + assert numPr.ilvl_val == 3 + assert numPr.numId_val == 4 + + def it_can_clear_ilvl_and_numId_by_assigning_None(self): + numPr = cast( + CT_NumPr, + element( + "w:numPr/(w:ilvl{w:val=1},w:numId{w:val=1})" + ), + ) + + numPr.ilvl_val = None + numPr.numId_val = None + + assert numPr.ilvl_val is None + assert numPr.numId_val is None diff --git a/tests/oxml/test_permissions.py b/tests/oxml/test_permissions.py new file mode 100644 index 000000000..a2c7fc0ca --- /dev/null +++ b/tests/oxml/test_permissions.py @@ -0,0 +1,54 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.permissions` module.""" + +from __future__ import annotations + +from typing import cast + +from docx.oxml.permissions import CT_PermEnd, CT_PermStart + +from ..unitutil.cxml import element + + +class DescribeCT_PermStart: + """Unit-test suite for `docx.oxml.permissions.CT_PermStart`.""" + + def it_knows_its_id(self): + permStart = cast(CT_PermStart, element("w:permStart{w:id=3}")) + assert permStart.id == 3 + + def it_knows_its_edit_group(self): + permStart = cast( + CT_PermStart, element("w:permStart{w:id=3,w:edGrp=everyone}") + ) + assert permStart.edit_group == "everyone" + + def it_knows_its_user(self): + permStart = cast( + CT_PermStart, element("w:permStart{w:id=3,w:ed=alice}") + ) + assert permStart.user == "alice" + + def it_reports_None_for_absent_optional_attributes(self): + permStart = cast(CT_PermStart, element("w:permStart{w:id=3}")) + assert permStart.edit_group is None + assert permStart.user is None + assert permStart.displaced_by_custom_xml is None + assert permStart.col_first is None + assert permStart.col_last is None + + def it_knows_its_displaced_by_custom_xml(self): + permStart = cast( + CT_PermStart, + element("w:permStart{w:id=3,w:displacedByCustomXml=next}"), + ) + assert permStart.displaced_by_custom_xml == "next" + + +class DescribeCT_PermEnd: + """Unit-test suite for `docx.oxml.permissions.CT_PermEnd`.""" + + def it_knows_its_id(self): + permEnd = cast(CT_PermEnd, element("w:permEnd{w:id=3}")) + assert permEnd.id == 3 diff --git a/tests/oxml/test_section.py b/tests/oxml/test_section.py index 8cf0bd9b7..d18f91381 100644 --- a/tests/oxml/test_section.py +++ b/tests/oxml/test_section.py @@ -4,11 +4,682 @@ from typing import cast -from docx.oxml.section import CT_HdrFtr +import pytest + +from docx.enum.section import ( + WD_BORDER_DISPLAY, + WD_BORDER_OFFSET_FROM, + WD_DOC_GRID_TYPE, + WD_LINE_NUMBERING_RESTART, + WD_ORIENTATION, +) +from docx.enum.text import WD_BORDER_STYLE +from docx.oxml.section import ( + CT_Col, + CT_Cols, + CT_DocGrid, + CT_HdrFtr, + CT_LineNumber, + CT_PaperSource, + CT_PgBorders, + CT_SectPr, +) from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P +from docx.shared import Emu, Inches, Length, RGBColor, Twips + +from ..unitutil.cxml import element, xml + + +class DescribeCT_Col: + """Unit-test suite for `docx.oxml.section.CT_Col`.""" + + @pytest.mark.parametrize( + ("col_cxml", "expected_w", "expected_space"), + [ + ("w:col", None, None), + ("w:col{w:w=4320,w:space=720}", Twips(4320), Twips(720)), + ], + ) + def it_knows_its_width_and_space(self, col_cxml, expected_w, expected_space): + col = cast(CT_Col, element(col_cxml)) + assert col.w == expected_w + assert col.space == expected_space + + +class DescribeCT_Cols: + """Unit-test suite for `docx.oxml.section.CT_Cols`.""" + + @pytest.mark.parametrize( + ("cols_cxml", "expected_num", "expected_space", "expected_eq"), + [ + ("w:cols", None, None, None), + ("w:cols{w:num=2,w:space=720,w:equalWidth=1}", 2, Twips(720), True), + ("w:cols{w:num=3,w:equalWidth=0}", 3, None, False), + ], + ) + def it_knows_its_attributes(self, cols_cxml, expected_num, expected_space, expected_eq): + cols = cast(CT_Cols, element(cols_cxml)) + assert cols.num == expected_num + assert cols.space == expected_space + assert cols.equalWidth == expected_eq + + def it_provides_access_to_its_col_children(self): + cols = cast( + CT_Cols, + element("w:cols/(w:col{w:w=4320,w:space=720},w:col{w:w=4320})"), + ) + col_lst = cols.col_lst + assert len(col_lst) == 2 + assert col_lst[0].w == Twips(4320) + assert col_lst[0].space == Twips(720) + assert col_lst[1].w == Twips(4320) + assert col_lst[1].space is None + + +class DescribeCT_SectPr_cols: + """Unit-test suite for CT_SectPr column-related features.""" + + def it_can_access_its_cols_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr/w:cols{w:num=2}")) + cols = sectPr.cols + assert cols is not None + assert cols.num == 2 + + def it_returns_None_when_no_cols_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + assert sectPr.cols is None + + def it_can_add_a_cols_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + cols = sectPr.get_or_add_cols() + assert cols is not None + assert sectPr.cols is cols + + def it_inserts_cols_in_the_right_position(self): + sectPr = cast(CT_SectPr, element("w:sectPr/w:pgMar")) + cols = sectPr.get_or_add_cols() + assert cols is not None + expected = xml("w:sectPr/(w:pgMar,w:cols)") + assert sectPr.xml == expected + + +class DescribeCT_SectPr_orientation_swap: + """Unit-test suite for CT_SectPr orientation setter w/h swap behavior.""" + + def it_swaps_w_and_h_when_changing_portrait_to_landscape(self): + sectPr = cast( + CT_SectPr, element("w:sectPr/w:pgSz{w:w=12240,w:h=15840}") + ) + + sectPr.orientation = WD_ORIENTATION.LANDSCAPE + + expected = xml( + "w:sectPr/w:pgSz{w:w=15840,w:h=12240,w:orient=landscape}" + ) + assert sectPr.xml == expected + + def it_swaps_w_and_h_when_changing_landscape_to_portrait(self): + sectPr = cast( + CT_SectPr, + element( + "w:sectPr/w:pgSz{w:w=15840,w:h=12240,w:orient=landscape}" + ), + ) + + sectPr.orientation = WD_ORIENTATION.PORTRAIT + + # -- orient is dropped (default is portrait), dims are swapped back -- + expected = xml("w:sectPr/w:pgSz{w:w=12240,w:h=15840}") + assert sectPr.xml == expected + + def it_treats_None_as_portrait_and_swaps_from_landscape(self): + sectPr = cast( + CT_SectPr, + element( + "w:sectPr/w:pgSz{w:w=15840,w:h=12240,w:orient=landscape}" + ), + ) + + sectPr.orientation = None + + expected = xml("w:sectPr/w:pgSz{w:w=12240,w:h=15840}") + assert sectPr.xml == expected + + def it_is_idempotent_when_setting_same_orientation_landscape(self): + sectPr = cast( + CT_SectPr, + element( + "w:sectPr/w:pgSz{w:w=15840,w:h=12240,w:orient=landscape}" + ), + ) + + sectPr.orientation = WD_ORIENTATION.LANDSCAPE + + expected = xml( + "w:sectPr/w:pgSz{w:w=15840,w:h=12240,w:orient=landscape}" + ) + assert sectPr.xml == expected + + def it_is_idempotent_when_setting_same_orientation_portrait(self): + sectPr = cast( + CT_SectPr, element("w:sectPr/w:pgSz{w:w=12240,w:h=15840}") + ) + + sectPr.orientation = WD_ORIENTATION.PORTRAIT + + expected = xml("w:sectPr/w:pgSz{w:w=12240,w:h=15840}") + assert sectPr.xml == expected + + def it_skips_swap_when_width_is_missing(self): + sectPr = cast( + CT_SectPr, element("w:sectPr/w:pgSz{w:h=15840}") + ) + + sectPr.orientation = WD_ORIENTATION.LANDSCAPE + + expected = xml("w:sectPr/w:pgSz{w:h=15840,w:orient=landscape}") + assert sectPr.xml == expected + + def it_skips_swap_when_height_is_missing(self): + sectPr = cast( + CT_SectPr, element("w:sectPr/w:pgSz{w:w=12240}") + ) + + sectPr.orientation = WD_ORIENTATION.LANDSCAPE + + expected = xml("w:sectPr/w:pgSz{w:w=12240,w:orient=landscape}") + assert sectPr.xml == expected + + def it_skips_swap_when_both_dims_missing(self): + sectPr = cast(CT_SectPr, element("w:sectPr/w:pgSz")) + + sectPr.orientation = WD_ORIENTATION.LANDSCAPE + + expected = xml("w:sectPr/w:pgSz{w:orient=landscape}") + assert sectPr.xml == expected + + def it_creates_pgSz_with_no_dims_when_none_present(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + + sectPr.orientation = WD_ORIENTATION.LANDSCAPE + + expected = xml("w:sectPr/w:pgSz{w:orient=landscape}") + assert sectPr.xml == expected + + +class DescribeCT_PgBorders: + """Unit-test suite for `docx.oxml.section.CT_PgBorders`.""" + + @pytest.mark.parametrize( + ("pgBorders_cxml", "expected_display", "expected_offset"), + [ + ("w:pgBorders", None, None), + ( + "w:pgBorders{w:display=allPages,w:offsetFrom=page}", + WD_BORDER_DISPLAY.ALL_PAGES, + WD_BORDER_OFFSET_FROM.PAGE, + ), + ( + "w:pgBorders{w:display=firstPage,w:offsetFrom=text}", + WD_BORDER_DISPLAY.FIRST_PAGE, + WD_BORDER_OFFSET_FROM.TEXT, + ), + ( + "w:pgBorders{w:display=notFirstPage}", + WD_BORDER_DISPLAY.NOT_FIRST_PAGE, + None, + ), + ], + ) + def it_knows_its_attributes( + self, pgBorders_cxml, expected_display, expected_offset + ): + pgBorders = cast(CT_PgBorders, element(pgBorders_cxml)) + assert pgBorders.display == expected_display + assert pgBorders.offsetFrom == expected_offset + + def it_can_access_each_edge_child(self): + from docx.shared import Pt + + pgBorders = cast( + CT_PgBorders, + element( + "w:pgBorders/(w:top{w:val=single,w:sz=24,w:space=24,w:color=FF0000}," + "w:left{w:val=dashed,w:sz=8,w:space=12,w:color=00FF00}," + "w:bottom{w:val=double,w:sz=4,w:space=6,w:color=0000FF}," + "w:right{w:val=dotted,w:sz=16,w:space=18,w:color=AABBCC})" + ), + ) + assert pgBorders.top is not None + assert pgBorders.top.val == WD_BORDER_STYLE.SINGLE + assert pgBorders.top.sz == Pt(24 / 8.0) + assert pgBorders.top.space == Pt(24) + assert pgBorders.top.color == RGBColor(0xFF, 0x00, 0x00) + assert pgBorders.left is not None + assert pgBorders.left.val == WD_BORDER_STYLE.DASHED + assert pgBorders.bottom is not None + assert pgBorders.bottom.val == WD_BORDER_STYLE.DOUBLE + assert pgBorders.right is not None + assert pgBorders.right.val == WD_BORDER_STYLE.DOTTED + + def it_returns_None_for_missing_edge_children(self): + pgBorders = cast(CT_PgBorders, element("w:pgBorders")) + assert pgBorders.top is None + assert pgBorders.bottom is None + assert pgBorders.left is None + assert pgBorders.right is None + + def it_can_add_each_edge(self): + pgBorders = cast(CT_PgBorders, element("w:pgBorders")) + top = pgBorders.get_or_add_top() + left = pgBorders.get_or_add_left() + bottom = pgBorders.get_or_add_bottom() + right = pgBorders.get_or_add_right() + assert pgBorders.top is top + assert pgBorders.left is left + assert pgBorders.bottom is bottom + assert pgBorders.right is right + expected = xml("w:pgBorders/(w:top,w:left,w:bottom,w:right)") + assert pgBorders.xml == expected + + +class DescribeCT_SectPr_pgBorders: + """Unit-test suite for CT_SectPr page-border features.""" + + def it_returns_None_when_no_pgBorders_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + assert sectPr.pgBorders is None + + def it_can_access_its_pgBorders_child(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/w:pgBorders{w:display=allPages}"), + ) + pgBorders = sectPr.pgBorders + assert pgBorders is not None + assert pgBorders.display == WD_BORDER_DISPLAY.ALL_PAGES + + def it_can_add_a_pgBorders_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + pgBorders = sectPr.get_or_add_pgBorders() + assert pgBorders is not None + assert sectPr.pgBorders is pgBorders + + def it_inserts_pgBorders_in_the_right_position(self): + sectPr = cast(CT_SectPr, element("w:sectPr/(w:pgSz,w:pgMar,w:cols)")) + sectPr.get_or_add_pgBorders() + expected = xml("w:sectPr/(w:pgSz,w:pgMar,w:pgBorders,w:cols)") + assert sectPr.xml == expected + + def it_can_remove_its_pgBorders_child(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/w:pgBorders/(w:top{w:val=single})"), + ) + sectPr._remove_pgBorders() # pyright: ignore[reportPrivateUsage] + assert sectPr.pgBorders is None + + +class DescribeCT_LineNumber: + """Unit-test suite for `docx.oxml.section.CT_LineNumber`.""" + + @pytest.mark.parametrize( + ("lnNumType_cxml", "count_by", "start", "distance", "restart"), + [ + ("w:lnNumType", None, None, None, None), + ( + "w:lnNumType{w:countBy=1,w:start=1,w:distance=360,w:restart=continuous}", + 1, + 1, + Twips(360), + WD_LINE_NUMBERING_RESTART.CONTINUOUS, + ), + ( + "w:lnNumType{w:countBy=5,w:start=10,w:distance=720,w:restart=newSection}", + 5, + 10, + Twips(720), + WD_LINE_NUMBERING_RESTART.NEW_SECTION, + ), + ( + "w:lnNumType{w:restart=newPage}", + None, + None, + None, + WD_LINE_NUMBERING_RESTART.NEW_PAGE, + ), + ], + ) + def it_knows_its_attributes( + self, lnNumType_cxml, count_by, start, distance, restart + ): + lnNumType = cast(CT_LineNumber, element(lnNumType_cxml)) + assert lnNumType.countBy == count_by + assert lnNumType.start == start + assert lnNumType.distance == distance + assert lnNumType.restart == restart + + def it_can_set_its_attributes(self): + lnNumType = cast(CT_LineNumber, element("w:lnNumType")) + lnNumType.countBy = 3 + lnNumType.start = 2 + lnNumType.distance = Twips(720) + lnNumType.restart = WD_LINE_NUMBERING_RESTART.NEW_PAGE + assert lnNumType.xml == xml( + "w:lnNumType{w:countBy=3,w:start=2,w:distance=720,w:restart=newPage}" + ) + + +class DescribeCT_SectPr_lnNumType: + """Unit-test suite for CT_SectPr line-numbering features.""" + + def it_returns_None_when_no_lnNumType_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + assert sectPr.lnNumType is None + + def it_can_access_its_lnNumType_child(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/w:lnNumType{w:countBy=1}"), + ) + lnNumType = sectPr.lnNumType + assert lnNumType is not None + assert lnNumType.countBy == 1 + + def it_can_add_a_lnNumType_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + lnNumType = sectPr.get_or_add_lnNumType() + assert lnNumType is not None + assert sectPr.lnNumType is lnNumType + + def it_inserts_lnNumType_in_the_right_position(self): + sectPr = cast(CT_SectPr, element("w:sectPr/(w:pgSz,w:pgMar,w:cols)")) + sectPr.get_or_add_lnNumType() + expected = xml("w:sectPr/(w:pgSz,w:pgMar,w:lnNumType,w:cols)") + assert sectPr.xml == expected + + def it_can_remove_its_lnNumType_child(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/w:lnNumType{w:countBy=1}"), + ) + sectPr._remove_lnNumType() # pyright: ignore[reportPrivateUsage] + assert sectPr.lnNumType is None + + +class DescribeCT_PaperSource: + """Unit-test suite for `docx.oxml.section.CT_PaperSource`.""" + + @pytest.mark.parametrize( + ("paperSrc_cxml", "expected_first", "expected_other"), + [ + ("w:paperSrc", None, None), + ("w:paperSrc{w:first=1}", 1, None), + ("w:paperSrc{w:other=2}", None, 2), + ("w:paperSrc{w:first=3,w:other=4}", 3, 4), + ], + ) + def it_knows_its_attributes(self, paperSrc_cxml, expected_first, expected_other): + paperSrc = cast(CT_PaperSource, element(paperSrc_cxml)) + assert paperSrc.first == expected_first + assert paperSrc.other == expected_other + + def it_can_set_its_attributes(self): + paperSrc = cast(CT_PaperSource, element("w:paperSrc")) + paperSrc.first = 5 + paperSrc.other = 6 + assert paperSrc.xml == xml("w:paperSrc{w:first=5,w:other=6}") + + +class DescribeCT_SectPr_paperSrc: + """Unit-test suite for CT_SectPr paper-source features.""" + + def it_returns_None_when_no_paperSrc_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + assert sectPr.paperSrc is None + + def it_can_access_its_paperSrc_child(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/w:paperSrc{w:first=1,w:other=2}"), + ) + paperSrc = sectPr.paperSrc + assert paperSrc is not None + assert paperSrc.first == 1 + assert paperSrc.other == 2 + + def it_can_add_a_paperSrc_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + paperSrc = sectPr.get_or_add_paperSrc() + assert paperSrc is not None + assert sectPr.paperSrc is paperSrc + + def it_inserts_paperSrc_in_the_right_position(self): + sectPr = cast(CT_SectPr, element("w:sectPr/(w:pgSz,w:pgMar,w:cols)")) + sectPr.get_or_add_paperSrc() + expected = xml("w:sectPr/(w:pgSz,w:pgMar,w:paperSrc,w:cols)") + assert sectPr.xml == expected + + def it_can_remove_its_paperSrc_child(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/w:paperSrc{w:first=1}"), + ) + sectPr._remove_paperSrc() # pyright: ignore[reportPrivateUsage] + assert sectPr.paperSrc is None + + +class DescribeCT_DocGrid: + """Unit-test suite for `docx.oxml.section.CT_DocGrid`.""" + + @pytest.mark.parametrize( + ("docGrid_cxml", "grid_type", "line_pitch", "char_space"), + [ + ("w:docGrid", None, None, None), + ( + "w:docGrid{w:type=default,w:linePitch=360,w:charSpace=0}", + WD_DOC_GRID_TYPE.DEFAULT, + 360, + 0, + ), + ( + "w:docGrid{w:type=lines,w:linePitch=312}", + WD_DOC_GRID_TYPE.LINES, + 312, + None, + ), + ( + "w:docGrid{w:type=linesAndChars,w:linePitch=400,w:charSpace=100}", + WD_DOC_GRID_TYPE.LINES_AND_CHARS, + 400, + 100, + ), + ( + "w:docGrid{w:type=snapToChars,w:charSpace=-50}", + WD_DOC_GRID_TYPE.SNAP_TO_CHARS, + None, + -50, + ), + ], + ) + def it_knows_its_attributes( + self, docGrid_cxml, grid_type, line_pitch, char_space + ): + docGrid = cast(CT_DocGrid, element(docGrid_cxml)) + assert docGrid.type == grid_type + assert docGrid.linePitch == line_pitch + assert docGrid.charSpace == char_space + + def it_can_set_its_attributes(self): + docGrid = cast(CT_DocGrid, element("w:docGrid")) + docGrid.type = WD_DOC_GRID_TYPE.LINES_AND_CHARS + docGrid.linePitch = 360 + docGrid.charSpace = 100 + assert docGrid.xml == xml( + "w:docGrid{w:type=linesAndChars,w:linePitch=360,w:charSpace=100}" + ) + + +class DescribeCT_SectPr_docGrid: + """Unit-test suite for CT_SectPr document-grid features.""" + + def it_returns_None_when_no_docGrid_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + assert sectPr.docGrid is None + + def it_can_access_its_docGrid_child(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/w:docGrid{w:linePitch=360}"), + ) + docGrid = sectPr.docGrid + assert docGrid is not None + assert docGrid.linePitch == 360 + + def it_can_add_a_docGrid_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + docGrid = sectPr.get_or_add_docGrid() + assert docGrid is not None + assert sectPr.docGrid is docGrid + + def it_inserts_docGrid_in_the_right_position(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/(w:pgSz,w:pgMar,w:cols,w:titlePg)"), + ) + sectPr.get_or_add_docGrid() + expected = xml( + "w:sectPr/(w:pgSz,w:pgMar,w:cols,w:titlePg,w:docGrid)" + ) + assert sectPr.xml == expected + + def it_can_remove_its_docGrid_child(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/w:docGrid{w:linePitch=360}"), + ) + sectPr._remove_docGrid() # pyright: ignore[reportPrivateUsage] + assert sectPr.docGrid is None + + +class DescribeCT_SectPr_text_direction: + """Unit-test suite for `CT_SectPr.text_direction`.""" + + def it_returns_None_when_no_textDirection_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + assert sectPr.text_direction is None + + @pytest.mark.parametrize( + ("sectPr_cxml", "expected_value"), + [ + ("w:sectPr/w:textDirection{w:val=lrTb}", "LR_TB"), + ("w:sectPr/w:textDirection{w:val=tbRl}", "TB_RL"), + ("w:sectPr/w:textDirection{w:val=btLr}", "BT_LR"), + ("w:sectPr/w:textDirection{w:val=lrTbV}", "LR_TB_V"), + ("w:sectPr/w:textDirection{w:val=tbRlV}", "TB_RL_V"), + ("w:sectPr/w:textDirection{w:val=tbLrV}", "TB_LR_V"), + ], + ) + def it_knows_its_text_direction(self, sectPr_cxml: str, expected_value: str): + from docx.enum.table import WD_TEXT_DIRECTION + + sectPr = cast(CT_SectPr, element(sectPr_cxml)) + assert sectPr.text_direction is getattr(WD_TEXT_DIRECTION, expected_value) + + @pytest.mark.parametrize( + ("enum_member", "xml_val"), + [ + ("LR_TB", "lrTb"), + ("TB_RL", "tbRl"), + ("BT_LR", "btLr"), + ("LR_TB_V", "lrTbV"), + ("TB_RL_V", "tbRlV"), + ("TB_LR_V", "tbLrV"), + ], + ) + def it_can_set_its_text_direction_round_trip( + self, enum_member: str, xml_val: str + ): + from docx.enum.table import WD_TEXT_DIRECTION + + sectPr = cast(CT_SectPr, element("w:sectPr")) + sectPr.text_direction = getattr(WD_TEXT_DIRECTION, enum_member) + assert sectPr.xml == xml(f"w:sectPr/w:textDirection{{w:val={xml_val}}}") + + def it_can_clear_its_text_direction(self): + sectPr = cast( + CT_SectPr, element("w:sectPr/w:textDirection{w:val=tbRl}") + ) + sectPr.text_direction = None + assert sectPr.xml == xml("w:sectPr") + + def it_inserts_textDirection_in_the_right_position(self): + sectPr = cast( + CT_SectPr, + element("w:sectPr/(w:pgSz,w:pgMar,w:cols,w:titlePg,w:docGrid)"), + ) + sectPr.get_or_add_textDirection() + expected = xml( + "w:sectPr/(w:pgSz,w:pgMar,w:cols,w:titlePg,w:textDirection,w:docGrid)" + ) + assert sectPr.xml == expected + + +class DescribeCT_SectPr_bidi: + """Unit-test suite for `CT_SectPr.bidi_val`.""" + + def it_returns_False_when_no_bidi_child(self): + sectPr = cast(CT_SectPr, element("w:sectPr")) + assert sectPr.bidi_val is False + + @pytest.mark.parametrize( + ("sectPr_cxml", "expected_value"), + [ + ("w:sectPr/w:bidi", True), + ("w:sectPr/w:bidi{w:val=1}", True), + ("w:sectPr/w:bidi{w:val=true}", True), + ("w:sectPr/w:bidi{w:val=on}", True), + ("w:sectPr/w:bidi{w:val=0}", False), + ("w:sectPr/w:bidi{w:val=false}", False), + ("w:sectPr/w:bidi{w:val=off}", False), + ], + ) + def it_knows_its_bidi_val(self, sectPr_cxml: str, expected_value: bool): + sectPr = cast(CT_SectPr, element(sectPr_cxml)) + assert sectPr.bidi_val is expected_value + + @pytest.mark.parametrize( + ("sectPr_cxml", "value", "expected_cxml"), + [ + ("w:sectPr", True, "w:sectPr/w:bidi"), + ("w:sectPr/w:bidi", False, "w:sectPr"), + ("w:sectPr/w:bidi", None, "w:sectPr"), + ("w:sectPr/w:bidi{w:val=off}", True, "w:sectPr/w:bidi"), + ("w:sectPr", False, "w:sectPr"), + ], + ) + def it_can_change_its_bidi_val( + self, sectPr_cxml: str, value: bool | None, expected_cxml: str + ): + sectPr = cast(CT_SectPr, element(sectPr_cxml)) + sectPr.bidi_val = value + assert sectPr.xml == xml(expected_cxml) -from ..unitutil.cxml import element + def it_inserts_bidi_in_the_right_position(self): + sectPr = cast( + CT_SectPr, + element( + "w:sectPr/(w:pgSz,w:pgMar,w:cols,w:titlePg" + ",w:textDirection{w:val=tbRl},w:docGrid)" + ), + ) + sectPr.get_or_add_bidi() + expected = xml( + "w:sectPr/(w:pgSz,w:pgMar,w:cols,w:titlePg" + ",w:textDirection{w:val=tbRl},w:bidi,w:docGrid)" + ) + assert sectPr.xml == expected class DescribeCT_HdrFtr: diff --git a/tests/oxml/test_settings.py b/tests/oxml/test_settings.py new file mode 100644 index 000000000..4545c77f4 --- /dev/null +++ b/tests/oxml/test_settings.py @@ -0,0 +1,550 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.settings` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.settings import CT_DocProtect, CT_Rsids, CT_Settings +from docx.shared import Twips + +from ..unitutil.cxml import element, xml + + +class DescribeCT_Settings: + """Unit-test suite for `docx.oxml.settings.CT_Settings`.""" + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:settings", None), + ("w:settings/w:zoom{w:percent=100}", 100), + ("w:settings/w:zoom{w:percent=75}", 75), + ("w:settings/w:zoom", None), + ], + ) + def it_can_get_the_zoom_percent(self, cxml: str, expected_value: int | None): + settings = cast(CT_Settings, element(cxml)) + assert settings.zoom_percent == expected_value + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:settings", 100, "w:settings/w:zoom{w:percent=100}"), + ("w:settings/w:zoom{w:percent=75}", 150, "w:settings/w:zoom{w:percent=150}"), + ("w:settings/w:zoom{w:percent=100}", None, "w:settings"), + ], + ) + def it_can_set_the_zoom_percent( + self, cxml: str, new_value: int | None, expected_cxml: str + ): + settings = cast(CT_Settings, element(cxml)) + settings.zoom_percent = new_value + assert settings.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:settings", False), + ("w:settings/w:trackRevisions", True), + ("w:settings/w:trackRevisions{w:val=0}", False), + ("w:settings/w:trackRevisions{w:val=true}", True), + ], + ) + def it_can_get_trackRevisions(self, cxml: str, expected_value: bool): + settings = cast(CT_Settings, element(cxml)) + assert settings.trackRevisions_val is expected_value + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:settings", True, "w:settings/w:trackRevisions"), + ("w:settings/w:trackRevisions", False, "w:settings"), + ("w:settings/w:trackRevisions{w:val=0}", True, "w:settings/w:trackRevisions"), + ], + ) + def it_can_set_trackRevisions( + self, cxml: str, new_value: bool, expected_cxml: str + ): + settings = cast(CT_Settings, element(cxml)) + settings.trackRevisions_val = new_value + assert settings.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:settings", None), + ("w:settings/w:defaultTabStop{w:val=720}", Twips(720)), + ("w:settings/w:defaultTabStop{w:val=360}", Twips(360)), + ], + ) + def it_can_get_the_defaultTabStop(self, cxml: str, expected_value): + settings = cast(CT_Settings, element(cxml)) + assert settings.defaultTabStop_val == expected_value + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:settings", Twips(720), "w:settings/w:defaultTabStop{w:val=720}"), + ( + "w:settings/w:defaultTabStop{w:val=720}", + Twips(360), + "w:settings/w:defaultTabStop{w:val=360}", + ), + ("w:settings/w:defaultTabStop{w:val=720}", None, "w:settings"), + ], + ) + def it_can_set_the_defaultTabStop( + self, cxml: str, new_value, expected_cxml: str + ): + settings = cast(CT_Settings, element(cxml)) + settings.defaultTabStop_val = new_value + assert settings.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("cxml", "expected_edit", "expected_enforcement"), + [ + ("w:settings", None, False), + ( + "w:settings/w:documentProtection{w:edit=readOnly,w:enforcement=1}", + "readOnly", + True, + ), + ( + "w:settings/w:documentProtection{w:edit=comments,w:enforcement=0}", + "comments", + False, + ), + ("w:settings/w:documentProtection{w:edit=forms}", "forms", False), + ("w:settings/w:documentProtection", None, False), + ], + ) + def it_can_get_document_protection( + self, + cxml: str, + expected_edit: str | None, + expected_enforcement: bool, + ): + settings = cast(CT_Settings, element(cxml)) + assert settings.documentProtection_edit == expected_edit + assert settings.documentProtection_enforcement is expected_enforcement + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:settings", "readOnly", "w:settings/w:documentProtection{w:edit=readOnly}"), + ( + "w:settings/w:documentProtection{w:edit=readOnly}", + "comments", + "w:settings/w:documentProtection{w:edit=comments}", + ), + ( + "w:settings/w:documentProtection{w:edit=readOnly}", + None, + "w:settings/w:documentProtection", + ), + ], + ) + def it_can_set_documentProtection_edit( + self, cxml: str, new_value: str | None, expected_cxml: str + ): + settings = cast(CT_Settings, element(cxml)) + settings.documentProtection_edit = new_value + assert settings.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:settings", True, "w:settings/w:documentProtection{w:enforcement=1}"), + ( + "w:settings/w:documentProtection{w:enforcement=1}", + False, + "w:settings/w:documentProtection", + ), + ], + ) + def it_can_set_documentProtection_enforcement( + self, cxml: str, new_value: bool, expected_cxml: str + ): + settings = cast(CT_Settings, element(cxml)) + settings.documentProtection_enforcement = new_value + assert settings.xml == xml(expected_cxml) + + def it_can_get_the_compatibilityMode_when_absent(self): + settings = cast(CT_Settings, element("w:settings")) + assert settings.compatibilityMode is None + + def it_can_get_the_compatibilityMode_when_present(self): + settings = cast(CT_Settings, element("w:settings/w:compat")) + # -- no compatSetting children yet, so None -- + assert settings.compatibilityMode is None + + def it_can_set_the_compatibilityMode(self): + settings = cast(CT_Settings, element("w:settings")) + settings.compatibilityMode = 15 + assert settings.compatibilityMode == 15 + + def it_can_change_the_compatibilityMode(self): + settings = cast(CT_Settings, element("w:settings")) + settings.compatibilityMode = 14 + assert settings.compatibilityMode == 14 + settings.compatibilityMode = 15 + assert settings.compatibilityMode == 15 + + def it_can_remove_the_compatibilityMode(self): + settings = cast(CT_Settings, element("w:settings")) + settings.compatibilityMode = 15 + assert settings.compatibilityMode == 15 + settings.compatibilityMode = None + assert settings.compatibilityMode is None + assert settings.xml == xml("w:settings") + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:settings", False), + ("w:settings/w:evenAndOddHeaders", True), + ("w:settings/w:evenAndOddHeaders{w:val=0}", False), + ("w:settings/w:evenAndOddHeaders{w:val=1}", True), + ], + ) + def it_can_get_evenAndOddHeaders(self, cxml: str, expected_value: bool): + settings = cast(CT_Settings, element(cxml)) + assert settings.evenAndOddHeaders_val is expected_value + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:settings", None), + ("w:settings/w:view", None), + ("w:settings/w:view{w:val=normal}", "normal"), + ("w:settings/w:view{w:val=outline}", "outline"), + ("w:settings/w:view{w:val=print}", "print"), + ("w:settings/w:view{w:val=web}", "web"), + ("w:settings/w:view{w:val=reading}", "reading"), + ("w:settings/w:view{w:val=masterPages}", "masterPages"), + ("w:settings/w:view{w:val=none}", "none"), + ], + ) + def it_can_get_the_view_val(self, cxml: str, expected_value: str | None): + settings = cast(CT_Settings, element(cxml)) + assert settings.view_val == expected_value + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:settings", "print", "w:settings/w:view{w:val=print}"), + ( + "w:settings/w:view{w:val=print}", + "outline", + "w:settings/w:view{w:val=outline}", + ), + ("w:settings/w:view{w:val=print}", None, "w:settings"), + ("w:settings/w:zoom{w:percent=100}", "web", + "w:settings/(w:view{w:val=web},w:zoom{w:percent=100})"), + ], + ) + def it_can_set_the_view_val( + self, cxml: str, new_value: str | None, expected_cxml: str + ): + settings = cast(CT_Settings, element(cxml)) + settings.view_val = new_value + assert settings.xml == xml(expected_cxml) + + +class DescribeCT_Rsids: + """Unit-test suite for `docx.oxml.settings.CT_Rsids`.""" + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:rsids", None), + ("w:rsids/w:rsidRoot{w:val=00FA1B42}", "00FA1B42"), + ("w:rsids/w:rsidRoot", None), + ( + "w:rsids/(w:rsidRoot{w:val=00ABCDEF},w:rsid{w:val=001234AB})", + "00ABCDEF", + ), + ], + ) + def it_reads_rsidRoot_val(self, cxml: str, expected_value: str | None): + rsids = cast(CT_Rsids, element(cxml)) + assert rsids.rsidRoot_val == expected_value + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:rsids", []), + ("w:rsids/w:rsidRoot{w:val=00FA1B42}", []), + ("w:rsids/w:rsid{w:val=001234AB}", ["001234AB"]), + ( + "w:rsids/(" + "w:rsidRoot{w:val=00FA1B42}," + "w:rsid{w:val=001234AB}," + "w:rsid{w:val=00567890}," + "w:rsid{w:val=00ABCDEF})", + ["001234AB", "00567890", "00ABCDEF"], + ), + ], + ) + def it_reads_rsid_vals_in_document_order( + self, cxml: str, expected_value: list[str] + ): + rsids = cast(CT_Rsids, element(cxml)) + assert rsids.rsid_vals == expected_value + + +class DescribeCT_Settings_Rsids: + """Unit-test suite for RSID access via `docx.oxml.settings.CT_Settings`.""" + + def it_returns_None_for_rsids_when_not_present(self): + settings = cast(CT_Settings, element("w:settings")) + assert settings.rsids is None + + def it_returns_the_rsids_element_when_present(self): + settings = cast( + CT_Settings, + element("w:settings/w:rsids/w:rsidRoot{w:val=00FA1B42}"), + ) + rsids = settings.rsids + assert rsids is not None + assert rsids.rsidRoot_val == "00FA1B42" + + +class DescribeCT_Compat: + """Unit-test suite for `docx.oxml.settings.CT_Compat`.""" + + # -- compatSetting dict-style helpers ----------------------------------- + + def it_returns_None_for_unknown_compat_setting_name(self): + compat = cast( + CT_Settings, element("w:settings/w:compat") + ).compat + assert compat is not None + assert compat.get_compat_setting("notThere") is None + + def it_can_get_a_compat_setting_by_name(self): + settings = cast( + CT_Settings, + element( + "w:settings/w:compat/w:compatSetting" + "{w:name=compatibilityMode,w:uri=http://x,w:val=15}" + ), + ) + assert settings.compat is not None + assert settings.compat.get_compat_setting("compatibilityMode") == "15" + + def it_can_add_a_new_compat_setting(self): + settings = cast(CT_Settings, element("w:settings/w:compat")) + assert settings.compat is not None + settings.compat.set_compat_setting("foo", "1", uri="http://bar") + assert settings.xml == xml( + "w:settings/w:compat/w:compatSetting" + "{w:name=foo,w:uri=http://bar,w:val=1}" + ) + + def it_can_update_an_existing_compat_setting_in_place(self): + settings = cast( + CT_Settings, + element( + "w:settings/w:compat/w:compatSetting" + "{w:name=foo,w:uri=http://keep,w:val=old}" + ), + ) + assert settings.compat is not None + settings.compat.set_compat_setting("foo", "new", uri="http://ignored") + # -- URI is left unchanged when the setting already exists -- + assert settings.xml == xml( + "w:settings/w:compat/w:compatSetting" + "{w:name=foo,w:uri=http://keep,w:val=new}" + ) + + def it_can_remove_a_compat_setting(self): + settings = cast( + CT_Settings, + element( + "w:settings/w:compat/w:compatSetting" + "{w:name=foo,w:uri=http://x,w:val=1}" + ), + ) + assert settings.compat is not None + assert settings.compat.remove_compat_setting("foo") is True + assert settings.xml == xml("w:settings/w:compat") + + def it_returns_False_when_removing_a_missing_compat_setting(self): + settings = cast(CT_Settings, element("w:settings/w:compat")) + assert settings.compat is not None + assert settings.compat.remove_compat_setting("absent") is False + + def it_iterates_compat_setting_names_in_document_order(self): + settings = cast( + CT_Settings, + element( + "w:settings/w:compat/(" + "w:compatSetting{w:name=a,w:uri=http://x,w:val=1}," + "w:compatSetting{w:name=b,w:uri=http://x,w:val=2}," + "w:compatSetting{w:name=c,w:uri=http://x,w:val=3})" + ), + ) + assert settings.compat is not None + assert list(settings.compat.iter_compat_setting_names()) == ["a", "b", "c"] + + # -- direct flag helpers ------------------------------------------------ + + def it_reports_has_flag_for_present_child(self): + settings = cast( + CT_Settings, element("w:settings/w:compat/w:growAutofit") + ) + assert settings.compat is not None + assert settings.compat.has_flag("growAutofit") is True + assert settings.compat.has_flag("useFELayout") is False + + def it_can_add_a_flag(self): + settings = cast(CT_Settings, element("w:settings/w:compat")) + assert settings.compat is not None + settings.compat.set_flag("growAutofit", True) + assert settings.xml == xml("w:settings/w:compat/w:growAutofit") + + def it_does_not_duplicate_existing_flag(self): + settings = cast( + CT_Settings, element("w:settings/w:compat/w:growAutofit") + ) + assert settings.compat is not None + settings.compat.set_flag("growAutofit", True) + assert settings.xml == xml("w:settings/w:compat/w:growAutofit") + + def it_can_remove_a_flag(self): + settings = cast( + CT_Settings, element("w:settings/w:compat/w:growAutofit") + ) + assert settings.compat is not None + settings.compat.set_flag("growAutofit", False) + assert settings.xml == xml("w:settings/w:compat") + + def it_iterates_flag_names_skipping_compatSetting(self): + settings = cast( + CT_Settings, + element( + "w:settings/w:compat/(" + "w:growAutofit," + "w:compatSetting{w:name=n,w:uri=http://x,w:val=1}," + "w:useFELayout)" + ), + ) + assert settings.compat is not None + assert list(settings.compat.iter_flag_names()) == [ + "growAutofit", + "useFELayout", + ] + + def it_can_clear_all_flags_but_preserve_compat_settings(self): + settings = cast( + CT_Settings, + element( + "w:settings/w:compat/(" + "w:growAutofit," + "w:compatSetting{w:name=n,w:uri=http://x,w:val=1}," + "w:useFELayout)" + ), + ) + assert settings.compat is not None + settings.compat.clear_flags() + assert settings.xml == xml( + "w:settings/w:compat/w:compatSetting" + "{w:name=n,w:uri=http://x,w:val=1}" + ) + + +class DescribeCT_DocProtect: + """Unit-test suite for `docx.oxml.settings.CT_DocProtect`.""" + + @pytest.mark.parametrize( + ("cxml", "expected_edit", "expected_enforcement", "expected_formatting"), + [ + ("w:documentProtection", None, False, False), + ( + "w:documentProtection{w:edit=readOnly,w:enforcement=1}", + "readOnly", + True, + False, + ), + ( + "w:documentProtection{w:edit=comments,w:formatting=1}", + "comments", + False, + True, + ), + ], + ) + def it_can_get_core_attributes( + self, + cxml: str, + expected_edit: str | None, + expected_enforcement: bool, + expected_formatting: bool, + ): + dp = cast(CT_DocProtect, element(cxml)) + assert dp.edit == expected_edit + assert dp.enforcement is expected_enforcement + assert dp.formatting is expected_formatting + + @pytest.mark.parametrize( + ("attr_name", "xml_name", "value"), + [ + ("hash", "w:hash", "abc123"), + ("salt", "w:salt", "def456"), + ("cryptProviderType", "w:cryptProviderType", "rsaAES"), + ("cryptAlgorithmClass", "w:cryptAlgorithmClass", "hash"), + ("cryptAlgorithmType", "w:cryptAlgorithmType", "typeAny"), + ], + ) + def it_round_trips_each_string_attribute( + self, attr_name: str, xml_name: str, value: str + ): + dp = cast( + CT_DocProtect, + element("w:documentProtection{%s=%s}" % (xml_name, value)), + ) + assert getattr(dp, attr_name) == value + + @pytest.mark.parametrize( + ("attr_name", "xml_name", "value"), + [ + ("cryptAlgorithmSid", "w:cryptAlgorithmSid", 4), + ("cryptSpinCount", "w:cryptSpinCount", 100000), + ], + ) + def it_round_trips_each_int_attribute( + self, attr_name: str, xml_name: str, value: int + ): + dp = cast( + CT_DocProtect, + element("w:documentProtection{%s=%d}" % (xml_name, value)), + ) + assert getattr(dp, attr_name) == value + + def it_can_set_hash_and_salt(self): + dp = cast(CT_DocProtect, element("w:documentProtection")) + dp.hash = "deadbeef==" + dp.salt = "cafebabe+/" + # -- hashes/salts are base64 strings in real use, which can include + # -- characters the cxml parser rejects; exercise the setter path + # -- using raw lxml access, not via cxml element syntax. + assert dp.hash == "deadbeef==" + assert dp.salt == "cafebabe+/" + + def it_can_set_algorithm_metadata(self): + dp = cast(CT_DocProtect, element("w:documentProtection")) + dp.cryptProviderType = "rsaAES" + dp.cryptAlgorithmClass = "hash" + dp.cryptAlgorithmType = "typeAny" + dp.cryptAlgorithmSid = 4 + dp.cryptSpinCount = 100000 + assert dp.cryptProviderType == "rsaAES" + assert dp.cryptAlgorithmClass == "hash" + assert dp.cryptAlgorithmType == "typeAny" + assert dp.cryptAlgorithmSid == 4 + assert dp.cryptSpinCount == 100000 diff --git a/tests/oxml/test_shape.py b/tests/oxml/test_shape.py new file mode 100644 index 000000000..bda664bf5 --- /dev/null +++ b/tests/oxml/test_shape.py @@ -0,0 +1,341 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for the `docx.oxml.shape` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.ns import qn +from docx.oxml.shape import ( + CT_Anchor, + CT_EffectList, + CT_Inline, + CT_LineProperties, + CT_OuterShadow, + CT_Picture, + CT_RelativeRect, + CT_ShapeProperties, + _rot_for_exif_orientation, +) + +from ..unitutil.cxml import element + + +class DescribeCT_Anchor: + """Unit-test suite for `docx.oxml.shape.CT_Anchor`.""" + + def it_can_construct_a_new_pic_anchor(self): + anchor = CT_Anchor.new_pic_anchor( + shape_id=42, rId="rId7", filename="foo.png", cx=1000, cy=2000 + ) + + # -- required attributes present -- + assert anchor.simplePos is False + assert anchor.locked is False + assert anchor.layoutInCell is True + assert anchor.allowOverlap is True + assert anchor.behindDoc is False + assert anchor.relativeHeight == 0 + + # -- default positioning -- + assert anchor.positionH is not None + assert anchor.positionH.relativeFrom == "column" + assert anchor.positionV is not None + assert anchor.positionV.relativeFrom == "paragraph" + + # -- default wrap -- + assert anchor.wrap_type == "square" + + # -- extent populated -- + assert anchor.extent.cx == 1000 + assert anchor.extent.cy == 2000 + + # -- docPr populated -- + assert anchor.docPr.id == 42 + assert anchor.docPr.name == "Picture 42" + + # -- graphic contains pic:pic with our rId -- + pic = anchor.graphic.graphicData.find(qn("pic:pic")) + assert pic is not None + blip = pic.find(".//" + qn("a:blip")) + assert blip is not None + assert blip.get(qn("r:embed")) == "rId7" + + def it_can_set_the_horizontal_position(self): + anchor = CT_Anchor.new_pic_anchor(1, "rId1", "f.png", 100, 100) + + anchor.set_horizontal_position("page", 914400) + + assert anchor.positionH.relativeFrom == "page" + assert anchor.positionH.posOffset is not None + assert anchor.positionH.posOffset.text == "914400" + + def it_can_set_the_vertical_position(self): + anchor = CT_Anchor.new_pic_anchor(1, "rId1", "f.png", 100, 100) + + anchor.set_vertical_position("margin", 457200) + + assert anchor.positionV.relativeFrom == "margin" + assert anchor.positionV.posOffset is not None + assert anchor.positionV.posOffset.text == "457200" + + @pytest.mark.parametrize( + ("wrap", "tag", "expected_behind_doc"), + [ + ("square", "wp:wrapSquare", False), + ("tight", "wp:wrapTight", False), + ("through", "wp:wrapThrough", False), + ("topAndBottom", "wp:wrapTopAndBottom", False), + ("behind", "wp:wrapNone", True), + ("inFront", "wp:wrapNone", False), + ], + ) + def it_can_set_the_wrap_type( + self, wrap: str, tag: str, expected_behind_doc: bool + ): + anchor = CT_Anchor.new_pic_anchor(1, "rId1", "f.png", 100, 100) + + anchor.set_wrap(wrap) + + assert anchor.find(qn(tag)) is not None + assert anchor.behindDoc is expected_behind_doc + assert anchor.wrap_type == wrap + + def it_replaces_existing_wrap_when_set_wrap_is_called(self): + anchor = CT_Anchor.new_pic_anchor(1, "rId1", "f.png", 100, 100) + + anchor.set_wrap("tight") + anchor.set_wrap("behind") + + # -- only one wrap element present at a time -- + wrap_elms = [ + tag + for tag in ( + "wp:wrapNone", + "wp:wrapSquare", + "wp:wrapTight", + "wp:wrapThrough", + "wp:wrapTopAndBottom", + ) + if anchor.find(qn(tag)) is not None + ] + assert wrap_elms == ["wp:wrapNone"] + assert anchor.behindDoc is True + + def it_reports_square_wrap_type_by_default(self): + anchor = CT_Anchor.new_pic_anchor(1, "rId1", "f.png", 100, 100) + + assert anchor.wrap_type == "square" + + def it_reads_horizontal_and_vertical_relativeFrom_from_existing_xml(self): + cxml = ( + "wp:anchor{distT=0,distB=0,distL=0,distR=0,simplePos=0,relativeHeight=1," + "behindDoc=0,locked=0,layoutInCell=1,allowOverlap=1}/" + "(wp:positionH{relativeFrom=page},wp:positionV{relativeFrom=margin})" + ) + anchor = cast(CT_Anchor, element(cxml)) + + assert anchor.positionH.relativeFrom == "page" + assert anchor.positionV.relativeFrom == "margin" + + def it_emits_a_rot_attribute_when_EXIF_orientation_implies_rotation(self): + """Regression for upstream#540. + + A portrait photo taken with the camera sideways carries an EXIF + ``Orientation`` of 6, meaning "rotate 90 degrees clockwise to + view upright". Word honours the value from `a:xfrm/@rot`. + """ + anchor = CT_Anchor.new_pic_anchor( + 1, "rId1", "f.png", 100, 100, orientation=6 + ) + + xfrm = anchor.find(".//" + qn("a:xfrm")) + assert xfrm is not None + assert xfrm.get("rot") == str(90 * 60000) + + def it_omits_rot_for_orientation_1_or_None(self): + for orientation in (None, 1): + anchor = CT_Anchor.new_pic_anchor( + 2, "rId2", "f.png", 100, 100, orientation=orientation + ) + + xfrm = anchor.find(".//" + qn("a:xfrm")) + assert xfrm is not None + assert xfrm.get("rot") is None + + +class DescribeCT_Picture: + """Unit-test suite for `docx.oxml.shape.CT_Picture`.""" + + def it_always_emits_an_a_xfrm_with_a_ext_on_new(self): + """Regression for upstream#1164: the non-SVG `_pic_xml()` branch + used to emit no ``, so Word resized the image back to + default dimensions until the user nudged it.""" + pic = CT_Picture.new( + pic_id=0, filename="f.png", rId="rId1", cx=1_234_567, cy=7_654_321 + ) + + xfrm = pic.find(".//" + qn("a:xfrm")) + assert xfrm is not None, "expected a:xfrm in pic:spPr" + ext = xfrm.find(qn("a:ext")) + assert ext is not None, "expected a:ext as a:xfrm child" + assert int(ext.get("cx")) == 1_234_567 + assert int(ext.get("cy")) == 7_654_321 + # -- and the off origin child, so Word anchors the xfrm -- + off = xfrm.find(qn("a:off")) + assert off is not None + assert off.get("x") == "0" + assert off.get("y") == "0" + + def it_sets_rot_on_xfrm_for_rotated_EXIF_orientation(self): + pic = CT_Picture.new( + pic_id=0, + filename="f.jpg", + rId="rId1", + cx=100, + cy=100, + orientation=8, + ) + + xfrm = pic.find(".//" + qn("a:xfrm")) + assert xfrm is not None + assert xfrm.get("rot") == str(270 * 60000) + + def it_emits_xfrm_in_svg_branch_with_rot_when_rotated(self): + pic = CT_Picture.new_svg( + pic_id=0, + filename="f.svg", + fallback_rId="rId1", + svg_rId="rId2", + cx=100, + cy=100, + orientation=3, + ) + + xfrm = pic.find(".//" + qn("a:xfrm")) + assert xfrm is not None + assert xfrm.get("rot") == str(180 * 60000) + + +class Describe_rot_for_exif_orientation: + @pytest.mark.parametrize( + ("orientation", "expected"), + [ + (None, 0), + (0, 0), # out-of-range + (1, 0), + (2, 0), + (3, 180 * 60000), + (4, 180 * 60000), + (5, 90 * 60000), + (6, 90 * 60000), + (7, 270 * 60000), + (8, 270 * 60000), + (9, 0), # out-of-range + (42, 0), # out-of-range + ], + ) + def it_maps_EXIF_orientation_to_xfrm_rot_60000ths_of_degree( + self, orientation: int | None, expected: int + ): + assert _rot_for_exif_orientation(orientation) == expected + + +class DescribeCT_Inline: + """Unit-test suite for `docx.oxml.shape.CT_Inline`.""" + + def it_threads_orientation_into_pic_xfrm_rot(self): + """End-to-end: calling `new_pic_inline` with orientation=6 should + produce a drawing whose `a:xfrm/@rot` is 5_400_000 (90deg).""" + inline = CT_Inline.new_pic_inline( + 1, "rId1", "f.jpg", 100, 100, orientation=6 + ) + + xfrm = inline.find(".//" + qn("a:xfrm")) + assert xfrm is not None + assert xfrm.get("rot") == str(90 * 60000) + + def it_builds_a_linked_pic_blip_when_link_is_True(self): + inline = CT_Inline.new_pic_inline( + 1, "rIdX", "f.jpg", 100, 100, link=True + ) + + blip = inline.find(".//" + qn("a:blip")) + assert blip is not None + assert blip.get(qn("r:link")) == "rIdX" + assert blip.get(qn("r:embed")) is None + + def it_builds_a_linked_anchor_blip_when_link_is_True(self): + anchor = CT_Anchor.new_pic_anchor( + 1, "rIdX", "f.jpg", 100, 100, link=True + ) + + blip = anchor.find(".//" + qn("a:blip")) + assert blip is not None + assert blip.get(qn("r:link")) == "rIdX" + assert blip.get(qn("r:embed")) is None + + +class DescribeCT_ShapeProperties: + """Unit-test suite for outline/effect children on `pic:spPr`.""" + + def it_can_add_a_line_properties_child_in_schema_order(self): + pic = CT_Picture.new( + pic_id=0, filename="f.png", rId="rId1", cx=1000, cy=2000 + ) + spPr = cast(CT_ShapeProperties, pic.spPr) + + ln = spPr.get_or_add_ln() + ln.w = 12700 + + # -- `a:ln` inserted after a:prstGeom, before a:effectLst -- + children = [c.tag for c in spPr] + assert children.index(qn("a:ln")) > children.index(qn("a:prstGeom")) + assert isinstance(ln, CT_LineProperties) + assert ln.w == 12700 + + def it_can_add_an_effect_list_child_in_schema_order(self): + pic = CT_Picture.new( + pic_id=0, filename="f.png", rId="rId1", cx=1000, cy=2000 + ) + spPr = cast(CT_ShapeProperties, pic.spPr) + + effectLst = spPr.get_or_add_effectLst() + + children = [c.tag for c in spPr] + assert children.index(qn("a:effectLst")) > children.index(qn("a:prstGeom")) + # -- and after a:ln when both present -- + spPr.get_or_add_ln() + children = [c.tag for c in spPr] + assert children.index(qn("a:effectLst")) > children.index(qn("a:ln")) + assert isinstance(effectLst, CT_EffectList) + + +class DescribeCT_RelativeRect: + """Unit-test suite for `a:srcRect` attributes.""" + + def it_accepts_and_exposes_l_t_r_b_attrs(self): + src = cast( + CT_RelativeRect, + element("a:srcRect{l=10000,t=20000,r=5000,b=15000}"), + ) + assert src.l == 10000 + assert src.t == 20000 + assert src.r == 5000 + assert src.b == 15000 + + +class DescribeCT_OuterShadow: + """Unit-test suite for `a:outerShdw`.""" + + def it_accepts_and_exposes_blurRad_dist_and_dir(self): + shdw = cast( + CT_OuterShadow, + element("a:outerShdw{blurRad=38100,dist=12700,dir=2700000}"), + ) + assert shdw.blurRad == 38100 + assert shdw.dist == 12700 + assert shdw.dir == 2700000 diff --git a/tests/oxml/test_simpletypes.py b/tests/oxml/test_simpletypes.py new file mode 100644 index 000000000..edb7b9007 --- /dev/null +++ b/tests/oxml/test_simpletypes.py @@ -0,0 +1,63 @@ +"""Unit-test suite for docx.oxml.simpletypes (tolerant numeric parsers).""" + +from __future__ import annotations + +import pytest + +from docx.oxml.simpletypes import ST_HpsMeasure, ST_TwipsMeasure +from docx.shared import Emu + + +class DescribeST_TwipsMeasure: + """Unit-test suite for `docx.oxml.simpletypes.ST_TwipsMeasure`. + + Covers upstream issues #1475, #1539 and PR #1478 — fractional twips + written by some third-party tools must not crash the loader. + """ + + def it_parses_an_integer_twips_value(self): + length = ST_TwipsMeasure.convert_from_xml("283") + assert int(length.twips) == 283 + + def it_tolerates_a_decimal_twips_value(self): + # -- "283.5" should round to 284 twips instead of raising ValueError -- + length = ST_TwipsMeasure.convert_from_xml("283.5") + assert int(length.twips) == 284 + + def it_rounds_half_down_twips_to_nearest_integer(self): + length = ST_TwipsMeasure.convert_from_xml("283.49") + assert int(length.twips) == 283 + + def it_still_parses_a_universal_measure_with_units(self): + length = ST_TwipsMeasure.convert_from_xml("1in") + assert length.inches == pytest.approx(1.0) + + def it_serializes_an_emu_value_to_twips(self): + assert ST_TwipsMeasure.convert_to_xml(Emu(914400)) == "1440" + + +class DescribeST_HpsMeasure: + """Unit-test suite for `docx.oxml.simpletypes.ST_HpsMeasure`. + + Covers upstream issues #1475, #1539 and PR #1478 — fractional half-points + (e.g. ``"23.5"``) must not crash the loader. + """ + + def it_parses_an_integer_half_point_value(self): + length = ST_HpsMeasure.convert_from_xml("24") + assert length.pt == pytest.approx(12.0) + + def it_tolerates_a_decimal_half_point_value(self): + # -- "23.5" means 11.75 points; prior behavior raised ValueError -- + length = ST_HpsMeasure.convert_from_xml("23.5") + assert length.pt == pytest.approx(11.75) + + def it_still_parses_a_universal_measure_with_units(self): + length = ST_HpsMeasure.convert_from_xml("12pt") + assert length.pt == pytest.approx(12.0) + + def it_serializes_an_emu_value_to_half_points(self): + # -- 12 pt == 24 half-points -- + from docx.shared import Pt + + assert ST_HpsMeasure.convert_to_xml(Pt(12)) == "24" diff --git a/tests/oxml/test_smart_art.py b/tests/oxml/test_smart_art.py new file mode 100644 index 000000000..8a254e91f --- /dev/null +++ b/tests/oxml/test_smart_art.py @@ -0,0 +1,167 @@ +# pyright: reportPrivateUsage=false + +"""Unit test suite for the `docx.oxml.smart_art` module.""" + +from __future__ import annotations + +from typing import cast + +from docx.oxml.drawing import CT_Drawing +from docx.oxml.smart_art import ( + CT_Cxn, + CT_DataModel, + CT_Pt, + CT_PtLst, + CT_RelIds, + dgm_relIds_from_drawing, +) + +from ..unitutil.cxml import element + + +class DescribeCT_RelIds: + """Unit test suite for `docx.oxml.smart_art.CT_RelIds`.""" + + def it_exposes_its_four_relationship_ids(self): + relIds = cast( + CT_RelIds, + element( + "dgm:relIds{r:dm=rId4,r:lo=rId5,r:qs=rId6,r:cs=rId7}" + ), + ) + + assert relIds.dm_rId == "rId4" + assert relIds.lo_rId == "rId5" + assert relIds.qs_rId == "rId6" + assert relIds.cs_rId == "rId7" + + def its_attributes_default_to_None_when_absent(self): + relIds = cast(CT_RelIds, element("dgm:relIds")) + + assert relIds.dm_rId is None + assert relIds.lo_rId is None + assert relIds.qs_rId is None + assert relIds.cs_rId is None + + +class DescribeCT_Pt: + """Unit test suite for `docx.oxml.smart_art.CT_Pt`.""" + + def it_knows_its_modelId(self): + pt = cast(CT_Pt, element("dgm:pt{modelId=abc}")) + + assert pt.modelId == "abc" + + def it_concatenates_run_text_across_a_paragraph(self): + pt = cast( + CT_Pt, + element('dgm:pt/dgm:t/a:p/(a:r/a:t"Hello ",a:r/a:t"World")'), + ) + + assert pt.text == "Hello World" + + def it_joins_multiple_paragraphs_with_newlines(self): + pt = cast( + CT_Pt, + element('dgm:pt/dgm:t/(a:p/a:r/a:t"Line1",a:p/a:r/a:t"Line2")'), + ) + + assert pt.text == "Line1\nLine2" + + def it_returns_empty_string_when_no_dgm_t_child(self): + pt = cast(CT_Pt, element("dgm:pt{modelId=x}")) + + assert pt.text == "" + + def it_returns_empty_string_when_dgm_t_is_empty(self): + pt = cast(CT_Pt, element("dgm:pt/dgm:t")) + + assert pt.text == "" + + +class DescribeCT_DataModel: + """Unit test suite for `docx.oxml.smart_art.CT_DataModel`.""" + + def it_lists_its_pt_children(self): + dm = cast( + CT_DataModel, + element( + "dgm:dataModel/dgm:ptLst/(" + "dgm:pt{modelId=a},dgm:pt{modelId=b},dgm:pt{modelId=c})" + ), + ) + + pts = dm.pt_lst + + assert [p.modelId for p in pts] == ["a", "b", "c"] + assert all(isinstance(p, CT_Pt) for p in pts) + + def it_returns_empty_list_when_no_ptLst(self): + dm = cast(CT_DataModel, element("dgm:dataModel")) + + assert dm.pt_lst == [] + + def it_lists_its_cxn_children(self): + dm = cast( + CT_DataModel, + element( + "dgm:dataModel/dgm:cxnLst/(" + "dgm:cxn{type=parOf,srcId=a,destId=b}," + "dgm:cxn{type=parOf,srcId=a,destId=c})" + ), + ) + + cxns = dm.cxn_lst + + assert len(cxns) == 2 + assert all(isinstance(c, CT_Cxn) for c in cxns) + assert cxns[0].srcId == "a" + assert cxns[0].destId == "b" + + +class DescribeDgmRelIdsFromDrawing: + """Unit test suite for `docx.oxml.smart_art.dgm_relIds_from_drawing`.""" + + def it_finds_an_inline_dgm_relIds(self): + drawing = cast( + CT_Drawing, + element( + "w:drawing/wp:inline/a:graphic/a:graphicData" + "/dgm:relIds{r:dm=rId4}" + ), + ) + + relIds = dgm_relIds_from_drawing(drawing) + + assert isinstance(relIds, CT_RelIds) + assert relIds.dm_rId == "rId4" + + def it_finds_an_anchor_dgm_relIds(self): + drawing = cast( + CT_Drawing, + element( + "w:drawing/wp:anchor/a:graphic/a:graphicData" + "/dgm:relIds{r:dm=rId9}" + ), + ) + + relIds = dgm_relIds_from_drawing(drawing) + + assert relIds is not None + assert relIds.dm_rId == "rId9" + + def it_returns_None_when_drawing_is_not_smart_art(self): + drawing = cast( + CT_Drawing, + element("w:drawing/wp:inline/a:graphic/a:graphicData/pic:pic"), + ) + + assert dgm_relIds_from_drawing(drawing) is None + + +class DescribeCT_PtLst: + """Sanity check registration.""" + + def it_is_registered(self): + pt_lst = element("dgm:ptLst") + assert isinstance(pt_lst, CT_PtLst) diff --git a/tests/oxml/test_table.py b/tests/oxml/test_table.py index 2c9e05344..4ace1a292 100644 --- a/tests/oxml/test_table.py +++ b/tests/oxml/test_table.py @@ -8,16 +8,626 @@ import pytest +from docx.enum.table import ( + WD_BORDER_STYLE, + WD_ROW_HEIGHT_RULE, + WD_SHADING_PATTERN, + WD_TEXT_DIRECTION, +) from docx.exceptions import InvalidSpanError +from docx.oxml.ns import qn from docx.oxml.parser import parse_xml -from docx.oxml.table import CT_Row, CT_Tbl, CT_Tc +from docx.oxml.table import ( + CT_Border, + CT_Row, + CT_Shd, + CT_Tbl, + CT_TblBorders, + CT_TblCellMar, + CT_TblLook, + CT_TblPr, + CT_TblWidth, + CT_Tc, + CT_TcBorders, + CT_TcMar, + CT_TcPr, +) from docx.oxml.text.paragraph import CT_P +from docx.shared import Emu, Inches, Length, Pt, RGBColor, Twips from ..unitutil.cxml import element, xml from ..unitutil.file import snippet_seq from ..unitutil.mock import FixtureRequest, Mock, call, instance_mock, method_mock, property_mock +class DescribeCT_Border: + """Unit-test suite for `docx.oxml.table.CT_Border` objects.""" + + @pytest.mark.parametrize( + ("border_cxml", "expected_val"), + [ + ("w:top", None), + ("w:top{w:val=single}", WD_BORDER_STYLE.SINGLE), + ("w:top{w:val=double}", WD_BORDER_STYLE.DOUBLE), + ("w:top{w:val=none}", WD_BORDER_STYLE.NONE), + ], + ) + def it_can_get_the_val_attribute( + self, border_cxml: str, expected_val: WD_BORDER_STYLE | None + ): + border = cast(CT_Border, element(border_cxml)) + assert border.val == expected_val + + @pytest.mark.parametrize( + ("border_cxml", "expected_sz"), + [ + ("w:top", None), + # `sz` is eighth-points, exposed as Length (EMU). 4 eighth-points = Pt(0.5). + ("w:top{w:sz=4}", Pt(0.5)), + ("w:top{w:sz=12}", Pt(1.5)), + ], + ) + def it_can_get_the_sz_attribute( + self, border_cxml: str, expected_sz: Length | None + ): + border = cast(CT_Border, element(border_cxml)) + assert border.sz == expected_sz + + @pytest.mark.parametrize( + ("border_cxml", "expected_color"), + [ + ("w:top", None), + ("w:top{w:color=FF0000}", RGBColor(0xFF, 0x00, 0x00)), + ("w:top{w:color=auto}", "auto"), + ], + ) + def it_can_get_the_color_attribute( + self, border_cxml: str, expected_color: RGBColor | str | None + ): + border = cast(CT_Border, element(border_cxml)) + assert border.color == expected_color + + @pytest.mark.parametrize( + ("border_cxml", "expected_space"), + [ + ("w:top", None), + # `space` is in whole points, exposed as Length (EMU). + ("w:top{w:space=0}", Pt(0)), + ("w:top{w:space=4}", Pt(4)), + ], + ) + def it_can_get_the_space_attribute( + self, border_cxml: str, expected_space: Length | None + ): + border = cast(CT_Border, element(border_cxml)) + assert border.space == expected_space + + +class DescribeCT_TblBorders: + """Unit-test suite for `docx.oxml.table.CT_TblBorders` objects.""" + + def it_can_get_and_add_border_children(self): + tblBorders = cast(CT_TblBorders, element("w:tblBorders")) + assert tblBorders.top is None + top = tblBorders.get_or_add_top() + assert isinstance(top, CT_Border) + assert tblBorders.top is top + + def it_inserts_borders_in_the_right_order(self): + tblBorders = cast(CT_TblBorders, element("w:tblBorders")) + tblBorders.get_or_add_insideV() + tblBorders.get_or_add_top() + expected = xml("w:tblBorders/(w:top,w:insideV)") + assert tblBorders.xml == expected + + @pytest.mark.parametrize("attr", ["top", "left", "bottom", "right", "insideH", "insideV"]) + def it_can_remove_each_border(self, attr: str): + tblBorders = cast(CT_TblBorders, element("w:tblBorders")) + get_or_add = getattr(tblBorders, f"get_or_add_{attr}") + remove = getattr(tblBorders, f"_remove_{attr}") + get_or_add() + assert getattr(tblBorders, attr) is not None + remove() + assert getattr(tblBorders, attr) is None + + +class DescribeCT_TcBorders: + """Unit-test suite for `docx.oxml.table.CT_TcBorders` objects.""" + + def it_can_get_and_add_border_children(self): + tcBorders = cast(CT_TcBorders, element("w:tcBorders")) + assert tcBorders.top is None + top = tcBorders.get_or_add_top() + assert isinstance(top, CT_Border) + assert tcBorders.top is top + + def it_inserts_borders_in_the_right_order(self): + tcBorders = cast(CT_TcBorders, element("w:tcBorders")) + tcBorders.get_or_add_right() + tcBorders.get_or_add_top() + expected = xml("w:tcBorders/(w:top,w:right)") + assert tcBorders.xml == expected + + +class DescribeCT_TblPr_borders: + """Unit-test suite for border-related features of CT_TblPr.""" + + def it_can_get_the_tblBorders_child(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + assert tblPr.tblBorders is None + + def it_can_add_tblBorders(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + tblBorders = tblPr.get_or_add_tblBorders() + assert isinstance(tblBorders, CT_TblBorders) + assert tblPr.tblBorders is tblBorders + + def it_inserts_tblBorders_in_the_right_position(self): + tblPr = cast(CT_TblPr, element("w:tblPr/(w:tblStyle,w:tblLayout)")) + tblPr.get_or_add_tblBorders() + expected = xml("w:tblPr/(w:tblStyle,w:tblBorders,w:tblLayout)") + assert tblPr.xml == expected + + +class DescribeCT_TblPr_tblInd: + """Unit-test suite for `w:tblInd` access on CT_TblPr.""" + + def it_is_None_when_no_tblInd_child_is_present(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + assert tblPr.tblInd is None + + def it_can_add_tblInd(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + tblInd = tblPr.get_or_add_tblInd() + assert isinstance(tblInd, CT_TblWidth) + assert tblPr.tblInd is tblInd + + def it_inserts_tblInd_in_the_right_position(self): + tblPr = cast(CT_TblPr, element("w:tblPr/(w:tblStyle,w:tblBorders)")) + tblPr.get_or_add_tblInd() + expected = xml("w:tblPr/(w:tblStyle,w:tblInd,w:tblBorders)") + assert tblPr.xml == expected + + +class DescribeCT_TblPr_tblCellMar: + """Unit-test suite for `w:tblCellMar` access on CT_TblPr.""" + + def it_is_None_when_no_tblCellMar_child_is_present(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + assert tblPr.tblCellMar is None + + def it_can_add_tblCellMar(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + tblCellMar = tblPr.get_or_add_tblCellMar() + assert isinstance(tblCellMar, CT_TblCellMar) + assert tblPr.tblCellMar is tblCellMar + + def it_inserts_tblCellMar_in_the_right_position(self): + tblPr = cast(CT_TblPr, element("w:tblPr/(w:tblStyle,w:tblLook)")) + tblPr.get_or_add_tblCellMar() + expected = xml("w:tblPr/(w:tblStyle,w:tblCellMar,w:tblLook)") + assert tblPr.xml == expected + + +class DescribeCT_TblPr_tblCaption: + """Unit-test suite for `w:tblCaption`/`w:tblDescription` on CT_TblPr.""" + + def it_is_None_when_no_tblCaption_child_is_present(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + assert tblPr.tblCaption is None + assert tblPr.tblDescription is None + + def it_inserts_tblCaption_after_tblLook(self): + tblPr = cast(CT_TblPr, element("w:tblPr/(w:tblStyle,w:tblLook)")) + tblPr._add_tblCaption().val = "foo" + expected = xml("w:tblPr/(w:tblStyle,w:tblLook,w:tblCaption{w:val=foo})") + assert tblPr.xml == expected + + def it_inserts_tblDescription_after_tblCaption(self): + tblPr = cast( + CT_TblPr, element("w:tblPr/(w:tblStyle,w:tblCaption{w:val=c})") + ) + tblPr._add_tblDescription().val = "d" + expected = xml( + "w:tblPr/(w:tblStyle,w:tblCaption{w:val=c},w:tblDescription{w:val=d})" + ) + assert tblPr.xml == expected + + +class DescribeCT_TblPr_tblLook: + """Unit-test suite for `w:tblLook` access via CT_TblPr.""" + + def it_is_None_when_no_tblLook_child_is_present(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + assert tblPr.tblLook is None + + def it_can_add_tblLook(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + tblLook = tblPr.get_or_add_tblLook() + assert isinstance(tblLook, CT_TblLook) + assert tblPr.tblLook is tblLook + + def it_inserts_tblLook_after_tblCellMar(self): + tblPr = cast(CT_TblPr, element("w:tblPr/(w:tblStyle,w:tblLayout)")) + tblPr.get_or_add_tblLook() + expected = xml("w:tblPr/(w:tblStyle,w:tblLayout,w:tblLook)") + assert tblPr.xml == expected + + +class DescribeCT_TblLook: + """Unit-test suite for `docx.oxml.table.CT_TblLook` objects.""" + + @pytest.mark.parametrize( + ("name", "cxml", "expected"), + [ + ("firstRow", "w:tblLook", False), + ("firstRow", 'w:tblLook{w:firstRow=1}', True), + ("firstRow", 'w:tblLook{w:firstRow=0}', False), + ("firstRow", 'w:tblLook{w:firstRow=true}', True), + ("lastRow", 'w:tblLook{w:lastRow=1}', True), + ("firstColumn", 'w:tblLook{w:firstColumn=1}', True), + ("lastColumn", 'w:tblLook{w:lastColumn=1}', True), + ("noHBand", 'w:tblLook{w:noHBand=1}', True), + ("noVBand", 'w:tblLook{w:noVBand=1}', True), + ], + ) + def it_reads_individual_flag_attrs(self, name: str, cxml: str, expected: bool): + tblLook = cast(CT_TblLook, element(cxml)) + assert tblLook.get_flag(name) is expected + + def it_falls_back_to_the_legacy_val_bitmask_when_flag_attr_absent(self): + # 0x04A0 = firstRow(0x0020) | firstColumn(0x0080) | noVBand(0x0400) + tblLook = cast(CT_TblLook, element("w:tblLook{w:val=04A0}")) + assert tblLook.get_flag("firstRow") is True + assert tblLook.get_flag("firstColumn") is True + assert tblLook.get_flag("noVBand") is True + assert tblLook.get_flag("lastRow") is False + assert tblLook.get_flag("lastColumn") is False + assert tblLook.get_flag("noHBand") is False + + def it_prefers_individual_flag_attr_over_legacy_val(self): + # val bitmask says firstRow=1, but explicit attr says 0 + tblLook = cast( + CT_TblLook, element("w:tblLook{w:val=04A0,w:firstRow=0}") + ) + assert tblLook.get_flag("firstRow") is False + + def it_ignores_malformed_val_bitmask(self): + tblLook = cast(CT_TblLook, element("w:tblLook{w:val=notahex}")) + assert tblLook.get_flag("firstRow") is False + + def it_writes_True_as_1(self): + tblLook = cast(CT_TblLook, element("w:tblLook")) + tblLook.set_flag("firstRow", True) + assert tblLook.xml == xml('w:tblLook{w:firstRow=1}') + + def it_writes_False_as_0(self): + tblLook = cast(CT_TblLook, element("w:tblLook")) + tblLook.set_flag("firstRow", False) + assert tblLook.xml == xml('w:tblLook{w:firstRow=0}') + + def it_overwrites_an_existing_flag(self): + tblLook = cast(CT_TblLook, element('w:tblLook{w:firstRow=1}')) + tblLook.set_flag("firstRow", False) + assert tblLook.get_flag("firstRow") is False + + def it_round_trips_each_flag(self): + tblLook = cast(CT_TblLook, element("w:tblLook")) + names = ("firstRow", "lastRow", "firstColumn", "lastColumn", "noHBand", "noVBand") + for name in names: + tblLook.set_flag(name, True) + assert tblLook.get_flag(name) is True + tblLook.set_flag(name, False) + assert tblLook.get_flag(name) is False + + +class DescribeCT_TblPr_width: + """Unit-test suite for `w:tblW` features of CT_TblPr.""" + + def it_is_None_when_no_tblW_child_is_present(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + assert tblPr.tblW is None + assert tblPr.preferred_width is None + + def it_returns_None_for_non_dxa_tblW(self): + tblPr = cast(CT_TblPr, element("w:tblPr/w:tblW{w:type=pct,w:w=5000}")) + assert tblPr.preferred_width is None + + def it_returns_EMU_for_dxa_tblW(self): + tblPr = cast(CT_TblPr, element("w:tblPr/w:tblW{w:type=dxa,w:w=1440}")) + assert tblPr.preferred_width == Inches(1) + + def it_can_set_a_preferred_width(self): + tblPr = cast(CT_TblPr, element("w:tblPr")) + tblPr.preferred_width = Inches(1) + assert tblPr.xml == xml("w:tblPr/w:tblW{w:type=dxa,w:w=1440}") + + def it_can_clear_a_preferred_width(self): + tblPr = cast(CT_TblPr, element("w:tblPr/w:tblW{w:type=dxa,w:w=1440}")) + tblPr.preferred_width = None + assert tblPr.xml == xml("w:tblPr") + + def it_inserts_tblW_in_the_right_position(self): + tblPr = cast(CT_TblPr, element("w:tblPr/(w:tblStyle,w:tblLayout)")) + tblPr.set_tblW(1440, "dxa") + expected = xml( + "w:tblPr/(w:tblStyle,w:tblW{w:type=dxa,w:w=1440},w:tblLayout)" + ) + assert tblPr.xml == expected + + def it_can_update_an_existing_tblW(self): + tblPr = cast(CT_TblPr, element("w:tblPr/w:tblW{w:type=auto,w:w=0}")) + tblPr.set_tblW(5000, "pct") + assert tblPr.xml == xml("w:tblPr/w:tblW{w:type=pct,w:w=5000}") + + +class DescribeCT_TblWidth: + """Unit-test suite for `docx.oxml.table.CT_TblWidth` objects.""" + + def it_returns_None_when_type_is_not_dxa(self): + tblW = cast(CT_TblWidth, element("w:tblW{w:type=auto,w:w=0}")) + assert tblW.width is None + + def it_returns_EMU_length_for_dxa(self): + tblW = cast(CT_TblWidth, element("w:tblW{w:type=dxa,w:w=1440}")) + assert tblW.width == Inches(1) + + def it_switches_to_dxa_when_width_is_set(self): + tblW = cast(CT_TblWidth, element("w:tblW{w:type=pct,w:w=5000}")) + tblW.width = Emu(914400) + assert tblW.type == "dxa" + assert tblW.w == 1440 + + +class DescribeCT_TcPr_borders: + """Unit-test suite for border-related features of CT_TcPr.""" + + def it_can_get_the_tcBorders_child(self): + tcPr = cast(CT_TcPr, element("w:tcPr")) + assert tcPr.tcBorders is None + + def it_can_add_tcBorders(self): + tcPr = cast(CT_TcPr, element("w:tcPr")) + tcBorders = tcPr.get_or_add_tcBorders() + assert isinstance(tcBorders, CT_TcBorders) + assert tcPr.tcBorders is tcBorders + + def it_inserts_tcBorders_in_the_right_position(self): + tcPr = cast(CT_TcPr, element("w:tcPr/(w:tcW,w:shd)")) + tcPr.get_or_add_tcBorders() + expected = xml("w:tcPr/(w:tcW,w:tcBorders,w:shd)") + assert tcPr.xml == expected + + +class DescribeCT_TcMar: + """Unit-test suite for `docx.oxml.table.CT_TcMar` objects.""" + + def it_returns_None_for_all_edges_when_empty(self): + tcMar = cast(CT_TcMar, element("w:tcMar")) + assert tcMar.get_margin("top") is None + assert tcMar.get_margin("bottom") is None + assert tcMar.get_margin("start") is None + assert tcMar.get_margin("end") is None + + @pytest.mark.parametrize( + ("edge", "tag"), + [("top", "w:top"), ("bottom", "w:bottom"), ("start", "w:start"), ("end", "w:end")], + ) + def it_can_read_an_edge_value(self, edge: str, tag: str): + tcMar = cast(CT_TcMar, element("w:tcMar/%s{w:w=144,w:type=dxa}" % tag)) + assert tcMar.get_margin(edge) == Twips(144) + + def it_reads_start_from_legacy_w_left(self): + tcMar = cast(CT_TcMar, element("w:tcMar/w:left{w:w=240,w:type=dxa}")) + assert tcMar.get_margin("start") == Twips(240) + + def it_reads_end_from_legacy_w_right(self): + tcMar = cast(CT_TcMar, element("w:tcMar/w:right{w:w=360,w:type=dxa}")) + assert tcMar.get_margin("end") == Twips(360) + + @pytest.mark.parametrize( + ("edge", "value"), + [ + ("top", Inches(0.1)), + ("bottom", Pt(6)), + ("start", Twips(100)), + ("end", Inches(0.25)), + ], + ) + def it_can_round_trip_a_margin_value(self, edge: str, value): + tcMar = cast(CT_TcMar, element("w:tcMar")) + tcMar.set_margin(edge, value) + assert tcMar.get_margin(edge) == value + + def it_writes_start_as_w_start_even_when_legacy_left_is_present(self): + tcMar = cast(CT_TcMar, element("w:tcMar/w:left{w:w=100,w:type=dxa}")) + tcMar.set_margin("start", Twips(200)) + # -- legacy w:left should be replaced by w:start -- + assert tcMar.get_margin("start") == Twips(200) + assert tcMar.find(qn("w:left")) is None + assert tcMar.find(qn("w:start")) is not None + + def it_can_remove_a_margin_edge(self): + tcMar = cast( + CT_TcMar, + element("w:tcMar/(w:top{w:w=100,w:type=dxa},w:bottom{w:w=200,w:type=dxa})"), + ) + tcMar.remove_margin("top") + assert tcMar.get_margin("top") is None + assert tcMar.get_margin("bottom") == Twips(200) + + def it_removes_the_legacy_tag_when_asked_to_remove_start_or_end(self): + tcMar = cast( + CT_TcMar, + element("w:tcMar/(w:left{w:w=100,w:type=dxa},w:right{w:w=200,w:type=dxa})"), + ) + tcMar.remove_margin("start") + tcMar.remove_margin("end") + assert tcMar.find(qn("w:left")) is None + assert tcMar.find(qn("w:right")) is None + + def it_keeps_children_in_schema_order(self): + tcMar = cast(CT_TcMar, element("w:tcMar")) + tcMar.set_margin("end", Twips(40)) + tcMar.set_margin("top", Twips(10)) + tcMar.set_margin("bottom", Twips(30)) + tcMar.set_margin("start", Twips(20)) + expected = xml( + "w:tcMar/(w:top{w:w=10,w:type=dxa},w:start{w:w=20,w:type=dxa}," + "w:bottom{w:w=30,w:type=dxa},w:end{w:w=40,w:type=dxa})" + ) + assert tcMar.xml == expected + + def it_raises_on_unknown_edge_name(self): + tcMar = cast(CT_TcMar, element("w:tcMar")) + with pytest.raises(ValueError): + tcMar.get_margin("middle") + + +class DescribeCT_TcPr_margins: + """Unit-test suite for `w:tcMar` features of CT_TcPr.""" + + def it_is_None_when_no_tcMar_child_is_present(self): + tcPr = cast(CT_TcPr, element("w:tcPr")) + assert tcPr.tcMar is None + + def it_can_add_a_tcMar_child(self): + tcPr = cast(CT_TcPr, element("w:tcPr")) + tcMar = tcPr.get_or_add_tcMar() + assert isinstance(tcMar, CT_TcMar) + assert tcPr.tcMar is tcMar + + def it_inserts_tcMar_in_the_right_position(self): + tcPr = cast(CT_TcPr, element("w:tcPr/(w:tcW,w:vAlign{w:val=center})")) + tcPr.get_or_add_tcMar() + # -- tcMar should appear between tcW (earlier) and vAlign (later) -- + expected = xml("w:tcPr/(w:tcW,w:tcMar,w:vAlign{w:val=center})") + assert tcPr.xml == expected + + def it_can_remove_tcMar(self): + tcPr = cast( + CT_TcPr, element("w:tcPr/w:tcMar/w:top{w:w=100,w:type=dxa}") + ) + tcPr._remove_tcMar() + assert tcPr.tcMar is None + assert tcPr.xml == xml("w:tcPr") + + +class DescribeCT_Shd: + """Unit-test suite for `docx.oxml.table.CT_Shd` objects.""" + + @pytest.mark.parametrize( + ("shd_cxml", "expected_fill"), + [ + ("w:shd", None), + ("w:shd{w:fill=D9E2F3}", RGBColor(0xD9, 0xE2, 0xF3)), + ("w:shd{w:fill=auto}", "auto"), + ], + ) + def it_can_get_the_fill_attribute(self, shd_cxml: str, expected_fill: RGBColor | str | None): + shd = cast(CT_Shd, element(shd_cxml)) + assert shd.fill == expected_fill + + @pytest.mark.parametrize( + ("shd_cxml", "expected_val"), + [ + ("w:shd", None), + ("w:shd{w:val=clear}", WD_SHADING_PATTERN.CLEAR), + ("w:shd{w:val=solid}", WD_SHADING_PATTERN.SOLID), + ("w:shd{w:val=pct10}", WD_SHADING_PATTERN.PCT_10), + ], + ) + def it_can_get_the_val_attribute( + self, shd_cxml: str, expected_val: WD_SHADING_PATTERN | None + ): + shd = cast(CT_Shd, element(shd_cxml)) + assert shd.val == expected_val + + +class DescribeCT_TcPr: + """Unit-test suite for `docx.oxml.table.CT_TcPr` objects.""" + + @pytest.mark.parametrize( + ("tcPr_cxml", "expected_shd_present"), + [ + ("w:tcPr", False), + ("w:tcPr/w:shd{w:val=clear,w:fill=D9E2F3}", True), + ], + ) + def it_can_get_the_shd_child(self, tcPr_cxml: str, expected_shd_present: bool): + tcPr = cast(CT_TcPr, element(tcPr_cxml)) + if expected_shd_present: + assert tcPr.shd is not None + assert isinstance(tcPr.shd, CT_Shd) + else: + assert tcPr.shd is None + + def it_can_add_a_shd_child(self): + tcPr = cast(CT_TcPr, element("w:tcPr")) + shd = tcPr.get_or_add_shd() + assert isinstance(shd, CT_Shd) + assert tcPr.shd is shd + + def it_inserts_shd_in_the_right_position(self): + tcPr = cast(CT_TcPr, element("w:tcPr/(w:tcW,w:vAlign{w:val=center})")) + shd = tcPr.get_or_add_shd() + assert isinstance(shd, CT_Shd) + # shd should appear between tcW and vAlign + expected_xml = xml("w:tcPr/(w:tcW,w:shd,w:vAlign{w:val=center})") + assert tcPr.xml == expected_xml + + @pytest.mark.parametrize( + ("tcPr_cxml", "expected_value"), + [ + ("w:tcPr", None), + ("w:tcPr/w:textDirection{w:val=lrTb}", WD_TEXT_DIRECTION.LR_TB), + ("w:tcPr/w:textDirection{w:val=tbRl}", WD_TEXT_DIRECTION.TB_RL), + ("w:tcPr/w:textDirection{w:val=btLr}", WD_TEXT_DIRECTION.BT_LR), + ("w:tcPr/w:textDirection{w:val=lrTbV}", WD_TEXT_DIRECTION.LR_TB_V), + ("w:tcPr/w:textDirection{w:val=tbRlV}", WD_TEXT_DIRECTION.TB_RL_V), + ("w:tcPr/w:textDirection{w:val=tbLrV}", WD_TEXT_DIRECTION.TB_LR_V), + ], + ) + def it_knows_its_text_direction( + self, tcPr_cxml: str, expected_value: WD_TEXT_DIRECTION | None + ): + tcPr = cast(CT_TcPr, element(tcPr_cxml)) + assert tcPr.text_direction == expected_value + + @pytest.mark.parametrize( + ("tcPr_cxml", "new_value", "expected_cxml"), + [ + ( + "w:tcPr", + WD_TEXT_DIRECTION.TB_RL, + "w:tcPr/w:textDirection{w:val=tbRl}", + ), + ( + "w:tcPr/w:textDirection{w:val=tbRl}", + WD_TEXT_DIRECTION.BT_LR, + "w:tcPr/w:textDirection{w:val=btLr}", + ), + ("w:tcPr/w:textDirection{w:val=tbRl}", None, "w:tcPr"), + ("w:tcPr", None, "w:tcPr"), + ], + ) + def it_can_change_its_text_direction( + self, tcPr_cxml: str, new_value: WD_TEXT_DIRECTION | None, expected_cxml: str + ): + tcPr = cast(CT_TcPr, element(tcPr_cxml)) + tcPr.text_direction = new_value + assert tcPr.xml == xml(expected_cxml) + + def it_inserts_textDirection_in_the_right_position(self): + tcPr = cast(CT_TcPr, element("w:tcPr/(w:tcW,w:vAlign{w:val=center})")) + tcPr.text_direction = WD_TEXT_DIRECTION.BT_LR + # textDirection should appear between tcW and vAlign + expected_xml = xml( + "w:tcPr/(w:tcW,w:textDirection{w:val=btLr},w:vAlign{w:val=center})" + ) + assert tcPr.xml == expected_xml + + class DescribeCT_Row: @pytest.mark.parametrize( ("tr_cxml", "expected_cxml"), @@ -33,12 +643,239 @@ def it_can_add_a_trPr(self, tr_cxml: str, expected_cxml: str): tr._add_trPr() assert tr.xml == xml(expected_cxml) - @pytest.mark.parametrize(("snippet_idx", "row_idx", "col_idx"), [(0, 0, 3), (1, 0, 1)]) + @pytest.mark.parametrize( + ("tr_cxml", "expected_value"), + [ + ("w:tr", True), + ("w:tr/w:trPr", True), + ("w:tr/w:trPr/w:cantSplit", False), + ("w:tr/w:trPr/w:cantSplit{w:val=true}", False), + ("w:tr/w:trPr/w:cantSplit{w:val=false}", True), + ], + ) + def it_knows_whether_it_allows_break_across_pages( + self, tr_cxml: str, expected_value: bool + ): + tr = cast(CT_Row, element(tr_cxml)) + assert tr.allow_break_across_pages is expected_value + + @pytest.mark.parametrize( + ("tr_cxml", "new_value", "expected_cxml"), + [ + ("w:tr", False, "w:tr/w:trPr/w:cantSplit"), + ("w:tr/w:trPr", False, "w:tr/w:trPr/w:cantSplit"), + ("w:tr/w:trPr/w:cantSplit", True, "w:tr/w:trPr"), + ("w:tr/w:trPr/w:cantSplit", None, "w:tr/w:trPr"), + ("w:tr", True, "w:tr/w:trPr"), + ], + ) + def it_can_change_whether_it_allows_break_across_pages( + self, tr_cxml: str, new_value: bool | None, expected_cxml: str + ): + tr = cast(CT_Row, element(tr_cxml)) + tr.allow_break_across_pages = new_value + assert tr.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("tr_cxml", "expected_value"), + [ + ("w:tr", False), + ("w:tr/w:trPr", False), + ("w:tr/w:trPr/w:tblHeader", True), + ("w:tr/w:trPr/w:tblHeader{w:val=true}", True), + ("w:tr/w:trPr/w:tblHeader{w:val=false}", False), + ], + ) + def it_knows_whether_it_is_a_header_row(self, tr_cxml: str, expected_value: bool): + tr = cast(CT_Row, element(tr_cxml)) + assert tr.is_header is expected_value + + @pytest.mark.parametrize( + ("tr_cxml", "new_value", "expected_cxml"), + [ + ("w:tr", True, "w:tr/w:trPr/w:tblHeader"), + ("w:tr/w:trPr", True, "w:tr/w:trPr/w:tblHeader"), + ("w:tr/w:trPr/w:tblHeader", False, "w:tr/w:trPr"), + ("w:tr/w:trPr/w:tblHeader", None, "w:tr/w:trPr"), + ("w:tr", False, "w:tr/w:trPr"), + ], + ) + def it_can_change_whether_it_is_a_header_row( + self, tr_cxml: str, new_value: bool | None, expected_cxml: str + ): + tr = cast(CT_Row, element(tr_cxml)) + tr.is_header = new_value + assert tr.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("tr_cxml", "expected_value"), + [ + ("w:tr", None), + ("w:tr/w:trPr", None), + ("w:tr/w:trPr/w:trHeight", None), + ("w:tr/w:trPr/w:trHeight{w:val=0}", 0), + ("w:tr/w:trPr/w:trHeight{w:val=1440}", 914400), + ], + ) + def it_knows_its_trHeight_val(self, tr_cxml: str, expected_value: int | None): + tr = cast(CT_Row, element(tr_cxml)) + assert tr.trHeight_val == expected_value + + @pytest.mark.parametrize( + ("tr_cxml", "new_value", "expected_cxml"), + [ + ("w:tr", Inches(1), "w:tr/w:trPr/w:trHeight{w:val=1440}"), + ("w:tr/w:trPr", Inches(1), "w:tr/w:trPr/w:trHeight{w:val=1440}"), + ("w:tr/w:trPr/w:trHeight", Inches(1), "w:tr/w:trPr/w:trHeight{w:val=1440}"), + ( + "w:tr/w:trPr/w:trHeight{w:val=1440}", + Inches(2), + "w:tr/w:trPr/w:trHeight{w:val=2880}", + ), + ("w:tr/w:trPr/w:trHeight{w:val=2880}", None, "w:tr/w:trPr/w:trHeight"), + ("w:tr", None, "w:tr/w:trPr"), + ("w:tr/w:trPr", None, "w:tr/w:trPr"), + ], + ) + def it_can_change_its_trHeight_val( + self, tr_cxml: str, new_value: Length | None, expected_cxml: str + ): + tr = cast(CT_Row, element(tr_cxml)) + tr.trHeight_val = new_value + assert tr.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("tr_cxml", "expected_value"), + [ + ("w:tr", None), + ("w:tr/w:trPr", None), + ("w:tr/w:trPr/w:trHeight", None), + ("w:tr/w:trPr/w:trHeight{w:hRule=auto}", WD_ROW_HEIGHT_RULE.AUTO), + ( + "w:tr/w:trPr/w:trHeight{w:val=1440, w:hRule=atLeast}", + WD_ROW_HEIGHT_RULE.AT_LEAST, + ), + ( + "w:tr/w:trPr/w:trHeight{w:val=2880, w:hRule=exact}", + WD_ROW_HEIGHT_RULE.EXACTLY, + ), + ], + ) + def it_knows_its_trHeight_hRule( + self, tr_cxml: str, expected_value: WD_ROW_HEIGHT_RULE | None + ): + tr = cast(CT_Row, element(tr_cxml)) + assert tr.trHeight_hRule == expected_value + + @pytest.mark.parametrize( + ("tr_cxml", "new_value", "expected_cxml"), + [ + ("w:tr", WD_ROW_HEIGHT_RULE.AUTO, "w:tr/w:trPr/w:trHeight{w:hRule=auto}"), + ( + "w:tr/w:trPr", + WD_ROW_HEIGHT_RULE.AT_LEAST, + "w:tr/w:trPr/w:trHeight{w:hRule=atLeast}", + ), + ( + "w:tr/w:trPr/w:trHeight", + WD_ROW_HEIGHT_RULE.EXACTLY, + "w:tr/w:trPr/w:trHeight{w:hRule=exact}", + ), + ( + "w:tr/w:trPr/w:trHeight{w:val=1440, w:hRule=exact}", + WD_ROW_HEIGHT_RULE.AUTO, + "w:tr/w:trPr/w:trHeight{w:val=1440, w:hRule=auto}", + ), + ( + "w:tr/w:trPr/w:trHeight{w:val=1440, w:hRule=auto}", + None, + "w:tr/w:trPr/w:trHeight{w:val=1440}", + ), + ("w:tr", None, "w:tr/w:trPr"), + ("w:tr/w:trPr", None, "w:tr/w:trPr"), + ], + ) + def it_can_change_its_trHeight_hRule( + self, + tr_cxml: str, + new_value: WD_ROW_HEIGHT_RULE | None, + expected_cxml: str, + ): + tr = cast(CT_Row, element(tr_cxml)) + tr.trHeight_hRule = new_value + assert tr.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("snippet_idx", "row_idx", "col_idx"), + [ + # -- grid_offset beyond last tc (snippet 0 is 3x3 uniform) -- + (0, 0, 3), + # -- negative grid_offset is out of range regardless of spans -- + (0, 0, -1), + ], + ) def it_raises_on_tc_at_grid_col(self, snippet_idx: int, row_idx: int, col_idx: int): tr = cast(CT_Tbl, parse_xml(snippet_seq("tbl-cells")[snippet_idx])).tr_lst[row_idx] with pytest.raises(ValueError, match=f"no `tc` element at grid_offset={col_idx}"): tr.tc_at_grid_offset(col_idx) + @pytest.mark.parametrize( + # -- regression for upstream#1458: tc_at_grid_offset must match by + # -- range, not by exact starting offset. A horizontally-spanning + # -- w:tc (gridSpan > 1) "covers" every grid column within its span, + # -- and w:gridBefore pushes the first tc's starting offset rightward. + ("tr_cxml", "grid_offset", "expected_tc_idx"), + [ + # -- gridSpan=2 covers both grid_offset 0 and 1 -- + ("w:tr/(w:tc/w:tcPr/w:gridSpan{w:val=2},w:tc)", 0, 0), + ("w:tr/(w:tc/w:tcPr/w:gridSpan{w:val=2},w:tc)", 1, 0), + ("w:tr/(w:tc/w:tcPr/w:gridSpan{w:val=2},w:tc)", 2, 1), + # -- gridBefore=2 means first tc starts at grid_offset 2 -- + ("w:tr/(w:trPr/w:gridBefore{w:val=2},w:tc,w:tc)", 2, 0), + ("w:tr/(w:trPr/w:gridBefore{w:val=2},w:tc,w:tc)", 3, 1), + # -- gridBefore plus a spanned cell: tc_0 covers 2 and 3 -- + ( + "w:tr/(w:trPr/w:gridBefore{w:val=2},w:tc/w:tcPr/w:gridSpan{w:val=2},w:tc)", + 2, + 0, + ), + ( + "w:tr/(w:trPr/w:gridBefore{w:val=2},w:tc/w:tcPr/w:gridSpan{w:val=2},w:tc)", + 3, + 0, + ), + ( + "w:tr/(w:trPr/w:gridBefore{w:val=2},w:tc/w:tcPr/w:gridSpan{w:val=2},w:tc)", + 4, + 1, + ), + ], + ) + def it_matches_tc_at_grid_offset_by_range( + self, tr_cxml: str, grid_offset: int, expected_tc_idx: int + ): + tr = cast(CT_Row, element(tr_cxml)) + + tc = tr.tc_at_grid_offset(grid_offset) + + assert tc is tr.tc_lst[expected_tc_idx] + + @pytest.mark.parametrize( + # -- offsets inside the w:gridBefore run have no covering tc -- + ("tr_cxml", "grid_offset"), + [ + ("w:tr/(w:trPr/w:gridBefore{w:val=2},w:tc,w:tc)", 0), + ("w:tr/(w:trPr/w:gridBefore{w:val=2},w:tc,w:tc)", 1), + ("w:tr/(w:trPr/w:gridBefore{w:val=2},w:tc,w:tc)", 4), + ], + ) + def it_raises_on_tc_at_grid_offset_in_gridBefore_or_beyond( + self, tr_cxml: str, grid_offset: int + ): + tr = cast(CT_Row, element(tr_cxml)) + with pytest.raises(ValueError, match=f"no `tc` element at grid_offset={grid_offset}"): + tr.tc_at_grid_offset(grid_offset) + class DescribeCT_Tc: """Unit-test suite for `docx.oxml.table.CT_Tc` objects.""" @@ -345,6 +1182,173 @@ def it_raises_on_tr_above(self, snippet_idx: int, row_idx: int, col_idx: int): with pytest.raises(ValueError, match="no tr above topmost tr"): tc._tr_above + @pytest.mark.parametrize( + ("tc_cxml", "expected"), + [ + # -- absent gridSpan defaults to 1 -- + ("w:tc", 1), + ("w:tc/w:tcPr", 1), + # -- explicit value of 1 -- + ("w:tc/w:tcPr/w:gridSpan{w:val=1}", 1), + # -- normal span values -- + ("w:tc/w:tcPr/w:gridSpan{w:val=2}", 2), + ("w:tc/w:tcPr/w:gridSpan{w:val=5}", 5), + # -- malformed: gridSpan=0 is coerced to 1 (read robustness) -- + ("w:tc/w:tcPr/w:gridSpan{w:val=0}", 1), + ], + ) + def it_coerces_malformed_grid_span_to_one(self, tc_cxml: str, expected: int): + tc = cast(CT_Tc, element(tc_cxml)) + assert tc.grid_span == expected + + def it_traces_bottom_through_multiple_vMerge_continuations(self): + """Restart followed by 3 continuations should report bottom==4.""" + tbl = cast( + CT_Tbl, + parse_xml( + '' + "" + "" + "" + "" + "" + "" + ), + ) + root_tc = tbl.tr_lst[0].tc_lst[0] + last_tc = tbl.tr_lst[3].tc_lst[0] + + # -- root sees the full span: 4 rows -- + assert root_tc.top == 0 + assert root_tc.bottom == 4 + # -- last continuation points back up to row 0 as its top -- + assert last_tc.top == 0 + assert last_tc.bottom == 4 + + def it_handles_vMerge_chain_to_last_row(self): + """vMerge chain that ends at the final row (no row below). + + Regression guard: ``bottom`` walks forward while there is another + continuation; when the current row is the last, it should return + ``_tr_idx + 1`` without trying to access a nonexistent row below. + """ + tbl = cast( + CT_Tbl, + parse_xml( + '' + "" + "" + "" + "" + ), + ) + root_tc = tbl.tr_lst[0].tc_lst[0] + last_tc = tbl.tr_lst[1].tc_lst[0] + # -- root's bottom is just past the last continuation row -- + assert root_tc.bottom == 2 + # -- last continuation has no row below; bottom is its own row +1 -- + assert last_tc.bottom == 2 + + def it_resolves_bottom_across_gridBefore_rows(self): + """Regression for upstream#1458: ``cell._tc.bottom`` must not crash + when the row directly below has ``w:gridBefore`` or gridSpan cells + that shift the grid column of the continuation tc. + """ + tbl = cast( + CT_Tbl, + parse_xml( + '' + "" + # -- row 0 starts a vertical merge in the second grid column -- + "" + "" + "" + "" + # -- row 1 starts with gridBefore=1; its single tc sits at -- + # -- grid offset 1 and continues the vMerge. -- + "" + "" + "" + "" + "" + ), + ) + top_tc = tbl.tr_lst[0].tc_lst[1] + # -- bottom is one past the last continuation row -- + assert top_tc.bottom == 2 + + def it_grows_iteratively_for_large_merges(self): + """Regression for upstream#1208: ``_grow_to`` must not recurse per + row, because very tall merges exceed Python's recursion limit. + """ + import sys + + # -- a merge height larger than the default recursion limit -- + row_count = sys.getrecursionlimit() + 50 + rows_xml = "".join( + '' for _ in range(row_count) + ) + tbl = cast( + CT_Tbl, + parse_xml( + '' + "" + + rows_xml + + "" + ), + ) + top_tc = tbl.tr_lst[0].tc_lst[0] + + # -- merging all rows into a single span must not raise RecursionError -- + top_tc._grow_to(1, row_count) + + # -- and the root tc's bottom now spans the full table -- + assert top_tc.bottom == row_count + + def it_merges_cells_in_a_nested_table_without_crossing_tables(self): + """Regression for upstream#169: merging cells in a table that is + itself nested inside another table must not leak grid-col lookups + into the outer table's rows. + """ + xml_str = ( + '' + "" + "" + "" + "" + # -- nested table inside the second outer cell -- + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + "" + ) + outer_tbl = cast(CT_Tbl, parse_xml(xml_str)) + # -- locate the inner tbl and merge its top-left to bottom-right -- + inner_tbl = cast(CT_Tbl, outer_tbl.xpath(".//w:tc//w:tbl")[0]) + inner_top_left = inner_tbl.tr_lst[0].tc_lst[0] + inner_bottom_right = inner_tbl.tr_lst[1].tc_lst[1] + + merged = inner_top_left.merge(inner_bottom_right) + + # -- merge returned the inner top-left, not anything from outer tbl -- + assert merged is inner_tbl.tr_lst[0].tc_lst[0] + # -- outer table rows are untouched -- + assert len(outer_tbl.tr_lst) == 2 + assert len(outer_tbl.tr_lst[0].tc_lst) == 2 + assert len(outer_tbl.tr_lst[1].tc_lst) == 2 + # -- inner table has a 2x2 merged span on the (now-only) top cell -- + assert merged.grid_span == 2 + assert merged.bottom == 2 + # fixtures ------------------------------------------------------- @pytest.fixture diff --git a/tests/oxml/test_theme.py b/tests/oxml/test_theme.py new file mode 100644 index 000000000..6a371ea91 --- /dev/null +++ b/tests/oxml/test_theme.py @@ -0,0 +1,170 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.theme` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.theme import ( + CT_ClrScheme, + CT_ColorChoice, + CT_FontScheme, + CT_Theme, +) +from docx.shared import RGBColor + +from ..unitutil.cxml import element + + +class DescribeCT_Theme: + """Unit-test suite for `docx.oxml.theme.CT_Theme`.""" + + def it_exposes_its_name_attribute(self): + theme = cast(CT_Theme, element("a:theme{name=Office Theme}")) + assert theme.name == "Office Theme" + + def it_returns_None_for_missing_name_attribute(self): + theme = cast(CT_Theme, element("a:theme")) + assert theme.name is None + + def it_exposes_the_nested_clrScheme(self): + theme = cast( + CT_Theme, element("a:theme/a:themeElements/a:clrScheme") + ) + assert theme.clrScheme is not None + assert isinstance(theme.clrScheme, CT_ClrScheme) + + def it_returns_None_when_themeElements_is_absent(self): + theme = cast(CT_Theme, element("a:theme")) + assert theme.clrScheme is None + assert theme.fontScheme is None + + def it_returns_None_when_clrScheme_is_absent(self): + theme = cast(CT_Theme, element("a:theme/a:themeElements")) + assert theme.clrScheme is None + assert theme.fontScheme is None + + def it_exposes_the_nested_fontScheme(self): + theme = cast( + CT_Theme, element("a:theme/a:themeElements/a:fontScheme") + ) + assert theme.fontScheme is not None + assert isinstance(theme.fontScheme, CT_FontScheme) + + +class DescribeCT_ClrScheme: + """Unit-test suite for `docx.oxml.theme.CT_ClrScheme`.""" + + @pytest.mark.parametrize( + ("slot", "cxml"), + [ + ("dk1", "a:clrScheme/a:dk1/a:srgbClr{val=010203}"), + ("lt1", "a:clrScheme/a:lt1/a:srgbClr{val=F0E0D0}"), + ("accent1", "a:clrScheme/a:accent1/a:srgbClr{val=5B9BD5}"), + ("hlink", "a:clrScheme/a:hlink/a:srgbClr{val=0563C1}"), + ("folHlink", "a:clrScheme/a:folHlink/a:srgbClr{val=954F72}"), + ], + ) + def it_exposes_each_slot_as_a_color_choice(self, slot: str, cxml: str): + scheme = cast(CT_ClrScheme, element(cxml)) + choice = getattr(scheme, slot) + assert isinstance(choice, CT_ColorChoice) + + def it_returns_None_for_absent_slots(self): + scheme = cast(CT_ClrScheme, element("a:clrScheme")) + for slot in ( + "dk1", + "lt1", + "dk2", + "lt2", + "accent1", + "accent2", + "accent3", + "accent4", + "accent5", + "accent6", + "hlink", + "folHlink", + ): + assert getattr(scheme, slot) is None + + def it_looks_up_slots_by_name(self): + scheme = cast( + CT_ClrScheme, + element("a:clrScheme/a:accent1/a:srgbClr{val=5B9BD5}"), + ) + choice = scheme.color_for("accent1") + assert choice is not None + assert choice.rgb == RGBColor.from_string("5B9BD5") + + def it_returns_None_for_unknown_color_names(self): + scheme = cast(CT_ClrScheme, element("a:clrScheme")) + assert scheme.color_for("bogus") is None + + +class DescribeCT_ColorChoice: + """Unit-test suite for `docx.oxml.theme.CT_ColorChoice`.""" + + def it_resolves_an_srgbClr_directly(self): + dk2 = cast(CT_ColorChoice, element("a:dk2/a:srgbClr{val=44546A}")) + assert dk2.rgb == RGBColor.from_string("44546A") + + def it_resolves_a_sysClr_via_its_lastClr_fallback(self): + dk1 = cast( + CT_ColorChoice, + element("a:dk1/a:sysClr{val=windowText,lastClr=000000}"), + ) + assert dk1.rgb == RGBColor.from_string("000000") + + def it_prefers_srgbClr_over_sysClr_when_both_are_present(self): + # Schema-wise this is unusual, but defensively ensure the fast path + # wins so we don't silently return a stale lastClr. + dk1 = cast( + CT_ColorChoice, + element( + "a:dk1/(" + "a:srgbClr{val=123456}," + "a:sysClr{val=windowText,lastClr=FFFFFF}" + ")" + ), + ) + assert dk1.rgb == RGBColor.from_string("123456") + + def it_returns_None_when_sysClr_has_no_lastClr(self): + dk1 = cast(CT_ColorChoice, element("a:dk1/a:sysClr{val=windowText}")) + assert dk1.rgb is None + + def it_returns_None_when_no_supported_color_child_is_present(self): + dk1 = cast(CT_ColorChoice, element("a:dk1")) + assert dk1.rgb is None + + +class DescribeCT_FontScheme: + """Unit-test suite for `docx.oxml.theme.CT_FontScheme`.""" + + def it_exposes_majorFont_and_minorFont_and_name(self): + scheme = cast( + CT_FontScheme, + element( + "a:fontScheme{name=Office}/(" + "a:majorFont/a:latin{typeface=Calibri Light}," + "a:minorFont/a:latin{typeface=Calibri}" + ")" + ), + ) + assert scheme.name == "Office" + assert scheme.majorFont is not None + assert scheme.minorFont is not None + assert scheme.majorFont.latin is not None + assert scheme.majorFont.latin.typeface == "Calibri Light" + assert scheme.minorFont.latin is not None + assert scheme.minorFont.latin.typeface == "Calibri" + + def it_returns_None_for_missing_children(self): + scheme = cast(CT_FontScheme, element("a:fontScheme")) + assert scheme.majorFont is None + assert scheme.minorFont is None + assert scheme.name is None diff --git a/tests/oxml/test_tracked_changes.py b/tests/oxml/test_tracked_changes.py new file mode 100644 index 000000000..7ad2cbde2 --- /dev/null +++ b/tests/oxml/test_tracked_changes.py @@ -0,0 +1,386 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.tracked_changes` module.""" + +from __future__ import annotations + +import datetime as dt +from typing import cast + +import pytest + +from docx.oxml.tracked_changes import ( + CT_CellDel, + CT_CellIns, + CT_Del, + CT_DelText, + CT_Ins, + CT_MoveFrom, + CT_MoveTo, + CT_TblPrChange, + CT_TcPrChange, + CT_TrPrChange, +) + +from ..unitutil.cxml import element + + +class DescribeCT_Ins: + """Unit-test suite for `docx.oxml.tracked_changes.CT_Ins`.""" + + def it_knows_its_id(self): + ins = cast(CT_Ins, element("w:ins{w:id=1,w:author=Alice}")) + assert ins.id == 1 + + def it_knows_its_author(self): + ins = cast(CT_Ins, element("w:ins{w:id=1,w:author=Alice}")) + assert ins.author == "Alice" + + def it_knows_its_date(self): + ins = cast(CT_Ins, element("w:ins{w:id=1,w:author=Alice,w:date=2023-10-01T12:00:00Z}")) + assert ins.date == dt.datetime(2023, 10, 1, 12, 0, 0, tzinfo=dt.timezone.utc) + + def it_returns_None_when_date_is_absent(self): + ins = cast(CT_Ins, element("w:ins{w:id=1,w:author=Alice}")) + assert ins.date is None + + @pytest.mark.parametrize( + ("cxml", "expected_text"), + [ + ("w:ins{w:id=1,w:author=A}", ""), + ('w:ins{w:id=1,w:author=A}/w:r/w:t"hello"', "hello"), + ( + 'w:ins{w:id=1,w:author=A}/(w:r/w:t"hello ",w:r/w:t"world")', + "hello world", + ), + ], + ) + def it_can_produce_its_text(self, cxml: str, expected_text: str): + ins = cast(CT_Ins, element(cxml)) + assert ins.text == expected_text + + def it_provides_access_to_its_runs(self): + ins = cast(CT_Ins, element('w:ins{w:id=1,w:author=A}/(w:r/w:t"a",w:r/w:t"b")')) + assert len(ins.r_lst) == 2 + + +class DescribeCT_Del: + """Unit-test suite for `docx.oxml.tracked_changes.CT_Del`.""" + + def it_knows_its_id(self): + del_elm = cast(CT_Del, element("w:del{w:id=2,w:author=Bob}")) + assert del_elm.id == 2 + + def it_knows_its_author(self): + del_elm = cast(CT_Del, element("w:del{w:id=2,w:author=Bob}")) + assert del_elm.author == "Bob" + + def it_knows_its_date(self): + del_elm = cast( + CT_Del, element("w:del{w:id=2,w:author=Bob,w:date=2023-11-15T09:30:00Z}") + ) + assert del_elm.date == dt.datetime(2023, 11, 15, 9, 30, 0, tzinfo=dt.timezone.utc) + + def it_returns_None_when_date_is_absent(self): + del_elm = cast(CT_Del, element("w:del{w:id=2,w:author=Bob}")) + assert del_elm.date is None + + @pytest.mark.parametrize( + ("cxml", "expected_text"), + [ + ("w:del{w:id=2,w:author=B}", ""), + ('w:del{w:id=2,w:author=B}/w:r/w:delText"removed"', "removed"), + ( + 'w:del{w:id=2,w:author=B}/(w:r/w:delText"foo ",w:r/w:delText"bar")', + "foo bar", + ), + ], + ) + def it_can_produce_its_text(self, cxml: str, expected_text: str): + del_elm = cast(CT_Del, element(cxml)) + assert del_elm.text == expected_text + + +class DescribeCT_DelText: + """Unit-test suite for `docx.oxml.tracked_changes.CT_DelText`.""" + + def it_can_report_its_text(self): + dt_elm = cast(CT_DelText, element('w:delText"some deleted text"')) + assert str(dt_elm) == "some deleted text" + + def it_returns_empty_string_when_no_content(self): + dt_elm = cast(CT_DelText, element("w:delText")) + assert str(dt_elm) == "" + + +class DescribeCT_Ins_acceptReject: + """Accept/reject behavior for ``.""" + + def it_unwraps_itself_on_accept_keeping_inserted_runs(self): + p = element( + 'w:p/(w:r/w:t"before",w:ins{w:id=1,w:author=A}/w:r/w:t"inserted",w:r/w:t"after")' + ) + ins = p.xpath("./w:ins")[0] + ins.accept() + assert p.xpath("./w:ins") == [] + assert [r.text for r in p.xpath("./w:r/w:t")] == ["before", "inserted", "after"] + + def it_removes_itself_on_reject_discarding_inserted_runs(self): + p = element( + 'w:p/(w:r/w:t"before",w:ins{w:id=1,w:author=A}/w:r/w:t"inserted",w:r/w:t"after")' + ) + ins = p.xpath("./w:ins")[0] + ins.reject() + assert p.xpath("./w:ins") == [] + assert [r.text for r in p.xpath("./w:r/w:t")] == ["before", "after"] + + +class DescribeCT_Del_acceptReject: + """Accept/reject behavior for ``.""" + + def it_removes_itself_on_accept_discarding_deleted_content(self): + p = element( + 'w:p/(w:r/w:t"keep",w:del{w:id=2,w:author=B}/w:r/w:delText"gone")' + ) + del_ = p.xpath("./w:del")[0] + del_.accept() + assert p.xpath("./w:del") == [] + assert [r.text for r in p.xpath("./w:r/w:t")] == ["keep"] + + def it_restores_content_on_reject_converting_delText_to_t(self): + p = element( + 'w:p/(w:r/w:t"keep ",w:del{w:id=2,w:author=B}/w:r/w:delText"restore")' + ) + del_ = p.xpath("./w:del")[0] + del_.reject() + assert p.xpath("./w:del") == [] + assert p.xpath("./w:r/w:delText") == [] + # Both runs survive; their text values are "keep " and "restore" + texts = [t.text for t in p.xpath("./w:r/w:t")] + assert texts == ["keep ", "restore"] + + +class DescribeCT_MoveFrom: + """Unit-test suite for `docx.oxml.tracked_changes.CT_MoveFrom`.""" + + def it_knows_its_id(self): + mf = cast( + CT_MoveFrom, + element("w:moveFrom{w:id=1,w:author=Alice,w:name=m1}"), + ) + assert mf.id == 1 + + def it_knows_its_author(self): + mf = cast( + CT_MoveFrom, + element("w:moveFrom{w:id=1,w:author=Alice,w:name=m1}"), + ) + assert mf.author == "Alice" + + def it_knows_its_name(self): + mf = cast( + CT_MoveFrom, + element("w:moveFrom{w:id=1,w:author=Alice,w:name=m1}"), + ) + assert mf.name == "m1" + + def it_returns_None_when_name_is_absent(self): + mf = cast(CT_MoveFrom, element("w:moveFrom{w:id=1,w:author=A}")) + assert mf.name is None + + def it_can_produce_its_text_from_delText_children(self): + mf = cast( + CT_MoveFrom, + element( + 'w:moveFrom{w:id=1,w:author=A,w:name=m1}/w:r/w:delText"moved away"' + ), + ) + assert mf.text == "moved away" + + def it_is_recognized_as_CT_Del_for_polymorphism(self): + # -- CT_MoveFrom inherits from CT_Del so _resolve_all_changes's type + # -- dispatch treats them uniformly -- + mf = cast(CT_MoveFrom, element("w:moveFrom{w:id=1,w:author=A,w:name=m1}")) + assert isinstance(mf, CT_Del) + + +class DescribeCT_MoveTo: + """Unit-test suite for `docx.oxml.tracked_changes.CT_MoveTo`.""" + + def it_knows_its_id(self): + mt = cast(CT_MoveTo, element("w:moveTo{w:id=2,w:author=Bob,w:name=m1}")) + assert mt.id == 2 + + def it_knows_its_author(self): + mt = cast(CT_MoveTo, element("w:moveTo{w:id=2,w:author=Bob,w:name=m1}")) + assert mt.author == "Bob" + + def it_knows_its_name(self): + mt = cast(CT_MoveTo, element("w:moveTo{w:id=2,w:author=Bob,w:name=m1}")) + assert mt.name == "m1" + + def it_returns_None_when_name_is_absent(self): + mt = cast(CT_MoveTo, element("w:moveTo{w:id=2,w:author=B}")) + assert mt.name is None + + def it_can_produce_its_text_from_t_children(self): + mt = cast( + CT_MoveTo, + element('w:moveTo{w:id=2,w:author=B,w:name=m1}/w:r/w:t"moved here"'), + ) + assert mt.text == "moved here" + + def it_is_recognized_as_CT_Ins_for_polymorphism(self): + mt = cast(CT_MoveTo, element("w:moveTo{w:id=2,w:author=B,w:name=m1}")) + assert isinstance(mt, CT_Ins) + + +class DescribeCT_MoveFrom_acceptReject: + """Accept/reject behavior for ``.""" + + def it_removes_itself_on_accept_completing_the_move(self): + p = element( + 'w:p/(w:r/w:t"keep",' + 'w:moveFrom{w:id=1,w:author=A,w:name=m1}/w:r/w:delText"gone")' + ) + mf = p.xpath("./w:moveFrom")[0] + mf.accept() + assert p.xpath("./w:moveFrom") == [] + assert [r.text for r in p.xpath("./w:r/w:t")] == ["keep"] + + def it_restores_content_on_reject_converting_delText_to_t(self): + p = element( + 'w:p/(w:r/w:t"keep ",' + 'w:moveFrom{w:id=1,w:author=A,w:name=m1}/w:r/w:delText"restored")' + ) + mf = p.xpath("./w:moveFrom")[0] + mf.reject() + assert p.xpath("./w:moveFrom") == [] + assert p.xpath("./w:r/w:delText") == [] + assert [t.text for t in p.xpath("./w:r/w:t")] == ["keep ", "restored"] + + +class DescribeCT_MoveTo_acceptReject: + """Accept/reject behavior for ``.""" + + def it_unwraps_itself_on_accept_keeping_content(self): + p = element( + 'w:p/(w:r/w:t"before ",' + 'w:moveTo{w:id=2,w:author=B,w:name=m1}/w:r/w:t"moved")' + ) + mt = p.xpath("./w:moveTo")[0] + mt.accept() + assert p.xpath("./w:moveTo") == [] + assert [r.text for r in p.xpath("./w:r/w:t")] == ["before ", "moved"] + + def it_removes_itself_on_reject_cancelling_the_move(self): + p = element( + 'w:p/(w:r/w:t"before ",' + 'w:moveTo{w:id=2,w:author=B,w:name=m1}/w:r/w:t"moved")' + ) + mt = p.xpath("./w:moveTo")[0] + mt.reject() + assert p.xpath("./w:moveTo") == [] + assert [r.text for r in p.xpath("./w:r/w:t")] == ["before "] + + +class DescribeCT_CellIns: + """Unit-test suite for `docx.oxml.tracked_changes.CT_CellIns`.""" + + def it_knows_its_id(self): + ci = cast(CT_CellIns, element("w:cellIns{w:id=3,w:author=Alice}")) + assert ci.id == 3 + + def it_knows_its_author(self): + ci = cast(CT_CellIns, element("w:cellIns{w:id=3,w:author=Alice}")) + assert ci.author == "Alice" + + def it_knows_its_date(self): + ci = cast( + CT_CellIns, + element("w:cellIns{w:id=3,w:author=Alice,w:date=2024-01-02T03:04:05Z}"), + ) + assert ci.date == dt.datetime(2024, 1, 2, 3, 4, 5, tzinfo=dt.timezone.utc) + + def it_returns_None_when_date_is_absent(self): + ci = cast(CT_CellIns, element("w:cellIns{w:id=3,w:author=Alice}")) + assert ci.date is None + + +class DescribeCT_CellDel: + """Unit-test suite for `docx.oxml.tracked_changes.CT_CellDel`.""" + + def it_knows_its_id(self): + cd = cast(CT_CellDel, element("w:cellDel{w:id=4,w:author=Bob}")) + assert cd.id == 4 + + def it_knows_its_author(self): + cd = cast(CT_CellDel, element("w:cellDel{w:id=4,w:author=Bob}")) + assert cd.author == "Bob" + + def it_knows_its_date(self): + cd = cast( + CT_CellDel, + element("w:cellDel{w:id=4,w:author=Bob,w:date=2024-06-07T08:09:10Z}"), + ) + assert cd.date == dt.datetime(2024, 6, 7, 8, 9, 10, tzinfo=dt.timezone.utc) + + +class DescribeCT_TcPrChange: + """Unit-test suite for `docx.oxml.tracked_changes.CT_TcPrChange`.""" + + def it_knows_its_id(self): + tpc = cast(CT_TcPrChange, element("w:tcPrChange{w:id=5,w:author=Carol}")) + assert tpc.id == 5 + + def it_knows_its_author(self): + tpc = cast(CT_TcPrChange, element("w:tcPrChange{w:id=5,w:author=Carol}")) + assert tpc.author == "Carol" + + def it_exposes_its_inner_tcPr(self): + tpc = cast( + CT_TcPrChange, + element( + "w:tcPrChange{w:id=5,w:author=C}/w:tcPr/w:vAlign{w:val=center}" + ), + ) + assert tpc.tcPr is not None + assert tpc.tcPr.xpath("./w:vAlign") + + def it_returns_None_for_tcPr_when_absent(self): + tpc = cast(CT_TcPrChange, element("w:tcPrChange{w:id=5,w:author=C}")) + assert tpc.tcPr is None + + +class DescribeCT_TrPrChange: + """Unit-test suite for `docx.oxml.tracked_changes.CT_TrPrChange`.""" + + def it_knows_its_id(self): + rpc = cast(CT_TrPrChange, element("w:trPrChange{w:id=6,w:author=Dave}")) + assert rpc.id == 6 + + def it_exposes_its_inner_trPr(self): + rpc = cast( + CT_TrPrChange, + element("w:trPrChange{w:id=6,w:author=D}/w:trPr/w:cantSplit"), + ) + assert rpc.trPr is not None + assert rpc.trPr.xpath("./w:cantSplit") + + +class DescribeCT_TblPrChange: + """Unit-test suite for `docx.oxml.tracked_changes.CT_TblPrChange`.""" + + def it_knows_its_id(self): + tpc = cast(CT_TblPrChange, element("w:tblPrChange{w:id=7,w:author=Eve}")) + assert tpc.id == 7 + + def it_exposes_its_inner_tblPr(self): + tpc = cast( + CT_TblPrChange, + element( + "w:tblPrChange{w:id=7,w:author=E}/w:tblPr/w:tblW{w:w=5000,w:type=dxa}" + ), + ) + assert tpc.tblPr is not None + assert tpc.tblPr.xpath("./w:tblW") diff --git a/tests/oxml/test_web_settings.py b/tests/oxml/test_web_settings.py new file mode 100644 index 000000000..4044d6e82 --- /dev/null +++ b/tests/oxml/test_web_settings.py @@ -0,0 +1,133 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.oxml.web_settings` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.web_settings import CT_WebSettings + +from ..unitutil.cxml import element, xml + + +class DescribeCT_WebSettings: + """Unit-test suite for `docx.oxml.web_settings.CT_WebSettings`.""" + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:webSettings", None), + ("w:webSettings/w:encoding", None), + ("w:webSettings/w:encoding{w:val=utf-8}", "utf-8"), + ("w:webSettings/w:encoding{w:val=windows-1252}", "windows-1252"), + ], + ) + def it_can_get_the_encoding_val(self, cxml: str, expected_value: str | None): + web_settings = cast(CT_WebSettings, element(cxml)) + assert web_settings.encoding_val == expected_value + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:webSettings", False), + ("w:webSettings/w:optimizeForBrowser", True), + ("w:webSettings/w:optimizeForBrowser{w:val=0}", False), + ("w:webSettings/w:optimizeForBrowser{w:val=1}", True), + ("w:webSettings/w:optimizeForBrowser{w:val=true}", True), + ], + ) + def it_can_get_the_optimizeForBrowser_val(self, cxml: str, expected_value: bool): + web_settings = cast(CT_WebSettings, element(cxml)) + assert web_settings.optimizeForBrowser_val is expected_value + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:webSettings", True, "w:webSettings/w:optimizeForBrowser"), + ("w:webSettings/w:optimizeForBrowser", False, "w:webSettings"), + ("w:webSettings/w:optimizeForBrowser{w:val=0}", True, + "w:webSettings/w:optimizeForBrowser"), + ("w:webSettings/w:optimizeForBrowser", None, "w:webSettings"), + ], + ) + def it_can_set_the_optimizeForBrowser_val( + self, cxml: str, new_value: bool | None, expected_cxml: str + ): + web_settings = cast(CT_WebSettings, element(cxml)) + web_settings.optimizeForBrowser_val = new_value + assert web_settings.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:webSettings", False), + ("w:webSettings/w:allowPNG", True), + ("w:webSettings/w:allowPNG{w:val=0}", False), + ("w:webSettings/w:allowPNG{w:val=true}", True), + ], + ) + def it_can_get_the_allowPNG_val(self, cxml: str, expected_value: bool): + web_settings = cast(CT_WebSettings, element(cxml)) + assert web_settings.allowPNG_val is expected_value + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:webSettings", True, "w:webSettings/w:allowPNG"), + ("w:webSettings/w:allowPNG", False, "w:webSettings"), + ("w:webSettings/w:allowPNG{w:val=0}", True, "w:webSettings/w:allowPNG"), + ], + ) + def it_can_set_the_allowPNG_val( + self, cxml: str, new_value: bool, expected_cxml: str + ): + web_settings = cast(CT_WebSettings, element(cxml)) + web_settings.allowPNG_val = new_value + assert web_settings.xml == xml(expected_cxml) + + @pytest.mark.parametrize( + ("cxml", "expected_value"), + [ + ("w:webSettings", False), + ("w:webSettings/w:doNotSaveAsSingleFile", True), + ("w:webSettings/w:doNotSaveAsSingleFile{w:val=0}", False), + ], + ) + def it_can_get_the_doNotSaveAsSingleFile_val( + self, cxml: str, expected_value: bool + ): + web_settings = cast(CT_WebSettings, element(cxml)) + assert web_settings.doNotSaveAsSingleFile_val is expected_value + + @pytest.mark.parametrize( + ("cxml", "new_value", "expected_cxml"), + [ + ("w:webSettings", True, "w:webSettings/w:doNotSaveAsSingleFile"), + ("w:webSettings/w:doNotSaveAsSingleFile", False, "w:webSettings"), + ], + ) + def it_can_set_the_doNotSaveAsSingleFile_val( + self, cxml: str, new_value: bool, expected_cxml: str + ): + web_settings = cast(CT_WebSettings, element(cxml)) + web_settings.doNotSaveAsSingleFile_val = new_value + assert web_settings.xml == xml(expected_cxml) + + def it_preserves_child_order_when_adding_children(self): + """Setters must insert children in the schema-prescribed order.""" + web_settings = cast(CT_WebSettings, element("w:webSettings")) + # -- add in reverse-schema order to exercise the successors lists -- + web_settings.doNotSaveAsSingleFile_val = True + web_settings.allowPNG_val = True + web_settings.optimizeForBrowser_val = True + expected = xml( + "w:webSettings/(" + "w:optimizeForBrowser," + "w:allowPNG," + "w:doNotSaveAsSingleFile" + ")" + ) + assert web_settings.xml == expected diff --git a/tests/oxml/test_xmlchemy.py b/tests/oxml/test_xmlchemy.py index 76b53c957..945202d7b 100644 --- a/tests/oxml/test_xmlchemy.py +++ b/tests/oxml/test_xmlchemy.py @@ -25,6 +25,40 @@ class DescribeBaseOxmlElement: + def it_tolerates_dir_on_CT_subclass_and_instance(self): + # -- upstream-PR#1220 / upstream#1433: dir(CT_*) or dir(tc) must not raise + # -- TypeError from the instance-dict descriptor binding. + from docx.oxml.parser import OxmlElement + from docx.oxml.table import CT_Tc + + assert dir(CT_Tc) + assert dir(OxmlElement("w:tc")) + + def it_keeps_the_instance_dict_descriptor_off_BaseOxmlElement(self): + # -- upstream-PR#1220 / upstream#1433: ensure `BaseOxmlElement` itself + # -- does NOT own a `__dict__` getset-descriptor (it must instead + # -- inherit via `_OxmlElementBase`). Without this indirection, IDE + # -- debuggers walking `type(cls).__mro__` hit the auto-generated + # -- descriptor and raise:: + # -- + # -- TypeError: descriptor '__dict__' for 'BaseOxmlElement' objects + # -- doesn't apply to a 'MetaOxmlElement' object + from docx.oxml.xmlchemy import BaseOxmlElement, _OxmlElementBase + + assert "__dict__" not in BaseOxmlElement.__dict__ + assert "__dict__" in _OxmlElementBase.__dict__ + + def it_resolves_instance_dict_descriptor_on_a_CT_instance(self): + # -- upstream-PR#1220 / upstream#1433: invoking the `__dict__` + # -- getset-descriptor on a CT_* instance must still produce a dict. + from docx.oxml.parser import OxmlElement + from docx.oxml.xmlchemy import _OxmlElementBase + + tc = OxmlElement("w:tc") + descr = _OxmlElementBase.__dict__["__dict__"] + + assert isinstance(descr.__get__(tc, type(tc)), dict) + def it_can_find_the_first_of_its_children_named_in_a_sequence(self, first_fixture): element, tagnames, matching_child = first_fixture assert element.first_child_found_in(*tagnames) is matching_child @@ -39,6 +73,50 @@ def it_can_remove_all_children_with_name_in_sequence(self, remove_fixture): element.remove_all(*tagnames) assert element.xml == expected_xml + def it_accepts_custom_namespaces_on_xpath(self): + # -- upstream-PR#622: `xpath(namespaces=...)` merges extra prefixes on + # -- top of the built-in OOXML nsmap so callers can resolve custom + # -- XML parts without re-declaring every standard prefix. + from docx.oxml.ns import nsdecls + + xml = ( + '' + '' + "" % nsdecls("w") + ).encode() + root = parse_xml(xml) + + hits = root.xpath( + ".//custom:tag", namespaces={"custom": "urn:example:custom"} + ) + + assert len(hits) == 1 + assert hits[0].get(qn("w:val")) == "hello" + + def it_can_compile_and_cache_xpath_expressions(self): + # -- upstream#342: repeated calls should reuse the compiled xpath. + from docx.oxml.xmlchemy import _XP + + expr_a = _XP(".//w:p", {"w": "http://example"}) + expr_b = _XP(".//w:p", {"w": "http://example"}) + assert expr_a is expr_b, "xpath cache must return identical compiled objects" + + def it_returns_empty_hits_when_namespace_is_unmapped(self): + # -- Without a custom namespace mapping, a prefix that isn't in the + # -- built-in nsmap raises or returns empty; with a custom mapping, it + # -- resolves correctly. + from docx.oxml.ns import nsdecls + + xml = ( + '' + '' + "" % nsdecls("w") + ).encode() + root = parse_xml(xml) + + hits = root.xpath(".//zz:marker", namespaces={"zz": "urn:zzz"}) + assert len(hits) == 1 + # fixtures --------------------------------------------- @pytest.fixture( diff --git a/tests/oxml/text/test_hyperlink.py b/tests/oxml/text/test_hyperlink.py index f5cec4761..ec2a289ca 100644 --- a/tests/oxml/text/test_hyperlink.py +++ b/tests/oxml/text/test_hyperlink.py @@ -43,3 +43,11 @@ def it_has_zero_or_more_runs_containing_the_hyperlink_text(self): assert [type(r) for r in rs] == [CT_R, CT_R] assert rs[0].text == "blog" assert rs[1].text == " post" + + def it_can_add_a_run(self): + hyperlink = cast(CT_Hyperlink, element("w:hyperlink")) + + r = hyperlink.add_r() + + assert isinstance(r, CT_R) + assert len(hyperlink.r_lst) == 1 diff --git a/tests/oxml/text/test_paragraph.py b/tests/oxml/text/test_paragraph.py new file mode 100644 index 000000000..5b1712a71 --- /dev/null +++ b/tests/oxml/text/test_paragraph.py @@ -0,0 +1,309 @@ +"""Test suite for the docx.oxml.text.paragraph module.""" + +from typing import cast + +from docx.oxml.ns import qn +from docx.oxml.parser import OxmlElement +from docx.oxml.text.paragraph import CT_P + +from ...unitutil.cxml import element + + +class DescribeCT_P: + """Unit-test suite for the CT_P () element.""" + + def it_can_add_an_external_hyperlink(self): + p = cast(CT_P, element("w:p")) + + hyperlink = p.add_hyperlink(rId="rId7", anchor=None, text="Click", rPr=None) + + assert hyperlink.rId == "rId7" + assert hyperlink.anchor is None + assert hyperlink.history is True + rs = hyperlink.r_lst + assert len(rs) == 1 + assert rs[0].text == "Click" + assert rs[0].rPr is None + + def it_can_add_an_internal_hyperlink(self): + p = cast(CT_P, element("w:p")) + + hyperlink = p.add_hyperlink(rId=None, anchor="bookmark1", text="Go", rPr=None) + + assert hyperlink.rId is None + assert hyperlink.anchor == "bookmark1" + assert hyperlink.history is True + assert hyperlink.r_lst[0].text == "Go" + + def it_can_add_a_hyperlink_with_rPr(self): + p = cast(CT_P, element("w:p")) + rPr = OxmlElement("w:rPr") + rStyle = OxmlElement("w:rStyle") + rStyle.set(qn("w:val"), "Hyperlink") + rPr.append(rStyle) + + hyperlink = p.add_hyperlink(rId="rId1", anchor=None, text="Link", rPr=rPr) + + r = hyperlink.r_lst[0] + assert r.rPr is not None + rStyle_elem = r.rPr.find(qn("w:rStyle")) + assert rStyle_elem is not None + assert rStyle_elem.get(qn("w:val")) == "Hyperlink" + + def it_appends_the_hyperlink_as_the_last_child(self): + p = cast(CT_P, element('w:p/w:r/w:t"existing"')) + + p.add_hyperlink(rId="rId1", anchor=None, text="Link", rPr=None) + + children = list(p) + assert children[-1].tag == qn("w:hyperlink") + + +class DescribeCT_P_SmartTagTransparency: + """`CT_P` descends transparently through `w:smartTag` and `w:customXml`. + + See upstream issues #932 and #225 — runs wrapped in smart-tag markup used + to be silently dropped from `Paragraph.runs` and `CT_P.text`. + """ + + def it_includes_smartTag_wrapped_run_text_in_paragraph_text(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'Hello ' + b'smart' + b'!' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + assert p.text == "Hello smart!" + + def it_includes_customXml_wrapped_run_text_in_paragraph_text(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'Hello ' + b'custom' + b'!' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + assert p.text == "Hello custom!" + + def it_yields_smartTag_wrapped_runs_from_iter_r_elements(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'a' + b'bc' + b'd' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + rs = list(p.iter_r_elements()) + assert [r.text for r in rs] == ["a", "b", "c", "d"] + + def it_descends_recursively_through_nested_smartTags(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'' + b'nested' + b'' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + assert p.text == "nested" + assert len(list(p.iter_r_elements())) == 1 + + +class DescribeCT_P_TransparentWrapperExpansion: + """Phase A-v2 #1: descend w:sdt/mc:AlternateContent/w:ins/w:moveTo. + + See upstream #1327, #1389, #335, PR#1538, PR#734. + """ + + def it_excludes_w_ins_content_from_paragraph_text(self): + """Tracked insertions are NOT visible in final-view paragraph text. + + Their content is reachable via revision_marks_text() / tracked_changes. + """ + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'before ' + b'' + b'inserted' + b'' + b' after' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + assert p.text == "before after" + + def it_excludes_w_moveTo_content_from_paragraph_text(self): + """Tracked move-destinations are NOT visible in final-view paragraph text.""" + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'X ' + b'' + b'moved' + b'' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + assert p.text == "X " + + def it_excludes_ins_and_moveTo_runs_from_iter_r_elements(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'plain' + b'' + b'ins1' + b'' + b'' + b'mv' + b'' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + rs = list(p.iter_r_elements()) + assert [r.text for r in rs] == ["plain"] + + def it_descends_mc_AlternateContent_preferring_Choice(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'' + b'choice' + b'fallback' + b'' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + assert p.text == "choice" + + def it_falls_back_when_Choice_has_no_run_like_content(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'' + b'' + b'fallback' + b'' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + assert p.text == "fallback" + + def it_descends_sdt_text_for_paragraph_text(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'before ' + b'' + b'' + b'sdt-text' + b'' + b' after' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + assert p.text == "before sdt-text after" + + +class DescribeCT_P_AllRunsIterator: + """Phase A-v2 #2: iter_all_r_elements surfaces nested runs. + + See upstream #1370, #1021. + """ + + def it_yields_runs_inside_hyperlink(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'link' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + rs = list(p.iter_all_r_elements()) + assert [r.text for r in rs] == ["link"] + + def it_yields_runs_inside_fldSimple(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'7' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + rs = list(p.iter_all_r_elements()) + assert [r.text for r in rs] == ["7"] + + def it_yields_runs_inside_sdt_sdtContent(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'' + b'' + b'inside' + b'' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + rs = list(p.iter_all_r_elements()) + assert [r.text for r in rs] == ["inside"] + + def it_skips_field_code_only_runs_in_complex_fields(self): + from docx.oxml.parser import parse_xml + + xml = ( + b'' + b'' + b' PAGE ' + b'' + b'42' + b'' + b'' + ) + p = cast(CT_P, parse_xml(xml)) + texts = [r.text for r in p.iter_all_r_elements()] + # -- fldChar runs carry no visible text (their .text is "") but the + # -- instrText run (the field *code*) must NOT appear, only "42" does. + assert "42" in texts + assert " PAGE " not in texts diff --git a/tests/oxml/text/test_parfmt.py b/tests/oxml/text/test_parfmt.py new file mode 100644 index 000000000..008448048 --- /dev/null +++ b/tests/oxml/text/test_parfmt.py @@ -0,0 +1,176 @@ +"""Test suite for the docx.oxml.text.parfmt module (text-frame focus).""" + +from typing import cast + +import pytest + +from docx.enum.text import ( + WD_FRAME_DROP_CAP, + WD_FRAME_H_ALIGN, + WD_FRAME_H_ANCHOR, + WD_FRAME_V_ALIGN, + WD_FRAME_V_ANCHOR, + WD_FRAME_WRAP, +) +from docx.oxml.text.parfmt import CT_FramePr, CT_PPr +from docx.shared import Twips + +from ...unitutil.cxml import element, xml + + +class DescribeCT_PPr: + """Unit-test suite for CT_PPr's ``w:framePr`` child.""" + + def it_exposes_framePr_when_present(self): + pPr = cast(CT_PPr, element("w:pPr/w:framePr")) + assert pPr.framePr is not None + + def it_returns_None_for_framePr_when_absent(self): + pPr = cast(CT_PPr, element("w:pPr")) + assert pPr.framePr is None + + def it_can_add_framePr_when_absent(self): + pPr = cast(CT_PPr, element("w:pPr")) + framePr = pPr.get_or_add_framePr() + assert framePr is not None + assert pPr.framePr is framePr + + def it_can_remove_framePr(self): + pPr = cast(CT_PPr, element("w:pPr/w:framePr")) + pPr._remove_framePr() + assert pPr.framePr is None + + def it_inserts_framePr_in_correct_schema_position(self): + # framePr must come before widowControl per ST_PPr schema order. + pPr = cast(CT_PPr, element("w:pPr/(w:pageBreakBefore,w:widowControl)")) + pPr.get_or_add_framePr() + tags = [child.tag for child in pPr.iterchildren()] + assert tags == [ + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pageBreakBefore", + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}framePr", + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}widowControl", + ] + + +class DescribeCT_PPr_bidi: + """Unit-test suite for `CT_PPr.bidi_val`.""" + + def it_returns_False_when_no_bidi_child(self): + pPr = cast(CT_PPr, element("w:pPr")) + assert pPr.bidi_val is False + + @pytest.mark.parametrize( + ("pPr_cxml", "expected_value"), + [ + ("w:pPr/w:bidi", True), + ("w:pPr/w:bidi{w:val=1}", True), + ("w:pPr/w:bidi{w:val=true}", True), + ("w:pPr/w:bidi{w:val=on}", True), + ("w:pPr/w:bidi{w:val=0}", False), + ("w:pPr/w:bidi{w:val=false}", False), + ("w:pPr/w:bidi{w:val=off}", False), + ], + ) + def it_knows_its_bidi_val(self, pPr_cxml: str, expected_value: bool): + pPr = cast(CT_PPr, element(pPr_cxml)) + assert pPr.bidi_val is expected_value + + @pytest.mark.parametrize( + ("pPr_cxml", "value", "expected_cxml"), + [ + ("w:pPr", True, "w:pPr/w:bidi"), + ("w:pPr/w:bidi", False, "w:pPr"), + ("w:pPr/w:bidi", None, "w:pPr"), + ("w:pPr/w:bidi{w:val=off}", True, "w:pPr/w:bidi"), + ("w:pPr", False, "w:pPr"), + ], + ) + def it_can_change_its_bidi_val( + self, pPr_cxml: str, value: bool | None, expected_cxml: str + ): + pPr = cast(CT_PPr, element(pPr_cxml)) + pPr.bidi_val = value + assert pPr.xml == xml(expected_cxml) + + def it_inserts_bidi_in_the_right_position(self): + # w:bidi must come after w:autoSpaceDN and before w:spacing/w:ind/w:jc + pPr = cast( + CT_PPr, + element("w:pPr/(w:pStyle{w:val=Foo},w:spacing,w:ind,w:jc{w:val=center})"), + ) + pPr.get_or_add_bidi() + tags = [child.tag.rsplit("}", 1)[-1] for child in pPr.iterchildren()] + assert tags == ["pStyle", "bidi", "spacing", "ind", "jc"] + + +class DescribeCT_FramePr: + """Unit-test suite for the CT_FramePr () element.""" + + def it_reads_twips_width_and_height(self): + framePr = cast( + CT_FramePr, element("w:framePr{w:w=1440,w:h=2880}") + ) + assert framePr.w == Twips(1440) + assert framePr.h == Twips(2880) + + def it_reads_signed_position_attributes(self): + framePr = cast( + CT_FramePr, element("w:framePr{w:x=720,w:y=-360}") + ) + assert framePr.x == Twips(720) + assert framePr.y == Twips(-360) + + @pytest.mark.parametrize( + ("attr", "xml_val", "enum_val"), + [ + ("hAnchor", "text", WD_FRAME_H_ANCHOR.TEXT), + ("hAnchor", "margin", WD_FRAME_H_ANCHOR.MARGIN), + ("hAnchor", "page", WD_FRAME_H_ANCHOR.PAGE), + ("vAnchor", "text", WD_FRAME_V_ANCHOR.TEXT), + ("vAnchor", "margin", WD_FRAME_V_ANCHOR.MARGIN), + ("vAnchor", "page", WD_FRAME_V_ANCHOR.PAGE), + ("wrap", "auto", WD_FRAME_WRAP.AUTO), + ("wrap", "notBeside", WD_FRAME_WRAP.NOT_BESIDE), + ("wrap", "around", WD_FRAME_WRAP.AROUND), + ("wrap", "none", WD_FRAME_WRAP.NONE), + ("wrap", "tight", WD_FRAME_WRAP.TIGHT), + ("wrap", "through", WD_FRAME_WRAP.THROUGH), + ("dropCap", "none", WD_FRAME_DROP_CAP.NONE), + ("dropCap", "drop", WD_FRAME_DROP_CAP.DROP), + ("dropCap", "margin", WD_FRAME_DROP_CAP.MARGIN), + ("xAlign", "left", WD_FRAME_H_ALIGN.LEFT), + ("xAlign", "center", WD_FRAME_H_ALIGN.CENTER), + ("xAlign", "right", WD_FRAME_H_ALIGN.RIGHT), + ("xAlign", "inside", WD_FRAME_H_ALIGN.INSIDE), + ("xAlign", "outside", WD_FRAME_H_ALIGN.OUTSIDE), + ("yAlign", "inline", WD_FRAME_V_ALIGN.INLINE), + ("yAlign", "top", WD_FRAME_V_ALIGN.TOP), + ("yAlign", "center", WD_FRAME_V_ALIGN.CENTER), + ("yAlign", "bottom", WD_FRAME_V_ALIGN.BOTTOM), + ("yAlign", "inside", WD_FRAME_V_ALIGN.INSIDE), + ("yAlign", "outside", WD_FRAME_V_ALIGN.OUTSIDE), + ], + ) + def it_reads_enum_attributes(self, attr, xml_val, enum_val): + framePr = cast( + CT_FramePr, element(f"w:framePr{{w:{attr}={xml_val}}}") + ) + assert getattr(framePr, attr) == enum_val + + def it_reads_lines_as_int(self): + framePr = cast(CT_FramePr, element("w:framePr{w:lines=3}")) + assert framePr.lines == 3 + + def it_returns_None_for_absent_attrs(self): + framePr = cast(CT_FramePr, element("w:framePr")) + assert framePr.w is None + assert framePr.h is None + assert framePr.x is None + assert framePr.y is None + assert framePr.hAnchor is None + assert framePr.vAnchor is None + assert framePr.wrap is None + assert framePr.dropCap is None + assert framePr.lines is None + assert framePr.xAlign is None + assert framePr.yAlign is None diff --git a/tests/oxml/text/test_run.py b/tests/oxml/text/test_run.py index 6aad7cd02..52de1ff6b 100644 --- a/tests/oxml/text/test_run.py +++ b/tests/oxml/text/test_run.py @@ -4,7 +4,8 @@ import pytest -from docx.oxml.text.run import CT_R +from docx.oxml.text.paragraph import CT_P +from docx.oxml.text.run import CT_R, CT_Sym from ...unitutil.cxml import element, xml @@ -39,3 +40,200 @@ def it_can_assemble_the_text_in_the_run(self): r = cast(CT_R, element(cxml)) assert r.text == "\n\n-\tfoobar\t" + + @pytest.mark.parametrize( + ("p_cxml", "offset", "expected_left_text", "expected_right_text"), + [ + # split in middle of text + ('w:p/w:r/w:t"foobar"', 3, "foo", "bar"), + # split at beginning — left run is empty + ('w:p/w:r/w:t"foobar"', 0, "", "foobar"), + # split at end — right run is empty + ('w:p/w:r/w:t"foobar"', 6, "foobar", ""), + # split run with formatting — both get rPr + ('w:p/w:r/(w:rPr/w:b,w:t"foobar")', 3, "foo", "bar"), + ], + ) + def it_can_split_at_a_character_offset( + self, + p_cxml: str, + offset: int, + expected_left_text: str, + expected_right_text: str, + ): + p = cast(CT_P, element(p_cxml)) + r = p.r_lst[0] + + new_r = r.split_run(offset) + + assert r.text == expected_left_text + assert new_r.text == expected_right_text + # -- new run is next sibling -- + assert r.getnext() is new_r + assert len(p.r_lst) == 2 + + def it_copies_rPr_to_the_new_run_on_split(self): + p = cast(CT_P, element('w:p/w:r/(w:rPr/(w:b,w:i),w:t"foobar")')) + r = p.r_lst[0] + + new_r = r.split_run(3) + + # -- both runs have bold+italic -- + assert r.rPr is not None + assert new_r.rPr is not None + assert r.rPr.xml == new_r.rPr.xml + # -- but they are distinct elements, not the same object -- + assert r.rPr is not new_r.rPr + + def it_splits_a_run_with_no_formatting(self): + p = cast(CT_P, element('w:p/w:r/w:t"hello"')) + r = p.r_lst[0] + + new_r = r.split_run(2) + + assert r.text == "he" + assert new_r.text == "llo" + assert r.rPr is None + assert new_r.rPr is None + + def it_raises_on_invalid_offset(self): + p = cast(CT_P, element('w:p/w:r/w:t"hello"')) + r = p.r_lst[0] + + with pytest.raises(ValueError, match="offset -1 out of range"): + r.split_run(-1) + with pytest.raises(ValueError, match="offset 6 out of range"): + r.split_run(6) + + def it_can_add_a_w_sym_child(self): + r = cast(CT_R, element("w:r")) + + sym = r.add_sym("F0E0", "Wingdings") + + assert r.xml == xml("w:r/w:sym{w:font=Wingdings,w:char=F0E0}") + assert isinstance(sym, CT_Sym) + assert sym.font == "Wingdings" + assert sym.char == "F0E0" + + def it_exposes_its_w_sym_children_via_sym_lst(self): + r = cast( + CT_R, + element( + "w:r/(w:sym{w:font=Wingdings,w:char=F0E0}," + "w:t\"x\"," + "w:sym{w:font=Symbol,w:char=0041})" + ), + ) + + syms = r.sym_lst + + assert [s.char for s in syms] == ["F0E0", "0041"] + assert [s.font for s in syms] == ["Wingdings", "Symbol"] + + def it_renders_w_sym_as_char_in_run_text(self): + """Closes upstream#1528 — ``w:sym`` contributes ``chr(@w:char)``.""" + r = cast( + CT_R, + element( + "w:r/(w:t\"a\",w:sym{w:font=Wingdings,w:char=F0E0},w:t\"b\")" + ), + ) + + assert r.text == "a" + chr(0xF0E0) + "b" + + +class DescribeCT_Sym: + """Unit-test suite for :class:`docx.oxml.text.run.CT_Sym`.""" + + def it_renders_as_the_derived_character(self): + sym = cast(CT_Sym, element("w:sym{w:font=Wingdings,w:char=F0E0}")) + assert str(sym) == chr(0xF0E0) + + def it_returns_empty_string_on_invalid_hex(self): + sym = cast(CT_Sym, element("w:sym{w:font=Wingdings,w:char=notHex}")) + assert str(sym) == "" + + +class DescribeCT_R_TextSetterPreservesReferences: + """Phase A-v2 #3: Run.text setter preserves reference-carrying children. + + See upstream #1519. Reassigning ``text`` must not silently delete + ``w:commentReference``, ``w:footnoteReference``, ``w:endnoteReference``, + ``w:fldChar`` marker, or ``w:instrText`` siblings. + """ + + def it_preserves_commentReference_on_text_reassignment(self): + from docx.oxml.ns import qn + from docx.oxml.parser import parse_xml + + xml_bytes = ( + b'' + b'old' + b'' + b'' + ) + r = cast(CT_R, parse_xml(xml_bytes)) + r.text = "new" + assert r.find(qn("w:commentReference")) is not None + assert r.text == "new" + + def it_preserves_footnoteReference_on_text_reassignment(self): + from docx.oxml.ns import qn + from docx.oxml.parser import parse_xml + + xml_bytes = ( + b'' + b'old' + b'' + b'' + ) + r = cast(CT_R, parse_xml(xml_bytes)) + r.text = "new" + assert r.find(qn("w:footnoteReference")) is not None + + def it_preserves_endnoteReference_on_text_reassignment(self): + from docx.oxml.ns import qn + from docx.oxml.parser import parse_xml + + xml_bytes = ( + b'' + b'old' + b'' + b'' + ) + r = cast(CT_R, parse_xml(xml_bytes)) + r.text = "new" + assert r.find(qn("w:endnoteReference")) is not None + + def it_preserves_fldChar_on_text_reassignment(self): + from docx.oxml.ns import qn + from docx.oxml.parser import parse_xml + + xml_bytes = ( + b'' + b'' + b'old' + b'' + ) + r = cast(CT_R, parse_xml(xml_bytes)) + r.text = "new" + assert r.find(qn("w:fldChar")) is not None + + def it_preserves_instrText_on_text_reassignment(self): + from docx.oxml.ns import qn + from docx.oxml.parser import parse_xml + + xml_bytes = ( + b'' + b' REF bookmark1 ' + b'old' + b'' + ) + r = cast(CT_R, parse_xml(xml_bytes)) + r.text = "new" + assert r.find(qn("w:instrText")) is not None diff --git a/tests/parts/test_chart.py b/tests/parts/test_chart.py new file mode 100644 index 000000000..6888e9d4a --- /dev/null +++ b/tests/parts/test_chart.py @@ -0,0 +1,129 @@ +"""Unit test suite for the docx.parts.chart module.""" + +from __future__ import annotations + +import pytest + +from docx.chart import WD_CHART_TYPE +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.opc.packuri import PackURI +from docx.opc.part import PartFactory +from docx.oxml.chart import CT_ChartSpace +from docx.package import Package +from docx.parts.chart import ChartPart + +from ..unitutil.mock import FixtureRequest, Mock, instance_mock, method_mock + + +class DescribeChartPart: + """Unit test suite for `docx.parts.chart.ChartPart`.""" + + def it_is_used_by_the_part_loader_to_construct_a_chart_part( + self, package_: Mock, ChartPart_load_: Mock, chart_part_: Mock + ): + partname = PackURI("/word/charts/chart1.xml") + content_type = CT.DML_CHART + reltype = RT.CHART + blob = ( + b'' + ) + ChartPart_load_.return_value = chart_part_ + + part = PartFactory(partname, content_type, reltype, blob, package_) + + ChartPart_load_.assert_called_once_with( + partname, content_type, blob, package_ + ) + assert part is chart_part_ + + def it_constructs_a_new_bar_chart_part(self): + package = Package() + part = ChartPart.new( + package, + WD_CHART_TYPE.BAR, + ["a", "b", "c"], + {"Series 1": [1.0, 2.0, 3.0]}, + ) + assert isinstance(part, ChartPart) + assert part.content_type == CT.DML_CHART + assert part.partname.startswith("/word/charts/chart") + assert part.partname.endswith(".xml") + assert isinstance(part.chartSpace, CT_ChartSpace) + # -- round-trip the chart data via the element tree -- + chart = part.chartSpace.chart + assert chart is not None + plotArea = chart.plotArea + assert plotArea is not None + kind = plotArea.chart_kind_element + assert kind is not None + assert kind.tag.endswith("barChart") + + @pytest.mark.parametrize( + ("chart_type", "expected_tag"), + [ + (WD_CHART_TYPE.BAR, "barChart"), + (WD_CHART_TYPE.BAR_STACKED, "barChart"), + (WD_CHART_TYPE.COLUMN, "barChart"), + (WD_CHART_TYPE.COLUMN_STACKED, "barChart"), + (WD_CHART_TYPE.LINE, "lineChart"), + (WD_CHART_TYPE.PIE, "pieChart"), + ], + ) + def it_constructs_new_parts_for_each_supported_kind( + self, chart_type: WD_CHART_TYPE, expected_tag: str + ): + package = Package() + part = ChartPart.new( + package, chart_type, ["a", "b"], {"S": [1.0, 2.0]} + ) + kind = part.chartSpace.chart.plotArea.chart_kind_element # pyright: ignore + assert kind is not None + assert kind.tag.endswith(expected_tag) + + def it_rejects_mismatched_series_length(self): + package = Package() + with pytest.raises(ValueError, match="has 2 values but 3 categories"): + ChartPart.new( + package, + WD_CHART_TYPE.BAR, + ["a", "b", "c"], + {"S": [1.0, 2.0]}, + ) + + def it_rejects_unsupported_chart_type_for_create(self): + package = Package() + with pytest.raises(ValueError, match="unsupported chart_type"): + ChartPart.new( + package, + WD_CHART_TYPE.SCATTER, + ["a"], + {"S": [1.0]}, + ) + + def it_generates_sequential_partnames(self): + package = Package() + part1 = ChartPart.new( + package, WD_CHART_TYPE.BAR, ["a"], {"S": [1.0]} + ) + package.relate_to(part1, RT.CHART) + part2 = ChartPart.new( + package, WD_CHART_TYPE.BAR, ["a"], {"S": [1.0]} + ) + # -- partnames must be distinct so they don't collide in the package -- + assert part1.partname != part2.partname + + # -- fixtures --------------------------------------------------------- + + @pytest.fixture + def chart_part_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, ChartPart) + + @pytest.fixture + def ChartPart_load_(self, request: FixtureRequest) -> Mock: + return method_mock(request, ChartPart, "load", autospec=False) + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/parts/test_custom_properties.py b/tests/parts/test_custom_properties.py new file mode 100644 index 000000000..2f19a8e4c --- /dev/null +++ b/tests/parts/test_custom_properties.py @@ -0,0 +1,71 @@ +"""Unit test suite for the `docx.parts.custom_properties` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.custom_properties import CustomProperties +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.packuri import PackURI +from docx.oxml.custom_properties import CT_CustomProperties +from docx.oxml.parser import parse_xml +from docx.package import Package +from docx.parts.custom_properties import CustomPropertiesPart + +from ..unitutil.mock import FixtureRequest, Mock, class_mock, instance_mock + + +_EMPTY_PROPERTIES_XML = ( + b'' +) + + +class DescribeCustomPropertiesPart: + """Unit test suite for `docx.parts.custom_properties.CustomPropertiesPart`.""" + + def it_provides_access_to_its_custom_properties_collection( + self, CustomProperties_: Mock, custom_properties_: Mock, package_: Mock + ): + CustomProperties_.return_value = custom_properties_ + elm = cast(CT_CustomProperties, parse_xml(_EMPTY_PROPERTIES_XML)) + part = CustomPropertiesPart( + PackURI("/docProps/custom.xml"), CT.OFC_CUSTOM_PROPERTIES, elm, package_ + ) + + custom_properties = part.custom_properties + + CustomProperties_.assert_called_once_with(part.element, part) + assert custom_properties is custom_properties_ + + def it_constructs_a_default_custom_properties_part_to_help(self): + package = Package() + + part = CustomPropertiesPart.default(package) + + assert isinstance(part, CustomPropertiesPart) + assert part.partname == "/docProps/custom.xml" + assert part.content_type == CT.OFC_CUSTOM_PROPERTIES + assert part.package is package + assert part.element.tag == ( + "{http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}" + "Properties" + ) + assert len(part.element) == 0 + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def CustomProperties_(self, request: FixtureRequest) -> Mock: + return class_mock(request, "docx.parts.custom_properties.CustomProperties") + + @pytest.fixture + def custom_properties_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, CustomProperties) + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/parts/test_document.py b/tests/parts/test_document.py index c27990baf..f83b2c916 100644 --- a/tests/parts/test_document.py +++ b/tests/parts/test_document.py @@ -5,6 +5,7 @@ import pytest from docx.comments import Comments +from docx.custom_properties import CustomProperties from docx.enum.style import WD_STYLE_TYPE from docx.opc.constants import CONTENT_TYPE as CT from docx.opc.constants import RELATIONSHIP_TYPE as RT @@ -12,11 +13,16 @@ from docx.opc.packuri import PackURI from docx.package import Package from docx.parts.comments import CommentsPart +from docx.parts.custom_properties import CustomPropertiesPart from docx.parts.document import DocumentPart +from docx.parts.font_table import FontTablePart +from docx.parts.footnotes import FootnotesPart +from docx.parts.glossary import GlossaryPart from docx.parts.hdrftr import FooterPart, HeaderPart from docx.parts.numbering import NumberingPart from docx.parts.settings import SettingsPart from docx.parts.styles import StylesPart +from docx.parts.theme import ThemePart from docx.settings import Settings from docx.styles.style import BaseStyle from docx.styles.styles import Styles @@ -109,7 +115,9 @@ def it_can_save_the_package_to_a_file(self, package_: Mock): document_part.save("foobar.docx") - package_.save.assert_called_once_with("foobar.docx") + package_.save.assert_called_once_with( + "foobar.docx", reproducible=False, password=None + ) def it_provides_access_to_the_comments_added_to_the_document( self, _comments_part_prop_: Mock, comments_part_: Mock, comments_: Mock, package_: Mock @@ -227,6 +235,171 @@ def it_can_get_the_id_of_a_style( styles_.get_style_id.assert_called_once_with(style_, WD_STYLE_TYPE.CHARACTER) assert style_id == "BodyCharacter" + def it_provides_access_to_its_font_table_when_a_part_is_related( + self, package_: Mock, part_related_by_: Mock, font_table_part_: Mock + ): + font_table_part_.font_table = "ft-sentinel" + part_related_by_.return_value = font_table_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part.font_table == "ft-sentinel" + part_related_by_.assert_called_once_with(document_part, RT.FONT_TABLE) + + def and_font_table_is_None_when_no_part_is_related( + self, package_: Mock, part_related_by_: Mock + ): + part_related_by_.side_effect = KeyError + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part.font_table is None + part_related_by_.assert_called_once_with(document_part, RT.FONT_TABLE) + + def it_provides_access_to_its_font_table_part_to_help( + self, package_: Mock, part_related_by_: Mock, font_table_part_: Mock + ): + part_related_by_.return_value = font_table_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part._font_table_part is font_table_part_ + part_related_by_.assert_called_once_with(document_part, RT.FONT_TABLE) + + def and_the_font_table_part_accessor_returns_None_when_not_present( + self, package_: Mock, part_related_by_: Mock + ): + part_related_by_.side_effect = KeyError + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part._font_table_part is None + + def it_provides_access_to_its_theme_when_a_part_is_related( + self, package_: Mock, part_related_by_: Mock, theme_part_: Mock + ): + theme_part_.theme = "theme-sentinel" + part_related_by_.return_value = theme_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part.theme == "theme-sentinel" + part_related_by_.assert_called_once_with(document_part, RT.THEME) + + def and_theme_is_None_when_no_theme_part_is_related( + self, package_: Mock, part_related_by_: Mock + ): + part_related_by_.side_effect = KeyError + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part.theme is None + part_related_by_.assert_called_once_with(document_part, RT.THEME) + + def it_provides_access_to_its_theme_part_to_help( + self, package_: Mock, part_related_by_: Mock, theme_part_: Mock + ): + part_related_by_.return_value = theme_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part._theme_part is theme_part_ + part_related_by_.assert_called_once_with(document_part, RT.THEME) + + def and_the_theme_part_accessor_returns_None_when_not_present( + self, package_: Mock, part_related_by_: Mock + ): + part_related_by_.side_effect = KeyError + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part._theme_part is None + + def it_provides_access_to_its_glossary_when_a_part_is_related( + self, package_: Mock, part_related_by_: Mock, glossary_part_: Mock + ): + glossary_part_.glossary = "glossary-sentinel" + part_related_by_.return_value = glossary_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part.glossary == "glossary-sentinel" + part_related_by_.assert_called_once_with(document_part, RT.GLOSSARY_DOCUMENT) + + def and_glossary_is_None_when_no_glossary_part_is_related( + self, package_: Mock, part_related_by_: Mock + ): + part_related_by_.side_effect = KeyError + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part.glossary is None + part_related_by_.assert_called_once_with(document_part, RT.GLOSSARY_DOCUMENT) + + def it_provides_access_to_its_glossary_part_to_help( + self, package_: Mock, part_related_by_: Mock, glossary_part_: Mock + ): + part_related_by_.return_value = glossary_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part._glossary_part is glossary_part_ + part_related_by_.assert_called_once_with(document_part, RT.GLOSSARY_DOCUMENT) + + def and_the_glossary_part_accessor_returns_None_when_not_present( + self, package_: Mock, part_related_by_: Mock + ): + part_related_by_.side_effect = KeyError + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part._glossary_part is None + + def it_provides_access_to_its_footnotes_part_to_help( + self, package_: Mock, part_related_by_: Mock, footnotes_part_: Mock + ): + part_related_by_.return_value = footnotes_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + footnotes_part = document_part._footnotes_part + + part_related_by_.assert_called_once_with(document_part, RT.FOOTNOTES) + assert footnotes_part is footnotes_part_ + + def and_it_creates_a_default_footnotes_part_if_not_present( + self, + package_: Mock, + part_related_by_: Mock, + FootnotesPart_: Mock, + footnotes_part_: Mock, + relate_to_: Mock, + ): + part_related_by_.side_effect = KeyError + FootnotesPart_.default.return_value = footnotes_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + footnotes_part = document_part._footnotes_part + + FootnotesPart_.default.assert_called_once_with(package_) + relate_to_.assert_called_once_with(document_part, footnotes_part_, RT.FOOTNOTES) + assert footnotes_part is footnotes_part_ + def it_provides_access_to_its_comments_part_to_help( self, package_: Mock, part_related_by_: Mock, comments_part_: Mock ): @@ -240,6 +413,56 @@ def it_provides_access_to_its_comments_part_to_help( part_related_by_.assert_called_once_with(document_part, RT.COMMENTS) assert comments_part is comments_part_ + def it_provides_access_to_its_custom_properties_part_to_help( + self, package_: Mock, part_related_by_: Mock, custom_properties_part_: Mock + ): + part_related_by_.return_value = custom_properties_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + custom_properties_part = document_part._custom_properties_part + + part_related_by_.assert_called_once_with(document_part, RT.CUSTOM_PROPERTIES) + assert custom_properties_part is custom_properties_part_ + + def and_it_creates_a_default_custom_properties_part_if_not_present( + self, + package_: Mock, + part_related_by_: Mock, + CustomPropertiesPart_: Mock, + custom_properties_part_: Mock, + relate_to_: Mock, + ): + part_related_by_.side_effect = KeyError + CustomPropertiesPart_.default.return_value = custom_properties_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + custom_properties_part = document_part._custom_properties_part + + CustomPropertiesPart_.default.assert_called_once_with(package_) + relate_to_.assert_called_once_with( + document_part, custom_properties_part_, RT.CUSTOM_PROPERTIES + ) + assert custom_properties_part is custom_properties_part_ + + def it_exposes_the_custom_properties_collection( + self, + _custom_properties_part_prop_: Mock, + custom_properties_part_: Mock, + custom_properties_: Mock, + package_: Mock, + ): + custom_properties_part_.custom_properties = custom_properties_ + _custom_properties_part_prop_.return_value = custom_properties_part_ + document_part = DocumentPart( + PackURI("/word/document.xml"), CT.WML_DOCUMENT, element("w:document"), package_ + ) + + assert document_part.custom_properties is custom_properties_ + def and_it_creates_a_default_comments_part_if_not_present( self, package_: Mock, @@ -348,10 +571,46 @@ def _comments_part_prop_(self, request: FixtureRequest) -> Mock: def core_properties_(self, request: FixtureRequest): return instance_mock(request, CoreProperties) + @pytest.fixture + def CustomPropertiesPart_(self, request: FixtureRequest) -> Mock: + return class_mock(request, "docx.parts.document.CustomPropertiesPart") + + @pytest.fixture + def custom_properties_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, CustomProperties) + + @pytest.fixture + def custom_properties_part_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, CustomPropertiesPart) + + @pytest.fixture + def _custom_properties_part_prop_(self, request: FixtureRequest) -> Mock: + return property_mock(request, DocumentPart, "_custom_properties_part") + @pytest.fixture def drop_rel_(self, request: FixtureRequest): return method_mock(request, DocumentPart, "drop_rel", autospec=True) + @pytest.fixture + def font_table_part_(self, request: FixtureRequest): + return instance_mock(request, FontTablePart) + + @pytest.fixture + def theme_part_(self, request: FixtureRequest): + return instance_mock(request, ThemePart) + + @pytest.fixture + def glossary_part_(self, request: FixtureRequest): + return instance_mock(request, GlossaryPart) + + @pytest.fixture + def FootnotesPart_(self, request: FixtureRequest): + return class_mock(request, "docx.parts.document.FootnotesPart") + + @pytest.fixture + def footnotes_part_(self, request: FixtureRequest): + return instance_mock(request, FootnotesPart) + @pytest.fixture def FooterPart_(self, request: FixtureRequest): return class_mock(request, "docx.parts.document.FooterPart") diff --git a/tests/parts/test_endnotes.py b/tests/parts/test_endnotes.py new file mode 100644 index 000000000..94b85894c --- /dev/null +++ b/tests/parts/test_endnotes.py @@ -0,0 +1,76 @@ +"""Unit test suite for the docx.parts.endnotes module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.opc.packuri import PackURI +from docx.opc.part import PartFactory +from docx.oxml.endnotes import CT_Endnotes +from docx.package import Package +from docx.parts.endnotes import EndnotesPart + +from ..unitutil.cxml import element +from ..unitutil.mock import FixtureRequest, Mock, instance_mock, method_mock + + +class DescribeEndnotesPart: + """Unit test suite for `docx.parts.endnotes.EndnotesPart` objects.""" + + def it_is_used_by_the_part_loader_to_construct_an_endnotes_part( + self, package_: Mock, EndnotesPart_load_: Mock, endnotes_part_: Mock + ): + partname = PackURI("/word/endnotes.xml") + content_type = CT.WML_ENDNOTES + reltype = RT.ENDNOTES + blob = b"" + EndnotesPart_load_.return_value = endnotes_part_ + + part = PartFactory(partname, content_type, reltype, blob, package_) + + EndnotesPart_load_.assert_called_once_with(partname, content_type, blob, package_) + assert part is endnotes_part_ + + def it_provides_access_to_its_endnotes_element(self, package_: Mock): + endnotes_elm = cast(CT_Endnotes, element("w:endnotes")) + endnotes_part = EndnotesPart( + PackURI("/word/endnotes.xml"), CT.WML_ENDNOTES, endnotes_elm, package_ + ) + + assert endnotes_part.endnotes_element is endnotes_elm + + def it_constructs_a_default_endnotes_part_to_help(self): + package = Package() + + endnotes_part = EndnotesPart.default(package) + + assert isinstance(endnotes_part, EndnotesPart) + assert endnotes_part.partname == "/word/endnotes.xml" + assert endnotes_part.content_type == CT.WML_ENDNOTES + assert endnotes_part.package is package + assert endnotes_part.element.tag == ( + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}endnotes" + ) + # default template has separator (id=0) and continuation separator (id=1) + endnote_elms = endnotes_part.element.xpath("./w:endnote") + assert len(endnote_elms) == 2 + assert endnote_elms[0].id == 0 + assert endnote_elms[1].id == 1 + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def endnotes_part_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, EndnotesPart) + + @pytest.fixture + def EndnotesPart_load_(self, request: FixtureRequest) -> Mock: + return method_mock(request, EndnotesPart, "load", autospec=False) + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/parts/test_font_table.py b/tests/parts/test_font_table.py new file mode 100644 index 000000000..d2637d014 --- /dev/null +++ b/tests/parts/test_font_table.py @@ -0,0 +1,84 @@ +"""Unit test suite for the `docx.parts.font_table` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.font_table import FontTable +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.opc.packuri import PackURI +from docx.opc.part import PartFactory +from docx.oxml.font_table import CT_Fonts +from docx.package import Package +from docx.parts.font_table import FontTablePart + +from ..unitutil.cxml import element +from ..unitutil.mock import FixtureRequest, Mock, class_mock, instance_mock, method_mock + + +class DescribeFontTablePart: + """Unit test suite for `docx.parts.font_table.FontTablePart` objects.""" + + def it_is_used_by_the_part_loader_to_construct_a_font_table_part( + self, + package_: Mock, + FontTablePart_load_: Mock, + font_table_part_: Mock, + ): + partname = PackURI("/word/fontTable.xml") + content_type = CT.WML_FONT_TABLE + reltype = RT.FONT_TABLE + blob = b"" + FontTablePart_load_.return_value = font_table_part_ + + part = PartFactory(partname, content_type, reltype, blob, package_) + + FontTablePart_load_.assert_called_once_with(partname, content_type, blob, package_) + assert part is font_table_part_ + + def it_provides_access_to_its_font_table_collection( + self, FontTable_: Mock, font_table_: Mock, package_: Mock + ): + FontTable_.return_value = font_table_ + fonts_elm = cast(CT_Fonts, element("w:fonts")) + font_table_part = FontTablePart( + PackURI("/word/fontTable.xml"), CT.WML_FONT_TABLE, fonts_elm, package_ + ) + + font_table = font_table_part.font_table + + FontTable_.assert_called_once_with(fonts_elm, font_table_part) + assert font_table is font_table_ + + def it_exposes_its_font_table_element(self, package_: Mock): + fonts_elm = cast(CT_Fonts, element("w:fonts")) + font_table_part = FontTablePart( + PackURI("/word/fontTable.xml"), CT.WML_FONT_TABLE, fonts_elm, package_ + ) + + assert font_table_part.font_table_element is fonts_elm + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def FontTable_(self, request: FixtureRequest) -> Mock: + return class_mock(request, "docx.parts.font_table.FontTable") + + @pytest.fixture + def font_table_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, FontTable) + + @pytest.fixture + def font_table_part_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, FontTablePart) + + @pytest.fixture + def FontTablePart_load_(self, request: FixtureRequest) -> Mock: + return method_mock(request, FontTablePart, "load", autospec=False) + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/parts/test_footnotes.py b/tests/parts/test_footnotes.py new file mode 100644 index 000000000..e7d7760a9 --- /dev/null +++ b/tests/parts/test_footnotes.py @@ -0,0 +1,76 @@ +"""Unit test suite for the docx.parts.footnotes module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.opc.packuri import PackURI +from docx.opc.part import PartFactory +from docx.oxml.footnotes import CT_Footnotes +from docx.package import Package +from docx.parts.footnotes import FootnotesPart + +from ..unitutil.cxml import element +from ..unitutil.mock import FixtureRequest, Mock, class_mock, instance_mock, method_mock + + +class DescribeFootnotesPart: + """Unit test suite for `docx.parts.footnotes.FootnotesPart` objects.""" + + def it_is_used_by_the_part_loader_to_construct_a_footnotes_part( + self, package_: Mock, FootnotesPart_load_: Mock, footnotes_part_: Mock + ): + partname = PackURI("/word/footnotes.xml") + content_type = CT.WML_FOOTNOTES + reltype = RT.FOOTNOTES + blob = b"" + FootnotesPart_load_.return_value = footnotes_part_ + + part = PartFactory(partname, content_type, reltype, blob, package_) + + FootnotesPart_load_.assert_called_once_with(partname, content_type, blob, package_) + assert part is footnotes_part_ + + def it_provides_access_to_its_footnotes_element(self, package_: Mock): + footnotes_elm = cast(CT_Footnotes, element("w:footnotes")) + footnotes_part = FootnotesPart( + PackURI("/word/footnotes.xml"), CT.WML_FOOTNOTES, footnotes_elm, package_ + ) + + assert footnotes_part.footnotes_element is footnotes_elm + + def it_constructs_a_default_footnotes_part_to_help(self): + package = Package() + + footnotes_part = FootnotesPart.default(package) + + assert isinstance(footnotes_part, FootnotesPart) + assert footnotes_part.partname == "/word/footnotes.xml" + assert footnotes_part.content_type == CT.WML_FOOTNOTES + assert footnotes_part.package is package + assert footnotes_part.element.tag == ( + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}footnotes" + ) + # default template has separator (id=0) and continuation separator (id=1) + footnote_elms = footnotes_part.element.xpath("./w:footnote") + assert len(footnote_elms) == 2 + assert footnote_elms[0].id == 0 + assert footnote_elms[1].id == 1 + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def footnotes_part_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, FootnotesPart) + + @pytest.fixture + def FootnotesPart_load_(self, request: FixtureRequest) -> Mock: + return method_mock(request, FootnotesPart, "load", autospec=False) + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/parts/test_glossary.py b/tests/parts/test_glossary.py new file mode 100644 index 000000000..8adcadb04 --- /dev/null +++ b/tests/parts/test_glossary.py @@ -0,0 +1,63 @@ +"""Unit test suite for the `docx.parts.glossary` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.glossary import Glossary +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.packuri import PackURI +from docx.oxml.glossary import CT_GlossaryDocument +from docx.package import Package +from docx.parts.glossary import GlossaryPart + +from ..unitutil.cxml import element +from ..unitutil.mock import FixtureRequest, Mock, class_mock, instance_mock + + +class DescribeGlossaryPart: + """Unit test suite for `docx.parts.glossary.GlossaryPart`.""" + + def it_provides_access_to_its_glossary_proxy( + self, Glossary_: Mock, glossary_: Mock, package_: Mock + ): + Glossary_.return_value = glossary_ + glossary_elm = cast(CT_GlossaryDocument, element("w:glossaryDocument")) + glossary_part = GlossaryPart( + PackURI("/word/glossary/document.xml"), + CT.WML_DOCUMENT_GLOSSARY, + glossary_elm, + package_, + ) + + glossary = glossary_part.glossary + + Glossary_.assert_called_once_with(glossary_elm, glossary_part) + assert glossary is glossary_ + + def it_exposes_its_glossary_element(self, package_: Mock): + glossary_elm = cast(CT_GlossaryDocument, element("w:glossaryDocument")) + glossary_part = GlossaryPart( + PackURI("/word/glossary/document.xml"), + CT.WML_DOCUMENT_GLOSSARY, + glossary_elm, + package_, + ) + + assert glossary_part.glossary_element is glossary_elm + + # -- fixtures ------------------------------------------------------------ + + @pytest.fixture + def Glossary_(self, request: FixtureRequest) -> Mock: + return class_mock(request, "docx.parts.glossary.Glossary") + + @pytest.fixture + def glossary_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Glossary) + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/parts/test_numbering.py b/tests/parts/test_numbering.py index 1ed0f2a05..025f2a6e7 100644 --- a/tests/parts/test_numbering.py +++ b/tests/parts/test_numbering.py @@ -2,7 +2,11 @@ import pytest +from docx.numbering import Numbering +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.packuri import PackURI from docx.oxml.numbering import CT_Numbering +from docx.package import Package from docx.parts.numbering import NumberingPart, _NumberingDefinitions from ..oxml.unitdata.numbering import a_num, a_numbering @@ -10,6 +14,26 @@ class DescribeNumberingPart: + def it_can_create_a_default_numbering_part(self, request): + package_ = instance_mock(request, Package) + + part = NumberingPart.default(package_) + + assert part.partname == PackURI("/word/numbering.xml") + assert part.content_type == CT.WML_NUMBERING + # -- the part exposes a Numbering proxy -- + numbering = part.numbering + assert isinstance(numbering, Numbering) + # -- freshly-created part has no definitions -- + assert len(numbering) == 0 + + def it_can_still_build_a_numbering_part_via_new(self): + # -- legacy compatibility: `NumberingPart.new()` without a package -- + part = NumberingPart.new() + + assert isinstance(part.numbering_element, CT_Numbering) + assert len(part.numbering) == 0 + def it_provides_access_to_the_numbering_definitions(self, num_defs_fixture): ( numbering_part, diff --git a/tests/parts/test_smart_art.py b/tests/parts/test_smart_art.py new file mode 100644 index 000000000..f5d3a9e77 --- /dev/null +++ b/tests/parts/test_smart_art.py @@ -0,0 +1,90 @@ +"""Unit test suite for the `docx.parts.smart_art` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.opc.packuri import PackURI +from docx.opc.part import PartFactory +from docx.oxml.parser import parse_xml +from docx.oxml.smart_art import CT_DataModel +from docx.package import Package +from docx.parts.smart_art import DiagramDataPart + +from ..unitutil.mock import FixtureRequest, Mock, instance_mock + + +DATA_MODEL_XML = ( + b'\n' + b'\n' + b" \n" + b' \n' + b' First\n' + b" \n" + b' \n' + b' Second\n' + b" \n" + b" \n" + b"
\n" +) + + +class DescribeDiagramDataPart: + """Unit test suite for `docx.parts.smart_art.DiagramDataPart`.""" + + def it_exposes_its_data_model(self, package_: Mock): + element = cast(CT_DataModel, parse_xml(DATA_MODEL_XML)) + part = DiagramDataPart( + PackURI("/word/diagrams/data1.xml"), + CT.DML_DIAGRAM_DATA, + element, + package_, + ) + + data_model = part.data_model + + assert isinstance(data_model, CT_DataModel) + assert len(data_model.pt_lst) == 2 + assert data_model.pt_lst[0].modelId == "n1" + + def it_is_loaded_by_the_part_factory(self, request: FixtureRequest): + package_ = instance_mock(request, Package) + partname = PackURI("/word/diagrams/data1.xml") + + part = PartFactory( + partname, + CT.DML_DIAGRAM_DATA, + RT.DIAGRAM_DATA, + DATA_MODEL_XML, + package_, + ) + + assert isinstance(part, DiagramDataPart) + assert part.partname == partname + assert part.content_type == CT.DML_DIAGRAM_DATA + assert isinstance(part.data_model, CT_DataModel) + + def it_round_trips_the_blob(self, package_: Mock): + element = cast(CT_DataModel, parse_xml(DATA_MODEL_XML)) + part = DiagramDataPart( + PackURI("/word/diagrams/data1.xml"), + CT.DML_DIAGRAM_DATA, + element, + package_, + ) + + # -- the blob is the serialized XML; check it at least contains the text -- + assert b"First" in part.blob + assert b"Second" in part.blob + + # -- fixtures ----------------------------------------------------------------- + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/parts/test_story.py b/tests/parts/test_story.py index 9a1dc7fab..b28d0238d 100644 --- a/tests/parts/test_story.py +++ b/tests/parts/test_story.py @@ -1,10 +1,14 @@ """Unit test suite for the docx.parts.story module.""" +import io + import pytest from docx.enum.style import WD_STYLE_TYPE +from docx.image.constants import MIME_TYPE from docx.image.image import Image from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.oxml.shape import CT_Inline from docx.package import Package from docx.parts.document import DocumentPart from docx.parts.image import ImagePart @@ -13,7 +17,7 @@ from ..unitutil.cxml import element from ..unitutil.file import snippet_text -from ..unitutil.mock import instance_mock, method_mock, property_mock +from ..unitutil.mock import class_mock, instance_mock, method_mock, property_mock class DescribeStoryPart: @@ -69,6 +73,28 @@ def it_can_create_a_new_pic_inline(self, get_or_add_image_, image_, next_id_prop image_.scaled_dimensions.assert_called_once_with(100, 200) assert inline.xml == expected_xml + def it_can_create_a_new_svg_pic_inline( + self, get_or_add_image_, image_, next_id_prop_, _generate_svg_fallback_ + ): + # First call returns the SVG image rId, second returns the fallback PNG rId + get_or_add_image_.side_effect = [("rId7", image_), ("rId8", image_)] + image_.scaled_dimensions.return_value = 400, 300 + image_.filename = "drawing.svg" + image_.content_type = MIME_TYPE.SVG + next_id_prop_.return_value = 5 + _generate_svg_fallback_.return_value = b"fake-png-bytes" + story_part = StoryPart(None, None, None, None) + + inline = story_part.new_pic_inline("drawing.svg", width=400, height=300) + + assert get_or_add_image_.call_count == 2 + # Second call should be for the fallback PNG stream + fallback_call_args = get_or_add_image_.call_args_list[1] + fallback_stream = fallback_call_args[0][1] + assert isinstance(fallback_stream, io.BytesIO) + assert fallback_stream.getvalue() == b"fake-png-bytes" + assert isinstance(inline, CT_Inline) + def it_knows_the_next_available_xml_id(self, next_id_fixture): story_element, expected_value = next_id_fixture story_part = StoryPart(None, None, story_element, None) @@ -77,6 +103,107 @@ def it_knows_the_next_available_xml_id(self, next_id_fixture): assert next_id == expected_value + def it_allocates_next_id_spanning_body_headers_and_footers(self, request): + # -- the body uses id 7, a related header uses id 9, and a related + # footer uses id 12; the next id must be 13 even when we ask the + # header part (not the document part) for it -- + from unittest.mock import Mock + from docx.opc.constants import RELATIONSHIP_TYPE as RT + + body_element = element("w:document/w:body/w:p{id=7}") + header_element = element("w:hdr/w:p{id=9}") + footer_element = element("w:ftr/w:p{id=12}") + + def make_rel(reltype, target): + rel = Mock() + rel.reltype = reltype + rel.is_external = False + rel.target_part = target + return rel + + header_part = Mock() + header_part._element = header_element + footer_part = Mock() + footer_part._element = footer_element + + doc_rels = { + "rId1": make_rel(RT.HEADER, header_part), + "rId2": make_rel(RT.FOOTER, footer_part), + } + rels_mock = Mock() + rels_mock.values = doc_rels.values + + document_part = Mock() + document_part._element = body_element + document_part.rels = rels_mock + + header_story = StoryPart(None, None, header_element, None) + # -- replace the document-part lookup with one that returns the mock; + # method_mock auto-reverts when `request` tears down -- + method_mock( + request, StoryPart, "_safe_document_part", return_value=document_part + ) + + assert header_story.next_id == 13 + + def it_falls_back_to_the_current_story_when_no_document_part(self, request): + story_element = element("w:document/w:body/w:p{id=4}") + method_mock(request, StoryPart, "_safe_document_part", return_value=None) + story_part = StoryPart(None, None, story_element, None) + + assert story_part.next_id == 5 + + def it_can_create_a_linked_pic_inline(self, request): + from docx.oxml.ns import qn + + story_part = StoryPart(None, None, None, None) + # -- patch helpers to avoid needing a real package / image file -- + method_mock( + request, + StoryPart, + "_resolve_link_target", + return_value=("https://example.com/cat.png", "cat.png", 914400, 457200, None), + ) + method_mock(request, StoryPart, "relate_to", return_value="rId7") + property_mock(request, StoryPart, "next_id", return_value=3) + + inline = story_part.new_pic_inline( + None, link=True, save_with_document=False, url="https://example.com/cat.png" + ) + + blip = inline.find(".//" + qn("a:blip")) + assert blip is not None + assert blip.get(qn("r:link")) == "rId7" + assert blip.get(qn("r:embed")) is None + + def it_can_create_a_linked_pic_anchor(self, request): + from docx.oxml.ns import qn + + story_part = StoryPart(None, None, None, None) + method_mock( + request, + StoryPart, + "_resolve_link_target", + return_value=("c:\\foo.png", "foo.png", 914400, 457200, None), + ) + method_mock(request, StoryPart, "relate_to", return_value="rId9") + property_mock(request, StoryPart, "next_id", return_value=4) + + anchor = story_part.new_pic_anchor( + "c:\\foo.png", link=True, save_with_document=False + ) + + blip = anchor.find(".//" + qn("a:blip")) + assert blip is not None + assert blip.get(qn("r:link")) == "rId9" + assert blip.get(qn("r:embed")) is None + + def it_raises_when_link_requires_a_target(self, request): + story_part = StoryPart(None, None, None, None) + + with pytest.raises(ValueError, match="requires an image path/stream or a url"): + story_part._resolve_link_target(None, None, None, None) + def it_knows_the_main_document_part_to_help(self, package_, document_part_): package_.main_document_part = document_part_ story_part = StoryPart(None, None, None, package_) @@ -115,6 +242,10 @@ def document_part_(self, request): def _document_part_prop_(self, request): return property_mock(request, StoryPart, "_document_part") + @pytest.fixture + def _generate_svg_fallback_(self, request): + return method_mock(request, StoryPart, "_generate_svg_fallback") + @pytest.fixture def get_or_add_image_(self, request): return method_mock(request, StoryPart, "get_or_add_image") diff --git a/tests/parts/test_theme.py b/tests/parts/test_theme.py new file mode 100644 index 000000000..a0d8127ca --- /dev/null +++ b/tests/parts/test_theme.py @@ -0,0 +1,89 @@ +"""Unit test suite for the `docx.parts.theme` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.opc.packuri import PackURI +from docx.opc.part import PartFactory +from docx.oxml.theme import CT_Theme +from docx.package import Package +from docx.parts.theme import ThemePart +from docx.theme import Theme + +from ..unitutil.cxml import element +from ..unitutil.mock import FixtureRequest, Mock, class_mock, instance_mock, method_mock + + +class DescribeThemePart: + """Unit test suite for `docx.parts.theme.ThemePart` objects.""" + + def it_is_used_by_the_part_loader_to_construct_a_theme_part( + self, + package_: Mock, + ThemePart_load_: Mock, + theme_part_: Mock, + ): + partname = PackURI("/word/theme/theme1.xml") + content_type = CT.OFC_THEME + reltype = RT.THEME + blob = ( + b"" + ) + ThemePart_load_.return_value = theme_part_ + + part = PartFactory(partname, content_type, reltype, blob, package_) + + ThemePart_load_.assert_called_once_with( + partname, content_type, blob, package_ + ) + assert part is theme_part_ + + def it_provides_access_to_its_theme_proxy( + self, Theme_: Mock, theme_: Mock, package_: Mock + ): + Theme_.return_value = theme_ + theme_elm = cast(CT_Theme, element("a:theme")) + theme_part = ThemePart( + PackURI("/word/theme/theme1.xml"), CT.OFC_THEME, theme_elm, package_ + ) + + theme = theme_part.theme + + Theme_.assert_called_once_with(theme_elm, theme_part) + assert theme is theme_ + + def it_exposes_its_theme_element(self, package_: Mock): + theme_elm = cast(CT_Theme, element("a:theme")) + theme_part = ThemePart( + PackURI("/word/theme/theme1.xml"), CT.OFC_THEME, theme_elm, package_ + ) + + assert theme_part.theme_element is theme_elm + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def Theme_(self, request: FixtureRequest) -> Mock: + return class_mock(request, "docx.parts.theme.Theme") + + @pytest.fixture + def theme_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Theme) + + @pytest.fixture + def theme_part_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, ThemePart) + + @pytest.fixture + def ThemePart_load_(self, request: FixtureRequest) -> Mock: + return method_mock(request, ThemePart, "load", autospec=False) + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/parts/test_web_settings.py b/tests/parts/test_web_settings.py new file mode 100644 index 000000000..125ebd4ab --- /dev/null +++ b/tests/parts/test_web_settings.py @@ -0,0 +1,89 @@ +"""Unit test suite for the `docx.parts.web_settings` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.opc.packuri import PackURI +from docx.opc.part import PartFactory +from docx.oxml.web_settings import CT_WebSettings +from docx.package import Package +from docx.parts.web_settings import WebSettingsPart +from docx.web_settings import WebSettings + +from ..unitutil.cxml import element +from ..unitutil.mock import FixtureRequest, Mock, class_mock, instance_mock, method_mock + + +class DescribeWebSettingsPart: + """Unit test suite for `docx.parts.web_settings.WebSettingsPart` objects.""" + + def it_is_used_by_the_part_loader_to_construct_a_web_settings_part( + self, + package_: Mock, + WebSettingsPart_load_: Mock, + web_settings_part_: Mock, + ): + partname = PackURI("/word/webSettings.xml") + content_type = CT.WML_WEB_SETTINGS + reltype = RT.WEB_SETTINGS + blob = ( + b"" + ) + WebSettingsPart_load_.return_value = web_settings_part_ + + part = PartFactory(partname, content_type, reltype, blob, package_) + + WebSettingsPart_load_.assert_called_once_with( + partname, content_type, blob, package_ + ) + assert part is web_settings_part_ + + def it_provides_access_to_its_web_settings_proxy( + self, WebSettings_: Mock, web_settings_: Mock, package_: Mock + ): + WebSettings_.return_value = web_settings_ + ws_elm = cast(CT_WebSettings, element("w:webSettings")) + web_settings_part = WebSettingsPart( + PackURI("/word/webSettings.xml"), CT.WML_WEB_SETTINGS, ws_elm, package_ + ) + + web_settings = web_settings_part.web_settings + + WebSettings_.assert_called_once_with(ws_elm, web_settings_part) + assert web_settings is web_settings_ + + def it_exposes_its_web_settings_element(self, package_: Mock): + ws_elm = cast(CT_WebSettings, element("w:webSettings")) + web_settings_part = WebSettingsPart( + PackURI("/word/webSettings.xml"), CT.WML_WEB_SETTINGS, ws_elm, package_ + ) + + assert web_settings_part.web_settings_element is ws_elm + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def WebSettings_(self, request: FixtureRequest) -> Mock: + return class_mock(request, "docx.parts.web_settings.WebSettings") + + @pytest.fixture + def web_settings_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, WebSettings) + + @pytest.fixture + def web_settings_part_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, WebSettingsPart) + + @pytest.fixture + def WebSettingsPart_load_(self, request: FixtureRequest) -> Mock: + return method_mock(request, WebSettingsPart, "load", autospec=False) + + @pytest.fixture + def package_(self, request: FixtureRequest) -> Mock: + return instance_mock(request, Package) diff --git a/tests/ref-docs/README.md b/tests/ref-docs/README.md new file mode 100644 index 000000000..9d4e92d43 --- /dev/null +++ b/tests/ref-docs/README.md @@ -0,0 +1,52 @@ +# Reference Documents + +This directory contains reference `.docx` files created in Microsoft Word for use in +testing. These files serve as ground truth for validating that python-docx can correctly +read documents produced by Word. + +## How to Use + +Reference files are used in Layer 4 (Reference File Comparison) tests. Test code reads +these files with python-docx and asserts the parsed content matches expectations. + +```python +from docx import Document +from tests.helpers.refcmp import ref_docx_path + +def it_reads_a_word_comments_file(): + doc = Document(ref_docx_path("comments-simple")) + comments = doc.comments + assert len(comments) == 1 + assert comments.get(0).author == "John Doe" +``` + +## Reference Files + +### comments-simple.docx (planned) +- One comment on a single word +- Author: "John Doe", Initials: "JD" +- Comment text: "This is a simple comment." + +### comments-threaded.docx (planned) +- Parent comment with 2 reply comments +- Multiple authors +- Demonstrates reply threading via `w16cid:paraIdParent` + +### comments-multi-author.docx (planned) +- Comments by 3 different authors +- Each with distinct initials + +### comments-formatted.docx (planned) +- Comment containing bold and italic text +- Comment containing multiple paragraphs + +## Creating Reference Files + +1. Open Microsoft Word (any recent version) +2. Create the document content described above +3. Save as `.docx` format +4. Place the file in this directory +5. Update this README with the actual content description + +These files are committed to the repository and should only be recreated when +the expected content changes. diff --git a/tests/styles/test_style.py b/tests/styles/test_style.py index 6201f9927..08f3a54ba 100644 --- a/tests/styles/test_style.py +++ b/tests/styles/test_style.py @@ -394,6 +394,196 @@ def unhide_set_fixture(self, request): return style, value, expected_xml + def it_knows_its_link_style(self, link_get_fixture): + style, expected_style_id = link_get_fixture + link_style = style.link_style + if expected_style_id is None: + assert link_style is None + else: + assert link_style is not None + assert link_style.style_id == expected_style_id + # -- referenced style is of type=character, so proxy is CharacterStyle -- + assert isinstance(link_style, CharacterStyle) + + def it_can_change_its_link_style_with_a_style(self, link_set_style_fixture): + style, value, expected_xml = link_set_style_fixture + style.link_style = value + assert style._element.xml == expected_xml + + def it_can_change_its_link_style_with_a_style_id(self, link_set_id_fixture): + style, style_id, expected_xml = link_set_id_fixture + style.link_style = style_id + assert style._element.xml == expected_xml + + def it_can_remove_its_link_style(self, link_remove_fixture): + style, expected_xml = link_remove_fixture + style.link_style = None + assert style._element.xml == expected_xml + + def it_knows_its_next_style(self, next_style_get_fixture): + style, expected_style_id = next_style_get_fixture + next_style = style.next_style + if expected_style_id is None: + assert next_style is None + else: + assert next_style is not None + assert next_style.style_id == expected_style_id + + def it_can_change_its_next_style_with_a_style(self, next_style_set_fixture): + style, value, expected_xml = next_style_set_fixture + style.next_style = value + assert style._element.xml == expected_xml + + def it_can_change_its_next_style_with_a_style_id(self, next_style_set_id_fixture): + style, style_id, expected_xml = next_style_set_id_fixture + style.next_style = style_id + assert style._element.xml == expected_xml + + def it_can_remove_its_next_style(self, next_style_remove_fixture): + style, expected_xml = next_style_remove_fixture + style.next_style = None + assert style._element.xml == expected_xml + + def it_knows_whether_its_redefined(self, redefined_get_fixture): + style, expected_value = redefined_get_fixture + assert style.is_redefined is expected_value + + # fixtures for link_style / next_style / is_redefined ----------- + + @pytest.fixture( + params=[ + # -- no w:link present -> None -- + (0, None), + # -- w:link points at valid sibling -> its styleId -- + (1, "Char"), + # -- w:link points at missing sibling -> None -- + (2, None), + ] + ) + def link_get_fixture(self, request): + style_idx, expected_style_id = request.param + styles = element( + "w:styles/(" + "w:style{w:type=paragraph,w:styleId=Body}," + "w:style{w:type=paragraph,w:styleId=H1}/w:link{w:val=Char}," + "w:style{w:type=paragraph,w:styleId=Broken}/w:link{w:val=DoesNotExist}," + "w:style{w:type=character,w:styleId=Char})" + ) + style = BaseStyle(styles[style_idx]) + return style, expected_style_id + + @pytest.fixture( + params=[ + ("w:style", "w:style/w:link{w:val=Char}"), + ("w:style/w:link{w:val=Old}", "w:style/w:link{w:val=Char}"), + ] + ) + def link_set_style_fixture(self, request, link_style_): + style_cxml, expected_cxml = request.param + style = BaseStyle(element(style_cxml)) + link_style_.style_id = "Char" + return style, link_style_, xml(expected_cxml) + + @pytest.fixture( + params=[ + ("w:style", "Char", "w:style/w:link{w:val=Char}"), + ("w:style/w:link{w:val=Old}", "Char", "w:style/w:link{w:val=Char}"), + ] + ) + def link_set_id_fixture(self, request): + style_cxml, style_id, expected_cxml = request.param + style = BaseStyle(element(style_cxml)) + return style, style_id, xml(expected_cxml) + + @pytest.fixture( + params=[ + ("w:style", "w:style"), + ("w:style/w:link{w:val=Old}", "w:style"), + ] + ) + def link_remove_fixture(self, request): + style_cxml, expected_cxml = request.param + style = BaseStyle(element(style_cxml)) + return style, xml(expected_cxml) + + @pytest.fixture + def link_style_(self, request): + return instance_mock(request, BaseStyle) + + @pytest.fixture( + params=[ + # -- no w:next present -> None -- + (0, None), + # -- w:next refers to valid sibling -> its styleId -- + (1, "Body"), + # -- w:next refers to missing sibling -> None -- + (2, None), + ] + ) + def next_style_get_fixture(self, request): + style_idx, expected_style_id = request.param + styles = element( + "w:styles/(" + "w:style{w:type=paragraph,w:styleId=Plain}," + "w:style{w:type=paragraph,w:styleId=H1}/w:next{w:val=Body}," + "w:style{w:type=paragraph,w:styleId=Broken}/w:next{w:val=DoesNotExist}," + "w:style{w:type=paragraph,w:styleId=Body})" + ) + style = BaseStyle(styles[style_idx]) + return style, expected_style_id + + @pytest.fixture( + params=[ + ("w:style", "w:style/w:next{w:val=Body}"), + ("w:style/w:next{w:val=Old}", "w:style/w:next{w:val=Body}"), + ] + ) + def next_style_set_fixture(self, request, next_style_): + style_cxml, expected_cxml = request.param + style = BaseStyle(element(style_cxml)) + next_style_.style_id = "Body" + return style, next_style_, xml(expected_cxml) + + @pytest.fixture( + params=[ + ("w:style", "Body", "w:style/w:next{w:val=Body}"), + ("w:style/w:next{w:val=Old}", "Body", "w:style/w:next{w:val=Body}"), + ] + ) + def next_style_set_id_fixture(self, request): + style_cxml, style_id, expected_cxml = request.param + style = BaseStyle(element(style_cxml)) + return style, style_id, xml(expected_cxml) + + @pytest.fixture( + params=[ + ("w:style", "w:style"), + ("w:style/w:next{w:val=Old}", "w:style"), + ] + ) + def next_style_remove_fixture(self, request): + style_cxml, expected_cxml = request.param + style = BaseStyle(element(style_cxml)) + return style, xml(expected_cxml) + + @pytest.fixture + def next_style_(self, request): + return instance_mock(request, BaseStyle) + + @pytest.fixture( + params=[ + ("w:style", False), + ("w:style/w:autoRedefine", True), + ("w:style/w:autoRedefine{w:val=0}", False), + ("w:style/w:autoRedefine{w:val=1}", True), + ] + ) + def redefined_get_fixture(self, request): + style_cxml, expected_value = request.param + style = BaseStyle(element(style_cxml)) + return style, expected_value + + class DescribeCharacterStyle: def it_knows_which_style_it_is_based_on(self, base_get_fixture): style, StyleFactory_, StyleFactory_calls, base_style_ = base_get_fixture diff --git a/tests/styles/test_styles.py b/tests/styles/test_styles.py index 7493388d0..9caac638a 100644 --- a/tests/styles/test_styles.py +++ b/tests/styles/test_styles.py @@ -2,11 +2,11 @@ import pytest -from docx.enum.style import WD_STYLE_TYPE +from docx.enum.style import WD_STYLE, WD_STYLE_TYPE from docx.oxml.styles import CT_Style, CT_Styles from docx.styles.latent import LatentStyles from docx.styles.style import BaseStyle -from docx.styles.styles import Styles +from docx.styles.styles import Styles, _builtin_style_ui_name from ..unitutil.cxml import element from ..unitutil.mock import call, class_mock, function_mock, instance_mock, method_mock @@ -41,6 +41,64 @@ def it_can_get_a_style_by_name(self, getitem_name_fixture): style = styles[key] assert style._element is expected_element + def it_can_get_a_style_by_WD_BUILTIN_STYLE_enum_member(self): + # -- upstream#1439: `styles[WD_STYLE.BODY_TEXT]` used to raise KeyError -- + styles = Styles( + element( + "w:styles/w:style{w:type=paragraph}/w:name{w:val=Body Text}" + ) + ) + expected_element = styles._element[0] + + style = styles[WD_STYLE.BODY_TEXT] + + assert style._element is expected_element + + def it_can_get_a_heading_style_by_WD_BUILTIN_STYLE_enum_member(self): + styles = Styles( + element( + "w:styles/w:style{w:type=paragraph}/w:name{w:val=heading 1}" + ) + ) + expected_element = styles._element[0] + + style = styles[WD_STYLE.HEADING_1] + + assert style._element is expected_element + + def it_raises_KeyError_for_missing_WD_BUILTIN_STYLE_member(self): + styles = Styles(element("w:styles")) + with pytest.raises(KeyError): + styles[WD_STYLE.BODY_TEXT] + + def it_can_get_INDEX_HEADING_by_enum(self): + # -- upstream#542: ``styles[WD_STYLE.INDEX_HEADING]`` used to raise + # -- ``KeyError: "no style with name 'INDEX_HEADING (-34)'"`` because + # -- ``str(enum)`` was fed straight into the name lookup. + styles = Styles( + element( + "w:styles/w:style{w:type=paragraph}/w:name{w:val=Index Heading}" + ) + ) + expected_element = styles._element[0] + + style = styles[WD_STYLE.INDEX_HEADING] + + assert style._element is expected_element + + def it_can_get_TOC_HEADING_by_enum(self): + # -- upstream#542: ``WD_STYLE.TOC_HEADING`` was entirely missing. -- + styles = Styles( + element( + "w:styles/w:style{w:type=paragraph}/w:name{w:val=TOC Heading}" + ) + ) + expected_element = styles._element[0] + + style = styles[WD_STYLE.TOC_HEADING] + + assert style._element is expected_element + def it_raises_on_style_not_found(self, get_raises_fixture): styles, key = get_raises_fixture with pytest.raises(KeyError): @@ -399,3 +457,104 @@ def style_elm_(self, request): @pytest.fixture def styles_elm_(self, request): return instance_mock(request, CT_Styles) + + +class DescribeStyles_CaseInsensitiveLookup: + """Phase A-v2 #6: Styles.__getitem__ case-insensitive fallback. + + See upstream #494, #420, PR#239. LibreOffice saves built-in style names + in lower-case on disk; the UI name lookup must still resolve them. + """ + + def it_looks_up_a_style_by_exact_ui_name(self): + styles = Styles( + element( + "w:styles/w:style{w:type=paragraph}/w:name{w:val=Heading 1}" + ) + ) + expected_element = styles._element[0] + + # -- exact match: on-disk name "Heading 1" matches "Heading 1" -- + style = styles["Heading 1"] + assert style._element is expected_element + + def it_finds_a_libreoffice_cased_style_by_its_ui_name(self): + # -- upstream#494: LibreOffice saves "Heading 1" as "Heading 1" + # -- with lower-case variants like "heading 1"; the fallback should + # -- also match oddly-cased on-disk names. -- + styles = Styles( + element( + "w:styles/w:style{w:type=paragraph}/w:name{w:val=HEADING 1}" + ) + ) + expected_element = styles._element[0] + + style = styles["Heading 1"] + assert style._element is expected_element + + def it_case_insensitively_matches_custom_names(self): + styles = Styles( + element( + "w:styles/w:style{w:type=paragraph}/w:name{w:val=My Style}" + ) + ) + expected_element = styles._element[0] + + style = styles["MY STYLE"] + assert style._element is expected_element + + def it_supports_in_operator_case_insensitively(self): + styles = Styles( + element( + "w:styles/w:style{w:type=paragraph}/w:name{w:val=HEADING 1}" + ) + ) + assert "Heading 1" in styles + + def it_can_lookup_various_WD_BUILTIN_STYLE_members(self): + # -- upstream#420: full WD_BUILTIN_STYLE → UI-name translation. -- + styles = Styles( + element( + "w:styles/" + "(w:style{w:type=paragraph}/w:name{w:val=Caption}," + "w:style{w:type=paragraph}/w:name{w:val=Body Text 2}," + "w:style{w:type=character}/w:name{w:val=Emphasis})" + ) + ) + + assert styles[WD_STYLE.CAPTION]._element is styles._element[0] + assert styles[WD_STYLE.BODY_TEXT_2]._element is styles._element[1] + assert styles[WD_STYLE.EMPHASIS]._element is styles._element[2] + + +class DescribeBuiltinStyleUiName: + """Unit-test suite for `_builtin_style_ui_name` helper.""" + + def it_returns_Body_Text_for_BODY_TEXT(self): + assert _builtin_style_ui_name(WD_STYLE.BODY_TEXT) == "Body Text" + + def it_returns_Heading_1_for_HEADING_1(self): + assert _builtin_style_ui_name(WD_STYLE.HEADING_1) == "Heading 1" + + def it_returns_Normal_for_NORMAL(self): + assert _builtin_style_ui_name(WD_STYLE.NORMAL) == "Normal" + + def it_returns_Index_Heading_for_INDEX_HEADING(self): + # -- upstream#542 -- + assert _builtin_style_ui_name(WD_STYLE.INDEX_HEADING) == "Index Heading" + + def it_returns_TOC_Heading_for_TOC_HEADING(self): + # -- upstream#542: TOC_HEADING was previously missing from the enum -- + assert _builtin_style_ui_name(WD_STYLE.TOC_HEADING) == "TOC Heading" + + +class DescribeBabelFish: + """Unit-test suite for `docx.styles.BabelFish`.""" + + def it_translates_WD_BUILTIN_STYLE_members_to_UI_names(self): + from docx.styles import BabelFish + + assert BabelFish.enum2ui(WD_STYLE.BODY_TEXT) == "Body Text" + assert BabelFish.enum2ui(WD_STYLE.HEADING_1) == "Heading 1" + assert BabelFish.enum2ui(WD_STYLE.INDEX_HEADING) == "Index Heading" + assert BabelFish.enum2ui(WD_STYLE.TOC_HEADING) == "TOC Heading" diff --git a/tests/styles/test_styles_import.py b/tests/styles/test_styles_import.py new file mode 100644 index 000000000..212e487d8 --- /dev/null +++ b/tests/styles/test_styles_import.py @@ -0,0 +1,158 @@ +"""Unit-test suite for `Styles.import_from`, `import_style`, `import_builtin` +and `Styles.document_default_font` — upstream#1375, #1083, #508, #701, #197, +#486, #383. +""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.oxml.styles import CT_Styles +from docx.styles.styles import Styles +from docx.text.font import Font + +from ..unitutil.cxml import element + + +class DescribeStyles_ImportFrom: + """Unit-test suite for `Styles.import_from` — cross-document style import.""" + + def it_copies_a_named_style_into_the_target(self): + source = _styles( + "w:styles/(" + "w:style{w:type=paragraph,w:styleId=Fancy}/w:name{w:val=Fancy}," + "w:style{w:type=paragraph,w:styleId=Plain}/w:name{w:val=Plain}" + ")" + ) + target = _styles("w:styles") + + imported = target.import_from(source, names=["Fancy"]) + + assert len(imported) == 1 + assert imported[0].style_id == "Fancy" + assert "Fancy" in target + + def it_skips_styles_already_present_in_the_target(self): + source = _styles( + "w:styles/w:style{w:type=paragraph,w:styleId=Fancy}/w:name{w:val=Fancy}" + ) + target = _styles( + "w:styles/w:style{w:type=paragraph,w:styleId=Fancy}/w:name{w:val=Fancy}" + ) + + imported = target.import_from(source) + + assert imported == [] + # -- still exactly one Fancy in the target -- + assert len([s for s in target if s.style_id == "Fancy"]) == 1 + + def it_imports_basedOn_link_and_next_dependencies(self): + source = _styles( + "w:styles/(" + "w:style{w:type=paragraph,w:styleId=Body}/w:name{w:val=Body}," + "w:style{w:type=character,w:styleId=BodyChar}/w:name{w:val=BodyChar}," + "w:style{w:type=paragraph,w:styleId=NextStyle}/w:name{w:val=NextStyle}," + "w:style{w:type=paragraph,w:styleId=Fancy}/(" + "w:name{w:val=Fancy}," + "w:basedOn{w:val=Body}," + "w:next{w:val=NextStyle}," + "w:link{w:val=BodyChar}" + ")" + ")" + ) + target = _styles("w:styles") + + target.import_from(source, names=["Fancy"]) + + ids = {s.style_id for s in target} + assert {"Fancy", "Body", "BodyChar", "NextStyle"} <= ids + + def it_accepts_objects_with_a_styles_attribute(self): + source = _styles( + "w:styles/w:style{w:type=paragraph,w:styleId=Fancy}/w:name{w:val=Fancy}" + ) + + class _FakeDoc: + styles = source + + target = _styles("w:styles") + target.import_from(_FakeDoc()) + assert "Fancy" in target + + +class DescribeStyles_ImportStyle: + """Unit-test suite for `Styles.import_style` — single-style deep copy.""" + + def it_returns_an_existing_style_untouched(self): + target = _styles( + "w:styles/w:style{w:type=paragraph,w:styleId=Fancy}/w:name{w:val=Fancy}" + ) + source = _styles( + "w:styles/w:style{w:type=paragraph,w:styleId=Fancy}/w:name{w:val=Fancy}" + ) + source_elm = source._element.get_by_id("Fancy") + + result = target.import_style(source_elm) + + assert result.style_id == "Fancy" + assert len([s for s in target if s.style_id == "Fancy"]) == 1 + + def it_imports_the_style_when_not_present(self): + source = _styles( + "w:styles/w:style{w:type=paragraph,w:styleId=Fancy}/w:name{w:val=Fancy}" + ) + target = _styles("w:styles") + + target.import_style(source._element.get_by_id("Fancy")) + + assert "Fancy" in target + + +class DescribeStyles_ImportBuiltin: + """Unit-test suite for `Styles.import_builtin` — upstream#486.""" + + def it_materialises_List_Bullet_from_the_bundled_defaults(self): + target = _styles("w:styles") + + style = target.import_builtin("List Bullet") + + assert style.style_id == "ListBullet" + assert "List Bullet" in target + + def it_raises_KeyError_for_unknown_names(self): + target = _styles("w:styles") + + with pytest.raises(KeyError): + target.import_builtin("NoSuchBuiltinStyle") + + +class DescribeStyles_DocumentDefaultFont: + """Unit-test suite for `Styles.document_default_font` — upstream#383.""" + + def it_returns_a_Font_over_the_docDefaults_rPr(self): + target = _styles("w:styles") + + font = target.document_default_font + + assert isinstance(font, Font) + # -- Writing through the Font proxy persists on the underlying XML -- + font.bold = True + docDefaults = target._element.docDefaults + assert docDefaults is not None + rPrDefault = docDefaults.rPrDefault + assert rPrDefault is not None + # -- rPr auto-created on write -- + assert rPrDefault.rPr is not None + + def it_returns_a_live_view_so_repeated_access_sees_writes(self): + target = _styles("w:styles") + + target.document_default_font.italic = True + # -- Re-read through a fresh proxy and verify the value round-trips -- + assert target.document_default_font.italic is True + + +def _styles(cxml: str) -> Styles: + return Styles(cast(CT_Styles, element(cxml))) diff --git a/tests/test_accessibility.py b/tests/test_accessibility.py new file mode 100644 index 000000000..084976c85 --- /dev/null +++ b/tests/test_accessibility.py @@ -0,0 +1,251 @@ +# pyright: reportPrivateUsage=false + +"""Unit test suite for the `docx.accessibility` module.""" + +from __future__ import annotations + +import pytest + +from docx.accessibility import ( + EMPTY_HEADING, + MULTIPLE_H1, + NO_H1, + SKIPPED_LEVEL, + HeadingIssue, + _heading_level, + validate_heading_structure, +) + +from .unitutil.mock import Mock + + +def _fake_paragraph(style_name: str | None, text: str = "lorem"): + """Return a Mock paragraph whose ``.style.name`` and ``.text`` match the args. + + When `style_name` is |None|, the paragraph has no style attached. + """ + paragraph = Mock(name="Paragraph") + paragraph.text = text + if style_name is None: + paragraph.style = None + else: + style = Mock(name="ParagraphStyle") + style.name = style_name + paragraph.style = style + return paragraph + + +class DescribeHeadingIssue: + """Unit-test suite for `docx.accessibility.HeadingIssue`.""" + + def it_exposes_its_paragraph_kind_and_message(self): + paragraph_ = Mock(name="Paragraph") + issue = HeadingIssue( + paragraph=paragraph_, kind=SKIPPED_LEVEL, message="Heading 3 follows Heading 1" + ) + assert issue.paragraph is paragraph_ + assert issue.kind == SKIPPED_LEVEL + assert issue.message == "Heading 3 follows Heading 1" + + def it_is_immutable(self): + paragraph_ = Mock(name="Paragraph") + issue = HeadingIssue(paragraph=paragraph_, kind=SKIPPED_LEVEL, message="msg") + with pytest.raises(Exception): + issue.kind = MULTIPLE_H1 # type: ignore[misc] + + +class Describe_heading_level: + """Unit-test suite for `docx.accessibility._heading_level`.""" + + @pytest.mark.parametrize( + ("style_name", "expected"), + [ + ("Heading 1", 1), + ("Heading 2", 2), + ("Heading 9", 9), + ("heading 3", 3), + ("HEADING 4", 4), + (" Heading 2 ", 2), + ("Normal", None), + ("Title", None), + ("Heading 10", None), + ("Heading", None), + ("Heading1", None), + ("", None), + (None, None), + ], + ) + def it_returns_the_level_for_a_heading_style(self, style_name, expected): + paragraph = _fake_paragraph(style_name) + assert _heading_level(paragraph) == expected + + def it_returns_None_when_paragraph_has_no_style(self): + paragraph = _fake_paragraph(None) + assert _heading_level(paragraph) is None + + +class DescribeValidateHeadingStructure: + """Unit-test suite for `docx.accessibility.validate_heading_structure`.""" + + def it_returns_empty_list_for_no_paragraphs(self): + assert validate_heading_structure([]) == [] + + def it_returns_empty_list_for_document_with_no_headings(self): + paragraphs = [ + _fake_paragraph("Normal"), + _fake_paragraph("Body Text"), + _fake_paragraph(None), + ] + assert validate_heading_structure(paragraphs) == [] + + def it_returns_empty_list_for_well_formed_heading_structure(self): + paragraphs = [ + _fake_paragraph("Heading 1"), + _fake_paragraph("Normal"), + _fake_paragraph("Heading 2"), + _fake_paragraph("Normal"), + _fake_paragraph("Heading 3"), + _fake_paragraph("Heading 2"), + _fake_paragraph("Heading 3"), + ] + assert validate_heading_structure(paragraphs) == [] + + def it_reports_skipped_levels(self): + h1 = _fake_paragraph("Heading 1") + h3 = _fake_paragraph("Heading 3") + paragraphs = [h1, _fake_paragraph("Normal"), h3] + + issues = validate_heading_structure(paragraphs) + + assert len(issues) == 1 + assert issues[0].paragraph is h3 + assert issues[0].kind == SKIPPED_LEVEL + assert "Heading 3 follows Heading 1" in issues[0].message + assert "Heading 2 is missing" in issues[0].message + + def it_reports_skipped_levels_spanning_multiple_levels(self): + h1 = _fake_paragraph("Heading 1") + h4 = _fake_paragraph("Heading 4") + issues = validate_heading_structure([h1, h4]) + + assert len(issues) == 1 + assert issues[0].kind == SKIPPED_LEVEL + # -- when the jump spans multiple levels, we name the first missing level -- + assert "Heading 2 is missing" in issues[0].message + + def it_reports_multiple_h1_paragraphs(self): + h1a = _fake_paragraph("Heading 1") + h1b = _fake_paragraph("Heading 1") + h1c = _fake_paragraph("Heading 1") + + issues = validate_heading_structure([h1a, h1b, h1c]) + + # -- only the second and subsequent H1s are flagged -- + multi = [i for i in issues if i.kind == MULTIPLE_H1] + assert len(multi) == 2 + assert multi[0].paragraph is h1b + assert multi[1].paragraph is h1c + + def it_reports_empty_heading_paragraphs(self): + empty = _fake_paragraph("Heading 1", text="") + whitespace = _fake_paragraph("Heading 2", text=" \t\n ") + + issues = validate_heading_structure([empty, whitespace]) + + empties = [i for i in issues if i.kind == EMPTY_HEADING] + assert len(empties) == 2 + assert empties[0].paragraph is empty + assert empties[1].paragraph is whitespace + assert "empty" in empties[0].message.lower() + + def it_does_not_report_empty_heading_for_nonheading_paragraphs(self): + paragraphs = [ + _fake_paragraph("Normal", text=""), + _fake_paragraph("Heading 1"), + ] + issues = validate_heading_structure(paragraphs) + assert [i for i in issues if i.kind == EMPTY_HEADING] == [] + + def it_reports_no_h1_when_first_heading_is_below_H1(self): + h2 = _fake_paragraph("Heading 2") + h3 = _fake_paragraph("Heading 3") + + issues = validate_heading_structure([_fake_paragraph("Normal"), h2, h3]) + + no_h1 = [i for i in issues if i.kind == NO_H1] + assert len(no_h1) == 1 + assert no_h1[0].paragraph is h2 + assert "Heading 1" in no_h1[0].message + + def it_does_not_report_no_h1_when_first_heading_is_H1(self): + paragraphs = [_fake_paragraph("Heading 1"), _fake_paragraph("Heading 2")] + issues = validate_heading_structure(paragraphs) + assert [i for i in issues if i.kind == NO_H1] == [] + + def it_ignores_non_heading_paragraphs_between_headings(self): + h1 = _fake_paragraph("Heading 1") + h2 = _fake_paragraph("Heading 2") + paragraphs = [ + h1, + _fake_paragraph("Normal"), + _fake_paragraph("Body Text"), + _fake_paragraph(None), + h2, + ] + assert validate_heading_structure(paragraphs) == [] + + def it_reports_multiple_issues_in_document_order(self): + h2 = _fake_paragraph("Heading 2") # triggers NO_H1 + h4 = _fake_paragraph("Heading 4") # triggers SKIPPED_LEVEL + h1 = _fake_paragraph("Heading 1", text="") # triggers EMPTY_HEADING + + issues = validate_heading_structure([h2, h4, h1]) + + # -- h2 produces NO_H1; h4 produces SKIPPED_LEVEL; h1 produces EMPTY_HEADING -- + kinds = [i.kind for i in issues] + assert NO_H1 in kinds + assert SKIPPED_LEVEL in kinds + assert EMPTY_HEADING in kinds + + # -- issues appear in document order -- + paragraph_order = [h2, h4, h1] + assert [ + paragraph_order.index(i.paragraph) for i in issues + ] == sorted([paragraph_order.index(i.paragraph) for i in issues]) + + +class DescribeDocument_validate_heading_structure: + """Integration test: `Document.validate_heading_structure` delegates to the helper.""" + + def it_calls_the_module_function_with_document_paragraphs(self): + from typing import cast + from unittest.mock import patch + + from docx.document import Document + from docx.oxml.document import CT_Document + + from .unitutil.cxml import element + + document_elm = cast( + CT_Document, + element('w:document/w:body/w:p/w:r/w:t"just text"'), + ) + doc = Document(document_elm, Mock(name="DocumentPart")) + + with patch( + "docx.accessibility.validate_heading_structure", + return_value=[Mock(name="HeadingIssue")], + ) as validate_mock: + result = doc.validate_heading_structure() + + # -- the helper is called exactly once, with the Sequence of + # -- Paragraph objects exposed by Document.paragraphs (now a + # -- lightweight view rather than a plain list — the helper + # -- accepts Iterable[Paragraph], so either is fine). -- + validate_mock.assert_called_once() + (call_paragraphs,) = validate_mock.call_args.args + from docx.text.paragraph import Paragraph + + assert len(call_paragraphs) == 1 + assert isinstance(call_paragraphs[0], Paragraph) + assert result == validate_mock.return_value diff --git a/tests/test_alt_chunk.py b/tests/test_alt_chunk.py new file mode 100644 index 000000000..231a3cb25 --- /dev/null +++ b/tests/test_alt_chunk.py @@ -0,0 +1,123 @@ +# pyright: reportPrivateUsage=false + +"""Unit test suite for the docx.alt_chunk module.""" + +from __future__ import annotations + +from docx import Document +from docx.alt_chunk import AltChunk +from docx.document import Document as DocumentCls +from docx.opc.constants import RELATIONSHIP_TYPE as RT +from docx.parts.alt_chunk import AltChunkPart, _ext_for_content_type + + +class DescribeDocumentAddAltChunk: + """Unit-test suite for `Document.add_alt_chunk`.""" + + def it_returns_an_AltChunk_proxy(self): + document: DocumentCls = Document() + + alt_chunk = document.add_alt_chunk("

hello

") + + assert isinstance(alt_chunk, AltChunk) + + def it_appends_a_w_altChunk_element_to_the_body(self): + document: DocumentCls = Document() + + document.add_alt_chunk(b"

hi

") + + body = document._element.body + assert len(body.altChunk_lst) == 1 + + def it_creates_a_relationship_with_aFChunk_reltype(self): + document: DocumentCls = Document() + + alt_chunk = document.add_alt_chunk("

hi

") + + # -- the rId on the altChunk element resolves to an AltChunkPart -- + assert alt_chunk.rId is not None + assert isinstance(alt_chunk.part, AltChunkPart) + # -- and the relationship type is aFChunk -- + document_part = document._part + assert document_part.rels[alt_chunk.rId].reltype == RT.A_F_CHUNK + + def it_encodes_str_content_as_utf_8_bytes(self): + document: DocumentCls = Document() + + alt_chunk = document.add_alt_chunk("café", content_type="text/plain") + + assert alt_chunk.content == "café".encode("utf-8") + + def it_defaults_the_content_type_to_text_html(self): + document: DocumentCls = Document() + + alt_chunk = document.add_alt_chunk("

hi

") + + assert alt_chunk.content_type == "text/html" + + def it_accepts_a_custom_content_type(self): + document: DocumentCls = Document() + + alt_chunk = document.add_alt_chunk(b"{\\rtf1}", content_type="application/rtf") + + assert alt_chunk.content_type == "application/rtf" + + +class DescribeDocumentAltChunks: + """Unit-test suite for `Document.alt_chunks`.""" + + def it_returns_an_empty_list_when_there_are_no_altChunks(self): + document: DocumentCls = Document() + + assert document.alt_chunks == [] + + def it_lists_one_proxy_per_altChunk_in_document_order(self): + document: DocumentCls = Document() + document.add_alt_chunk("

first

") + document.add_alt_chunk("

second

", content_type="text/html") + + chunks = document.alt_chunks + + assert len(chunks) == 2 + assert all(isinstance(ch, AltChunk) for ch in chunks) + assert chunks[0].content == b"

first

" + assert chunks[1].content == b"

second

" + + def it_round_trips_through_save_and_open(self, tmp_path): + document: DocumentCls = Document() + document.add_alt_chunk("

hello

", content_type="text/html") + path = tmp_path / "roundtrip.docx" + document.save(str(path)) + + reopened: DocumentCls = Document(str(path)) + + chunks = reopened.alt_chunks + assert len(chunks) == 1 + assert chunks[0].content_type == "text/html" + assert chunks[0].content == b"

hello

" + + +class DescribeAltChunkPart: + """Unit-test suite for `docx.parts.alt_chunk.AltChunkPart`.""" + + def it_picks_a_partname_extension_from_the_content_type(self): + assert _ext_for_content_type("text/html") == ".html" + assert _ext_for_content_type("application/rtf") == ".rtf" + assert _ext_for_content_type("text/rtf") == ".rtf" + assert _ext_for_content_type("application/xhtml+xml") == ".xhtml" + assert _ext_for_content_type("text/plain") == ".txt" + assert _ext_for_content_type("application/msword") == ".doc" + assert _ext_for_content_type("weird/thing") == ".bin" + + def it_can_be_loaded_from_blob(self): + # -- simulate the PartFactory.load path -- + from docx.opc.packuri import PackURI + + part = AltChunkPart.load( + PackURI("/word/afchunk1.html"), + "text/html", + b"

x

", + None, # type: ignore[arg-type] + ) + assert part.blob == b"

x

" + assert part.content_type == "text/html" diff --git a/tests/test_api.py b/tests/test_api.py index 6b5d3ae07..64de24a52 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,13 +1,68 @@ """Test suite for the docx.api module.""" +import io +import zipfile +from pathlib import Path + import pytest +from lxml import etree from docx.api import Document as DocumentFactoryFn from docx.document import Document as DocumentCls +from docx.exceptions import EncryptedDocumentError from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.exceptions import PackageNotFoundError from .unitutil.mock import FixtureRequest, Mock, class_mock, function_mock, instance_mock +_OLE_SIGNATURE = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" + + +def _make_malformed_docx_bytes() -> bytes: + """Return a zip-packaged .docx whose `word/document.xml` is truncated mid-tag. + + The rest of the package is valid so recovery mode has something to graft + the degraded document part onto. + """ + tpl_path = Path(__file__).parent.parent / "src" / "docx" / "templates" / "default.docx" + with open(tpl_path, "rb") as f: + blob = f.read() + out = io.BytesIO() + with zipfile.ZipFile(io.BytesIO(blob), "r") as zin: + with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout: + for item in zin.infolist(): + data = zin.read(item.filename) + if item.filename == "word/document.xml": + data = data[: len(data) // 2] # -- truncate mid-element -- + zout.writestr(item, data) + return out.getvalue() + + +def _make_valid_docx_bytes() -> bytes: + tpl_path = Path(__file__).parent.parent / "src" / "docx" / "templates" / "default.docx" + with open(tpl_path, "rb") as f: + return f.read() + + +def _make_empty_document_xml_docx_bytes() -> bytes: + """Return a valid .docx whose `word/document.xml` is an empty byte string. + + Empty content is unrecoverable even with ``recover=True`` — forces the + stub-element fallback in ``XmlPart.load``. + """ + tpl_path = Path(__file__).parent.parent / "src" / "docx" / "templates" / "default.docx" + with open(tpl_path, "rb") as f: + blob = f.read() + out = io.BytesIO() + with zipfile.ZipFile(io.BytesIO(blob), "r") as zin: + with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout: + for item in zin.infolist(): + data = zin.read(item.filename) + if item.filename == "word/document.xml": + data = b"" + zout.writestr(item, data) + return out.getvalue() + class DescribeDocument: """Unit-test suite for `docx.api.Document` factory function.""" @@ -19,20 +74,89 @@ def it_opens_a_docx_file(self, Package_: Mock, document_: Mock): document = DocumentFactoryFn("foobar.docx") - Package_.open.assert_called_once_with("foobar.docx") + Package_.open.assert_called_once_with( + "foobar.docx", recover=False, huge_tree=False, password=None + ) + assert document is document_ + + def it_accepts_a_PathLike_docx_path(self, Package_: Mock, document_: Mock): + # -- upstream-PR#1168: accept os.PathLike (e.g. pathlib.Path) -- + document_part = Package_.open.return_value.main_document_part + document_part.document = document_ + document_part.content_type = CT.WML_DOCUMENT_MAIN + + document = DocumentFactoryFn(Path("foobar.docx")) + + # -- os.fspath normalises the PathLike to str before delegating -- + Package_.open.assert_called_once_with( + "foobar.docx", recover=False, huge_tree=False, password=None + ) assert document is document_ def it_opens_the_default_docx_if_none_specified( - self, _default_docx_path_: Mock, Package_: Mock, document_: Mock + self, _default_docx_stream_: Mock, Package_: Mock, document_: Mock ): - _default_docx_path_.return_value = "default-document.docx" + default_stream = io.BytesIO(b"fake-default-bytes") + _default_docx_stream_.return_value = default_stream document_part = Package_.open.return_value.main_document_part document_part.document = document_ document_part.content_type = CT.WML_DOCUMENT_MAIN document = DocumentFactoryFn() - Package_.open.assert_called_once_with("default-document.docx") + Package_.open.assert_called_once_with( + default_stream, recover=False, huge_tree=False, password=None + ) + assert document is document_ + + def it_sources_the_default_docx_via_importlib_resources(self): + # -- PyInstaller / cx_freeze / zipimport safety: must not rely on __file__ path. + # -- Closes upstream#176, upstream-PR#1310, upstream-PR#177. + from docx.api import _default_docx_stream + + data_stream = _default_docx_stream() + + assert isinstance(data_stream, io.BytesIO) + # -- first four bytes of every .docx package are the PK\x03\x04 zip signature -- + assert data_stream.getvalue()[:4] == b"PK\x03\x04" + + def it_produces_a_usable_default_Document_instance(self): + # -- round-trip sanity check: Document() with no arg yields a real Document -- + document = DocumentFactoryFn() + + assert isinstance(document, DocumentCls) + + def it_strips_metadata_when_include_metadata_is_False(self): + # -- default template ships with Application, AppVersion, Template, etc. + # -- baseline: with include_metadata=True (default), those survive -- + document = DocumentFactoryFn(include_metadata=False) + + # -- core properties cleared -- + assert document.core_properties.author == "" + assert document.core_properties.title == "" + assert document.core_properties.last_modified_by == "" + assert document.core_properties.modified is None + # -- extended properties cleared -- + assert document.extended_properties.application is None + assert document.extended_properties.app_version is None + assert document.extended_properties.template is None + + def it_keeps_metadata_by_default(self): + document = DocumentFactoryFn() + + # -- the bundled template writes a known Application name -- + assert document.extended_properties.application is not None + + def it_opens_a_docm_file(self, Package_: Mock, document_: Mock): + document_part = Package_.open.return_value.main_document_part + document_part.document = document_ + document_part.content_type = CT.WML_DOCUMENT_MACRO + + document = DocumentFactoryFn("foobar.docm") + + Package_.open.assert_called_once_with( + "foobar.docm", recover=False, huge_tree=False, password=None + ) assert document is document_ def it_raises_on_not_a_Word_file(self, Package_: Mock): @@ -41,11 +165,202 @@ def it_raises_on_not_a_Word_file(self, Package_: Mock): with pytest.raises(ValueError, match="file 'foobar.xlsx' is not a Word file,"): DocumentFactoryFn("foobar.xlsx") + def it_raises_EncryptedDocumentError_on_password_protected_path(self, tmp_path): + encrypted_path = tmp_path / "encrypted.docx" + encrypted_path.write_bytes(_OLE_SIGNATURE + b"\x00" * 512) + + with pytest.raises(EncryptedDocumentError, match="python-ooxml-crypto"): + DocumentFactoryFn(str(encrypted_path)) + + def it_raises_FileNotFoundError_on_missing_path(self, tmp_path): + # -- upstream#1410: missing file must raise FileNotFoundError so it + # -- behaves like a normal filesystem-missing error. -- + missing = str(tmp_path / "no-such-file.docx") + + with pytest.raises(FileNotFoundError): + DocumentFactoryFn(missing) + + def it_raises_NotADocxError_on_non_zip_file(self, tmp_path): + # -- upstream#1410: existing file that isn't a zip raises NotADocxError -- + from docx.opc.exceptions import NotADocxError + + plain = tmp_path / "plain.docx" + plain.write_bytes(b"this is just text, not a zip") + + with pytest.raises(NotADocxError): + DocumentFactoryFn(str(plain)) + + def it_raises_EncryptedDocumentError_on_password_protected_stream(self): + stream = io.BytesIO(_OLE_SIGNATURE + b"\x00" * 512) + + with pytest.raises(EncryptedDocumentError, match="password-protected"): + DocumentFactoryFn(stream) + + def it_raises_on_malformed_document_xml_by_default(self): + stream = io.BytesIO(_make_malformed_docx_bytes()) + + with pytest.raises(etree.XMLSyntaxError): + DocumentFactoryFn(stream) + + def it_opens_malformed_document_in_recover_mode(self): + stream = io.BytesIO(_make_malformed_docx_bytes()) + + document = DocumentFactoryFn(stream, recover=True) + + assert isinstance(document, DocumentCls) + assert len(document.recovery_warnings) > 0 + assert all(isinstance(w, str) for w in document.recovery_warnings) + + def it_reports_no_warnings_for_valid_document_in_recover_mode(self): + stream = io.BytesIO(_make_valid_docx_bytes()) + + document = DocumentFactoryFn(stream, recover=True) + + assert document.recovery_warnings == [] + + def it_recovery_mode_still_raises_for_invalid_zip(self, tmp_path): + not_a_zip = tmp_path / "bogus.docx" + not_a_zip.write_bytes(b"this is not a zip file") + + with pytest.raises(PackageNotFoundError): + DocumentFactoryFn(str(not_a_zip), recover=True) + + def it_raises_PackageNotFoundError_on_zip_missing_content_types(self, tmp_path): + # -- Regression for issue #172: a zip that happens to be a valid archive + # -- but lacks `[Content_Types].xml` used to surface a bare + # -- `KeyError("[Content_Types].xml")` from `zipfile.read`, which leaks + # -- the internal file name and is hard to match on. Now wrapped in a + # -- typed `PackageNotFoundError`. -- + bogus = tmp_path / "no-content-types.docx" + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z: + z.writestr("word/document.xml", b"") + bogus.write_bytes(buf.getvalue()) + + with pytest.raises(PackageNotFoundError, match=r"\[Content_Types\]\.xml"): + DocumentFactoryFn(str(bogus)) + + def it_raises_PackageNotFoundError_on_zip_missing_content_types_in_recover_mode( + self, tmp_path + ): + # -- The wrapping happens at the OPC load boundary, before recovery mode + # -- gets a chance to kick in. `PackageNotFoundError` must surface even + # -- when the caller opts into `recover=True`. -- + bogus = tmp_path / "no-content-types.docx" + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z: + z.writestr("word/document.xml", b"") + bogus.write_bytes(buf.getvalue()) + + with pytest.raises(PackageNotFoundError): + DocumentFactoryFn(str(bogus), recover=True) + + def it_recovery_mode_still_raises_for_encrypted_docx(self, tmp_path): + encrypted_path = tmp_path / "encrypted.docx" + encrypted_path.write_bytes(_OLE_SIGNATURE + b"\x00" * 512) + + with pytest.raises(EncryptedDocumentError): + DocumentFactoryFn(str(encrypted_path), recover=True) + + def it_defaults_recover_to_False_for_valid_document(self): + stream = io.BytesIO(_make_valid_docx_bytes()) + + document = DocumentFactoryFn(stream) + + assert document.recovery_warnings == [] + + def it_falls_back_to_stub_when_document_xml_is_empty(self): + stream = io.BytesIO(_make_empty_document_xml_docx_bytes()) + + document = DocumentFactoryFn(stream, recover=True) + + assert isinstance(document, DocumentCls) + assert document.paragraphs == [] + assert len(document.recovery_warnings) >= 1 + + def it_passes_recover_True_through_to_Package_open( + self, Package_: Mock, document_: Mock + ): + document_part = Package_.open.return_value.main_document_part + document_part.document = document_ + document_part.content_type = CT.WML_DOCUMENT_MAIN + + DocumentFactoryFn("foobar.docx", recover=True) + + Package_.open.assert_called_once_with( + "foobar.docx", recover=True, huge_tree=False, password=None + ) + + def it_passes_huge_tree_True_through_to_Package_open( + self, Package_: Mock, document_: Mock + ): + # -- upstream#1086: huge_tree=True must propagate to Package.open -- + document_part = Package_.open.return_value.main_document_part + document_part.document = document_ + document_part.content_type = CT.WML_DOCUMENT_MAIN + + DocumentFactoryFn("foobar.docx", huge_tree=True) + + Package_.open.assert_called_once_with( + "foobar.docx", recover=False, huge_tree=True, password=None + ) + + def it_defaults_huge_tree_to_False(self, Package_: Mock, document_: Mock): + document_part = Package_.open.return_value.main_document_part + document_part.document = document_ + document_part.content_type = CT.WML_DOCUMENT_MAIN + + DocumentFactoryFn("foobar.docx") + + Package_.open.assert_called_once_with( + "foobar.docx", recover=False, huge_tree=False, password=None + ) + + def it_passes_password_through_to_Package_open( + self, Package_: Mock, document_: Mock + ): + document_part = Package_.open.return_value.main_document_part + document_part.document = document_ + document_part.content_type = CT.WML_DOCUMENT_MAIN + + DocumentFactoryFn("protected.docx", password="hunter2") + + Package_.open.assert_called_once_with( + "protected.docx", recover=False, huge_tree=False, password="hunter2" + ) + + def it_ships_hanging_indents_on_List_Bullet_and_List_Number(self): + # -- upstream#1443: default.docx used to omit hanging indents on these + # -- list styles so Word-rendered bullets collided with paragraph text. -- + document = DocumentFactoryFn() + + for name in ("List Bullet", "List Number"): + pf = document.styles[name].paragraph_format + assert pf.left_indent is not None and pf.left_indent > 0, ( + f"style {name!r} has no left_indent" + ) + assert pf.first_line_indent is not None and pf.first_line_indent < 0, ( + f"style {name!r} has no hanging (negative first-line) indent" + ) + + def it_preserves_List_Bullet_indents_after_round_trip(self): + # -- upstream#1443: round-trip through save/open must preserve indents -- + document = DocumentFactoryFn() + buf = io.BytesIO() + document.save(buf) + buf.seek(0) + + reopened = DocumentFactoryFn(buf) + + pf = reopened.styles["List Bullet"].paragraph_format + assert pf.left_indent is not None and pf.left_indent > 0 + assert pf.first_line_indent is not None and pf.first_line_indent < 0 + # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture - def _default_docx_path_(self, request: FixtureRequest): - return function_mock(request, "docx.api._default_docx_path") + def _default_docx_stream_(self, request: FixtureRequest): + return function_mock(request, "docx.api._default_docx_stream") @pytest.fixture def document_(self, request: FixtureRequest): @@ -54,3 +369,83 @@ def document_(self, request: FixtureRequest): @pytest.fixture def Package_(self, request: FixtureRequest): return class_mock(request, "docx.api.Package") + + +class DescribePasswordRoundTrip: + """Integration tests for encrypted Document open/save via ``python-ooxml-crypto``.""" + + def _requires_ooxml_crypto(self): + import importlib.util + + if importlib.util.find_spec("ooxml_crypto") is None: + pytest.skip( + "python-ooxml-crypto is not installed (optional dependency)" + ) + + def it_round_trips_through_a_stream(self): + self._requires_ooxml_crypto() + + document = DocumentFactoryFn() + document.add_paragraph("encrypted round-trip body") + + buf = io.BytesIO() + document.save(buf, password="hunter2") + + # -- the saved bytes are a CFBF (OLE2) container, not a plain zip -- + assert buf.getvalue()[:8] == _OLE_SIGNATURE + + buf.seek(0) + reopened = DocumentFactoryFn(buf, password="hunter2") + + texts = [p.text for p in reopened.paragraphs] + assert "encrypted round-trip body" in texts + + def it_round_trips_through_a_path(self, tmp_path): + self._requires_ooxml_crypto() + + document = DocumentFactoryFn() + document.add_paragraph("encrypted round-trip body via path") + + out_path = tmp_path / "protected.docx" + document.save(str(out_path), password="hunter2") + + # -- the saved bytes are a CFBF (OLE2) container, not a plain zip -- + with open(out_path, "rb") as f: + assert f.read(8) == _OLE_SIGNATURE + + reopened = DocumentFactoryFn(str(out_path), password="hunter2") + + texts = [p.text for p in reopened.paragraphs] + assert "encrypted round-trip body via path" in texts + + def it_raises_EncryptedDocumentError_with_wrong_password(self, tmp_path): + self._requires_ooxml_crypto() + + document = DocumentFactoryFn() + document.add_paragraph("wrong-password reject test") + out_path = tmp_path / "protected.docx" + document.save(str(out_path), password="correct") + + with pytest.raises(EncryptedDocumentError, match="password does not match"): + DocumentFactoryFn(str(out_path), password="incorrect") + + def it_raises_EncryptedDocumentError_when_password_is_missing(self, tmp_path): + self._requires_ooxml_crypto() + + document = DocumentFactoryFn() + document.add_paragraph("missing-password reject test") + out_path = tmp_path / "protected.docx" + document.save(str(out_path), password="correct") + + with pytest.raises(EncryptedDocumentError, match="password-protected"): + DocumentFactoryFn(str(out_path)) + + def it_rejects_flat_opc_with_password(self, tmp_path): + # -- flat_opc and password are mutually exclusive: Flat-OPC is not a zip. -- + document = DocumentFactoryFn() + + out_path = tmp_path / "protected.xml" + with pytest.raises(ValueError, match="mutually exclusive"): + document.save(str(out_path), flat_opc=True, password="hunter2") + + diff --git a/tests/test_append_document.py b/tests/test_append_document.py new file mode 100644 index 000000000..26f895eeb --- /dev/null +++ b/tests/test_append_document.py @@ -0,0 +1,117 @@ +"""Test suite for docx.append_document.""" + +from __future__ import annotations + +import io +from pathlib import Path + +import pytest + +from docx import Document +from docx.opc.constants import RELATIONSHIP_TYPE as RT + + +TEST_PNG = Path(__file__).parent / "test_files" / "python-icon.png" + + +class DescribeAppendDocument: + """Covers `Document.append_document / append_body / append_paragraph` (upstream#1457 et al.).""" + + def it_copies_every_paragraph_from_source_body(self): + src = Document() + src.add_paragraph("first") + src.add_paragraph("second") + + dest = Document() + dest._body.clear_content() + + copied = dest.append_document(src) + + assert copied >= 2 + texts = [p.text for p in dest.paragraphs] + assert "first" in texts + assert "second" in texts + + def it_copies_the_Heading_1_style_when_source_uses_it(self): + src = Document() + src.add_heading("Chapter One", level=1) + + dest = Document() + dest._body.clear_content() + # -- drop the default Heading 1 style so we can verify it's copied in -- + # -- (default.docx ships with it; that's fine, the test still verifies + # -- presence after append). -- + + dest.append_document(src) + + assert "Heading 1" in dest.styles + + def it_imports_image_parts_and_rewrites_rIds_for_referenced_images(self): + if not TEST_PNG.exists(): + pytest.skip("test PNG fixture unavailable") + + src = Document() + src.add_picture(str(TEST_PNG)) + + dest = Document() + dest._body.clear_content() + + dest.append_document(src) + + image_rels = [ + r for r in dest.part.rels.values() if r.reltype == RT.IMAGE + ] + assert len(image_rels) >= 1 + + def it_survives_a_save_and_reopen_roundtrip(self): + src = Document() + src.add_heading("Heading", level=1) + src.add_paragraph("hello") + + dest = Document() + dest._body.clear_content() + dest.append_document(src) + + buf = io.BytesIO() + dest.save(buf) + buf.seek(0) + reopened = Document(buf) + + texts = [p.text for p in reopened.paragraphs] + assert "Heading" in texts + assert "hello" in texts + + def it_exposes_append_body_as_an_alias(self): + src = Document() + src.add_paragraph("body-only") + dest = Document() + dest._body.clear_content() + + copied = dest.append_body(src) + + assert copied >= 1 + assert "body-only" in [p.text for p in dest.paragraphs] + + def it_can_append_a_single_paragraph(self): + src = Document() + para = src.add_paragraph("one paragraph") + + dest = Document() + dest._body.clear_content() + + new_para = dest.append_paragraph(para) + + assert new_para.text == "one paragraph" + assert "one paragraph" in [p.text for p in dest.paragraphs] + + def it_preserves_destination_section_settings(self): + src = Document() + src.add_paragraph("src para") + + dest = Document() + # -- destination keeps its existing sectPr; appended content goes before it -- + original_sections = len(dest.sections) + + dest.append_document(src) + + assert len(dest.sections) == original_sections diff --git a/tests/test_attachments.py b/tests/test_attachments.py new file mode 100644 index 000000000..398143322 --- /dev/null +++ b/tests/test_attachments.py @@ -0,0 +1,99 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.attachments` + `Document.attachments`.""" + +from __future__ import annotations + +from typing import cast + +from docx.attachments import Attachment +from docx.document import Document +from docx.opc.packuri import PackURI +from docx.opc.part import Part +from docx.oxml.document import CT_Document +from docx.parts.document import DocumentPart + +from .unitutil.cxml import element +from .unitutil.mock import FixtureRequest, instance_mock + + +def _html_part(blob: bytes = b"hi") -> Part: + part = Part(PackURI("/word/afchunk.html"), "text/html", blob) + return part + + +class DescribeAttachment: + def it_knows_its_r_id(self): + alt = element("w:altChunk{r:id=rId7}") + att = Attachment(alt, None) + assert att.r_id == "rId7" + + def it_returns_None_r_id_when_missing(self): + alt = element("w:altChunk") + att = Attachment(alt, None) + assert att.r_id is None + + def it_exposes_blob_and_content_type_when_resolved(self): + alt = element("w:altChunk{r:id=rId1}") + part = _html_part() + att = Attachment(alt, part) + assert att.blob == b"hi" + assert att.content_type == "text/html" + assert att.partname == "/word/afchunk.html" + + def it_returns_empty_blob_when_unresolved(self): + alt = element("w:altChunk{r:id=rId99}") + att = Attachment(alt, None) + assert att.blob == b"" + assert att.content_type is None + assert att.partname is None + + +class DescribeDocument_attachments: + def it_returns_empty_list_when_no_altChunks(self, request: FixtureRequest): + doc_elm = cast(CT_Document, element("w:document/w:body/w:p")) + document_part_ = instance_mock(request, DocumentPart) + document_part_.related_parts = {} + document = Document(doc_elm, document_part_) + + assert document.attachments == [] + + def it_enumerates_each_altChunk(self, request: FixtureRequest): + part_html = _html_part(b"A") + part_rtf = Part(PackURI("/word/afchunk.rtf"), "application/rtf", b"{\\rtf1}") + document_part_ = instance_mock(request, DocumentPart) + document_part_.related_parts = { + "rId10": part_html, + "rId11": part_rtf, + } + doc_elm = cast( + CT_Document, + element( + "w:document/w:body/(" + "w:altChunk{r:id=rId10}," + "w:p," + "w:altChunk{r:id=rId11}" + ")" + ), + ) + document = Document(doc_elm, document_part_) + + atts = document.attachments + assert len(atts) == 2 + assert [a.r_id for a in atts] == ["rId10", "rId11"] + assert [a.content_type for a in atts] == ["text/html", "application/rtf"] + assert [a.blob for a in atts] == [b"A", b"{\\rtf1}"] + + def it_handles_unresolved_altChunk(self, request: FixtureRequest): + document_part_ = instance_mock(request, DocumentPart) + document_part_.related_parts = {} + doc_elm = cast( + CT_Document, + element("w:document/w:body/w:altChunk{r:id=rIdMissing}"), + ) + document = Document(doc_elm, document_part_) + atts = document.attachments + assert len(atts) == 1 + assert atts[0].blob == b"" + assert atts[0].partname is None + assert atts[0].content_type is None diff --git a/tests/test_bibliography.py b/tests/test_bibliography.py new file mode 100644 index 000000000..598b9ee53 --- /dev/null +++ b/tests/test_bibliography.py @@ -0,0 +1,156 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.bibliography` proxy + write-side integration.""" + +from __future__ import annotations + +import io +import zipfile + +import pytest + +from docx import Document +from docx.bibliography import Bibliography, Source +from docx.oxml.bibliography import new_sources_root +from docx.oxml.ns import qn + + +class DescribeBibliography: + """Unit-test suite for the `Bibliography` proxy.""" + + def it_iterates_Source_proxies_for_each_b_Source_child(self): + sources = new_sources_root() + sources.add_source_from_kwargs("a", title="Book A") + sources.add_source_from_kwargs("b", title="Book B") + bib = Bibliography(sources) + + found = list(bib) + + assert [s.tag for s in found] == ["a", "b"] + assert all(isinstance(s, Source) for s in found) + + def its_len_matches_the_number_of_sources(self): + sources = new_sources_root() + sources.add_source_from_kwargs("a") + sources.add_source_from_kwargs("b") + bib = Bibliography(sources) + + assert len(bib) == 2 + + def it_can_look_up_a_source_by_tag(self): + sources = new_sources_root() + sources.add_source_from_kwargs("alpha", title="A") + sources.add_source_from_kwargs("beta", title="B") + bib = Bibliography(sources) + + hit = bib.get_by_tag("beta") + + assert hit is not None + assert hit.tag == "beta" + assert hit.title == "B" + + def but_it_returns_None_for_an_unknown_tag(self): + bib = Bibliography(new_sources_root()) + + assert bib.get_by_tag("nope") is None + + def it_proxies_selected_style_and_style_name(self): + bib = Bibliography(new_sources_root()) + + bib.selected_style = "/MLA7.XSL" + bib.style_name = "MLA7" + + assert bib.selected_style == "/MLA7.XSL" + assert bib.style_name == "MLA7" + + def it_rejects_duplicate_tags(self): + bib = Bibliography(new_sources_root()) + bib.add_source("dup") + + with pytest.raises(ValueError, match="dup"): + bib.add_source("dup") + + +class DescribeDocument_add_citation: + """Smoke-level integration suite for `Document.add_citation`.""" + + def it_creates_a_bibliography_source_reachable_via_bibliography(self): + doc = Document() + + src = doc.add_citation( + "smith2020", title="A Book", author="Smith, J.", year=2020 + ) + + assert isinstance(src, Source) + assert doc.bibliography.get_by_tag("smith2020") is not None + hit = doc.bibliography.get_by_tag("smith2020") + assert hit is not None + assert hit.title == "A Book" + assert hit.year == "2020" + + def it_survives_a_save_reload_roundtrip(self): + doc = Document() + doc.add_citation( + "einstein1905", title="Zur Elektrodynamik", author="Einstein, A.", year=1905 + ) + buf = io.BytesIO() + doc.save(buf) + buf.seek(0) + reloaded = Document(buf) + + sources = list(reloaded.bibliography) + + assert [s.tag for s in sources] == ["einstein1905"] + assert sources[0].year == "1905" + + def it_emits_a_citation_sdt_for_add_citation_reference(self): + doc = Document() + doc.add_citation("smith2020", title="Book") + p = doc.add_paragraph("See ") + cc = p.add_citation_reference("smith2020") + p.add_run(".") + + sdt = cc.element + # -- must carry marker -- + sdtPr = sdt.find(qn("w:sdtPr")) + assert sdtPr is not None + assert sdtPr.find(qn("w:citation")) is not None + # -- and a CITATION fieldcode inside sdtContent -- + sdtContent = sdt.find(qn("w:sdtContent")) + assert sdtContent is not None + instrs = sdtContent.findall(f".//{qn('w:instrText')}") + assert len(instrs) == 1 + assert "CITATION" in instrs[0].text + assert "smith2020" in instrs[0].text + + def it_writes_bibliography_xml_on_save(self): + doc = Document() + doc.add_citation( + "keynes1936", + title="The General Theory", + author="Keynes, J.M.", + year=1936, + ) + p = doc.add_paragraph() + p.add_citation_reference("keynes1936") + + buf = io.BytesIO() + doc.save(buf) + buf.seek(0) + with zipfile.ZipFile(buf) as zf: + names = set(zf.namelist()) + assert "customXml/item1.xml" in names + assert "customXml/itemProps1.xml" in names + item1 = zf.read("customXml/item1.xml").decode("utf-8") + assert "keynes1936" in item1 + assert "The General Theory" in item1 + + def it_supports_multiple_sources_in_one_part(self): + doc = Document() + + doc.add_citation("a", title="A", year=2001) + doc.add_citation("b", title="B", year=2002) + doc.add_citation("c", title="C", year=2003) + + tags = [s.tag for s in doc.bibliography] + assert tags == ["a", "b", "c"] diff --git a/tests/test_blkcntnr.py b/tests/test_blkcntnr.py index ab463663f..9ad823f5a 100644 --- a/tests/test_blkcntnr.py +++ b/tests/test_blkcntnr.py @@ -95,8 +95,14 @@ def it_provides_access_to_the_paragraphs_it_contains( assert len(paragraphs) == expected_count # -- is iterable -- assert all(isinstance(p, Paragraph) for p in paragraphs) - # -- is indexable -- - assert all(p is paragraphs[idx] for idx, p in enumerate(paragraphs)) + # -- is indexable — iteration and ``[idx]`` agree on the + # -- underlying ```` element. (We compare the wrapped + # -- element rather than using ``is``: the view re-wraps each + # -- access so the Paragraph proxies have distinct identities, + # -- but they're equivalent wrappers over the same element.) -- + assert all( + p._p is paragraphs[idx]._p for idx, p in enumerate(paragraphs) + ) @pytest.mark.parametrize( ("blkcntnr_cxml", "expected_count"), diff --git a/tests/test_bookmark_integration.py b/tests/test_bookmark_integration.py new file mode 100644 index 000000000..a2a0e3640 --- /dev/null +++ b/tests/test_bookmark_integration.py @@ -0,0 +1,216 @@ +# pyright: reportPrivateUsage=false + +"""Integration tests for bookmark feature across paragraph and document.""" + +from __future__ import annotations + +from typing import cast + +from docx.bookmarks import Bookmark, Bookmarks +from docx.oxml.document import CT_Body, CT_Document +from docx.oxml.ns import qn +from docx.text.paragraph import Paragraph + +from .unitutil.cxml import element + + +class DescribeParagraph_add_bookmark: + """Unit-test suite for `Paragraph.add_bookmark()`.""" + + def it_can_add_a_bookmark_wrapping_whole_paragraph(self): + body = cast(CT_Body, element('w:body/w:p/w:r/w:t"hello"')) + p_elm = body.p_lst[0] + para = Paragraph(p_elm, None) # type: ignore[arg-type] + + bm = para.add_bookmark("test_bm") + + assert isinstance(bm, Bookmark) + assert bm.name == "test_bm" + assert bm.bookmark_id == 0 + # -- bookmarkStart is first child (no pPr), bookmarkEnd is last -- + children = list(p_elm) + assert children[0].tag == qn("w:bookmarkStart") + assert children[-1].tag == qn("w:bookmarkEnd") + + def it_can_add_a_bookmark_wrapping_whole_paragraph_with_pPr(self): + body = cast(CT_Body, element('w:body/w:p/(w:pPr,w:r/w:t"hello")')) + p_elm = body.p_lst[0] + para = Paragraph(p_elm, None) # type: ignore[arg-type] + + bm = para.add_bookmark("test_bm") + + assert bm.name == "test_bm" + children = list(p_elm) + # -- pPr is first, then bookmarkStart, then run, then bookmarkEnd -- + assert children[0].tag == qn("w:pPr") + assert children[1].tag == qn("w:bookmarkStart") + assert children[-1].tag == qn("w:bookmarkEnd") + + def it_can_add_a_bookmark_around_specific_runs(self): + body = cast( + CT_Body, + element('w:body/w:p/(w:r/w:t"aaa",w:r/w:t"bbb",w:r/w:t"ccc")'), + ) + p_elm = body.p_lst[0] + para = Paragraph(p_elm, None) # type: ignore[arg-type] + runs = para.runs + + bm = para.add_bookmark("mid", start_run=runs[1], end_run=runs[1]) + + assert bm.name == "mid" + # -- bookmarkStart is before the second run, bookmarkEnd is after it -- + children = list(p_elm) + tags = [c.tag for c in children] + bs_idx = tags.index(qn("w:bookmarkStart")) + be_idx = tags.index(qn("w:bookmarkEnd")) + # bookmarkStart should be right before second w:r + assert tags[bs_idx + 1] == qn("w:r") + # bookmarkEnd should be right after that same w:r + assert be_idx == bs_idx + 2 + + def it_allocates_unique_ids(self): + body = cast(CT_Body, element('w:body/w:p/w:r/w:t"hello"')) + p_elm = body.p_lst[0] + para = Paragraph(p_elm, None) # type: ignore[arg-type] + + bm1 = para.add_bookmark("bm1") + bm2 = para.add_bookmark("bm2") + + assert bm1.bookmark_id == 0 + assert bm2.bookmark_id == 1 + + def it_can_add_a_bookmark_with_only_start_run(self): + body = cast( + CT_Body, + element('w:body/w:p/(w:r/w:t"aaa",w:r/w:t"bbb")'), + ) + p_elm = body.p_lst[0] + para = Paragraph(p_elm, None) # type: ignore[arg-type] + runs = para.runs + + bm = para.add_bookmark("single", start_run=runs[0]) + + assert bm.name == "single" + assert bm.bookmark_id == 0 + + +class DescribeDocument_bookmarks: + """Unit-test suite for `Document.bookmarks`.""" + + def it_provides_access_to_document_bookmarks(self): + from docx.document import Document + + doc_elm = cast( + CT_Document, + element( + "w:document/w:body/w:p/" + "(w:bookmarkStart{w:id=0,w:name=bm1},w:bookmarkEnd{w:id=0})" + ), + ) + doc = Document(doc_elm, None) # type: ignore[arg-type] + + bookmarks = doc.bookmarks + + assert isinstance(bookmarks, Bookmarks) + assert len(bookmarks) == 1 + bm = next(iter(bookmarks)) + assert bm.name == "bm1" + + +class DescribeDocument_add_bookmark: + """Unit-test suite for `Document.add_bookmark(runs, name)`.""" + + def it_adds_a_bookmark_spanning_a_single_run(self): + from docx.document import Document + + doc_elm = cast( + CT_Document, + element('w:document/w:body/w:p/w:r/w:t"hello"'), + ) + doc = Document(doc_elm, None) # type: ignore[arg-type] + run = doc.paragraphs[0].runs[0] + + bm = doc.add_bookmark(run, "single") + + assert isinstance(bm, Bookmark) + assert bm.name == "single" + assert bm.bookmark_id == 0 + body = doc_elm.body + assert len(body.xpath(".//w:bookmarkStart")) == 1 + assert len(body.xpath(".//w:bookmarkEnd")) == 1 + + def it_adds_a_bookmark_spanning_runs_across_paragraphs(self): + from docx.document import Document + + doc_elm = cast( + CT_Document, + element( + 'w:document/w:body/(w:p/w:r/w:t"aaa",w:p/w:r/w:t"bbb")' + ), + ) + doc = Document(doc_elm, None) # type: ignore[arg-type] + first_run = doc.paragraphs[0].runs[0] + last_run = doc.paragraphs[1].runs[0] + + bm = doc.add_bookmark([first_run, last_run], "spanning") + + assert bm.name == "spanning" + body = doc_elm.body + # -- bookmarkStart is a sibling of first_run inside its paragraph, + # bookmarkEnd is a sibling of last_run inside the second paragraph -- + p1_children = list(body.p_lst[0]) + p2_children = list(body.p_lst[1]) + assert p1_children[0].tag == qn("w:bookmarkStart") + assert p2_children[-1].tag == qn("w:bookmarkEnd") + + def it_allocates_unique_ids_across_calls(self): + from docx.document import Document + + doc_elm = cast( + CT_Document, + element('w:document/w:body/w:p/w:r/w:t"hello"'), + ) + doc = Document(doc_elm, None) # type: ignore[arg-type] + run = doc.paragraphs[0].runs[0] + + bm1 = doc.add_bookmark(run, "bm1") + bm2 = doc.add_bookmark(run, "bm2") + + assert bm1.bookmark_id == 0 + assert bm2.bookmark_id == 1 + + def it_raises_on_empty_runs_sequence(self): + import pytest + + from docx.document import Document + + doc_elm = cast( + CT_Document, + element("w:document/w:body"), + ) + doc = Document(doc_elm, None) # type: ignore[arg-type] + + with pytest.raises(ValueError, match="non-empty"): + doc.add_bookmark([], "oops") + + +class DescribeBookmark_name_setter: + """Unit-test suite for `Bookmark.name` setter.""" + + def it_can_rename_a_bookmark(self): + body = cast( + CT_Body, + element( + "w:body/w:p/(w:bookmarkStart{w:id=0,w:name=old_name}" + ",w:bookmarkEnd{w:id=0})" + ), + ) + bookmarks = Bookmarks(body) + bm = next(iter(bookmarks)) + + bm.name = "new_name" + + assert bm.name == "new_name" + # -- underlying w:bookmarkStart/@w:name reflects the rename -- + bookmarkStart = body.xpath(".//w:bookmarkStart")[0] + assert bookmarkStart.get(qn("w:name")) == "new_name" diff --git a/tests/test_bookmarks.py b/tests/test_bookmarks.py new file mode 100644 index 000000000..5fa8eceaa --- /dev/null +++ b/tests/test_bookmarks.py @@ -0,0 +1,140 @@ +# pyright: reportPrivateUsage=false + +"""Unit test suite for the `docx.bookmarks` module.""" + +from __future__ import annotations + +from typing import cast + +from docx.bookmarks import Bookmark, Bookmarks +from docx.oxml.bookmarks import CT_BookmarkStart +from docx.oxml.document import CT_Body + +from .unitutil.cxml import element + + +class DescribeBookmarks: + """Unit-test suite for `docx.bookmarks.Bookmarks` objects.""" + + def it_knows_how_many_bookmarks_it_contains(self): + body = cast(CT_Body, element("w:body")) + assert len(Bookmarks(body)) == 0 + + body = cast( + CT_Body, + element( + "w:body/w:p/(w:bookmarkStart{w:id=0,w:name=bm1},w:bookmarkEnd{w:id=0})" + ), + ) + assert len(Bookmarks(body)) == 1 + + body = cast( + CT_Body, + element( + "w:body/(w:p/(w:bookmarkStart{w:id=0,w:name=bm1},w:bookmarkEnd{w:id=0})" + ",w:p/(w:bookmarkStart{w:id=1,w:name=bm2},w:bookmarkEnd{w:id=1}))" + ), + ) + assert len(Bookmarks(body)) == 2 + + def it_is_iterable_over_bookmarks(self): + body = cast( + CT_Body, + element( + "w:body/(w:p/(w:bookmarkStart{w:id=0,w:name=bm1},w:bookmarkEnd{w:id=0})" + ",w:p/(w:bookmarkStart{w:id=1,w:name=bm2},w:bookmarkEnd{w:id=1}))" + ), + ) + bookmarks = Bookmarks(body) + + bm_iter = iter(bookmarks) + bm1 = next(bm_iter) + assert isinstance(bm1, Bookmark) + assert bm1.name == "bm1" + bm2 = next(bm_iter) + assert isinstance(bm2, Bookmark) + assert bm2.name == "bm2" + + def it_supports_containment_check_by_name(self): + body = cast( + CT_Body, + element( + "w:body/w:p/(w:bookmarkStart{w:id=0,w:name=bm1},w:bookmarkEnd{w:id=0})" + ), + ) + bookmarks = Bookmarks(body) + assert "bm1" in bookmarks + assert "nonexistent" not in bookmarks + + def it_can_get_a_bookmark_by_name(self): + body = cast( + CT_Body, + element( + "w:body/w:p/(w:bookmarkStart{w:id=0,w:name=bm1},w:bookmarkEnd{w:id=0})" + ), + ) + bookmarks = Bookmarks(body) + + bm = bookmarks.get("bm1") + assert bm is not None + assert bm.name == "bm1" + + assert bookmarks.get("nonexistent") is None + + +class DescribeBookmark: + """Unit-test suite for `docx.bookmarks.Bookmark`.""" + + def it_knows_its_name(self): + body = cast(CT_Body, element("w:body")) + bookmarkStart = cast( + CT_BookmarkStart, + element("w:bookmarkStart{w:id=5,w:name=test_bookmark}"), + ) + bm = Bookmark(bookmarkStart, body) + assert bm.name == "test_bookmark" + + def it_knows_its_bookmark_id(self): + body = cast(CT_Body, element("w:body")) + bookmarkStart = cast( + CT_BookmarkStart, + element("w:bookmarkStart{w:id=42,w:name=bm1}"), + ) + bm = Bookmark(bookmarkStart, body) + assert bm.bookmark_id == 42 + + def it_can_delete_itself(self): + body = cast( + CT_Body, + element( + "w:body/w:p/(w:bookmarkStart{w:id=0,w:name=bm1}" + ",w:r/w:t\"hello\"" + ",w:bookmarkEnd{w:id=0})" + ), + ) + bookmarks = Bookmarks(body) + assert len(bookmarks) == 1 + + bm = next(iter(bookmarks)) + bm.delete() + + assert len(bookmarks) == 0 + # -- bookmarkEnd is also removed -- + assert len(body.xpath(".//w:bookmarkEnd")) == 0 + + def it_can_delete_a_cross_paragraph_bookmark(self): + body = cast( + CT_Body, + element( + "w:body/(w:p/(w:bookmarkStart{w:id=0,w:name=bm1},w:r/w:t\"hello\")" + ",w:p/(w:r/w:t\"world\",w:bookmarkEnd{w:id=0}))" + ), + ) + bookmarks = Bookmarks(body) + assert len(bookmarks) == 1 + + bm = next(iter(bookmarks)) + bm.delete() + + assert len(bookmarks) == 0 + assert len(body.xpath(".//w:bookmarkEnd")) == 0 diff --git a/tests/test_captions.py b/tests/test_captions.py new file mode 100644 index 000000000..164aff534 --- /dev/null +++ b/tests/test_captions.py @@ -0,0 +1,163 @@ +"""Unit-test suite for caption-building helpers.""" + +from __future__ import annotations + +from docx import Document +from docx.captions import new_caption_paragraph +from docx.document import Document as DocumentCls +from docx.fields import Field +from docx.text.paragraph import Paragraph + + +class DescribeDocument_AddCaption: + """Unit-test suite for `Document.add_caption`.""" + + def it_appends_a_caption_paragraph_to_the_body(self): + document: DocumentCls = Document() + start_count = len(document.paragraphs) + + paragraph = document.add_caption("A diagram of the system") + + assert isinstance(paragraph, Paragraph) + assert len(document.paragraphs) == start_count + 1 + assert document.paragraphs[-1]._p is paragraph._p + + def it_applies_the_caption_style_by_default(self): + document: DocumentCls = Document() + + paragraph = document.add_caption("A diagram") + + assert paragraph.style is not None + assert paragraph.style.name == "Caption" + + def it_produces_the_expected_run_and_field_sequence(self): + document: DocumentCls = Document() + + paragraph = document.add_caption("A diagram of the system") + + # -- expected order: "Figure ", SEQ field(result "1"), ": ", "A diagram..." + assert paragraph.text == "Figure 1: A diagram of the system" + + def it_builds_a_SEQ_field_targeting_the_Figure_label(self): + document: DocumentCls = Document() + + paragraph = document.add_caption("A diagram") + + assert len(paragraph.fields) == 1 + field = paragraph.fields[0] + assert isinstance(field, Field) + assert field.is_complex is False + assert field.type == "SEQ" + assert field.instruction.strip() == "SEQ Figure \\* ARABIC" + assert field.result_text == "1" + + def it_accepts_a_custom_label(self): + document: DocumentCls = Document() + + paragraph = document.add_caption("A pricing table", label="Table") + + assert paragraph.text == "Table 1: A pricing table" + field = paragraph.fields[0] + assert field.instruction.strip() == "SEQ Table \\* ARABIC" + + def it_accepts_a_custom_style(self): + document: DocumentCls = Document() + # -- reuse an existing paragraph style rather than authoring a new one -- + style_name = "Heading 1" + + paragraph = document.add_caption("Custom style", style=style_name) + + assert paragraph.style is not None + assert paragraph.style.name == style_name + + def it_round_trips_through_document_paragraphs(self): + document: DocumentCls = Document() + + paragraph = document.add_caption("Round trip") + + retrieved_texts = [p.text for p in document.paragraphs] + assert "Figure 1: Round trip" in retrieved_texts + + +class DescribeParagraph_AddCaptionBeforeAfter: + """Unit-test suite for `Paragraph.add_caption_before` / `_after`.""" + + def it_inserts_a_caption_after_the_target_paragraph(self): + document: DocumentCls = Document() + anchor = document.add_paragraph("anchor paragraph") + + caption = anchor.add_caption_after("A diagram") + + paragraphs = document.paragraphs + anchor_idx = paragraphs.index( + next(p for p in paragraphs if p._p is anchor._p) + ) + assert paragraphs[anchor_idx + 1]._p is caption._p + assert caption.text == "Figure 1: A diagram" + assert caption.style is not None + assert caption.style.name == "Caption" + + def it_inserts_a_caption_before_the_target_paragraph(self): + document: DocumentCls = Document() + anchor = document.add_paragraph("anchor paragraph") + + caption = anchor.add_caption_before("A diagram") + + paragraphs = document.paragraphs + anchor_idx = paragraphs.index( + next(p for p in paragraphs if p._p is anchor._p) + ) + assert paragraphs[anchor_idx - 1]._p is caption._p + assert caption.text == "Figure 1: A diagram" + + def it_honors_custom_label_and_style_on_add_caption_after(self): + document: DocumentCls = Document() + anchor = document.add_paragraph("anchor") + + caption = anchor.add_caption_after( + "Pricing", label="Table", style="Heading 1" + ) + + assert caption.text == "Table 1: Pricing" + field = caption.fields[0] + assert field.instruction.strip() == "SEQ Table \\* ARABIC" + assert caption.style is not None + assert caption.style.name == "Heading 1" + + def it_honors_custom_label_and_style_on_add_caption_before(self): + document: DocumentCls = Document() + anchor = document.add_paragraph("anchor") + + caption = anchor.add_caption_before( + "Pricing", label="Table", style="Heading 1" + ) + + assert caption.text == "Table 1: Pricing" + field = caption.fields[0] + assert field.instruction.strip() == "SEQ Table \\* ARABIC" + assert caption.style is not None + assert caption.style.name == "Heading 1" + + +class DescribeNewCaptionParagraph: + """Unit-test suite for the low-level `new_caption_paragraph` helper.""" + + def it_populates_an_empty_paragraph_with_the_standard_caption_shape(self): + document: DocumentCls = Document() + paragraph = document.add_paragraph() + + result = new_caption_paragraph(paragraph, "A diagram") + + assert result is paragraph + assert paragraph.text == "Figure 1: A diagram" + assert paragraph.style is not None + assert paragraph.style.name == "Caption" + + def it_returns_the_populated_paragraph(self): + document: DocumentCls = Document() + paragraph = document.add_paragraph() + + result = new_caption_paragraph(paragraph, "text", label="Table") + + assert result is paragraph + assert paragraph.fields[0].instruction.strip() == "SEQ Table \\* ARABIC" diff --git a/tests/test_chart.py b/tests/test_chart.py new file mode 100644 index 000000000..2bcbd740f --- /dev/null +++ b/tests/test_chart.py @@ -0,0 +1,143 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `docx.chart` module.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.chart import Chart, ChartSeries, WD_CHART_TYPE +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.packuri import PackURI +from docx.oxml.chart import CT_ChartSpace, CT_Ser +from docx.package import Package +from docx.parts.chart import ChartPart + +from .unitutil.cxml import element + + +def _chart_part(cxml: str) -> ChartPart: + chartSpace = cast(CT_ChartSpace, element(cxml)) + package = Package() + return ChartPart(PackURI("/word/charts/chart1.xml"), CT.DML_CHART, chartSpace, package) + + +class DescribeWD_CHART_TYPE: + def it_is_an_enum_with_expected_members(self): + assert WD_CHART_TYPE.BAR.value == "bar" + assert WD_CHART_TYPE.BAR_STACKED.value == "barStacked" + assert WD_CHART_TYPE.COLUMN.value == "column" + assert WD_CHART_TYPE.COLUMN_STACKED.value == "columnStacked" + assert WD_CHART_TYPE.LINE.value == "line" + assert WD_CHART_TYPE.PIE.value == "pie" + assert WD_CHART_TYPE.DOUGHNUT.value == "doughnut" + assert WD_CHART_TYPE.SCATTER.value == "scatter" + assert WD_CHART_TYPE.AREA.value == "area" + + +class DescribeChart: + @pytest.mark.parametrize( + ("kind_cxml", "expected"), + [ + ("c:barChart/c:barDir{val=bar}", WD_CHART_TYPE.BAR), + ( + "c:barChart/(c:barDir{val=bar},c:grouping{val=stacked})", + WD_CHART_TYPE.BAR_STACKED, + ), + ("c:barChart/c:barDir{val=col}", WD_CHART_TYPE.COLUMN), + ( + "c:barChart/(c:barDir{val=col},c:grouping{val=stacked})", + WD_CHART_TYPE.COLUMN_STACKED, + ), + ("c:lineChart", WD_CHART_TYPE.LINE), + ("c:pieChart", WD_CHART_TYPE.PIE), + ("c:doughnutChart", WD_CHART_TYPE.DOUGHNUT), + ("c:scatterChart", WD_CHART_TYPE.SCATTER), + ("c:areaChart", WD_CHART_TYPE.AREA), + ], + ) + def it_identifies_its_chart_type(self, kind_cxml: str, expected: WD_CHART_TYPE): + cxml = f"c:chartSpace/c:chart/c:plotArea/{kind_cxml}" + part = _chart_part(cxml) + chart = Chart(part) + assert chart.chart_type is expected + + def its_chart_type_is_None_when_no_kind_element(self): + part = _chart_part("c:chartSpace/c:chart/c:plotArea") + assert Chart(part).chart_type is None + + def it_reads_its_title(self): + cxml = ( + 'c:chartSpace/c:chart/(c:title/c:tx/c:rich/a:p/a:r/a:t"Sales",' + "c:plotArea/c:barChart/c:barDir{val=bar})" + ) + part = _chart_part(cxml) + assert Chart(part).title == "Sales" + + def its_title_is_None_when_absent(self): + cxml = "c:chartSpace/c:chart/c:plotArea/c:barChart/c:barDir{val=bar}" + part = _chart_part(cxml) + assert Chart(part).title is None + + @pytest.mark.parametrize( + ("cxml", "expected"), + [ + ("c:chartSpace/c:chart/c:plotArea", False), + ("c:chartSpace/c:chart/(c:plotArea,c:legend)", True), + ], + ) + def it_knows_whether_it_has_a_legend(self, cxml: str, expected: bool): + part = _chart_part(cxml) + assert Chart(part).has_legend is expected + + def it_provides_access_to_its_series(self): + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:barChart" + '/(c:barDir{val=bar},c:ser/c:tx/c:v"S1",c:ser/c:tx/c:v"S2")' + ) + part = _chart_part(cxml) + chart = Chart(part) + names = [s.name for s in chart.series] + assert names == ["S1", "S2"] + + def it_returns_empty_series_when_no_plotArea(self): + part = _chart_part("c:chartSpace/c:chart") + assert Chart(part).series == [] + + def it_returns_categories_from_the_first_series(self): + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:barChart/c:ser/" + "c:cat/c:strRef/c:strCache/" + '(c:pt{idx=0}/c:v"A",c:pt{idx=1}/c:v"B")' + ) + part = _chart_part(cxml) + assert Chart(part).categories == ["A", "B"] + + def it_returns_empty_categories_when_no_series(self): + cxml = "c:chartSpace/c:chart/c:plotArea/c:barChart" + part = _chart_part(cxml) + assert Chart(part).categories == [] + + +class DescribeChartSeries: + def it_exposes_name_values_and_categories(self): + cxml = ( + "c:ser/" + '(c:tx/c:v"Rev",' + "c:cat/c:strRef/c:strCache/" + '(c:pt{idx=0}/c:v"Q1",c:pt{idx=1}/c:v"Q2"),' + "c:val/c:numRef/c:numCache/" + '(c:pt{idx=0}/c:v"10",c:pt{idx=1}/c:v"20"))' + ) + ser = cast(CT_Ser, element(cxml)) + series = ChartSeries(ser) + + assert series.name == "Rev" + assert series.categories == ["Q1", "Q2"] + assert series.values == [10.0, 20.0] + + def its_name_is_empty_string_when_not_set(self): + ser = cast(CT_Ser, element("c:ser")) + assert ChartSeries(ser).name == "" diff --git a/tests/test_chart_replace_data.py b/tests/test_chart_replace_data.py new file mode 100644 index 000000000..babf41125 --- /dev/null +++ b/tests/test_chart_replace_data.py @@ -0,0 +1,176 @@ +# pyright: reportPrivateUsage=false + +"""Unit-test suite for `Chart.replace_data`.""" + +from __future__ import annotations + +from typing import cast + +import pytest + +from docx.chart import Chart +from docx.opc.constants import CONTENT_TYPE as CT +from docx.opc.packuri import PackURI +from docx.oxml.chart import CT_ChartSpace +from docx.oxml.ns import qn +from docx.package import Package +from docx.parts.chart import ChartPart, _rewrite_ser + +from .unitutil.cxml import element + + +def _make_chart(cxml: str) -> Chart: + chartSpace = cast(CT_ChartSpace, element(cxml)) + package = Package() + part = ChartPart( + PackURI("/word/charts/chart1.xml"), CT.DML_CHART, chartSpace, package + ) + return Chart(part) + + +class DescribeChart_replace_data: + """Unit-test suite for `docx.chart.Chart.replace_data`.""" + + def it_rewrites_categories_and_values_on_a_single_series(self): + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:barChart/" + "(c:barDir{val=col},c:ser/(" + "c:idx{val=0},c:order{val=0}," + 'c:tx/c:v"Old",' + 'c:cat/c:strRef/c:strCache/(c:ptCount{val=1},c:pt{idx=0}/c:v"a"),' + "c:val/c:numRef/c:numCache/(c:ptCount{val=1},c:pt{idx=0}/c:v\"1\")" + "))" + ) + chart = _make_chart(cxml) + + chart.replace_data(["Q1", "Q2", "Q3"], {"Revenue": [10.0, 20.0, 30.0]}) + + assert chart.categories == ["Q1", "Q2", "Q3"] + assert chart.series[0].name == "Revenue" + assert chart.series[0].values == [10.0, 20.0, 30.0] + + def it_preserves_non_data_styling_children_on_series(self): + # -- include a c:spPr (styling) child that replace_data must preserve -- + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:barChart/" + "(c:barDir{val=col},c:ser/(" + "c:idx{val=0},c:order{val=0}," + 'c:tx/c:v"Old",' + "c:spPr," + 'c:cat/c:strRef/c:strCache/(c:ptCount{val=1},c:pt{idx=0}/c:v"a"),' + "c:val/c:numRef/c:numCache/(c:ptCount{val=1},c:pt{idx=0}/c:v\"1\")" + "))" + ) + chart = _make_chart(cxml) + chart.replace_data(["A"], {"New": [5.0]}) + ser = chart.part.chartSpace.xpath(".//c:ser")[0] + assert ser.find(qn("c:spPr")) is not None + + def it_clones_the_last_series_when_adding_more(self): + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:barChart/" + "(c:barDir{val=col},c:ser/(" + "c:idx{val=0},c:order{val=0}," + 'c:tx/c:v"S0",' + "c:spPr," + 'c:cat/c:strRef/c:strCache/(c:ptCount{val=1},c:pt{idx=0}/c:v"a"),' + "c:val/c:numRef/c:numCache/(c:ptCount{val=1},c:pt{idx=0}/c:v\"1\")" + "))" + ) + chart = _make_chart(cxml) + + chart.replace_data( + ["Q1", "Q2"], + {"A": [1.0, 2.0], "B": [3.0, 4.0]}, + ) + + assert [s.name for s in chart.series] == ["A", "B"] + assert [s.values for s in chart.series] == [[1.0, 2.0], [3.0, 4.0]] + # -- cloned series keeps c:spPr from the template -- + for ser in chart.part.chartSpace.xpath(".//c:ser"): + assert ser.find(qn("c:spPr")) is not None + + def it_removes_excess_series_when_shrinking(self): + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:barChart/" + "(c:barDir{val=col}," + "c:ser/(" + "c:idx{val=0},c:order{val=0}," + 'c:tx/c:v"S0",' + 'c:cat/c:strRef/c:strCache/(c:ptCount{val=1},c:pt{idx=0}/c:v"a"),' + "c:val/c:numRef/c:numCache/(c:ptCount{val=1},c:pt{idx=0}/c:v\"1\"))," + "c:ser/(" + "c:idx{val=1},c:order{val=1}," + 'c:tx/c:v"S1",' + 'c:cat/c:strRef/c:strCache/(c:ptCount{val=1},c:pt{idx=0}/c:v"a"),' + "c:val/c:numRef/c:numCache/(c:ptCount{val=1},c:pt{idx=0}/c:v\"2\"))" + ")" + ) + chart = _make_chart(cxml) + + chart.replace_data(["X"], {"OnlyOne": [99.0]}) + + assert len(chart.series) == 1 + assert chart.series[0].name == "OnlyOne" + assert chart.series[0].values == [99.0] + + def it_raises_when_series_length_mismatches_categories(self): + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:barChart/" + "(c:barDir{val=col},c:ser/(" + "c:idx{val=0},c:order{val=0}," + 'c:tx/c:v"S0",' + 'c:cat/c:strRef/c:strCache/(c:ptCount{val=1},c:pt{idx=0}/c:v"a"),' + "c:val/c:numRef/c:numCache/(c:ptCount{val=1},c:pt{idx=0}/c:v\"1\")" + "))" + ) + chart = _make_chart(cxml) + with pytest.raises(ValueError, match="3 categories"): + chart.replace_data(["a", "b", "c"], {"X": [1.0, 2.0]}) + + def it_raises_when_no_existing_series(self): + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:barChart/c:barDir{val=col}" + ) + chart = _make_chart(cxml) + with pytest.raises(ValueError, match="at least one existing c:ser"): + chart.replace_data(["a"], {"X": [1.0]}) + + def it_preserves_chart_type_across_replacement(self): + cxml = ( + "c:chartSpace/c:chart/c:plotArea/c:lineChart/" + "c:ser/(" + "c:idx{val=0},c:order{val=0}," + 'c:tx/c:v"S0",' + 'c:cat/c:strRef/c:strCache/(c:ptCount{val=1},c:pt{idx=0}/c:v"a"),' + "c:val/c:numRef/c:numCache/(c:ptCount{val=1},c:pt{idx=0}/c:v\"1\")" + ")" + ) + chart = _make_chart(cxml) + before = chart.chart_type + chart.replace_data(["A", "B"], {"New": [1.0, 2.0]}) + assert chart.chart_type is before + + +class DescribeRewriteSer: + """Direct tests for the `_rewrite_ser` helper.""" + + def it_preserves_non_data_siblings_in_order(self): + cxml = ( + "c:ser/(" + "c:idx{val=0},c:order{val=0}," + 'c:tx/c:v"Old",' + "c:spPr," + "c:smooth" + ")" + ) + ser = element(cxml) + _rewrite_ser(ser, idx=3, name="New", categories=["a"], values=[42.0]) + # -- spPr and smooth remain after the rewritten data elements -- + tags = [child.tag for child in list(ser)] + assert qn("c:idx") in tags + assert qn("c:order") in tags + assert qn("c:spPr") in tags + assert qn("c:smooth") in tags + # -- idx is updated -- + assert ser.find(qn("c:idx")).get("val") == "3" diff --git a/tests/test_comments.py b/tests/test_comments.py index 0f292ec8a..2152d80d2 100644 --- a/tests/test_comments.py +++ b/tests/test_comments.py @@ -161,6 +161,25 @@ def and_it_sets_the_author_and_their_initials_when_adding_a_comment_when_provide assert comment.author == "Steve Canny" assert comment.initials == "SJC" + def and_it_accepts_an_explicit_tz_aware_date( + self, comments: Comments, package_: Mock + ): + when = dt.datetime(2024, 1, 15, 10, 0, 0, tzinfo=dt.timezone.utc) + + comment = comments.add_comment(author="A", date=when) + + assert comment.timestamp == when + + def and_it_accepts_an_explicit_date_on_add_reply( + self, comments: Comments, package_: Mock + ): + parent = comments.add_comment(author="A") + when = dt.datetime(2024, 2, 1, 9, 0, 0, tzinfo=dt.timezone.utc) + + reply = parent.add_reply(author="B", date=when) + + assert reply.timestamp == when + # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture @@ -268,8 +287,102 @@ def it_can_update_the_comment_initials(self, initials: str | None, comments_part assert comment.initials == initials + def it_can_add_a_reply_to_a_comment(self, package_: Mock): + comments_elm = cast(CT_Comments, element("w:comments")) + comments_part = CommentsPart( + PackURI("/word/comments.xml"), + CT.WML_COMMENTS, + comments_elm, + package_, + ) + comments = Comments(comments_elm, comments_part) + parent = comments.add_comment(text="Parent comment", author="Author A", initials="AA") + + reply = parent.add_reply(text="Reply text", author="Author B", initials="BB") + + assert isinstance(reply, Comment) + assert reply.text == "Reply text" + assert reply.author == "Author B" + assert reply.initials == "BB" + assert reply.timestamp is not None + assert reply.comment_id != parent.comment_id + # -- verify the reply is linked to the parent via paraIdParent -- + assert reply._comment_elm.paraIdParent == parent._comment_elm.paraId + + def it_can_list_replies_to_a_comment(self, package_: Mock): + comments_elm = cast(CT_Comments, element("w:comments")) + comments_part = CommentsPart( + PackURI("/word/comments.xml"), + CT.WML_COMMENTS, + comments_elm, + package_, + ) + comments = Comments(comments_elm, comments_part) + parent = comments.add_comment(text="Parent", author="A") + parent.add_reply(text="Reply 1", author="B") + parent.add_reply(text="Reply 2", author="C") + # -- add an unrelated comment -- + comments.add_comment(text="Other comment", author="D") + + replies = parent.replies + + assert len(replies) == 2 + assert replies[0].text == "Reply 1" + assert replies[0].author == "B" + assert replies[1].text == "Reply 2" + assert replies[1].author == "C" + + def and_it_returns_empty_list_when_no_replies(self, package_: Mock): + comments_elm = cast(CT_Comments, element("w:comments")) + comments_part = CommentsPart( + PackURI("/word/comments.xml"), + CT.WML_COMMENTS, + comments_elm, + package_, + ) + comments = Comments(comments_elm, comments_part) + parent = comments.add_comment(text="Parent", author="A") + + assert parent.replies == [] + + def it_can_add_a_reply_with_no_text(self, package_: Mock): + comments_elm = cast(CT_Comments, element("w:comments")) + comments_part = CommentsPart( + PackURI("/word/comments.xml"), + CT.WML_COMMENTS, + comments_elm, + package_, + ) + comments = Comments(comments_elm, comments_part) + parent = comments.add_comment(author="A") + + reply = parent.add_reply(author="B") + + assert isinstance(reply, Comment) + assert [p.text for p in reply.paragraphs] == [""] + + def it_can_add_a_multiline_reply(self, package_: Mock): + comments_elm = cast(CT_Comments, element("w:comments")) + comments_part = CommentsPart( + PackURI("/word/comments.xml"), + CT.WML_COMMENTS, + comments_elm, + package_, + ) + comments = Comments(comments_elm, comments_part) + parent = comments.add_comment(author="A") + + reply = parent.add_reply(text="line 1\nline 2", author="B") + + assert len(reply.paragraphs) == 2 + assert [p.text for p in reply.paragraphs] == ["line 1", "line 2"] + # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture def comments_part_(self, request: FixtureRequest): return instance_mock(request, CommentsPart) + + @pytest.fixture + def package_(self, request: FixtureRequest): + return instance_mock(request, Package) diff --git a/tests/test_conformance_corpus.py b/tests/test_conformance_corpus.py new file mode 100644 index 000000000..c1a2d93e8 --- /dev/null +++ b/tests/test_conformance_corpus.py @@ -0,0 +1,178 @@ +"""Feature-manifest conformance tests against the corpus. + +Iterates every ``docx/*.json`` manifest in +``loadfix/ooxml-reference-corpus`` (expected as a sibling checkout at +``../ooxml-reference-corpus/``). For each manifest the suite: + +1. Expands it into one or more concrete test cases (``kind: literal`` + is a single case; ``kind: parameterised`` is the Cartesian product + of its `parameters` axes). +2. Invokes the committed generator to produce the case's fixture. +3. Asserts ``ooxml_validate.conformance.run_feature`` returns `pass`. + +These tests are the live guard against drift between python-docx and +the shared feature definitions. They auto-skip when either sibling +repo is absent. +""" + +from __future__ import annotations + +import importlib.util +import os +import runpy +import shlex +import subprocess +import sys +from pathlib import Path + +import pytest + +_REPO_ROOT = Path(__file__).resolve().parent.parent +_CORPUS_ROOT = _REPO_ROOT.parent / "ooxml-reference-corpus" +_FEATURES_DIR = _CORPUS_ROOT / "features" / "docx" +_SCRIPTS_DIR = _CORPUS_ROOT / "scripts" +_FIXTURES_DIR = _CORPUS_ROOT / "fixtures" / "docx" + + +def _ooxml_validate_available() -> bool: + return importlib.util.find_spec("ooxml_validate") is not None + + +def _collect_cases() -> list[tuple[str, str, int]]: + """Return a list of (manifest_id, case_id, case_index) tuples. + + For literal manifests the case_id equals the manifest_id. + Parameterised manifests expand into one tuple per concrete case. + ``case_index`` is the index into the expanded list so the test + body can re-expand without mismatch. + """ + if not _FEATURES_DIR.is_dir() or not _ooxml_validate_available(): + return [] + from ooxml_validate import expand_manifest, load_manifest + + cases: list[tuple[str, str, int]] = [] + for manifest_path in sorted(_FEATURES_DIR.glob("*.json")): + manifest = load_manifest(manifest_path) + for idx, case in enumerate(expand_manifest(manifest)): + cases.append((manifest_path.stem, case["id"], idx)) + return cases + + +pytestmark = [ + pytest.mark.skipif( + not _CORPUS_ROOT.is_dir(), + reason=f"Corpus sibling checkout not found at {_CORPUS_ROOT}", + ), + pytest.mark.skipif( + not _ooxml_validate_available(), + reason="ooxml-validate not installed in this env", + ), +] + + +_COLLECTED_CASES = _collect_cases() + + +class DescribeCorpusConformance: + @pytest.mark.parametrize( + ("manifest_stem", "case_id", "case_index"), + _COLLECTED_CASES or [("", "", 0)], + ids=[case_id for _, case_id, _ in _COLLECTED_CASES] or [""], + ) + def it_passes_every_docx_case( + self, + manifest_stem: str, + case_id: str, + case_index: int, + tmp_path: Path, + ): + if manifest_stem == "": + pytest.skip("No manifests found in corpus") + + from ooxml_validate import expand_manifest, load_manifest, run_feature + from ooxml_validate.conformance import _substitute + + manifest_path = _FEATURES_DIR / f"{manifest_stem}.json" + manifest = load_manifest(manifest_path) + case = expand_manifest(manifest)[case_index] + assert case["id"] == case_id, "case index drifted since collection" + + gen_script = _SCRIPTS_DIR / manifest["generator"]["python"].split("/")[-1] + if not gen_script.is_file(): + pytest.skip(f"Generator script missing: {gen_script}") + + if manifest.get("kind") == "parameterised": + _invoke_parameterised_generator(manifest, case, gen_script) + else: + _invoke_literal_generator(gen_script) + + fmt, logical_name = case["fixtures"]["machine"].split("/", 1) + fixture = _CORPUS_ROOT / "fixtures" / fmt / f"{logical_name}.docx" + assert fixture.is_file(), f"Generator did not produce {fixture}" + + result = run_feature( + case, + library="python-docx", + fixture_path=fixture, + tool_version=_current_version(), + ) + assert result.status == "pass", [a.to_dict() for a in result.assertions] + + +def _invoke_literal_generator(gen_script: Path) -> None: + """Run the generator as ``python