From a901aada60cf86d7e93549cd600b04157b955a21 Mon Sep 17 00:00:00 2001 From: Shengyu Zhang Date: Wed, 29 Oct 2025 16:37:45 +0800 Subject: [PATCH 1/4] Use ``urllib.request.DataHandler`` in ``parse_data_uri`` The previous code that manually parsed data URIs had a bug: Exception is raised when parsing non-base64 encoded data URIs. Using urllib completely avoids these problems and simplifies the project. --- sphinx/util/images.py | 29 ++++++++++------------------- tests/test_util/test_util_images.py | 13 +++++++++++++ 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/sphinx/util/images.py b/sphinx/util/images.py index ca6cb66764b..ed10517512f 100644 --- a/sphinx/util/images.py +++ b/sphinx/util/images.py @@ -5,6 +5,7 @@ import base64 from pathlib import Path from typing import TYPE_CHECKING, NamedTuple, overload +from urllib.request import DataHandler, build_opener import imagesize @@ -90,26 +91,16 @@ def get_image_extension(mimetype: str) -> str | None: def parse_data_uri(uri: str) -> DataURI | None: if not uri.startswith('data:'): return None - uri = uri[5:] - - if ',' not in uri: + try: + response = build_opener(DataHandler).open(uri) + except ValueError as e: msg = 'malformed data URI' - raise ValueError(msg) - - # data:[][;charset=][;base64], - mimetype = 'text/plain' - charset = 'US-ASCII' - - properties, _, data = uri.partition(',') - for prop in properties.split(';'): - if prop == 'base64': - pass # skip - elif prop.startswith('charset='): - charset = prop[8:] - elif prop: - mimetype = prop - - image_data = base64.b64decode(data) + raise ValueError(msg) from e + info = response.info() + mimetype = info.get_content_type() or 'text/plain' + charset = info.get_content_charset() or 'US-ASCII' + image_data = response.read() + return DataURI(mimetype, charset, image_data) diff --git a/tests/test_util/test_util_images.py b/tests/test_util/test_util_images.py index d0b4f918afc..ce3a98a029d 100644 --- a/tests/test_util/test_util_images.py +++ b/tests/test_util/test_util_images.py @@ -86,3 +86,16 @@ def test_parse_data_uri() -> None: ) with pytest.raises(ValueError, match=r'malformed data URI'): parse_data_uri(uri) + + # not base64 + uri = ( + 'data:image/svg+xml,%3Csvg%20viewBox%3D%220%200%20100%20100%22%20xmlns' + '%3D%22http%3A//www.w3.org/2000/svg%22%3E%3Ccircle%20cx%3D%2250%22%20cy' + '%3D%2250%22%20r%3D%2250%22%20fill%3D%22%23eff2f5%22/%3E%3Ccircle%20cx' + '%3D%2250%22%20cy%3D%2250%22%20r%3D%2250%22%20fill%3D%22%23116329%22/' + '%3E%3Ccircle%20cx%3D%2250%22%20cy%3D%2250%22%20r%3D%2225.0%22%20fill' + '%3D%22%23ffffff%22/%3E%3C/svg%3E' + ) + image = parse_data_uri(uri) + assert image is not None + assert image.mimetype == 'image/svg+xml' From 5f58f8b9dad37b35a14c6dee5712ab25ffe49bf5 Mon Sep 17 00:00:00 2001 From: Shengyu Zhang Date: Wed, 29 Oct 2025 16:51:09 +0800 Subject: [PATCH 2/4] Remove unused base64 import --- sphinx/util/images.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sphinx/util/images.py b/sphinx/util/images.py index ed10517512f..eca5bafa3e0 100644 --- a/sphinx/util/images.py +++ b/sphinx/util/images.py @@ -2,7 +2,6 @@ from __future__ import annotations -import base64 from pathlib import Path from typing import TYPE_CHECKING, NamedTuple, overload from urllib.request import DataHandler, build_opener From 0ee5f37f0591928d66839f333b740f2723042dfb Mon Sep 17 00:00:00 2001 From: Shengyu Zhang Date: Wed, 29 Oct 2025 16:53:16 +0800 Subject: [PATCH 3/4] Update CHANGES.rst --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 3a90d66f08d..173147b218d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -134,6 +134,9 @@ Bugs fixed Patch by Jeremy Maitin-Shepard. * #13939: LaTeX: page break can separate admonition title from contents. Patch by Jean-François B. +* #14006: Use ``urllib.request.DataHandler`` in ``parse_data_uri`` to fix + parsing of non-base64 data URI. + Patch by Shengyu Zhang. Testing From ef8d6260c87f5e67807eeedbfcd220e669fbe55b Mon Sep 17 00:00:00 2001 From: Shengyu Zhang Date: Wed, 29 Oct 2025 22:36:58 +0800 Subject: [PATCH 4/4] Simplify SVG test data --- tests/test_util/test_util_images.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_util/test_util_images.py b/tests/test_util/test_util_images.py index ce3a98a029d..ecec591b87b 100644 --- a/tests/test_util/test_util_images.py +++ b/tests/test_util/test_util_images.py @@ -89,12 +89,10 @@ def test_parse_data_uri() -> None: # not base64 uri = ( - 'data:image/svg+xml,%3Csvg%20viewBox%3D%220%200%20100%20100%22%20xmlns' - '%3D%22http%3A//www.w3.org/2000/svg%22%3E%3Ccircle%20cx%3D%2250%22%20cy' - '%3D%2250%22%20r%3D%2250%22%20fill%3D%22%23eff2f5%22/%3E%3Ccircle%20cx' - '%3D%2250%22%20cy%3D%2250%22%20r%3D%2250%22%20fill%3D%22%23116329%22/' - '%3E%3Ccircle%20cx%3D%2250%22%20cy%3D%2250%22%20r%3D%2225.0%22%20fill' - '%3D%22%23ffffff%22/%3E%3C/svg%3E' + 'data:image/svg+xml,%3Csvg%20width%3D%22100%22%20height%3D%22100%22%20' + 'xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%3Ccircle%20cx' + '%3D%2250%22%20cy%3D%2250%22%20r%3D%2240%22%20fill%3D%22blue%22%2F%3E' + '%3C%2Fsvg%3E' ) image = parse_data_uri(uri) assert image is not None