From 6772f2d11677a09ab3ed65fecde164c8d0e56bb7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 17 May 2025 19:46:29 +0300 Subject: [PATCH 1/3] gh-133890: Handle UnicodeEncodeError in tarfile UnicodeEncodeError is now handled the same way as OSError during TarFile member extraction. --- Lib/tarfile.py | 4 +- Lib/test/test_posixpath.py | 6 +++ Lib/test/test_tarfile.py | 45 +++++++++++++++++-- ...-05-17-18-08-35.gh-issue-133890.onn9_X.rst | 2 + 4 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 13889d768021b1..212b71f6509740 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -2439,7 +2439,7 @@ def _get_extract_tarinfo(self, member, filter_function, path): unfiltered = tarinfo try: tarinfo = filter_function(tarinfo, path) - except (OSError, FilterError) as e: + except (OSError, UnicodeEncodeError, FilterError) as e: self._handle_fatal_error(e) except ExtractError as e: self._handle_nonfatal_error(e) @@ -2460,7 +2460,7 @@ def _extract_one(self, tarinfo, path, set_attrs, numeric_owner): self._extract_member(tarinfo, os.path.join(path, tarinfo.name), set_attrs=set_attrs, numeric_owner=numeric_owner) - except OSError as e: + except (OSError, UnicodeEncodeError) as e: self._handle_fatal_error(e) except ExtractError as e: self._handle_nonfatal_error(e) diff --git a/Lib/test/test_posixpath.py b/Lib/test/test_posixpath.py index fa19d549c26cfa..6294bc244d1bcd 100644 --- a/Lib/test/test_posixpath.py +++ b/Lib/test/test_posixpath.py @@ -489,6 +489,12 @@ def test_realpath_strict(self): finally: os_helper.unlink(ABSTFN) + def test_realpath_unencodable(self): + self.assertRaises(ValueError, realpath, 'test\0', strict=False) + self.assertRaises(ValueError, realpath, 'test\0', strict=True) + self.assertRaises(UnicodeEncodeError, realpath, '\ud800', strict=False) + self.assertRaises(UnicodeEncodeError, realpath, '\ud800', strict=True) + @os_helper.skip_unless_symlink @skip_if_ABSTFN_contains_backslash def test_realpath_relative(self): diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 2d9649237a9382..2e9ece6c032f57 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -3490,11 +3490,12 @@ class ArchiveMaker: with t.open() as tar: ... # `tar` is now a TarFile with 'filename' in it! """ - def __init__(self): + def __init__(self, **kwargs): self.bio = io.BytesIO() + self.tar_kwargs = dict(kwargs) def __enter__(self): - self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio) + self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio, **self.tar_kwargs) return self def __exit__(self, *exc): @@ -4073,7 +4074,10 @@ def test_tar_filter(self): # that in the test archive.) with tarfile.TarFile.open(tarname) as tar: for tarinfo in tar.getmembers(): - filtered = tarfile.tar_filter(tarinfo, '') + try: + filtered = tarfile.tar_filter(tarinfo, '') + except UnicodeEncodeError: + continue self.assertIs(filtered.name, tarinfo.name) self.assertIs(filtered.type, tarinfo.type) @@ -4084,11 +4088,44 @@ def test_data_filter(self): for tarinfo in tar.getmembers(): try: filtered = tarfile.data_filter(tarinfo, '') - except tarfile.FilterError: + except (tarfile.FilterError, UnicodeEncodeError): continue self.assertIs(filtered.name, tarinfo.name) self.assertIs(filtered.type, tarinfo.type) + def test_filter_unencodable(self): + # Sanity check using a valid path. + tarinfo = tarfile.TarInfo(os_helper.TESTFN) + filtered = tarfile.tar_filter(tarinfo, '') + self.assertIs(filtered.name, tarinfo.name) + filtered = tarfile.data_filter(tarinfo, '') + self.assertIs(filtered.name, tarinfo.name) + + tarinfo = tarfile.TarInfo('test\0') + self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '') + self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '') + tarinfo = tarfile.TarInfo('\ud800') + self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '') + self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '') + + def test_extract_encode_error(self): + with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc: + arc.add('\udced\udca0\udc80') + with os_helper.temp_cwd() as tmp: + tar = arc.open(encoding='utf-8', errors='surrogatepass', + errorlevel=1) + self.assertEqual(tar.getnames(), ['\ud800']) + with self.assertRaises(UnicodeEncodeError): + tar.extractall() + self.assertEqual(os.listdir(), []) + + tar = arc.open(encoding='utf-8', errors='surrogatepass', + errorlevel=0, debug=1) + with support.captured_stderr() as stderr: + tar.extractall() + self.assertEqual(os.listdir(), []) + self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue()) + def test_change_default_filter_on_instance(self): tar = tarfile.TarFile(tarname, 'r') def strict_filter(tarinfo, path): diff --git a/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst new file mode 100644 index 00000000000000..44565a5424e65b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst @@ -0,0 +1,2 @@ +The :mod:`tarfile` module now handles :exc:`UnicodeEncodeError` in the same +way as :exc:`OSError` when cannot extract a member. From de4fe01086396892691819e2fc5b3f5c373dc59b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 18 May 2025 16:03:00 +0300 Subject: [PATCH 2/3] Fix tests on Windows. --- Lib/test/test_ntpath.py | 28 ++++++++++++++++++++++++++++ Lib/test/test_posixpath.py | 34 ++++++++++++++++++++++++++++++---- Lib/test/test_tarfile.py | 8 ++++++-- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_ntpath.py b/Lib/test/test_ntpath.py index c10387b58e3f9c..7cb638c5faecf6 100644 --- a/Lib/test/test_ntpath.py +++ b/Lib/test/test_ntpath.py @@ -437,6 +437,34 @@ def test_realpath_strict(self): # gh-106242: Embedded nulls should raise OSError (not ValueError) self.assertRaises(OSError, ntpath.realpath, ABSTFN + "\0spam", strict=True) + @unittest.skipUnless(HAVE_GETFINALPATHNAME, 'need _getfinalpathname') + def test_realpath_embedded_null(self): + realpath = ntpath.realpath + ABSTFN = ntpath.abspath(os_helper.TESTFN) + path = ABSTFN + '\x00' + self.assertEqual(realpath(path, strict=False), path) + self.assertRaises(OSError, realpath, path, strict=True) + path = os.fsencode(ABSTFN) + b'\x00' + self.assertEqual(realpath(path, strict=False), path) + self.assertRaises(OSError, realpath, path, strict=True) + path = ABSTFN + '\\nonexistent\\x\x00' + self.assertEqual(realpath(path, strict=False), path) + self.assertRaises(OSError, realpath, path, strict=True) + path = os.fsencode(ABSTFN) + b'\\nonexistent\\x\x00' + self.assertEqual(realpath(path, strict=False), path) + self.assertRaises(OSError, realpath, path, strict=True) + + @unittest.skipUnless(HAVE_GETFINALPATHNAME, 'need _getfinalpathname') + def test_realpath_undecodable(self): + realpath = ntpath.realpath + ABSTFN = ntpath.abspath(os_helper.TESTFN) + path = os.fsencode(ABSTFN) + b'\xff' + self.assertRaises(UnicodeDecodeError, realpath, path, strict=False) + self.assertRaises(UnicodeDecodeError, realpath, path, strict=True) + path = os.fsencode(ABSTFN) + b'\\nonexistent\\\xff' + self.assertRaises(UnicodeDecodeError, realpath, path, strict=False) + self.assertRaises(UnicodeDecodeError, realpath, path, strict=True) + @os_helper.skip_unless_symlink @unittest.skipUnless(HAVE_GETFINALPATHNAME, 'need _getfinalpathname') def test_realpath_relative(self): diff --git a/Lib/test/test_posixpath.py b/Lib/test/test_posixpath.py index 6294bc244d1bcd..eec6b09a6408c2 100644 --- a/Lib/test/test_posixpath.py +++ b/Lib/test/test_posixpath.py @@ -489,11 +489,37 @@ def test_realpath_strict(self): finally: os_helper.unlink(ABSTFN) + def test_realpath_embedded_null(self): + path = '/\x00' + self.assertRaises(ValueError, realpath, path, strict=False) + self.assertRaises(ValueError, realpath, path, strict=True) + path = b'/\x00' + self.assertRaises(ValueError, realpath, path, strict=False) + self.assertRaises(ValueError, realpath, path, strict=True) + path = '/nonexistent/x\x00' + self.assertRaises(ValueError, realpath, path, strict=False) + self.assertRaises(FileNotFoundError, realpath, path, strict=True) + path = b'/nonexistent/x\x00' + self.assertRaises(ValueError, realpath, path, strict=False) + self.assertRaises(FileNotFoundError, realpath, path, strict=True) + + @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths') def test_realpath_unencodable(self): - self.assertRaises(ValueError, realpath, 'test\0', strict=False) - self.assertRaises(ValueError, realpath, 'test\0', strict=True) - self.assertRaises(UnicodeEncodeError, realpath, '\ud800', strict=False) - self.assertRaises(UnicodeEncodeError, realpath, '\ud800', strict=True) + path = '/\ud800' + self.assertRaises(UnicodeEncodeError, realpath, path, strict=False) + self.assertRaises(UnicodeEncodeError, realpath, path, strict=True) + path = '/nonexistent/\ud800' + self.assertRaises(UnicodeEncodeError, realpath, path, strict=False) + self.assertRaises(FileNotFoundError, realpath, path, strict=True) + + @unittest.skipUnless(sys.platform == 'win32', 'requires native Unicode paths') + def test_realpath_undecodable(self): + path = b'/\xff' + self.assertRaises(UnicodeDecodeError, realpath, path, strict=False) + self.assertRaises(UnicodeDecodeError, realpath, path, strict=True) + path = b'/nonexistent/\xff' + self.assertRaises(UnicodeDecodeError, realpath, path, strict=False) + self.assertRaises(FileNotFoundError, realpath, path, strict=True) @os_helper.skip_unless_symlink @skip_if_ABSTFN_contains_backslash diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 2e9ece6c032f57..2018a20afd1b18 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -4093,6 +4093,7 @@ def test_data_filter(self): self.assertIs(filtered.name, tarinfo.name) self.assertIs(filtered.type, tarinfo.type) + @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths') def test_filter_unencodable(self): # Sanity check using a valid path. tarinfo = tarfile.TarInfo(os_helper.TESTFN) @@ -4101,14 +4102,17 @@ def test_filter_unencodable(self): filtered = tarfile.data_filter(tarinfo, '') self.assertIs(filtered.name, tarinfo.name) - tarinfo = tarfile.TarInfo('test\0') + tarinfo = tarfile.TarInfo('test\x00') self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '') self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '') tarinfo = tarfile.TarInfo('\ud800') self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '') self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '') - def test_extract_encode_error(self): + @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths') + def test_extract_unencodable(self): + # Create a member with name \xed\xa0\x80 which is UTF-8 encoded + # lone surrogate \ud800. with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc: arc.add('\udced\udca0\udc80') with os_helper.temp_cwd() as tmp: From 730ff5b5579cc0d8b6b574901df8339f5a162636 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 18 May 2025 20:52:09 +0300 Subject: [PATCH 3/3] Extract realpath() tests to a separate PR. --- Lib/test/test_ntpath.py | 28 ---------------------------- Lib/test/test_posixpath.py | 32 -------------------------------- 2 files changed, 60 deletions(-) diff --git a/Lib/test/test_ntpath.py b/Lib/test/test_ntpath.py index 7cb638c5faecf6..c10387b58e3f9c 100644 --- a/Lib/test/test_ntpath.py +++ b/Lib/test/test_ntpath.py @@ -437,34 +437,6 @@ def test_realpath_strict(self): # gh-106242: Embedded nulls should raise OSError (not ValueError) self.assertRaises(OSError, ntpath.realpath, ABSTFN + "\0spam", strict=True) - @unittest.skipUnless(HAVE_GETFINALPATHNAME, 'need _getfinalpathname') - def test_realpath_embedded_null(self): - realpath = ntpath.realpath - ABSTFN = ntpath.abspath(os_helper.TESTFN) - path = ABSTFN + '\x00' - self.assertEqual(realpath(path, strict=False), path) - self.assertRaises(OSError, realpath, path, strict=True) - path = os.fsencode(ABSTFN) + b'\x00' - self.assertEqual(realpath(path, strict=False), path) - self.assertRaises(OSError, realpath, path, strict=True) - path = ABSTFN + '\\nonexistent\\x\x00' - self.assertEqual(realpath(path, strict=False), path) - self.assertRaises(OSError, realpath, path, strict=True) - path = os.fsencode(ABSTFN) + b'\\nonexistent\\x\x00' - self.assertEqual(realpath(path, strict=False), path) - self.assertRaises(OSError, realpath, path, strict=True) - - @unittest.skipUnless(HAVE_GETFINALPATHNAME, 'need _getfinalpathname') - def test_realpath_undecodable(self): - realpath = ntpath.realpath - ABSTFN = ntpath.abspath(os_helper.TESTFN) - path = os.fsencode(ABSTFN) + b'\xff' - self.assertRaises(UnicodeDecodeError, realpath, path, strict=False) - self.assertRaises(UnicodeDecodeError, realpath, path, strict=True) - path = os.fsencode(ABSTFN) + b'\\nonexistent\\\xff' - self.assertRaises(UnicodeDecodeError, realpath, path, strict=False) - self.assertRaises(UnicodeDecodeError, realpath, path, strict=True) - @os_helper.skip_unless_symlink @unittest.skipUnless(HAVE_GETFINALPATHNAME, 'need _getfinalpathname') def test_realpath_relative(self): diff --git a/Lib/test/test_posixpath.py b/Lib/test/test_posixpath.py index eec6b09a6408c2..fa19d549c26cfa 100644 --- a/Lib/test/test_posixpath.py +++ b/Lib/test/test_posixpath.py @@ -489,38 +489,6 @@ def test_realpath_strict(self): finally: os_helper.unlink(ABSTFN) - def test_realpath_embedded_null(self): - path = '/\x00' - self.assertRaises(ValueError, realpath, path, strict=False) - self.assertRaises(ValueError, realpath, path, strict=True) - path = b'/\x00' - self.assertRaises(ValueError, realpath, path, strict=False) - self.assertRaises(ValueError, realpath, path, strict=True) - path = '/nonexistent/x\x00' - self.assertRaises(ValueError, realpath, path, strict=False) - self.assertRaises(FileNotFoundError, realpath, path, strict=True) - path = b'/nonexistent/x\x00' - self.assertRaises(ValueError, realpath, path, strict=False) - self.assertRaises(FileNotFoundError, realpath, path, strict=True) - - @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths') - def test_realpath_unencodable(self): - path = '/\ud800' - self.assertRaises(UnicodeEncodeError, realpath, path, strict=False) - self.assertRaises(UnicodeEncodeError, realpath, path, strict=True) - path = '/nonexistent/\ud800' - self.assertRaises(UnicodeEncodeError, realpath, path, strict=False) - self.assertRaises(FileNotFoundError, realpath, path, strict=True) - - @unittest.skipUnless(sys.platform == 'win32', 'requires native Unicode paths') - def test_realpath_undecodable(self): - path = b'/\xff' - self.assertRaises(UnicodeDecodeError, realpath, path, strict=False) - self.assertRaises(UnicodeDecodeError, realpath, path, strict=True) - path = b'/nonexistent/\xff' - self.assertRaises(UnicodeDecodeError, realpath, path, strict=False) - self.assertRaises(FileNotFoundError, realpath, path, strict=True) - @os_helper.skip_unless_symlink @skip_if_ABSTFN_contains_backslash def test_realpath_relative(self):