diff --git a/CHANGELOG.md b/CHANGELOG.md index f658278131..014decc31e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # CHANGELOG +## Version 3.3.0, 2023-01-22 + +### New Features (ENH) +- Add page label support to PdfWriter (#1558) +- Accept inline images with space before EI (#1552) +- Add circle annotation support (#1556) +- Add polygon annotation support (#1557) +- Make merging pages produce a deterministic PDF (#1542, #1543) + +### Bug Fixes (BUG) +- Fix error in cmap extraction (#1544) +- Remove erroneous assertion check (#1564) +- Fix dictionary access of optional page label keys (#1562) + +### Robustness (ROB) +- Set ignore_eof=True for read_until_regex (#1521) + +### Documentation (DOC) +- Paper size (#1550) + +### Developer Experience (DEV) +- Fix broken combination of dependencies of docs.txt +- Annotate tests appropriately (#1551) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.2.1...3.3.0) + + ## Version 3.2.1, 2023-01-08 ### Bug Fixes (BUG) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index d17c9a1f4d..0af20cd2d8 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -20,6 +20,8 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra * [Karvonen, Harry](https://github.com/Hatell/) * [KourFrost](https://github.com/KourFrost) * [Lightup1](https://github.com/Lightup1) +* [Majumder, Jonah](https://github.com/jonahmajumder) +* [Manini, Lorenzo](https://github.com/lorenzomanini) * [maxbeer99](https://github.com/maxbeer99) * [Mérino, Antoine](https://github.com/Merinorus) * [Perrensen, Olsen](https://github.com/olsonperrensen) diff --git a/docs/modules/PaperSize.rst b/docs/modules/PaperSize.rst index 0487678522..0cbc36f402 100644 --- a/docs/modules/PaperSize.rst +++ b/docs/modules/PaperSize.rst @@ -1,7 +1,34 @@ The PaperSize Class ------------------------- +------------------- .. autoclass:: pypdf.PaperSize :members: :undoc-members: :show-inheritance: + +Add blank page with PaperSize +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: python + :linenos: + + from PyPDF2 import PaperSize, PdfReader, PdfWriter + pdf_reader = PdfReader("sample.pdf") + pdf_writer = PdfWriter() + pdf_writer.append_pages_from_reader(pdf_reader) + pdf_writer.add_blank_page(PaperSize.A8.width, PaperSize.A8.height) + with open("output.pdf", "wb") as output_stream: + pdf_writer.write(output_stream) + +Insert blank page with PaperSize +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: python + :linenos: + + from PyPDF2 import PaperSize, PdfReader, PdfWriter + pdf_reader = PdfReader("sample.pdf") + pdf_writer = PdfWriter() + pdf_writer.append_pages_from_reader(pdf_reader) + pdf_writer.insert_blank_page(PaperSize.A8.width, PaperSize.A8.height, 1) + with open("output.pdf", "wb") as output_stream: + pdf_writer.write(output_stream) + \ No newline at end of file diff --git a/docs/user/adding-pdf-annotations.md b/docs/user/adding-pdf-annotations.md index 54c451547a..620ea8704a 100644 --- a/docs/user/adding-pdf-annotations.md +++ b/docs/user/adding-pdf-annotations.md @@ -104,7 +104,7 @@ page = reader.pages[0] writer = PdfWriter() writer.add_page(page) -# Add the line +# Add the rectangle annotation = AnnotationBuilder.rectangle( rect=(50, 550, 200, 650), ) @@ -119,6 +119,56 @@ If you want the rectangle to be filled, use the `interiour_color="ff0000"` param This method uses the "square" annotation type of the PDF format. + +## Ellipse + +If you want to add a circle like this: + +![](annotation-circle.png) + +```python +pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") +reader = PdfReader(pdf_path) +page = reader.pages[0] +writer = PdfWriter() +writer.add_page(page) + +# Add the rectangle +annotation = AnnotationBuilder.ellipse( + rect=(50, 550, 200, 650), +writer.add_annotation(page_number=0, annotation=annotation) + +# Write the annotated file to disk +with open("annotated-pdf.pdf", "wb") as fp: + writer.write(fp) +``` + +## Polygon + +If you want to add a polygon like this: + +![](annotation-polygon.png) + +you can use the {py:class}`AnnotationBuilder `: + +```python +pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") +reader = PdfReader(pdf_path) +page = reader.pages[0] +writer = PdfWriter() +writer.add_page(page) + +# Add the line +annotation = AnnotationBuilder.polygon( + vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], +) +writer.add_annotation(page_number=0, annotation=annotation) + +# Write the annotated file to disk +with open("annotated-pdf.pdf", "wb") as fp: + writer.write(fp) +``` + ## Link If you want to add a link, you can use diff --git a/docs/user/annotation-circle.png b/docs/user/annotation-circle.png new file mode 100644 index 0000000000..8bf8bdf9dd Binary files /dev/null and b/docs/user/annotation-circle.png differ diff --git a/docs/user/annotation-polygon.png b/docs/user/annotation-polygon.png new file mode 100644 index 0000000000..5b8e74f744 Binary files /dev/null and b/docs/user/annotation-polygon.png differ diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index dca9895694..0036de5500 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -151,6 +151,30 @@ the way PDF stores information just makes it hard to achieve that: And finally there are issues that pypdf will deal with. If you find such a text extraction bug, please share the PDF with us so we can work on it! +### Whitespaces + +The PDF format is meant for printing. It is not designed to be read by machines. +The text within a PDF document is absolutely positioned, meaning that every single +character could be positioned on the page. + +The text + +> This is a test document by Ethan Nelson. + +can be represented as + +> [(This is a )9(te)-3(st)9( do)-4(cu)13(m)-4(en)12(t )-3(b)3(y)-3( )9(Et)-2(h)3(an)4( Nels)13(o)-5(n)3(.)] TJ + +Where the numbers are adjustments of vertical space. This representation used +within the PDF file makes it very hard to guarantee correct whitespaces. + + +More information: + +* [issue #1507](https://github.com/py-pdf/pypdf/issues/1507) +* [Negative numbers in PDF content stream text object](https://stackoverflow.com/a/28203655/562769) +* Mark Stephens: [Understanding PDF text objects](https://blog.idrsolutions.com/understanding-pdf-text-objects/), 2010. + ## OCR vs Text Extraction Optical Character Recognition (OCR) is the process of extracting text from diff --git a/make_changelog.py b/make_changelog.py index d5f0b93799..badd315d05 100644 --- a/make_changelog.py +++ b/make_changelog.py @@ -27,7 +27,8 @@ def main(changelog_path: str): today = datetime.now() header = f"Version {new_version}, {today:%Y-%m-%d}\n" header = header + "-" * (len(header) - 1) + "\n" - trailer = f"\n[Full Changelog](https://github.com/py-pdf/pypdf/compare/{git_tag}...{new_version})\n\n" + url = f"https://github.com/py-pdf/pypdf/compare/{git_tag}...{new_version}" + trailer = f"\n[Full Changelog]({url})\n\n" new_entry = header + changes + trailer print(new_entry) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 8c472f87dd..9890526ba0 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -27,8 +27,10 @@ def build_char_map( encoding, space_code = parse_encoding(ft, space_code) map_dict, space_code, int_entry = parse_to_unicode(ft, space_code) - # encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me) - # if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data + # encoding can be either a string for decode + # (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me) + # if empty string, it means it is than encoding field is not present and + # we have to select the good encoding from cmap input data if encoding == "": if -1 not in map_dict or map_dict[-1] == 1: # I have not been able to find any rule for no /Encoding nor /ToUnicode @@ -36,7 +38,9 @@ def build_char_map( encoding = "charmap" else: encoding = "utf-16-be" - # apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters) + # apply rule from PDF ref 1.7 §5.9.1, 1st bullet : + # if cmap not empty encoding should be discarded + # (here transformed into identity for those characters) # if encoding is an str it is expected to be a identity translation elif isinstance(encoding, dict): for x in int_entry: @@ -131,7 +135,9 @@ def parse_encoding( enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore if isinstance(enc, str): try: - # allready done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding + # allready done : + # enc = NameObject.unnumber(enc.encode()).decode() + # for #xx decoding if enc in charset_encoding: encoding = charset_encoding[enc].copy() elif enc in _predefined_cmap: @@ -214,10 +220,12 @@ def prepare_cm(ft: DictionaryObject) -> bytes: if isinstance(tu, StreamObject): cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() elif isinstance(tu, str) and tu.startswith("/Identity"): - cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" # the full range 0000-FFFF will be processed + # the full range 0000-FFFF will be processed + cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" if isinstance(cm, str): cm = cm.encode() - # we need to prepare cm before due to missing return line in pdf printed to pdf from word + # we need to prepare cm before due to missing return line in pdf printed + # to pdf from word cm = ( cm.strip() .replace(b"beginbfchar", b"\nbeginbfchar\n") @@ -280,13 +288,11 @@ def parse_bfrange( ) -> Union[None, Tuple[int, int]]: lst = [x for x in line.split(b" ") if x] closure_found = False - nbi = max(len(lst[0]), len(lst[1])) - map_dict[-1] = ceil(nbi / 2) - fmt = b"%%0%dX" % (map_dict[-1] * 2) if multiline_rg is not None: + fmt = b"%%0%dX" % (map_dict[-1] * 2) a = multiline_rg[0] # a, b not in the current line b = multiline_rg[1] - for sq in lst[1:]: + for sq in lst[0:]: if sq == b"]": closure_found = True break @@ -301,6 +307,9 @@ def parse_bfrange( else: a = int(lst[0], 16) b = int(lst[1], 16) + nbi = max(len(lst[0]), len(lst[1])) + map_dict[-1] = ceil(nbi / 2) + fmt = b"%%0%dX" % (map_dict[-1] * 2) if lst[2] == b"[": for sq in lst[3:]: if sq == b"]": diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index 038067a4ba..3d331fdab0 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -257,8 +257,8 @@ def compute_key( 2E 2E 00 B6 D0 68 3E 80 2F 0C A9 FE 64 53 69 7A > That is, if the password string is n bytes long, append the first 32 - n bytes of the padding string to the end - of the password string. If the password string is empty (zero-length), - meaning there is no user password, + of the password string. If the password string is empty + (zero-length), meaning there is no user password, substitute the entire padding string in its place. b) Initialize the MD5 hash function and pass the result of step (a) @@ -295,10 +295,10 @@ def compute_key( key_size: The size of the key in bytes o_entry: The owner entry P: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, all other - bits are ignored and all operations are permitted. If bit 2 is set to 0, - permission for operations are based on the values of the remaining flags - defined in Table 24. + when the document is opened with user access. If bit 2 is set to 1, + all other bits are ignored and all operations are permitted. + If bit 2 is set to 0, permission for operations are based on the + values of the remaining flags defined in Table 24. id1_entry: metadata_encrypted: A boolean indicating if the metadata is encrypted. @@ -465,14 +465,20 @@ def verify_user_password( """ Algorithm 6: Authenticating the user password. - a) Perform all but the last step of "Algorithm 4: Computing the encryption dictionary’s U (user password) - value (Security handlers of revision 2)" or "Algorithm 5: Computing the encryption dictionary’s U (user - password) value (Security handlers of revision 3 or greater)" using the supplied password string. - b) If the result of step (a) is equal to the value of the encryption dictionary’s U entry (comparing on the first 16 - bytes in the case of security handlers of revision 3 or greater), the password supplied is the correct user - password. The key obtained in step (a) (that is, in the first step of "Algorithm 4: Computing the encryption - dictionary’s U (user password) value (Security handlers of revision 2)" or "Algorithm 5: Computing the - encryption dictionary’s U (user password) value (Security handlers of revision 3 or greater)") shall be used + a) Perform all but the last step of "Algorithm 4: Computing the + encryption dictionary’s U (user password) value (Security handlers of + revision 2)" or "Algorithm 5: Computing the encryption dictionary’s U + (user password) value (Security handlers of revision 3 or greater)" + using the supplied password string. + b) If the result of step (a) is equal to the value of the encryption + dictionary’s U entry (comparing on the first 16 bytes in the case of + security handlers of revision 3 or greater), the password supplied is + the correct user password. The key obtained in step (a) (that is, in + the first step of "Algorithm 4: Computing the encryption + dictionary’s U (user password) value + (Security handlers of revision 2)" or + "Algorithm 5: Computing the encryption dictionary’s U (user password) + value (Security handlers of revision 3 or greater)") shall be used to decrypt the document. Args: @@ -482,10 +488,10 @@ def verify_user_password( o_entry: The owner entry u_entry: The user entry P: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, all other - bits are ignored and all operations are permitted. If bit 2 is set to 0, - permission for operations are based on the values of the remaining flags - defined in Table 24. + when the document is opened with user access. If bit 2 is set to 1, + all other bits are ignored and all operations are permitted. + If bit 2 is set to 0, permission for operations are based on the + values of the remaining flags defined in Table 24. id1_entry: metadata_encrypted: A boolean indicating if the metadata is encrypted. @@ -517,17 +523,25 @@ def verify_owner_password( """ Algorithm 7: Authenticating the owner password. - a) Compute an encryption key from the supplied password string, as described in steps (a) to (d) of - "Algorithm 3: Computing the encryption dictionary’s O (owner password) value". - b) (Security handlers of revision 2 only) Decrypt the value of the encryption dictionary’s O entry, using an RC4 + a) Compute an encryption key from the supplied password string, as + described in steps (a) to (d) of + "Algorithm 3: Computing the encryption dictionary’s O (owner password) + value". + b) (Security handlers of revision 2 only) Decrypt the value of the + encryption dictionary’s O entry, using an RC4 encryption function with the encryption key computed in step (a). - (Security handlers of revision 3 or greater) Do the following 20 times: Decrypt the value of the encryption - dictionary’s O entry (first iteration) or the output from the previous iteration (all subsequent iterations), - using an RC4 encryption function with a different encryption key at each iteration. The key shall be - generated by taking the original key (obtained in step (a)) and performing an XOR (exclusive or) operation - between each byte of the key and the single-byte value of the iteration counter (from 19 to 0). - c) The result of step (b) purports to be the user password. Authenticate this user password using "Algorithm 6: - Authenticating the user password". If it is correct, the password supplied is the correct owner password. + (Security handlers of revision 3 or greater) Do the following 20 times: + Decrypt the value of the encryption dictionary’s O entry (first iteration) + or the output from the previous iteration (all subsequent iterations), + using an RC4 encryption function with a different encryption key at + each iteration. The key shall be generated by taking the original key + (obtained in step (a)) and performing an XOR (exclusive or) operation + between each byte of the key and the single-byte value of the + iteration counter (from 19 to 0). + c) The result of step (b) purports to be the user password. + Authenticate this user password using + "Algorithm 6: Authenticating the user password". + If it is correct, the password supplied is the correct owner password. Args: owner_password: @@ -536,10 +550,10 @@ def verify_owner_password( o_entry: The owner entry u_entry: The user entry P: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, all other - bits are ignored and all operations are permitted. If bit 2 is set to 0, - permission for operations are based on the values of the remaining flags - defined in Table 24. + when the document is opened with user access. If bit 2 is set to 1, + all other bits are ignored and all operations are permitted. + If bit 2 is set to 0, permission for operations are based on the + values of the remaining flags defined in Table 24. id1_entry: metadata_encrypted: A boolean indicating if the metadata is encrypted. @@ -575,31 +589,44 @@ def verify_owner_password( """ Algorithm 3.2a Computing an encryption key. - To understand the algorithm below, it is necessary to treat the O and U strings in the Encrypt dictionary - as made up of three sections. The first 32 bytes are a hash value (explained below). The next 8 bytes are - called the Validation Salt. The final 8 bytes are called the Key Salt. - - 1. The password string is generated from Unicode input by processing the input string with the SASLprep - (IETF RFC 4013) profile of stringprep (IETF RFC 3454), and then converting to a UTF-8 representation. - 2. Truncate the UTF-8 representation to 127 bytes if it is longer than 127 bytes. - 3. Test the password against the owner key by computing the SHA-256 hash of the UTF-8 password - concatenated with the 8 bytes of owner Validation Salt, concatenated with the 48-byte U string. If the - 32-byte result matches the first 32 bytes of the O string, this is the owner password. - Compute an intermediate owner key by computing the SHA-256 hash of the UTF-8 password - concatenated with the 8 bytes of owner Key Salt, concatenated with the 48-byte U string. The 32-byte - result is the key used to decrypt the 32-byte OE string using AES-256 in CBC mode with no padding and - an initialization vector of zero. The 32-byte result is the file encryption key. - 4. Test the password against the user key by computing the SHA-256 hash of the UTF-8 password - concatenated with the 8 bytes of user Validation Salt. If the 32 byte result matches the first 32 bytes of + To understand the algorithm below, it is necessary to treat the O and U + strings in the Encrypt dictionary as made up of three sections. + The first 32 bytes are a hash value (explained below). The next 8 bytes + are called the Validation Salt. The final 8 bytes are called the Key Salt. + + 1. The password string is generated from Unicode input by processing the + input string with the SASLprep (IETF RFC 4013) profile of + stringprep (IETF RFC 3454), and then converting to a UTF-8 + representation. + 2. Truncate the UTF-8 representation to 127 bytes if it is longer than + 127 bytes. + 3. Test the password against the owner key by computing the SHA-256 hash + of the UTF-8 password concatenated with the 8 bytes of owner + Validation Salt, concatenated with the 48-byte U string. If the + 32-byte result matches the first 32 bytes of the O string, this is + the owner password. + Compute an intermediate owner key by computing the SHA-256 hash of + the UTF-8 password concatenated with the 8 bytes of owner Key Salt, + concatenated with the 48-byte U string. The 32-byte result is the + key used to decrypt the 32-byte OE string using AES-256 in CBC mode + with no padding and an initialization vector of zero. + The 32-byte result is the file encryption key. + 4. Test the password against the user key by computing the SHA-256 hash + of the UTF-8 password concatenated with the 8 bytes of user + Validation Salt. If the 32 byte result matches the first 32 bytes of the U string, this is the user password. - Compute an intermediate user key by computing the SHA-256 hash of the UTF-8 password - concatenated with the 8 bytes of user Key Salt. The 32-byte result is the key used to decrypt the 32-byte - UE string using AES-256 in CBC mode with no padding and an initialization vector of zero. The 32-byte - result is the file encryption key. - 5. Decrypt the 16-byte Perms string using AES-256 in ECB mode with an initialization vector of zero and - the file encryption key as the key. Verify that bytes 9-11 of the result are the characters ‘a’, ‘d’, ‘b’. Bytes - 0-3 of the decrypted Perms entry, treated as a little-endian integer, are the user permissions. They - should match the value in the P key. + Compute an intermediate user key by computing the SHA-256 hash of the + UTF-8 password concatenated with the 8 bytes of user Key Salt. + The 32-byte result is the key used to decrypt the 32-byte + UE string using AES-256 in CBC mode with no padding and an + initialization vector of zero. The 32-byte result is the file + encryption key. + 5. Decrypt the 16-byte Perms string using AES-256 in ECB mode with an + initialization vector of zero and the file encryption key as the key. + Verify that bytes 9-11 of the result are the characters ‘a’, ‘d’, ‘b’. + Bytes 0-3 of the decrypted Perms entry, treated as a little-endian + integer, are the user permissions. + They should match the value in the P key. Args: R: A number specifying which revision of the standard security @@ -721,15 +748,20 @@ def generate_values( @staticmethod def compute_U_value(password: bytes, key: bytes) -> Tuple[bytes, bytes]: """ - Algorithm 3.8 Computing the encryption dictionary’s U (user password) and UE (user encryption key) values - - 1. Generate 16 random bytes of data using a strong random number generator. The first 8 bytes are the - User Validation Salt. The second 8 bytes are the User Key Salt. Compute the 32-byte SHA-256 hash of - the password concatenated with the User Validation Salt. The 48-byte string consisting of the 32-byte - hash followed by the User Validation Salt followed by the User Key Salt is stored as the U key. - 2. Compute the 32-byte SHA-256 hash of the password concatenated with the User Key Salt. Using this - hash as the key, encrypt the file encryption key using AES-256 in CBC mode with no padding and an - initialization vector of zero. The resulting 32-byte string is stored as the UE key. + Algorithm 3.8 Computing the encryption dictionary’s U (user password) + and UE (user encryption key) values + + 1. Generate 16 random bytes of data using a strong random number generator. + The first 8 bytes are the User Validation Salt. The second 8 bytes + are the User Key Salt. Compute the 32-byte SHA-256 hash of the + password concatenated with the User Validation Salt. The 48-byte + string consisting of the 32-byte hash followed by the User + Validation Salt followed by the User Key Salt is stored as the U key. + 2. Compute the 32-byte SHA-256 hash of the password concatenated with + the User Key Salt. Using this hash as the key, encrypt the file + encryption key using AES-256 in CBC mode with no padding and an + initialization vector of zero. The resulting 32-byte string is stored + as the UE key. Args: password: @@ -753,24 +785,30 @@ def compute_O_value( password: bytes, key: bytes, u_value: bytes ) -> Tuple[bytes, bytes]: """ - Algorithm 3.9 Computing the encryption dictionary’s O (owner password) and OE (owner encryption key) values. - - 1. Generate 16 random bytes of data using a strong random number generator. The first 8 bytes are the - Owner Validation Salt. The second 8 bytes are the Owner Key Salt. Compute the 32-byte SHA-256 hash - of the password concatenated with the Owner Validation Salt and then concatenated with the 48-byte - U string as generated in Algorithm 3.8. The 48-byte string consisting of the 32-byte hash followed by - the Owner Validation Salt followed by the Owner Key Salt is stored as the O key. - 2. Compute the 32-byte SHA-256 hash of the password concatenated with the Owner Key Salt and then - concatenated with the 48-byte U string as generated in Algorithm 3.8. Using this hash as the key, - encrypt the file encryption key using AES-256 in CBC mode with no padding and an initialization vector - of zero. The resulting 32-byte string is stored as the OE key. + Algorithm 3.9 Computing the encryption dictionary’s O (owner password) + and OE (owner encryption key) values. + + 1. Generate 16 random bytes of data using a strong random number + generator. The first 8 bytes are the Owner Validation Salt. The + second 8 bytes are the Owner Key Salt. Compute the 32-byte SHA-256 + hash of the password concatenated with the Owner Validation Salt and + then concatenated with the 48-byte U string as generated in + Algorithm 3.8. The 48-byte string consisting of the 32-byte hash + followed by the Owner Validation Salt followed by the Owner Key Salt + is stored as the O key. + 2. Compute the 32-byte SHA-256 hash of the password concatenated with + the Owner Key Salt and then concatenated with the 48-byte U string as + generated in Algorithm 3.8. Using this hash as the key, + encrypt the file encryption key using AES-256 in CBC mode with + no padding and an initialization vector of zero. + The resulting 32-byte string is stored as the OE key. Args: password: key: u_value: A 32-byte string, based on the user password, that shall be - used in determining whether to prompt the user for a password and, if so, - whether a valid user or owner password was entered. + used in determining whether to prompt the user for a password and, + if so, whether a valid user or owner password was entered. Returns: A tuple (O value, OE value) @@ -792,23 +830,27 @@ def compute_Perms_value(key: bytes, p: int, metadata_encrypted: bool) -> bytes: """ Algorithm 3.10 Computing the encryption dictionary’s Perms (permissions) value - 1. Extend the permissions (contents of the P integer) to 64 bits by setting the upper 32 bits to all 1’s. (This - allows for future extension without changing the format.) - 2. Record the 8 bytes of permission in the bytes 0-7 of the block, low order byte first. - 3. Set byte 8 to the ASCII value ' T ' or ' F ' according to the EncryptMetadata Boolean. + 1. Extend the permissions (contents of the P integer) to 64 bits by + setting the upper 32 bits to all 1’s. + (This allows for future extension without changing the format.) + 2. Record the 8 bytes of permission in the bytes 0-7 of the block, + low order byte first. + 3. Set byte 8 to the ASCII value ' T ' or ' F ' according to the + EncryptMetadata Boolean. 4. Set bytes 9-11 to the ASCII characters ' a ', ' d ', ' b '. 5. Set bytes 12-15 to 4 bytes of random data, which will be ignored. - 6. Encrypt the 16-byte block using AES-256 in ECB mode with an initialization vector of zero, using the file - encryption key as the key. The result (16 bytes) is stored as the Perms string, and checked for validity - when the file is opened. + 6. Encrypt the 16-byte block using AES-256 in ECB mode with an + initialization vector of zero, using the file encryption key as the + key. The result (16 bytes) is stored as the Perms string, and checked + for validity when the file is opened. Args: key: p: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, all other - bits are ignored and all operations are permitted. If bit 2 is set to 0, - permission for operations are based on the values of the remaining flags - defined in Table 24. + when the document is opened with user access. If bit 2 is set to 1, + all other bits are ignored and all operations are permitted. + If bit 2 is set to 0, permission for operations are based on the + values of the remaining flags defined in Table 24. metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: @@ -860,30 +902,40 @@ def decrypt_object(self, obj: PdfObject, idnum: int, generation: int) -> PdfObje """ Algorithm 1: Encryption of data using the RC4 or AES algorithms. - a) Obtain the object number and generation number from the object identifier of the string or stream to be - encrypted (see 7.3.10, "Indirect Objects"). If the string is a direct object, use the identifier of the indirect - object containing it. - b) For all strings and streams without crypt filter specifier; treating the object number and generation number - as binary integers, extend the original n-byte encryption key to n + 5 bytes by appending the low-order 3 - bytes of the object number and the low-order 2 bytes of the generation number in that order, low-order byte - first. (n is 5 unless the value of V in the encryption dictionary is greater than 1, in which case n is the value - of Length divided by 8.) - If using the AES algorithm, extend the encryption key an additional 4 bytes by adding the value “sAlT”, - which corresponds to the hexadecimal values 0x73, 0x41, 0x6C, 0x54. (This addition is done for backward - compatibility and is not intended to provide additional security.) - c) Initialize the MD5 hash function and pass the result of step (b) as input to this function. - d) Use the first (n + 5) bytes, up to a maximum of 16, of the output from the MD5 hash as the key for the RC4 - or AES symmetric key algorithms, along with the string or stream data to be encrypted. - If using the AES algorithm, the Cipher Block Chaining (CBC) mode, which requires an initialization vector, - is used. The block size parameter is set to 16 bytes, and the initialization vector is a 16-byte random - number that is stored as the first 16 bytes of the encrypted stream or string. + a) Obtain the object number and generation number from the object + identifier of the string or stream to be encrypted + (see 7.3.10, "Indirect Objects"). If the string is a direct object, + use the identifier of the indirect object containing it. + b) For all strings and streams without crypt filter specifier; treating + the object number and generation number as binary integers, extend + the original n-byte encryption key to n + 5 bytes by appending the + low-order 3 bytes of the object number and the low-order 2 bytes of + the generation number in that order, low-order byte first. + (n is 5 unless the value of V in the encryption dictionary is greater + than 1, in which case n is the value of Length divided by 8.) + If using the AES algorithm, extend the encryption key an additional + 4 bytes by adding the value “sAlT”, which corresponds to the + hexadecimal values 0x73, 0x41, 0x6C, 0x54. (This addition is done for + backward compatibility and is not intended to provide additional + security.) + c) Initialize the MD5 hash function and pass the result of step (b) as + input to this function. + d) Use the first (n + 5) bytes, up to a maximum of 16, of the output + from the MD5 hash as the key for the RC4 or AES symmetric key + algorithms, along with the string or stream data to be encrypted. + If using the AES algorithm, the Cipher Block Chaining (CBC) mode, + which requires an initialization vector, is used. The block size + parameter is set to 16 bytes, and the initialization vector is a + 16-byte random number that is stored as the first 16 bytes of the + encrypted stream or string. Algorithm 3.1a Encryption of data using the AES algorithm - 1. Use the 32-byte file encryption key for the AES-256 symmetric key algorithm, along with the string or - stream data to be encrypted. - Use the AES algorithm in Cipher Block Chaining (CBC) mode, which requires an initialization vector. The - block size parameter is set to 16 bytes, and the initialization vector is a 16-byte random number that is - stored as the first 16 bytes of the encrypted stream or string. + 1. Use the 32-byte file encryption key for the AES-256 symmetric key + algorithm, along with the string or stream data to be encrypted. + Use the AES algorithm in Cipher Block Chaining (CBC) mode, which + requires an initialization vector. The block size parameter is set to + 16 bytes, and the initialization vector is a 16-byte random number + that is stored as the first 16 bytes of the encrypted stream or string. The output is the encrypted data to be stored in the PDF file. Args: diff --git a/pypdf/_merger.py b/pypdf/_merger.py index 7642e79031..cbfeb4a38e 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -178,7 +178,8 @@ def merge( ) else: raise ValueError( - "The argument position of merge is deprecated. Use page_number only." + "The argument position of merge is deprecated. " + "Use page_number only." ) if page_number is None: # deprecated @@ -335,7 +336,8 @@ def write(self, fileobj: Union[Path, StrByteType]) -> None: page.out_pagedata = self.output.get_reference( pages_obj[PA.KIDS][-1].get_object() ) - # idnum = self.output._objects.index(self.output._pages.get_object()[PA.KIDS][-1].get_object()) + 1 + # key_temp = self.output._pages.get_object()[PA.KIDS][-1].get_object() + # idnum = self.output._objects.index(key_temp) + 1 # page.out_pagedata = IndirectObject(idnum, 0, self.output) # Once all pages are added, create outline items to point at those pages @@ -703,7 +705,8 @@ def add_outline_item( """ if page_number is not None and pagenum is not None: raise ValueError( - "The argument pagenum of add_outline_item is deprecated. Use page_number only." + "The argument pagenum of add_outline_item is deprecated. " + "Use page_number only." ) if pagenum is not None: old_term = "pagenum" @@ -809,7 +812,8 @@ def add_named_destination( """ if page_number is not None and pagenum is not None: raise ValueError( - "The argument pagenum of add_named_destination is deprecated. Use page_number only." + "The argument pagenum of add_named_destination is deprecated. " + "Use page_number only." ) if pagenum is not None: old_term = "pagenum" diff --git a/pypdf/_page.py b/pypdf/_page.py index 2a9656f223..0eb268da26 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,7 +28,6 @@ # POSSIBILITY OF SUCH DAMAGE. import math -import uuid import warnings from decimal import Decimal from typing import ( @@ -95,17 +94,20 @@ def set_custom_rtl( If set to `None`, the value will not be changed. If set to an integer or string, it will be converted to its ASCII code. The default value is -1, which sets no additional range to be converted. - _max: The new maximum value for the range of custom characters that will be written right to left. + _max: The new maximum value for the range of custom characters that will + be written right to left. If set to `None`, the value will not be changed. If set to an integer or string, it will be converted to its ASCII code. The default value is -1, which sets no additional range to be converted. - specials: The new list of special characters to be inserted in the current insertion order. + specials: The new list of special characters to be inserted in the + current insertion order. If set to `None`, the current value will not be changed. If set to a string, it will be converted to a list of ASCII codes. The default value is an empty list. Returns: - A tuple containing the new values for `CUSTOM_RTL_MIN`, `CUSTOM_RTL_MAX`, and `CUSTOM_RTL_SPECIAL_CHARS`. + A tuple containing the new values for `CUSTOM_RTL_MIN`, + `CUSTOM_RTL_MAX`, and `CUSTOM_RTL_SPECIAL_CHARS`. """ global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS if isinstance(_min, int): @@ -576,17 +578,41 @@ def _merge_resources( ) -> Tuple[Dict[str, Any], Dict[str, Any]]: new_res = DictionaryObject() new_res.update(res1.get(resource, DictionaryObject()).get_object()) + + def compute_unique_key(base_key: str) -> Tuple[str, bool]: + """Find a key that either doesn't already exist or has the same + value (indicated by the bool)""" + value = page2res.raw_get(base_key) + # try the current key first (e.g. "foo"), but otherwise iterate + # through "foo-0", "foo-1", etc. new_res can contain only finitely + # many keys, thus this'll eventually end, even if it's been crafted + # to be maximally annoying. + computed_key = base_key + idx = 0 + while computed_key in new_res: + if new_res.raw_get(computed_key) == value: + # there's already a resource of this name, with the exact + # same value + return computed_key, True + computed_key = f"{base_key}-{idx}" + idx += 1 + return computed_key, False + page2res = cast( DictionaryObject, res2.get(resource, DictionaryObject()).get_object() ) rename_res = {} - for key in list(page2res.keys()): - if key in new_res and new_res.raw_get(key) != page2res.raw_get(key): - newname = NameObject(key + str(uuid.uuid4())) + for key in sorted(page2res.keys()): + unique_key, same_value = compute_unique_key(key) + newname = NameObject(unique_key) + if key != unique_key: + # we have to use a different name for this rename_res[key] = newname + + if not same_value: + # the value wasn't already recorded new_res[newname] = page2res[key] - elif key not in new_res: - new_res[key] = page2res.raw_get(key) + return new_res, rename_res @staticmethod @@ -740,12 +766,14 @@ def _merge_page( new_resources[NameObject(res)] = new rename.update(newrename) - # Combine /ProcSet sets. + # Combine /ProcSet sets, making sure there's a consistent order new_resources[NameObject(RES.PROC_SET)] = ArrayObject( - frozenset( - original_resources.get(RES.PROC_SET, ArrayObject()).get_object() - ).union( - frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) + sorted( + set( + original_resources.get(RES.PROC_SET, ArrayObject()).get_object() + ).union( + set(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) + ) ) ) @@ -894,7 +922,8 @@ def mergeScaledPage( """ deprecation_with_replacement( "page.mergeScaledPage(page2, scale, expand)", - "page2.add_transformation(Transformation().scale(scale)); page.merge_page(page2, expand)", + "page2.add_transformation(Transformation().scale(scale)); " + "page.merge_page(page2, expand)", "3.0.0", ) op = Transformation().scale(scale, scale) @@ -919,7 +948,8 @@ def mergeRotatedPage( """ deprecation_with_replacement( "page.mergeRotatedPage(page2, rotation, expand)", - "page2.add_transformation(Transformation().rotate(rotation)); page.merge_page(page2, expand)", + "page2.add_transformation(Transformation().rotate(rotation)); " + "page.merge_page(page2, expand)", "3.0.0", ) op = Transformation().rotate(rotation) @@ -945,7 +975,8 @@ def mergeTranslatedPage( """ deprecation_with_replacement( "page.mergeTranslatedPage(page2, tx, ty, expand)", - "page2.add_transformation(Transformation().translate(tx, ty)); page.merge_page(page2, expand)", + "page2.add_transformation(Transformation().translate(tx, ty)); " + "page.merge_page(page2, expand)", "3.0.0", ) op = Transformation().translate(tx, ty) @@ -977,7 +1008,8 @@ def mergeRotatedTranslatedPage( """ deprecation_with_replacement( "page.mergeRotatedTranslatedPage(page2, rotation, tx, ty, expand)", - "page2.add_transformation(Transformation().rotate(rotation).translate(tx, ty)); page.merge_page(page2, expand)", + "page2.add_transformation(Transformation().rotate(rotation).translate(tx, ty)); " + "page.merge_page(page2, expand)", "3.0.0", ) op = Transformation().translate(-tx, -ty).rotate(rotation).translate(tx, ty) @@ -1003,7 +1035,8 @@ def mergeRotatedScaledPage( """ deprecation_with_replacement( "page.mergeRotatedScaledPage(page2, rotation, scale, expand)", - "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", + "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); " + "page.merge_page(page2, expand)", "3.0.0", ) op = Transformation().rotate(rotation).scale(scale, scale) @@ -1035,7 +1068,8 @@ def mergeScaledTranslatedPage( """ deprecation_with_replacement( "page.mergeScaledTranslatedPage(page2, scale, tx, ty, expand)", - "page2.add_transformation(Transformation().scale(scale).translate(tx, ty)); page.merge_page(page2, expand)", + "page2.add_transformation(Transformation().scale(scale).translate(tx, ty)); " + "page.merge_page(page2, expand)", "3.0.0", ) op = Transformation().scale(scale, scale).translate(tx, ty) @@ -1070,7 +1104,8 @@ def mergeRotatedScaledTranslatedPage( """ deprecation_with_replacement( "page.mergeRotatedScaledTranslatedPage(page2, rotation, tx, ty, expand)", - "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); page.merge_page(page2, expand)", + "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); " + "page.merge_page(page2, expand)", "3.0.0", ) op = Transformation().rotate(rotation).scale(scale, scale).translate(tx, ty) @@ -1334,10 +1369,13 @@ def _extract_text( while NameObject(PG.RESOURCES) not in objr: # /Resources can be inherited sometimes so we look to parents objr = objr["/Parent"].get_object() - # if no parents we will have no /Resources will be available => an exception wil be raised + # if no parents we will have no /Resources will be available + # => an exception wil be raised resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) except Exception: - return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj + # no resources means no text is possible (no font) we consider the + # file as not damaged, no need to check for TJ or Tj + return "" if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): cmaps[f] = build_char_map(f, space_width, obj) @@ -1403,7 +1441,9 @@ def current_spacewidth() -> float: return _space_width / 1000.0 def process_operation(operator: bytes, operands: List) -> None: - nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap, orientations, rtl_dir, visitor_text + nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text + nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap + nonlocal orientations, rtl_dir, visitor_text global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS check_crlf_space: bool = False @@ -1484,10 +1524,12 @@ def process_operation(operator: bytes, operands: List) -> None: text = "" # rtl_dir = False try: - # charMapTuple: font_type, float(sp_width / 2), encoding, map_dict, font-dictionary + # charMapTuple: font_type, float(sp_width / 2), encoding, + # map_dict, font-dictionary charMapTuple = cmaps[operands[0]] _space_width = charMapTuple[1] - # current cmap: encoding, map_dict, font resource name (internal name, not the real font-name), + # current cmap: encoding, map_dict, font resource name + # (internal name, not the real font-name), # font-dictionary. The font-dictionary describes the font. cmap = ( charMapTuple[2], @@ -1550,7 +1592,10 @@ def process_operation(operator: bytes, operands: List) -> None: t = tt.decode( cmap[0], "surrogatepass" ) # apply str encoding - except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good + except Exception: + # the data does not match the expectation, + # we use the alternative ; + # text extraction may not be good t = tt.decode( "utf-16-be" if cmap[0] == "charmap" else "charmap", "surrogatepass", @@ -1568,7 +1613,9 @@ def process_operation(operator: bytes, operands: List) -> None: ): xx = ord(x) # fmt: off - if ( # cases where the current inserting order is kept (punctuation,...) + if ( + # cases where the current inserting order is + # kept (punctuation,...) (xx <= 0x2F) # punctuations but... or (0x3A <= xx and xx <= 0x40) # numbers (x30-39) or (0x2000 <= xx and xx <= 0x206F) # upper punctuations.. @@ -1784,9 +1831,11 @@ def extract_text( will change if this function is made more sophisticated. Arabic, Hebrew,... are extracted in the good order. - If required an custom RTL range of characters can be defined; see function set_custom_rtl + If required an custom RTL range of characters can be defined; + see function set_custom_rtl - Additionally you can provide visitor-methods to get informed on all operands and all text-objects. + Additionally you can provide visitor-methods to get informed on all + operands and all text-objects. For example in some PDF files this can be useful to parse tables. Args: @@ -1913,9 +1962,9 @@ def _get_fonts(self) -> Tuple[Set[str], Set[str]]: mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) """ - A :class:`RectangleObject`, expressed in default user space units, - defining the boundaries of the physical medium on which the page is - intended to be displayed or printed. + A :class:`RectangleObject`, expressed in + default user space units, defining the boundaries of the physical medium on + which the page is intended to be displayed or printed. """ @property @@ -1940,10 +1989,10 @@ def mediaBox(self, value: RectangleObject) -> None: # deprecated cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) """ - A :class:`RectangleObject`, expressed in default user space units, - defining the visible region of default user space. When the page is - displayed or printed, its contents are to be clipped (cropped) to this - rectangle and then imposed on the output medium in some + A :class:`RectangleObject`, expressed in + default user space units, defining the visible region of default user space. + When the page is displayed or printed, its contents are to be clipped + (cropped) to this rectangle and then imposed on the output medium in some implementation-defined manner. Default value: same as :attr:`mediabox`. """ @@ -1964,9 +2013,9 @@ def cropBox(self, value: RectangleObject) -> None: # deprecated bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) """ - A :class:`RectangleObject`, expressed in default user space units, - defining the region to which the contents of the page should be clipped - when output in a production environment. + A :class:`RectangleObject`, expressed in + default user space units, defining the region to which the contents of the + page should be clipped when output in a production environment. """ @property @@ -1986,8 +2035,9 @@ def bleedBox(self, value: RectangleObject) -> None: # deprecated trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) """ - A :class:`RectangleObject`, expressed in default user space units, - defining the intended dimensions of the finished page after trimming. + A :class:`RectangleObject`, expressed in + default user space units, defining the intended dimensions of the finished + page after trimming. """ @property @@ -2007,9 +2057,9 @@ def trimBox(self, value: RectangleObject) -> None: # deprecated artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) """ - A :class:`RectangleObject`, expressed in default user space units, - defining the extent of the page's meaningful content as intended by the - page's creator. + A :class:`RectangleObject`, expressed in + default user space units, defining the extent of the page's meaningful + content as intended by the page's creator. """ @property diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py index 7a83c7d11b..e2baa8aeda 100644 --- a/pypdf/_page_labels.py +++ b/pypdf/_page_labels.py @@ -57,11 +57,21 @@ aa to zz for the next 26, and so on) """ -from typing import Iterator +from typing import ( + Iterator, + Optional, + Tuple, +) from ._protocols import PdfReaderProtocol from ._utils import logger_warning +from .generic import ( + ArrayObject, + DictionaryObject, + NumberObject, +) + def number2uppercase_roman_numeral(num: int) -> str: roman = [ @@ -151,6 +161,7 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str: break i += 2 m = { + None: lambda n: "", "/D": lambda n: str(n), "/R": number2uppercase_roman_numeral, "/r": number2lowercase_roman_numeral, @@ -161,7 +172,9 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str: value = reader.get_object(value) if not isinstance(value, dict): return str(index + 1) # Fallback - return m[value["/S"]](index - start_index + 1) + start = value.get("/St", 1) + prefix = value.get("/P", "") + return prefix + m[value.get("/S")](index - start_index + start) if "/Kids" in number_tree or "/Limits" in number_tree: logger_warning( ( @@ -173,3 +186,81 @@ def index2label(reader: PdfReaderProtocol, index: int) -> str: ) # TODO: Implement /Kids and /Limits for number tree return str(index + 1) # Fallback + + +def nums_insert( + key: NumberObject, + value: DictionaryObject, + nums: ArrayObject, +) -> None: + """ + Insert a key, value pair in a Nums array. + + See 7.9.7 "Number Trees". + + Args: + key: number key of the entry + value: value of the entry + nums: Nums array to modify + """ + if len(nums) % 2 != 0: + raise ValueError("a nums like array must have an even number of elements") + + i = len(nums) + while i != 0 and key <= nums[i - 2]: + i = i - 2 + + if i < len(nums) and key == nums[i]: + nums[i + 1] = value + else: + nums.insert(i, key) + nums.insert(i + 1, value) + + +def nums_clear_range( + key: NumberObject, + page_index_to: int, + nums: ArrayObject, +) -> None: + """ + Remove all entries in a number tree in a range after an entry. + + See 7.9.7 "Number Trees". + + Args: + key: number key of the entry before the range + page_index_to: The page index of the upper limit of the range + nums: Nums array to modify + """ + if len(nums) % 2 != 0: + raise ValueError("a nums like array must have an even number of elements") + if page_index_to < key: + raise ValueError("page_index_to must be greater or equal than key") + + i = nums.index(key) + 2 + while i < len(nums) and nums[i] <= page_index_to: + nums.pop(i) + nums.pop(i) + + +def nums_next( + key: NumberObject, + nums: ArrayObject, +) -> Tuple[Optional[NumberObject], Optional[DictionaryObject]]: + """ + Return the (key, value) pair of the entry after the given one. + + See 7.9.7 "Number Trees". + + Args: + key: number key of the entry + nums: Nums array + """ + if len(nums) % 2 != 0: + raise ValueError("a nums like array must have an even number of elements") + + i = nums.index(key) + 2 + if i < len(nums): + return (nums[i], nums[i + 1]) + else: + return (None, None) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 21520ed8f3..ddca4b1a40 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -118,7 +118,8 @@ def convertToInt(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: # depreca class DocumentInformation(DictionaryObject): """ A class representing the basic document metadata provided in a PDF File. - This class is accessible through :py:class:`PdfReader.metadata`. + This class is accessible through + :py:class:`PdfReader.metadata`. All text properties of the document metadata have *two* properties, eg. author and author_raw. The non-raw property will @@ -1901,11 +1902,11 @@ def _read_xref_subsections( get_entry: Callable[[int], Union[int, Tuple[int, ...]]], used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], ) -> None: - last_end = 0 + # last_end = 0 for start, size in self._pairs(idx_pairs): # The subsections must increase - assert start >= last_end - last_end = start + size + # assert start >= last_end + # last_end = start + size for num in range(start, start + size): # The first entry is the type xref_type = get_entry(0) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 4da2663fcd..f68e0c359e 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -163,31 +163,22 @@ def skip_over_comment(stream: StreamType) -> None: tok = stream.read(1) -def read_until_regex( - stream: StreamType, regex: Pattern[bytes], ignore_eof: bool = False -) -> bytes: +def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes: """ Read until the regular expression pattern matched (ignore the match). + Treats EOF on the underlying stream as the end of the token to be matched. Args: - ignore_eof: If true, ignore end-of-line and return immediately regex: re.Pattern - ignore_eof: (Default value = False) Returns: The read bytes. - - Raises: - PdfStreamError: on premature end-of-file - """ name = b"" while True: tok = stream.read(16) if not tok: - if ignore_eof: - return name - raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + return name m = regex.search(tok) if m is not None: name += tok[: m.start()] @@ -479,8 +470,9 @@ def rename_kwargs( # type: ignore ) if new_term in kwargs: raise TypeError( - f"{func_name} received both {old_term} and {new_term} as an argument. " - f"{old_term} is deprecated. Use {new_term} instead." + f"{func_name} received both {old_term} and {new_term} as " + f"an argument. {old_term} is deprecated. " + f"Use {new_term} instead." ) kwargs[new_term] = kwargs.pop(old_term) warnings.warn( diff --git a/pypdf/_version.py b/pypdf/_version.py index 1da6a55523..88c513ea36 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.2.1" +__version__ = "3.3.0" diff --git a/pypdf/_writer.py b/pypdf/_writer.py index df318e5169..571df35894 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -59,6 +59,7 @@ from ._encryption import Encryption from ._page import PageObject, _VirtualList +from ._page_labels import nums_clear_range, nums_insert, nums_next from ._reader import PdfReader from ._security import _alg33, _alg34, _alg35 from ._utils import ( @@ -84,6 +85,7 @@ InteractiveFormDictEntries, ) from .constants import PageAttributes as PG +from .constants import PageLabelStyle from .constants import PagesAttributes as PA from .constants import StreamAttributes as SA from .constants import TrailerKeys as TK @@ -222,12 +224,14 @@ def get_object( if ido is not None: # deprecated if indirect_reference is not None: raise ValueError( - "Please only set 'indirect_reference'. The 'ido' argument is deprecated." + "Please only set 'indirect_reference'. The 'ido' argument " + "is deprecated." ) else: indirect_reference = ido warnings.warn( - "The parameter 'ido' is depreciated and will be removed in pypdf 4.0.0.", + "The parameter 'ido' is depreciated and will be removed in " + "pypdf 4.0.0.", DeprecationWarning, ) assert ( @@ -258,9 +262,10 @@ def _add_page( page_org = page excluded_keys = list(excluded_keys) excluded_keys += [PA.PARENT, "/StructParents"] - # acrobat does not accept to have two indirect ref pointing on the same page; - # therefore in order to add easily multiple copies of the same page, we need to create a new - # dictionary for the page, however the objects below (including content) is not duplicated + # acrobat does not accept to have two indirect ref pointing on the same + # page; therefore in order to add easily multiple copies of the same " + # page, we need to create a new dictionary for the page, however the " + # objects below (including content) is not duplicated try: # delete an already existing page del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore page_org.indirect_reference.idnum # type: ignore @@ -581,7 +586,8 @@ def add_js(self, javascript: str) -> None: NameObject("/JS"): TextStringObject(f"{javascript}"), } ) - # We need a name for parameterized javascript in the pdf file, but it can be anything. + # We need a name for parameterized javascript in the pdf file, + # but it can be anything. js_list.append(create_string_object(str(uuid.uuid4()))) js_list.append(self._add_object(js)) @@ -650,7 +656,8 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: } ) - # Then create the entry for the root, as it needs a reference to the Filespec + # Then create the entry for the root, as it needs + # a reference to the Filespec # Sample: # 1 0 obj # << @@ -692,8 +699,8 @@ def append_pages_from_reader( after_page_append: Optional[Callable[[PageObject], None]] = None, ) -> None: """ - Copy pages from reader to writer. Includes an optional callback parameter - which is invoked after pages are appended to the writer. + Copy pages from reader to writer. Includes an optional callback + parameter which is invoked after pages are appended to the writer. `append` should be prefered. @@ -979,7 +986,8 @@ def encrypt( if owner_pwd is not None: # deprecated if owner_password is not None: raise ValueError( - "The argument owner_pwd of encrypt is deprecated. Use owner_password only." + "The argument owner_pwd of encrypt is deprecated. " + "Use owner_password only." ) else: old_term = "owner_pwd" @@ -1204,7 +1212,8 @@ def _sweep_indirect_references( discovered.append(str(data)) stack.append((data.get_object(), None, None, [])) - # Check if data has a parent and if it is a dict or an array update the value + # Check if data has a parent and if it is a dict or + # an array update the value if isinstance(parent, (DictionaryObject, ArrayObject)): if isinstance(data, StreamObject): # a dictionary value is a stream. streams must be indirect @@ -1313,7 +1322,8 @@ def get_threads_root(self) -> ArrayObject: See §8.3.2 from PDF 1.7 spec. Returns: - An array (possibly empty) of Dictionaries with ``/F`` and ``/I`` properties. + An array (possibly empty) of Dictionaries with ``/F`` and + ``/I`` properties. """ if CO.THREADS in self._root_object: # TABLE 3.25 Entries in the catalog dictionary @@ -1394,7 +1404,8 @@ def add_outline_item_destination( ) -> IndirectObject: if page_destination is not None and dest is not None: # deprecated raise ValueError( - "The argument dest of add_outline_item_destination is deprecated. Use page_destination only." + "The argument dest of add_outline_item_destination is " + "deprecated. Use page_destination only." ) if dest is not None: # deprecated old_term = "dest" @@ -1513,7 +1524,7 @@ def add_outline_item( pagenum: Optional[int] = None, # deprecated ) -> IndirectObject: """ - Add an outline item (commonly referred to as a "Bookmark") to this PDF file. + Add an outline item (commonly referred to as a "Bookmark") to the PDF file. Args: title: Title to use for this outline item. @@ -1540,7 +1551,8 @@ def add_outline_item( ) if page_number is not None and pagenum is not None: raise ValueError( - "The argument pagenum of add_outline_item is deprecated. Use page_number only." + "The argument pagenum of add_outline_item is deprecated. " + "Use page_number only." ) if page_number is None: action_ref = None @@ -1662,7 +1674,8 @@ def add_named_destination_object( ) -> IndirectObject: if page_destination is not None and dest is not None: raise ValueError( - "The argument dest of add_named_destination_object is deprecated. Use page_destination only." + "The argument dest of add_named_destination_object is " + "deprecated. Use page_destination only." ) if dest is not None: # deprecated old_term = "dest" @@ -1706,7 +1719,8 @@ def add_named_destination( ) -> IndirectObject: if page_number is not None and pagenum is not None: raise ValueError( - "The argument pagenum of add_outline_item is deprecated. Use page_number only." + "The argument pagenum of add_outline_item is deprecated. " + "Use page_number only." ) if pagenum is not None: old_term = "pagenum" @@ -1862,7 +1876,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None: Remove text from this output. Args: - ignore_byte_string_object: optional parameter to ignore ByteString Objects. + ignore_byte_string_object: optional parameter """ pg_dict = cast(DictionaryObject, self.get_object(self._pages)) pages = cast(List[IndirectObject], pg_dict[PA.KIDS]) @@ -1925,8 +1939,8 @@ def add_uri( Args: page_number: index of the page on which to place the URI action. uri: URI of resource to link to. - rect: :class:`RectangleObject` or array of four - integers specifying the clickable rectangular area + rect: :class:`RectangleObject` or + array of four integers specifying the clickable rectangular area ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. border: if provided, an array describing border-drawing properties. See the PDF spec for details. No border will be @@ -2323,7 +2337,8 @@ def add_annotation(self, page_number: int, annotation: Dict[str, Any]) -> None: def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject: """ Perform some clean up in the page. - Currently: convert NameObject nameddestination to TextStringObject (required for names/dests list) + Currently: convert NameObject nameddestination to TextStringObject + (required for names/dests list) Args: page: @@ -2874,6 +2889,118 @@ def reset_translation( else: raise Exception("invalid parameter {reader}") + def set_page_label( + self, + page_index_from: int, + page_index_to: int, + style: Optional[PageLabelStyle] = None, + prefix: Optional[str] = None, + start: Optional[int] = 0, + ) -> None: + """ + Set a page label to a range of pages. + + Page indexes must be given starting from 0. + Labels must have a style, a prefix or both. + If to a range is not assigned any page label a decimal label starting from 1 is applied. + + Args: + page_index_from: page index of the beginning of the range starting from 0 + page_index_to: page index of the beginning of the range starting from 0 + style: The numbering style to be used for the numeric portion of each page label: + '/D' Decimal arabic numerals + '/R' Uppercase roman numerals + '/r' Lowercase roman numerals + '/A' Uppercase letters (A to Z for the first 26 pages, + AA to ZZ for the next 26, and so on) + '/a' Lowercase letters (a to z for the first 26 pages, + aa to zz for the next 26, and so on) + prefix: The label prefix for page labels in this range. + start: The value of the numeric portion for the first page label + in the range. + Subsequent pages are numbered sequentially from this value, + which must be greater than or equal to 1. + Default value: 1. + """ + if style is None and prefix is None: + raise ValueError("at least one between style and prefix must be given") + if page_index_from < 0: + raise ValueError("page_index_from must be equal or greater then 0") + if page_index_to < page_index_from: + raise ValueError( + "page_index_to must be equal or greater then page_index_from" + ) + if page_index_to >= len(self.pages): + raise ValueError("page_index_to exceeds number of pages") + if start is not None and start != 0 and start < 1: + raise ValueError("if given, start must be equal or greater than one") + + self._set_page_label(page_index_from, page_index_to, style, prefix, start) + + def _set_page_label( + self, + page_index_from: int, + page_index_to: int, + style: Optional[PageLabelStyle] = None, + prefix: Optional[str] = None, + start: Optional[int] = 0, + ) -> None: + """ + Set a page label to a range of pages. + Page indexes must be given starting from 0. + Labels must have a style, a prefix or both. + If to a range is not assigned any page label a decimal label starting + from 1 is applied. + + Args: + page_index_from: page index of the beginning of the range starting from 0 + page_index_to: page index of the beginning of the range starting from 0 + style: The numbering style to be used for the numeric portion of each page label: + /D Decimal arabic numerals + /R Uppercase roman numerals + /r Lowercase roman numerals + /A Uppercase letters (A to Z for the first 26 pages, + AA to ZZ for the next 26, and so on) + /a Lowercase letters (a to z for the first 26 pages, + aa to zz for the next 26, and so on) + prefix: The label prefix for page labels in this range. + start: The value of the numeric portion for the first page label + in the range. + Subsequent pages are numbered sequentially from this value, + which must be greater than or equal to 1. Default value: 1. + """ + default_page_label = DictionaryObject() + default_page_label[NameObject("/S")] = NameObject("/D") + + new_page_label = DictionaryObject() + if style is not None: + new_page_label[NameObject("/S")] = NameObject(style) + if prefix is not None: + new_page_label[NameObject("/P")] = TextStringObject(prefix) + if start != 0: + new_page_label[NameObject("/St")] = NumberObject(start) + + if NameObject(CatalogDictionary.PAGE_LABELS) not in self._root_object: + nums = ArrayObject() + nums_insert(NumberObject(0), default_page_label, nums) + page_labels = TreeObject() + page_labels[NameObject("/Nums")] = nums + self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels + + page_labels = cast( + TreeObject, self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] + ) + nums = cast(ArrayObject, page_labels[NameObject("/Nums")]) + + nums_insert(NumberObject(page_index_from), new_page_label, nums) + nums_clear_range(NumberObject(page_index_from), page_index_to, nums) + next_label_pos, *_ = nums_next(NumberObject(page_index_from), nums) + if next_label_pos != page_index_to + 1 and page_index_to + 1 < len(self.pages): + nums_insert(NumberObject(page_index_to + 1), default_page_label, nums) + + page_labels[NameObject("/Nums")] = nums + self._root_object[NameObject(CatalogDictionary.PAGE_LABELS)] = page_labels + def _pdf_objectify(obj: Union[Dict[str, Any], str, int, List[Any]]) -> PdfObject: if isinstance(obj, PdfObject): diff --git a/pypdf/constants.py b/pypdf/constants.py index 10b5886fb5..cd8cc1ecf4 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -430,6 +430,16 @@ class OutlineFontFlag(IntFlag): bold = 2 +class PageLabelStyle: + """Table 8.10 in the 1.7 reference.""" + + DECIMAL = "/D" # Decimal arabics + LOWERCASE_ROMAN = "/r" # Lowercase roman numbers + UPPERCASE_ROMAN = "/R" # Uppercase roman numbers + LOWERCASE_LETTER = "/a" # Lowercase letters + UPPERCASE_LETTER = "/A" # Uppercase letters + + PDF_KEYS = ( AnnotationDictionaryAttributes, CatalogAttributes, diff --git a/pypdf/errors.py b/pypdf/errors.py index ad45946284..25fc1f7de4 100644 --- a/pypdf/errors.py +++ b/pypdf/errors.py @@ -8,67 +8,55 @@ class DeprecationError(Exception): """Raised when a deprecated feature is used.""" - pass - class DependencyError(Exception): - """Raised when a required dependency (a library or module that PyPDF depends on) is not available or cannot be imported.""" - - pass + """ + Raised when a required dependency (a library or module that PyPDF depends on) + is not available or cannot be imported. + """ class PyPdfError(Exception): """Base class for all exceptions raised by PyPDF.""" - pass - class PdfReadError(PyPdfError): """Raised when there is an issue reading a PDF file.""" - pass - class PageSizeNotDefinedError(PyPdfError): """Raised when the page size of a PDF document is not defined.""" - pass - class PdfReadWarning(UserWarning): """Issued when there is a potential issue reading a PDF file, but it can still be read.""" - pass - class PdfStreamError(PdfReadError): """Raised when there is an issue reading the stream of data in a PDF file.""" - pass - class ParseError(Exception): - """Raised when there is an issue parsing (analyzing and understanding the structure and meaning of) a PDF file.""" - - pass + """ + Raised when there is an issue parsing (analyzing and understanding the + structure and meaning of) a PDF file. + """ class FileNotDecryptedError(PdfReadError): - """Raised when a PDF file that has been encrypted (meaning it requires a password to be accessed) has not been successfully decrypted.""" - - pass + """ + Raised when a PDF file that has been encrypted + (meaning it requires a password to be accessed) has not been successfully + decrypted. + """ class WrongPasswordError(FileNotDecryptedError): """Raised when the wrong password is used to try to decrypt an encrypted PDF file.""" - pass - class EmptyFileError(PdfReadError): """Raised when a PDF file is empty or has no content.""" - pass - STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly" diff --git a/pypdf/generic/_annotations.py b/pypdf/generic/_annotations.py index c2f43af90e..26dfd63daa 100644 --- a/pypdf/generic/_annotations.py +++ b/pypdf/generic/_annotations.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple, Union +from typing import List, Optional, Tuple, Union from ._base import ( BooleanObject, @@ -37,9 +37,9 @@ def text( Add text annotation. Args: - rect: - or array of four integers specifying the clickable rectangular area - ``[xLL, yLL, xUR, yUR]`` + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + text: The text that is added to the document open: flags: @@ -76,15 +76,15 @@ def free_text( Args: text: Text to be added - rect: or array of four integers - specifying the clickable rectangular area ``[xLL, yLL, xUR, yUR]`` + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area font: Name of the Font, e.g. 'Helvetica' bold: Print the text in bold italic: Print the text in italic font_size: How big the text will be, e.g. '14pt' - font_color: Hex-string for the color - border_color: Hex-string for the border color - background_color: Hex-string for the background of the annotation + font_color: Hex-string for the color, e.g. cdcdcd + border_color: Hex-string for the border color, e.g. cdcdcd + background_color: Hex-string for the background of the annotation, e.g. cdcdcd Returns: A dictionary object representing the annotation. @@ -135,9 +135,8 @@ def line( Args: p1: First point p2: Second point - rect: or array of four - integers specifying the clickable rectangular area - ``[xLL, yLL, xUR, yUR]`` + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area text: Text to be displayed as the line annotation title_bar: Text to be displayed in the title bar of the annotation; by convention this is the name of the author @@ -185,12 +184,13 @@ def rectangle( """ Draw a rectangle on the PDF. + This method uses the /Square annotation type of the PDF format. + Args: - rect: or array of four - integers specifying the clickable rectangular area - ``[xLL, yLL, xUR, yUR]`` - rect: - interiour_color: + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + interiour_color: None or hex-string for the color, e.g. cdcdcd + If None is used, the interiour is transparent. Returns: A dictionary object representing the annotation. @@ -210,6 +210,67 @@ def rectangle( return square_obj + @staticmethod + def ellipse( + rect: Union[RectangleObject, Tuple[float, float, float, float]], + interiour_color: Optional[str] = None, + ) -> DictionaryObject: + """ + Draw a rectangle on the PDF. + + This method uses the /Circle annotation type of the PDF format. + + Args: + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` specifying + the bounding box of the ellipse + interiour_color: None or hex-string for the color, e.g. cdcdcd + If None is used, the interiour is transparent. + + Returns: + A dictionary object representing the annotation. + """ + ellipse_obj = DictionaryObject( + { + NameObject("/Type"): NameObject("/Annot"), + NameObject("/Subtype"): NameObject("/Circle"), + NameObject("/Rect"): RectangleObject(rect), + } + ) + + if interiour_color: + ellipse_obj[NameObject("/IC")] = ArrayObject( + [FloatObject(n) for n in hex_to_rgb(interiour_color)] + ) + + return ellipse_obj + + @staticmethod + def polygon(vertices: List[Tuple[float, float]]) -> DictionaryObject: + if len(vertices) == 0: + raise ValueError("A polygon needs at least 1 vertex with two coordinates") + x_min, y_min = vertices[0][0], vertices[0][1] + x_max, y_max = vertices[0][0], vertices[0][1] + for x, y in vertices: + x_min = min(x_min, x) + y_min = min(y_min, y) + x_max = min(x_max, x) + y_max = min(y_max, y) + rect = RectangleObject((x_min, y_min, x_max, y_max)) + coord_list = [] + for x, y in vertices: + coord_list.append(NumberObject(x)) + coord_list.append(NumberObject(y)) + obj = DictionaryObject( + { + NameObject("/Type"): NameObject("/Annot"), + NameObject("/Subtype"): NameObject("/Polygon"), + NameObject("/Vertices"): ArrayObject(coord_list), + NameObject("/IT"): NameObject("PolygonCloud"), + NameObject("/Rect"): RectangleObject(rect), + } + ) + return obj + @staticmethod def link( rect: Union[RectangleObject, Tuple[float, float, float, float]], @@ -227,9 +288,8 @@ def link( An internal link requires the target_page_index, fit, and fit args. Args: - rect: or array of four - integers specifying the clickable rectangular area - ``[xLL, yLL, xUR, yUR]`` + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area border: if provided, an array describing border-drawing properties. See the PDF spec for details. No border will be drawn if this argument is omitted. @@ -255,7 +315,8 @@ def link( ) if is_external and is_internal: raise ValueError( - f"Either 'url' or 'target_page_index' have to be provided. url={url}, target_page_index={target_page_index}" + "Either 'url' or 'target_page_index' have to be provided. " + f"url={url}, target_page_index={target_page_index}" ) border_arr: BorderArrayType diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index b1adcc557e..a84ce1841e 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -77,8 +77,10 @@ def clone( """ clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) force_duplicate: in standard if the object has been already cloned and reference, - the copy is returned; when force_duplicate == True, a new copy is always performed - ignore_fields : list/tuple of Fields names (for dictionaries that will be ignored during cloning (apply also to childs duplication) + the copy is returned; when force_duplicate == True, + a new copy is always performed + ignore_fields : list/tuple of Fields names (for dictionaries that will + be ignored during cloning (apply also to childs duplication) in standard, clone function call _reference_clone (see _reference) Args: @@ -620,7 +622,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) if name != NameObject.surfix: raise PdfReadError("name read error") - name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True) + name += read_until_regex(stream, NameObject.delimiter_pattern) try: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 2e472f51c1..3df0ae14bd 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -346,10 +346,10 @@ def read_from_stream( def get_next_obj_pos( p: int, p1: int, rem_gens: List[int], pdf: Any ) -> int: # PdfReader - l = pdf.xref[rem_gens[0]] - for o in l: - if p1 > l[o] and p < l[o]: - p1 = l[o] + loc = pdf.xref[rem_gens[0]] + for o in loc: + if p1 > loc[o] and p < loc[o]: + p1 = loc[o] if len(rem_gens) == 1: return p1 else: @@ -969,7 +969,7 @@ def __parse_content_stream(self, stream: StreamType) -> None: break stream.seek(-1, 1) if peek.isalpha() or peek in (b"'", b'"'): - operator = read_until_regex(stream, NameObject.delimiter_pattern, True) + operator = read_until_regex(stream, NameObject.delimiter_pattern) if operator == b"BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... @@ -1016,7 +1016,9 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # We have reached the end of the stream, but haven't found the EI operator. if not buf: raise PdfReadError("Unexpected end of stream") - loc = buf.find(b"E") + loc = buf.find( + b"E" + ) # we can not look straight for "EI" because it may not have been loaded in the buffer if loc == -1: data.write(buf) @@ -1026,28 +1028,44 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: # Seek back in the stream to read the E next. stream.seek(loc - len(buf), 1) - tok = stream.read(1) + tok = stream.read(1) # E of "EI" # Check for End Image - tok2 = stream.read(1) - if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES: - # Data can contain [\s]EI, so check for the separator \s; 4 chars suffisent Q operator not required. - tok3 = stream.read(1) - info = tok + tok2 - # We need to find at least one whitespace after. - has_q_whitespace = False + tok2 = stream.read(1) # I of "EI" + if tok2 != b"I": + stream.seek(-1, 1) + data.write(tok) + continue + # for further debug : print("!!!!",buf[loc-1:loc+10]) + info = tok + tok2 + tok3 = stream.read( + 1 + ) # possible space after "EI" may not been loaded in buf + if tok3 not in WHITESPACES: + stream.seek(-2, 1) # to step back on I + data.write(tok) + elif buf[loc - 1 : loc] in WHITESPACES: # and tok3 in WHITESPACES: + # Data can contain [\s]EI[\s]: 4 chars sufficient, checking Q operator not required. + while tok3 in WHITESPACES: + # needed ???? : info += tok3 + tok3 = stream.read(1) + stream.seek(-1, 1) + # we do not insert EI + break + else: # buf[loc - 1 : loc] not in WHITESPACES and tok3 in WHITESPACES: + # Data can contain [!\s]EI[\s], so check for Q or EMC operator is required to have 4 chars. while tok3 in WHITESPACES: - has_q_whitespace = True info += tok3 tok3 = stream.read(1) - if has_q_whitespace: - stream.seek(-1, 1) + stream.seek(-1, 1) + if tok3 == b"Q": break + elif tok3 == b"E": + ope = stream.read(3) + stream.seek(-3, 1) + if ope == b"EMC": + break else: - stream.seek(-1, 1) data.write(info) - else: - stream.seek(-1, 1) - data.write(tok) return {"settings": settings, "data": data.getvalue()} @property diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index ed1fd1f5d5..1643422ab8 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -150,10 +150,11 @@ def create_string_object( retval.autodetect_utf16 = True return retval else: - # This is probably a big performance hit here, but we need to - # convert string objects into the text/unicode-aware version if - # possible... and the only way to check if that's possible is - # to try. Some strings are strings, some are just byte arrays. + # This is probably a big performance hit here, but we need + # to convert string objects into the text/unicode-aware + # version if possible... and the only way to check if that's + # possible is to try. + # Some strings are strings, some are just byte arrays. retval = TextStringObject(decode_pdfdocencoding(string)) retval.autodetect_pdfdocencoding = True return retval diff --git a/pyproject.toml b/pyproject.toml index b380e481ee..8a12f7e453 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,3 +97,6 @@ exclude_lines = [ "if 0:", "if __name__ == .__main__.:", ] + +[tool.ruff] +line-length = 120 diff --git a/requirements/docs.in b/requirements/docs.in index 641b64b65c..6fe145949f 100644 --- a/requirements/docs.in +++ b/requirements/docs.in @@ -2,3 +2,4 @@ sphinx sphinx_rtd_theme myst_parser==0.16.1 -e . +attrs # required for myst, but not automatically installed by myst diff --git a/requirements/docs.txt b/requirements/docs.txt index 4a44968633..cb44bcd2bb 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -6,13 +6,15 @@ # -e . # via -r requirements/docs.in -alabaster==0.7.12 +alabaster==0.7.13 # via sphinx +attrs==22.2.0 + # via -r requirements/docs.in babel==2.11.0 # via sphinx certifi==2022.12.7 # via requests -charset-normalizer==2.1.1 +charset-normalizer==3.0.1 # via requests docutils==0.17.1 # via @@ -33,7 +35,7 @@ markdown-it-py==2.1.0 # via # mdit-py-plugins # myst-parser -markupsafe==2.1.1 +markupsafe==2.1.2 # via jinja2 mdit-py-plugins==0.3.3 # via myst-parser @@ -41,19 +43,19 @@ mdurl==0.1.2 # via markdown-it-py myst-parser==0.16.1 # via -r requirements/docs.in -packaging==22.0 +packaging==23.0 # via sphinx pygments==2.14.0 # via sphinx -pytz==2022.7 +pytz==2022.7.1 # via babel pyyaml==6.0 # via myst-parser -requests==2.28.1 +requests==2.28.2 # via sphinx snowballstemmer==2.2.0 # via sphinx -sphinx==5.3.0 +sphinx==4.5.0 # via # -r requirements/docs.in # myst-parser @@ -76,9 +78,8 @@ typing-extensions==4.4.0 # via # importlib-metadata # markdown-it-py - # myst-parser # pypdf -urllib3==1.26.13 +urllib3==1.26.14 # via requests zipp==3.11.0 # via importlib-metadata diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 17740e167a..38f745413e 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -3,6 +3,7 @@ import pytest from pypdf import PdfReader +from pypdf._cmap import build_char_map from pypdf.errors import PdfReadWarning from . import get_pdf_from_url @@ -57,12 +58,18 @@ def test_get_font_width_from_default(): # L40 @pytest.mark.external def test_multiline_bfrange(): # non regression test for iss_1285 - url = "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf" + url = ( + "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/" + "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf" + ) name = "tika-908104.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) for page in reader.pages: page.extract_text() - url = "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf" + url = ( + "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/" + "Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf" + ) name = "Giacalone.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) for page in reader.pages: @@ -72,7 +79,10 @@ def test_multiline_bfrange(): @pytest.mark.external def test_bfchar_on_2_chars(): # iss #1293 - url = "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf" + url = ( + "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/" + "2007%2CASurveyofImageClassificationBasedTechniques.pdf" + ) name = "ASurveyofImageClassificationBasedTechniques.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) for page in reader.pages: @@ -102,3 +112,12 @@ def test_iss1379(): name = "02voc.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.pages[2].extract_text() + + +@pytest.mark.external +def test_iss1533(): + url = "https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf" + name = "iss1533.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader.pages[0].extract_text() # no error + assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü" diff --git a/tests/test_encryption.py b/tests/test_encryption.py index dfe14c760b..6400a7adc5 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -24,33 +24,47 @@ [ # unencrypted pdf ("unencrypted.pdf", False), - # created by `qpdf --encrypt "" "" 40 -- unencrypted.pdf r2-empty-password.pdf`: + # created by: + # qpdf --encrypt "" "" 40 -- unencrypted.pdf r2-empty-password.pdf ("r2-empty-password.pdf", False), - # created by `qpdf --encrypt "" "" 128 -- unencrypted.pdf r3-empty-password.pdf`: + # created by: + # qpdf --encrypt "" "" 128 -- unencrypted.pdf r3-empty-password.pdf ("r3-empty-password.pdf", False), - # created by `qpdf --encrypt "asdfzxcv" "" 40 -- unencrypted.pdf r2-user-password.pdf`: + # created by: + # qpdf --encrypt "asdfzxcv" "" 40 -- unencrypted.pdf r2-user-password.pdf ("r2-user-password.pdf", False), - # created by `qpdf --encrypt "" "asdfzxcv" 40 -- unencrypted.pdf r2-user-password.pdf`: + # created by: + # qpdf --encrypt "" "asdfzxcv" 40 -- unencrypted.pdf r2-user-password.pdf ("r2-owner-password.pdf", False), - # created by `qpdf --encrypt "asdfzxcv" "" 128 -- unencrypted.pdf r3-user-password.pdf`: + # created by: + # qpdf --encrypt "asdfzxcv" "" 128 -- unencrypted.pdf r3-user-password.pdf ("r3-user-password.pdf", False), - # created by `qpdf --encrypt "asdfzxcv" "" 128 --force-V4 -- unencrypted.pdf r4-user-password.pdf`: + # created by: + # qpdf --encrypt "asdfzxcv" "" 128 --force-V4 -- unencrypted.pdf r4-user-password.pdf ("r4-user-password.pdf", False), - # created by `qpdf --encrypt "" "asdfzxcv" 128 --force-V4 -- unencrypted.pdf r4-owner-password.pdf`: + # created by: + # qpdf --encrypt "" "asdfzxcv" 128 --force-V4 -- unencrypted.pdf r4-owner-password.pdf ("r4-owner-password.pdf", False), - # created by `qpdf --encrypt "asdfzxcv" "" 128 --use-aes=y -- unencrypted.pdf r4-aes-user-password.pdf`: + # created by: + # qpdf --encrypt "asdfzxcv" "" 128 --use-aes=y -- unencrypted.pdf r4-aes-user-password.pdf ("r4-aes-user-password.pdf", True), - # # created by `qpdf --encrypt "" "" 256 --force-R5 -- unencrypted.pdf r5-empty-password.pdf`: + # created by: + # qpdf --encrypt "" "" 256 --force-R5 -- unencrypted.pdf r5-empty-password.pdf ("r5-empty-password.pdf", True), - # # created by `qpdf --encrypt "asdfzxcv" "" 256 --force-R5 -- unencrypted.pdf r5-user-password.pdf`: + # created by: + # qpdf --encrypt "asdfzxcv" "" 256 --force-R5 -- unencrypted.pdf r5-user-password.pdf ("r5-user-password.pdf", True), - # # created by `qpdf --encrypt "" "asdfzxcv" 256 --force-R5 -- unencrypted.pdf r5-owner-password.pdf`: + # created by: + # qpdf --encrypt "" "asdfzxcv" 256 --force-R5 -- unencrypted.pdf r5-owner-password.pdf ("r5-owner-password.pdf", True), - # created by `qpdf --encrypt "" "" 256 -- unencrypted.pdf r6-empty-password.pdf`: + # created by: + # qpdf --encrypt "" "" 256 -- unencrypted.pdf r6-empty-password.pdf ("r6-empty-password.pdf", True), - # created by `qpdf --encrypt "asdfzxcv" "" 256 -- unencrypted.pdf r6-user-password.pdf`: + # created by: + # qpdf --encrypt "asdfzxcv" "" 256 -- unencrypted.pdf r6-user-password.pdf ("r6-user-password.pdf", True), - # created by `qpdf --encrypt "" "asdfzxcv" 256 -- unencrypted.pdf r6-owner-password.pdf`: + # created by: + # qpdf --encrypt "" "asdfzxcv" 256 -- unencrypted.pdf r6-owner-password.pdf ("r6-owner-password.pdf", True), ], ) @@ -87,7 +101,8 @@ def test_encryption(name, requres_pycryptodome): @pytest.mark.parametrize( ("name", "user_passwd", "owner_passwd"), [ - # created by `qpdf --encrypt "foo" "bar" 256 -- unencrypted.pdf r6-both-passwords.pdf` + # created by + # qpdf --encrypt "foo" "bar" 256 -- unencrypted.pdf r6-both-passwords.pdf ("r6-both-passwords.pdf", "foo", "bar"), ], ) diff --git a/tests/test_filters.py b/tests/test_filters.py index a514695521..c43b2aa4c3 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -244,7 +244,7 @@ def test_image_without_imagemagic(): for page in reader.pages: with pytest.raises(ImportError) as exc: page.images - assert ( - exc.value.args[0] - == "pillow is required to do image extraction. It can be installed via 'pip install pypdf[image]'" + assert exc.value.args[0] == ( + "pillow is required to do image extraction. " + "It can be installed via 'pip install pypdf[image]'" ) diff --git a/tests/test_generic.py b/tests/test_generic.py index a59cd3d3e3..acdb3408e8 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -675,7 +675,10 @@ def test_bool_repr(tmp_path): @pytest.mark.external @patch("pypdf._reader.logger_warning") def test_issue_997(mock_logger_warning): - url = "https://github.com/py-pdf/pypdf/files/8908874/Exhibit_A-2_930_Enterprise_Zone_Tax_Credits_final.pdf" + url = ( + "https://github.com/py-pdf/pypdf/files/8908874/" + "Exhibit_A-2_930_Enterprise_Zone_Tax_Credits_final.pdf" + ) name = "gh-issue-997.pdf" merger = PdfMerger() @@ -746,6 +749,34 @@ def test_annotation_builder_free_text(): os.remove(target) # comment this out for manual inspection +def test_annotation_builder_polygon(): + # Arrange + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + # Act + with pytest.raises(ValueError) as exc: + AnnotationBuilder.polygon( + vertices=[], + ) + assert exc.value.args[0] == "A polygon needs at least 1 vertex with two coordinates" + + annotation = AnnotationBuilder.polygon( + vertices=[(50, 550), (200, 650), (70, 750), (50, 700)], + ) + writer.add_annotation(0, annotation) + + # Assert: You need to inspect the file manually + target = "annotated-pdf.pdf" + with open(target, "wb") as fp: + writer.write(fp) + + os.remove(target) # comment this out for manual inspection + + def test_annotation_builder_line(): # Arrange pdf_path = RESOURCE_ROOT / "crazyones.pdf" @@ -798,6 +829,34 @@ def test_annotation_builder_square(): os.remove(target) # comment this out for manual inspection +def test_annotation_builder_circle(): + # Arrange + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + # Act + circle_annotation = AnnotationBuilder.ellipse( + rect=(50, 550, 200, 650), interiour_color="ff0000" + ) + writer.add_annotation(0, circle_annotation) + + diameter = 100 + circle_annotation = AnnotationBuilder.ellipse( + rect=(110, 500, 110 + diameter, 500 + diameter), + ) + writer.add_annotation(0, circle_annotation) + + # Assert: You need to inspect the file manually + target = "annotated-pdf-circle.pdf" + with open(target, "wb") as fp: + writer.write(fp) + + os.remove(target) # comment this out for manual inspection + + def test_annotation_builder_link(): # Arrange pdf_path = RESOURCE_ROOT / "outline-without-title.pdf" @@ -814,9 +873,9 @@ def test_annotation_builder_link(): url="https://martin-thoma.com/", target_page_index=3, ) - assert ( - exc.value.args[0] - == "Either 'url' or 'target_page_index' have to be provided. url=https://martin-thoma.com/, target_page_index=3" + assert exc.value.args[0] == ( + "Either 'url' or 'target_page_index' have to be provided. " + "url=https://martin-thoma.com/, target_page_index=3" ) # Part 2: Too few args diff --git a/tests/test_merger.py b/tests/test_merger.py index 2dd43a214c..ee0453c56e 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -56,9 +56,9 @@ def merger_operate(merger): with open(pdf_path, "rb") as fh: merger.append(fh) - merger.write( - BytesIO() - ) # to force to build outlines and ensur the add_outline_item is at end of the list + # to force to build outlines and ensure the add_outline_item is + # at end of the list + merger.write(BytesIO()) outline_item = merger.add_outline_item("An outline item", 0) oi2 = merger.add_outline_item( "deeper", 0, parent=outline_item, italic=True, bold=True @@ -670,7 +670,10 @@ def test_deprecation_bookmark_decorator_deprecationexcp(): merger = PdfMerger() with pytest.raises( DeprecationError, - match="import_bookmarks is deprecated as an argument. Use import_outline instead", + match=( + "import_bookmarks is deprecated as an argument. " + "Use import_outline instead" + ), ): merger.merge(0, reader, import_bookmarks=True) @@ -680,7 +683,10 @@ def test_deprecation_bookmark_decorator_deprecationexcp_with_writer(): merger = PdfWriter() with pytest.raises( DeprecationError, - match="import_bookmarks is deprecated as an argument. Use import_outline instead", + match=( + "import_bookmarks is deprecated as an argument. " + "Use import_outline instead" + ), ): merger.merge(0, reader, import_bookmarks=True) diff --git a/tests/test_page.py b/tests/test_page.py index b58d42ceb5..1914636a2e 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1,5 +1,6 @@ import json import os +import random from copy import deepcopy from io import BytesIO from pathlib import Path @@ -13,6 +14,7 @@ from pypdf.errors import DeprecationError, PdfReadWarning from pypdf.generic import ( ArrayObject, + ContentStream, DictionaryObject, FloatObject, IndirectObject, @@ -570,7 +572,8 @@ def ignore_large_rectangles(r): page_lrs_model, rect_filter=ignore_large_rectangles ) - # We see ten rectangles (5 tabs, 5 boxes) but there are 64 rectangles (including some invisible ones). + # We see ten rectangles (5 tabs, 5 boxes) but there are 64 rectangles + # (including some invisible ones). assert len(rectangles) == 60 rectangle2texts = {} for t in texts: @@ -871,3 +874,170 @@ def test_no_resources(): page_one = reader.pages[0] page_two = reader.pages[0] page_one.merge_page(page_two) + + +def test_merge_page_reproducible_with_proc_set(): + page1 = PageObject.create_blank_page(width=100, height=100) + page2 = PageObject.create_blank_page(width=100, height=100) + + ordered = sorted(NameObject(f"/{x}") for x in range(20)) + + shuffled = list(ordered) + random.shuffle(shuffled) + + # each page has some overlap in their /ProcSet, and they're in a weird order + page1[NameObject("/Resources")][NameObject("/ProcSet")] = ArrayObject(shuffled[:15]) + page2[NameObject("/Resources")][NameObject("/ProcSet")] = ArrayObject(shuffled[5:]) + page1.merge_page(page2) + + assert page1[NameObject("/Resources")][NameObject("/ProcSet")] == ordered + + +@pytest.mark.parametrize( + ("page1", "page2", "expected_result", "expected_renames"), + [ + # simple cases: + pytest.param({}, {}, {}, {}, id="no resources"), + pytest.param( + {"/1": "/v1"}, + {"/2": "/v2"}, + {"/1": "/v1", "/2": "/v2"}, + {}, + id="no overlap", + ), + pytest.param( + {"/x": "/v"}, {"/x": "/v"}, {"/x": "/v"}, {}, id="overlap, matching values" + ), + pytest.param( + {"/x": "/v1"}, + {"/x": "/v2"}, + {"/x": "/v1", "/x-0": "/v2"}, + {"/x": "/x-0"}, + id="overlap, different values", + ), + # carefully crafted names that match the renaming pattern: + pytest.param( + {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v1"}, + {"/x": "/v2"}, + { + "/x": "/v1", + "/x-0": "/v1", + "/x-1": "/v1", + "/x-2": "/v2", + }, + {"/x": "/x-2"}, + id="crafted, different values", + ), + pytest.param( + {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v"}, + {"/x": "/v"}, + {"/x": "/v1", "/x-0": "/v1", "/x-1": "/v"}, + {"/x": "/x-1"}, + id="crafted, matching value in chain", + ), + pytest.param( + {"/x": "/v1"}, + {"/x": "/v2.1", "/x-0": "/v2.2"}, + {"/x": "/v1", "/x-0": "/v2.1", "/x-0-0": "/v2.2"}, + {"/x": "/x-0", "/x-0": "/x-0-0"}, + id="crafted, overlaps with previous rename, different value", + ), + pytest.param( + {"/x": "/v1"}, + {"/x": "/v2", "/x-0": "/v2"}, + {"/x": "/v1", "/x-0": "/v2"}, + {"/x": "/x-0"}, + id="crafted, overlaps with previous rename, matching value", + ), + ], +) +def test_merge_resources(page1, page2, expected_result, expected_renames): + # Arrange + page1 = DictionaryObject( + { + PG.RESOURCES: DictionaryObject( + {NameObject(k): NameObject(v) for k, v in page1.items()} + ) + } + ) + page2 = DictionaryObject( + { + PG.RESOURCES: DictionaryObject( + {NameObject(k): NameObject(v) for k, v in page2.items()} + ) + } + ) + + # Act + result, renames = PageObject._merge_resources(page1, page2, PG.RESOURCES) + + # Assert + assert result == expected_result + assert renames == expected_renames + + +def test_merge_page_resources_smoke_test(): + # Arrange + page1 = PageObject.create_blank_page(width=100, height=100) + page2 = PageObject.create_blank_page(width=100, height=100) + + NO = NameObject + + # set up some dummy resources that overlap (or not) between the two pages + # (note, all the edge cases are tested in test_merge_resources) + props1 = page1[NO("/Resources")][NO("/Properties")] = DictionaryObject( + { + NO("/just1"): NO("/just1-value"), + NO("/overlap-matching"): NO("/overlap-matching-value"), + NO("/overlap-different"): NO("/overlap-different-value1"), + } + ) + props2 = page2[NO("/Resources")][NO("/Properties")] = DictionaryObject( + { + NO("/just2"): NO("/just2-value"), + NO("/overlap-matching"): NO("/overlap-matching-value"), + NO("/overlap-different"): NO("/overlap-different-value2"), + } + ) + # use these keys for some "operations", to validate renaming + # (the operand name doesn't matter) + contents1 = page1[NO("/Contents")] = ContentStream(None, None) + contents1.operations = [(ArrayObject(props1.keys()), "page1-contents")] + contents2 = page2[NO("/Contents")] = ContentStream(None, None) + contents2.operations = [(ArrayObject(props2.keys()), "page2-contents")] + + expected_properties = { + "/just1": "/just1-value", + "/just2": "/just2-value", + "/overlap-matching": "/overlap-matching-value", + "/overlap-different": "/overlap-different-value1", + "/overlap-different-0": "/overlap-different-value2", + } + expected_operations = [ + # no renaming + (ArrayObject(props1.keys()), b"page1-contents"), + # some renaming + ( + ArrayObject( + [ + NO("/just2"), + NO("/overlap-matching"), + NO("/overlap-different-0"), + ] + ), + b"page2-contents", + ), + ] + + # Act + page1.merge_page(page2) + + # Assert + assert page1[NO("/Resources")][NO("/Properties")] == expected_properties + + relevant_operations = [ + (op, name) + for op, name in page1.get_contents().operations + if name in (b"page1-contents", b"page2-contents") + ] + assert relevant_operations == expected_operations diff --git a/tests/test_reader.py b/tests/test_reader.py index 710e6c59ca..b8a115877a 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -119,7 +119,10 @@ def test_broken_meta_data(pdf_path): reader = PdfReader(f) with pytest.raises( PdfReadError, - match=r"trailer not found or does not point to document information directory", + match=( + "trailer not found or does not point to document " + "information directory" + ), ): reader.metadata @@ -176,6 +179,7 @@ def test_get_outline(src, outline_elements): assert len(outline) == outline_elements +@pytest.mark.samples @pytest.mark.parametrize( ("src", "expected_images"), [ @@ -615,23 +619,25 @@ def test_get_destination_page_number(): def test_do_not_get_stuck_on_large_files_without_start_xref(): - """Tests for the absence of a DoS bug, where a large file without an startxref mark - would cause the library to hang for minutes to hours""" + """ + Tests for the absence of a DoS bug, where a large file without an startxref + mark would cause the library to hang for minutes to hours + """ start_time = time.time() broken_stream = BytesIO(b"\0" * 5 * 1000 * 1000) with pytest.raises(PdfReadError): PdfReader(broken_stream) parse_duration = time.time() - start_time - # parsing is expected take less than a second on a modern cpu, but include a large - # tolerance to account for busy or slow systems + # parsing is expected take less than a second on a modern cpu, but include + # a large tolerance to account for busy or slow systems assert parse_duration < 60 @pytest.mark.external def test_decrypt_when_no_id(): """ - Decrypt an encrypted file that's missing the 'ID' value in its - trailer. + Decrypt an encrypted file that's missing the 'ID' value in its trailer. + https://github.com/py-pdf/pypdf/issues/608 """ @@ -797,7 +803,10 @@ def test_read_path(): def test_read_not_binary_mode(caplog): with open(RESOURCE_ROOT / "crazyones.pdf") as f: - msg = "PdfReader stream/file object is not in binary mode. It may not be read correctly." + msg = ( + "PdfReader stream/file object is not in binary mode. " + "It may not be read correctly." + ) with pytest.raises(io.UnsupportedOperation): PdfReader(f) assert normalize_warnings(caplog.text) == [msg] @@ -866,6 +875,7 @@ def test_get_fields(): assert dict(fields["c1-1"]) == ({"/FT": "/Btn", "/T": "c1-1"}) +@pytest.mark.external def test_get_full_qualified_fields(): url = "https://github.com/py-pdf/PyPDF2/files/10142389/fields_with_dots.pdf" name = "fields_with_dots.pdf" @@ -1115,7 +1125,10 @@ def test_named_destination(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) assert len(reader.named_destinations) > 0 # 2nd case : Dest below names and with Kids... - url = "https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf" + url = ( + "https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/" + "pdfstandards/pdf/PDF32000_2008.pdf" + ) name = "PDF32000_2008.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) assert len(reader.named_destinations) > 0 @@ -1144,7 +1157,8 @@ def test_outline_with_empty_action(): def test_outline_with_invalid_destinations(): reader = PdfReader(RESOURCE_ROOT / "outlines-with-invalid-destinations.pdf") - # contains 9 outline items, 6 with invalid destinations caused by different malformations + # contains 9 outline items, 6 with invalid destinations + # caused by different malformations assert len(reader.outline) == 9 @@ -1208,14 +1222,21 @@ def test_reader(caplog): @pytest.mark.external def test_zeroing_xref(): # iss #328 - url = "https://github.com/py-pdf/pypdf/files/9066120/UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" + url = ( + "https://github.com/py-pdf/pypdf/files/9066120/" + "UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" + ) name = "UTA_OSHA.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) len(reader.pages) +@pytest.mark.external def test_thread(): - url = "https://github.com/py-pdf/pypdf/files/9066120/UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" + url = ( + "https://github.com/py-pdf/pypdf/files/9066120/" + "UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" + ) name = "UTA_OSHA.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) assert reader.threads is None @@ -1226,6 +1247,7 @@ def test_thread(): assert len(reader.threads) >= 1 +@pytest.mark.external def test_build_outline_item(caplog): url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" @@ -1253,6 +1275,7 @@ def test_build_outline_item(caplog): assert "Unexpected destination 2" in exc.value.args[0] +@pytest.mark.samples @pytest.mark.parametrize( ("src", "page_labels"), [ @@ -1274,3 +1297,11 @@ def test_build_outline_item(caplog): def test_page_labels(src, page_labels): max_indices = 6 assert PdfReader(src).page_labels[:max_indices] == page_labels[:max_indices] + + +def test_iss1559(): + url = "https://github.com/py-pdf/pypdf/files/10441992/default.pdf" + name = "iss1559.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + for p in reader.pages: + p.extract_text() diff --git a/tests/test_utils.py b/tests/test_utils.py index 841c8d7122..e447d5d674 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -62,20 +62,11 @@ def test_skip_over_comment(stream, remainder): assert stream.read() == remainder -def test_read_until_regex_premature_ending_raise(): - import re - - stream = io.BytesIO(b"") - with pytest.raises(PdfStreamError) as exc: - read_until_regex(stream, re.compile(b".")) - assert exc.value.args[0] == "Stream has ended unexpectedly" - - def test_read_until_regex_premature_ending_name(): import re stream = io.BytesIO(b"") - assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b"" + assert read_until_regex(stream, re.compile(b".")) == b"" @pytest.mark.parametrize( @@ -250,7 +241,10 @@ def foo(old_param=1, baz=2): @pytest.mark.external def test_escapedcode_followed_by_int(): # iss #1294 - url = "https://github.com/timedegree/playground_files/raw/main/%E8%AE%BA%E6%96%87/AN%20EXACT%20ANALYTICAL%20SOLUTION%20OF%20KEPLER'S%20EQUATION.pdf" + url = ( + "https://github.com/timedegree/playground_files/raw/main/" + "%E8%AE%BA%E6%96%87/AN%20EXACT%20ANALYTICAL%20SOLUTION%20OF%20KEPLER'S%20EQUATION.pdf" + ) name = "keppler.pdf" reader = PdfReader(io.BytesIO(get_pdf_from_url(url, name=name))) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index fb8187e70e..0e7e8c381b 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -15,7 +15,8 @@ from pypdf import PdfMerger, PdfReader, PdfWriter from pypdf.constants import PageAttributes as PG -from pypdf.errors import PdfReadWarning +from pypdf.errors import PdfReadError, PdfReadWarning +from pypdf.generic import ContentStream, read_object from . import get_pdf_from_url, normalize_warnings @@ -74,7 +75,7 @@ def test_dropdown_items(): inputfile = RESOURCE_ROOT / "libreoffice-form.pdf" reader = PdfReader(inputfile) fields = reader.get_fields() - assert "/Opt" in fields["Nationality"].keys() + assert "/Opt" in fields["Nationality"] def test_PdfReaderFileLoad(): @@ -99,15 +100,17 @@ def test_PdfReaderFileLoad(): assert expected_line == actual_line assert text == pdftext, ( - "PDF extracted text differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n" - % (pdftext, text) + "PDF extracted text differs from expected value.\n\n" + "Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text) ) def test_PdfReaderJpegImage(): """ - Test loading and parsing of a file. Extract the image of the file and compare to expected - textual output. Expected outcome: file loads, image matches expected. + Test loading and parsing of a file. Extract the image of the file and + compare to expected textual output. + + Expected outcome: file loads, image matches expected. """ with open(RESOURCE_ROOT / "jpeg.pdf", "rb") as inputfile: @@ -124,7 +127,8 @@ def test_PdfReaderJpegImage(): # Compare the text of the PDF to a known source assert binascii.hexlify(data).decode() == imagetext, ( - "PDF extracted image differs from expected value.\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n" + "PDF extracted image differs from expected value.\n\n" + "Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (imagetext, binascii.hexlify(data).decode()) ) @@ -880,3 +884,36 @@ def test_tounicode_is_identity(): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data, strict=False) reader.pages[0].extract_text() + + +@pytest.mark.external +def test_extra_test_iss1541(): + url = "https://github.com/py-pdf/pypdf/files/10418158/tst_iss1541.pdf" + name = "tst_iss1541.pdf" + data = BytesIO(get_pdf_from_url(url, name=name)) + reader = PdfReader(data, strict=False) + reader.pages[0].extract_text() + + cs = ContentStream(reader.pages[0]["/Contents"], None, None) + cs.operations.insert(-1, ([], b"EMC")) + bu = BytesIO() + cs.write_to_stream(bu, None) + bu.seek(0) + ContentStream(read_object(bu, None, None), None, None).operations + + cs = ContentStream(reader.pages[0]["/Contents"], None, None) + cs.operations.insert(-1, ([], b"E!C")) + bu = BytesIO() + cs.write_to_stream(bu, None) + bu.seek(0) + with pytest.raises(PdfReadError) as exc: + ContentStream(read_object(bu, None, None), None, None).operations + assert exc.value.args[0] == "Unexpected end of stream" + + buf2 = BytesIO(data.getbuffer()) + reader = PdfReader( + BytesIO(bytes(buf2.getbuffer()).replace(b"EI \n", b"E! \n")), strict=False + ) + with pytest.raises(PdfReadError) as exc: + reader.pages[0].extract_text() + assert exc.value.args[0] == "Unexpected end of stream" diff --git a/tests/test_writer.py b/tests/test_writer.py index 60b4a17de2..df94af6e19 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -33,9 +33,8 @@ def test_writer_exception_non_binary(tmp_path, caplog): writer = PdfWriter() writer.add_page(reader.pages[0]) - with open(tmp_path / "out.txt", "w") as fp: - with pytest.raises(TypeError): - writer.write_stream(fp) + with open(tmp_path / "out.txt", "w") as fp, pytest.raises(TypeError): + writer.write_stream(fp) ending = "to write to is not in binary mode. It may not be written to correctly.\n" assert caplog.text.endswith(ending) @@ -380,7 +379,8 @@ def test_remove_text_all_operators(ignore_byte_string_object): pdf_data.find(b"4 0 obj") + startx_correction, pdf_data.find(b"5 0 obj") + startx_correction, pdf_data.find(b"6 0 obj") + startx_correction, - # startx_correction should be -1 due to double % at the beginning inducing an error on startxref computation + # startx_correction should be -1 due to double % at the beginning + # inducing an error on startxref computation pdf_data.find(b"xref"), ) print(pdf_data.decode()) @@ -930,6 +930,7 @@ def test_startup_dest(): pdf_file_writer.open_destination = None +@pytest.mark.external def test_iss471(): url = "https://github.com/py-pdf/pypdf/files/9139245/book.pdf" name = "book_471.pdf" @@ -942,6 +943,7 @@ def test_iss471(): ) +@pytest.mark.external def test_reset_translation(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" @@ -977,6 +979,7 @@ def test_threads_empty(): assert thr == thr2 +@pytest.mark.external def test_append_without_annots_and_articles(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" @@ -993,6 +996,7 @@ def test_append_without_annots_and_articles(): assert len(writer.threads) >= 1 +@pytest.mark.external def test_append_multiple(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" @@ -1005,3 +1009,144 @@ def test_append_multiple(): pages = writer._root_object["/Pages"]["/Kids"] assert pages[0] not in pages[1:] # page not repeated assert pages[-1] not in pages[0:-1] # page not repeated + + +@pytest.mark.samples +def test_set_page_label(): + src = RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # File without labels + target = "pypdf-output.pdf" + reader = PdfReader(src) + + expected = [ + "i", + "ii", + "1", + "2", + "A", + "B", + "1", + "2", + "3", + "4", + "A", + "i", + "I", + "II", + "1", + "2", + "3", + "I", + "II", + ] + + # Tests full lenght with labels assigned at first and last elements + # Tests different labels assigned to consecutive ranges + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(0, 1, "/r") + writer.set_page_label(4, 5, "/A") + writer.set_page_label(10, 10, "/A") + writer.set_page_label(11, 11, "/r") + writer.set_page_label(12, 13, "/R") + writer.set_page_label(17, 18, "/R") + writer.write(target) + assert PdfReader(target).page_labels == expected + + writer = PdfWriter() # Same labels, different set order + writer.clone_document_from_reader(reader) + writer.set_page_label(17, 18, "/R") + writer.set_page_label(4, 5, "/A") + writer.set_page_label(10, 10, "/A") + writer.set_page_label(0, 1, "/r") + writer.set_page_label(12, 13, "/R") + writer.set_page_label(11, 11, "/r") + writer.write(target) + assert PdfReader(target).page_labels == expected + + # Tests labels assigned only in the middle + # Tests label assigned to a range already containing labled ranges + expected = ["1", "2", "i", "ii", "iii", "iv", "v", "1"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(3, 4, "/a") + writer.set_page_label(5, 5, "/A") + writer.set_page_label(2, 6, "/r") + writer.write(target) + assert PdfReader(target).page_labels[: len(expected)] == expected + + # Tests labels assigned inside a previously existing range + expected = ["1", "2", "i", "a", "b", "A", "1", "1", "2"] + # Ones repeat because user didnt cover the entire original range + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(2, 6, "/r") + writer.set_page_label(3, 4, "/a") + writer.set_page_label(5, 5, "/A") + writer.write(target) + assert PdfReader(target).page_labels[: len(expected)] == expected + + # Tests invalid user input + writer = PdfWriter() + writer.clone_document_from_reader(reader) + with pytest.raises( + ValueError, match="at least one between style and prefix must be given" + ): + writer.set_page_label(0, 5, start=2) + with pytest.raises( + ValueError, match="page_index_from must be equal or greater then 0" + ): + writer.set_page_label(-1, 5, "/r") + with pytest.raises( + ValueError, match="page_index_to must be equal or greater then page_index_from" + ): + writer.set_page_label(5, 0, "/r") + with pytest.raises(ValueError, match="page_index_to exceeds number of pages"): + writer.set_page_label(0, 19, "/r") + with pytest.raises( + ValueError, match="if given, start must be equal or greater than one" + ): + writer.set_page_label(0, 5, "/r", start=-1) + + os.remove(target) + + src = ( + SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf" + ) # File with pre existing labels + target = "pypdf-output.pdf" + reader = PdfReader(src) + + # Tests adding labels to existing ones + expected = ["i", "ii", "A", "B", "1"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(2, 3, "/A") + writer.write(target) + assert PdfReader(target).page_labels[: len(expected)] == expected + + # Tests replacing existing lables + expected = ["A", "B", "1", "1", "2"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(0, 1, "/A") + writer.write(target) + assert PdfReader(target).page_labels[: len(expected)] == expected + + os.remove(target) + + # Tests prefix and start. + src = RESOURCE_ROOT / "issue-604.pdf" # File without page labels + target = "page_labels_test.pdf" + reader = PdfReader(src) + writer = PdfWriter() + writer.clone_document_from_reader(reader) + + writer.set_page_label(0, 0, prefix="FRONT") + writer.set_page_label(1, 2, "/D", start=2) + writer.set_page_label(3, 6, prefix="UPDATES") + writer.set_page_label(7, 10, "/D", prefix="THYR-") + writer.set_page_label(11, 21, "/D", prefix="PAP-") + writer.set_page_label(22, 30, "/D", prefix="FOLL-") + writer.set_page_label(31, 39, "/D", prefix="HURT-") + writer.write(target) + + os.remove(target) # comment to see result