|
46 | 46 |
|
47 | 47 | from ._cmap import build_char_map, unknown_char_map |
48 | 48 | from ._protocols import PdfReaderProtocol, PdfWriterProtocol |
| 49 | +from ._text_extraction import ( |
| 50 | + OrientationNotFoundError, |
| 51 | + crlf_space_check, |
| 52 | + handle_tj, |
| 53 | + mult, |
| 54 | +) |
49 | 55 | from ._utils import ( |
50 | 56 | CompressedTransformationMatrix, |
51 | 57 | File, |
|
72 | 78 | NullObject, |
73 | 79 | NumberObject, |
74 | 80 | RectangleObject, |
75 | | - encode_pdfdocencoding, |
76 | 81 | ) |
77 | 82 |
|
78 | | -CUSTOM_RTL_MIN: int = -1 |
79 | | -CUSTOM_RTL_MAX: int = -1 |
80 | | -CUSTOM_RTL_SPECIAL_CHARS: List[int] = [] |
81 | 83 | MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox' |
82 | 84 |
|
83 | 85 |
|
84 | | -def set_custom_rtl( |
85 | | - _min: Union[str, int, None] = None, |
86 | | - _max: Union[str, int, None] = None, |
87 | | - specials: Union[str, List[int], None] = None, |
88 | | -) -> Tuple[int, int, List[int]]: |
89 | | - """ |
90 | | - Change the Right-To-Left and special characters custom parameters. |
91 | | -
|
92 | | - Args: |
93 | | - _min: The new minimum value for the range of custom characters that |
94 | | - will be written right to left. |
95 | | - If set to ``None``, the value will not be changed. |
96 | | - If set to an integer or string, it will be converted to its ASCII code. |
97 | | - The default value is -1, which sets no additional range to be converted. |
98 | | - _max: The new maximum value for the range of custom characters that will |
99 | | - be written right to left. |
100 | | - If set to ``None``, the value will not be changed. |
101 | | - If set to an integer or string, it will be converted to its ASCII code. |
102 | | - The default value is -1, which sets no additional range to be converted. |
103 | | - specials: The new list of special characters to be inserted in the |
104 | | - current insertion order. |
105 | | - If set to ``None``, the current value will not be changed. |
106 | | - If set to a string, it will be converted to a list of ASCII codes. |
107 | | - The default value is an empty list. |
108 | | -
|
109 | | - Returns: |
110 | | - A tuple containing the new values for ``CUSTOM_RTL_MIN``, |
111 | | - ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``. |
112 | | - """ |
113 | | - global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS |
114 | | - if isinstance(_min, int): |
115 | | - CUSTOM_RTL_MIN = _min |
116 | | - elif isinstance(_min, str): |
117 | | - CUSTOM_RTL_MIN = ord(_min) |
118 | | - if isinstance(_max, int): |
119 | | - CUSTOM_RTL_MAX = _max |
120 | | - elif isinstance(_max, str): |
121 | | - CUSTOM_RTL_MAX = ord(_max) |
122 | | - if isinstance(specials, str): |
123 | | - CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] |
124 | | - elif isinstance(specials, list): |
125 | | - CUSTOM_RTL_SPECIAL_CHARS = specials |
126 | | - return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS |
127 | | - |
128 | | - |
129 | 86 | def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject: |
130 | 87 | retval: Union[None, RectangleObject, IndirectObject] = self.get(name) |
131 | 88 | if isinstance(retval, RectangleObject): |
@@ -1664,26 +1621,6 @@ def _extract_text( |
1664 | 1621 | TL = 0.0 |
1665 | 1622 | font_size = 12.0 # init just in case of |
1666 | 1623 |
|
1667 | | - def mult(m: List[float], n: List[float]) -> List[float]: |
1668 | | - return [ |
1669 | | - m[0] * n[0] + m[1] * n[2], |
1670 | | - m[0] * n[1] + m[1] * n[3], |
1671 | | - m[2] * n[0] + m[3] * n[2], |
1672 | | - m[2] * n[1] + m[3] * n[3], |
1673 | | - m[4] * n[0] + m[5] * n[2] + n[4], |
1674 | | - m[4] * n[1] + m[5] * n[3] + n[5], |
1675 | | - ] |
1676 | | - |
1677 | | - def orient(m: List[float]) -> int: |
1678 | | - if m[3] > 1e-6: |
1679 | | - return 0 |
1680 | | - elif m[3] < -1e-6: |
1681 | | - return 180 |
1682 | | - elif m[1] > 0: |
1683 | | - return 90 |
1684 | | - else: |
1685 | | - return 270 |
1686 | | - |
1687 | 1624 | def current_spacewidth() -> float: |
1688 | 1625 | return _space_width / 1000.0 |
1689 | 1626 |
|
@@ -1815,169 +1752,36 @@ def process_operation(operator: bytes, operands: List) -> None: |
1815 | 1752 |
|
1816 | 1753 | elif operator == b"Tj": |
1817 | 1754 | check_crlf_space = True |
1818 | | - m = mult(tm_matrix, cm_matrix) |
1819 | | - orientation = orient(m) |
1820 | | - if orientation in orientations: |
1821 | | - if isinstance(operands[0], str): |
1822 | | - text += operands[0] |
1823 | | - else: |
1824 | | - t: str = "" |
1825 | | - tt: bytes = ( |
1826 | | - encode_pdfdocencoding(operands[0]) |
1827 | | - if isinstance(operands[0], str) |
1828 | | - else operands[0] |
1829 | | - ) |
1830 | | - if isinstance(cmap[0], str): |
1831 | | - try: |
1832 | | - t = tt.decode( |
1833 | | - cmap[0], "surrogatepass" |
1834 | | - ) # apply str encoding |
1835 | | - except Exception: |
1836 | | - # the data does not match the expectation, |
1837 | | - # we use the alternative ; |
1838 | | - # text extraction may not be good |
1839 | | - t = tt.decode( |
1840 | | - "utf-16-be" if cmap[0] == "charmap" else "charmap", |
1841 | | - "surrogatepass", |
1842 | | - ) # apply str encoding |
1843 | | - else: # apply dict encoding |
1844 | | - t = "".join( |
1845 | | - [ |
1846 | | - cmap[0][x] if x in cmap[0] else bytes((x,)).decode() |
1847 | | - for x in tt |
1848 | | - ] |
1849 | | - ) |
1850 | | - # "\u0590 - \u08FF \uFB50 - \uFDFF" |
1851 | | - for x in [cmap[1][x] if x in cmap[1] else x for x in t]: |
1852 | | - # x can be a sequence of bytes ; ex: habibi.pdf |
1853 | | - if len(x) == 1: |
1854 | | - xx = ord(x) |
1855 | | - else: |
1856 | | - xx = 1 |
1857 | | - # fmt: off |
1858 | | - if ( |
1859 | | - # cases where the current inserting order is kept |
1860 | | - (xx <= 0x2F) # punctuations but... |
1861 | | - or (0x3A <= xx and xx <= 0x40) # numbers (x30-39) |
1862 | | - or (0x2000 <= xx and xx <= 0x206F) # upper punctuations.. |
1863 | | - or (0x20A0 <= xx and xx <= 0x21FF) # but (numbers) indices/exponents |
1864 | | - or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... |
1865 | | - ): |
1866 | | - text = x + text if rtl_dir else text + x |
1867 | | - elif ( # right-to-left characters set |
1868 | | - (0x0590 <= xx and xx <= 0x08FF) |
1869 | | - or (0xFB1D <= xx and xx <= 0xFDFF) |
1870 | | - or (0xFE70 <= xx and xx <= 0xFEFF) |
1871 | | - or (CUSTOM_RTL_MIN <= xx and xx <= CUSTOM_RTL_MAX) |
1872 | | - ): |
1873 | | - if not rtl_dir: |
1874 | | - rtl_dir = True |
1875 | | - output += text |
1876 | | - if visitor_text is not None: |
1877 | | - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) |
1878 | | - text = "" |
1879 | | - text = x + text |
1880 | | - else: # left-to-right |
1881 | | - # print(">",xx,x,end="") |
1882 | | - if rtl_dir: |
1883 | | - rtl_dir = False |
1884 | | - output += text |
1885 | | - if visitor_text is not None: |
1886 | | - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) |
1887 | | - text = "" |
1888 | | - text = text + x |
1889 | | - # fmt: on |
| 1755 | + text, rtl_dir = handle_tj( |
| 1756 | + text, |
| 1757 | + operands, |
| 1758 | + cm_matrix, |
| 1759 | + tm_matrix, # text matrix |
| 1760 | + cmap, |
| 1761 | + orientations, |
| 1762 | + output, |
| 1763 | + font_size, |
| 1764 | + rtl_dir, |
| 1765 | + visitor_text, |
| 1766 | + ) |
1890 | 1767 | else: |
1891 | 1768 | return None |
1892 | 1769 | if check_crlf_space: |
1893 | | - m = mult(tm_matrix, cm_matrix) |
1894 | | - orientation = orient(m) |
1895 | | - delta_x = m[4] - tm_prev[4] |
1896 | | - delta_y = m[5] - tm_prev[5] |
1897 | | - k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) |
1898 | | - f = font_size * k |
1899 | | - tm_prev = m |
1900 | | - if orientation not in orientations: |
1901 | | - return None |
1902 | 1770 | try: |
1903 | | - if orientation == 0: |
1904 | | - if delta_y < -0.8 * f: |
1905 | | - if (output + text)[-1] != "\n": |
1906 | | - output += text + "\n" |
1907 | | - if visitor_text is not None: |
1908 | | - visitor_text( |
1909 | | - text + "\n", |
1910 | | - cm_matrix, |
1911 | | - tm_matrix, |
1912 | | - cmap[3], |
1913 | | - font_size, |
1914 | | - ) |
1915 | | - text = "" |
1916 | | - elif ( |
1917 | | - abs(delta_y) < f * 0.3 |
1918 | | - and abs(delta_x) > current_spacewidth() * f * 15 |
1919 | | - and (output + text)[-1] != " " |
1920 | | - ): |
1921 | | - text += " " |
1922 | | - elif orientation == 180: |
1923 | | - if delta_y > 0.8 * f: |
1924 | | - if (output + text)[-1] != "\n": |
1925 | | - output += text + "\n" |
1926 | | - if visitor_text is not None: |
1927 | | - visitor_text( |
1928 | | - text + "\n", |
1929 | | - cm_matrix, |
1930 | | - tm_matrix, |
1931 | | - cmap[3], |
1932 | | - font_size, |
1933 | | - ) |
1934 | | - text = "" |
1935 | | - elif ( |
1936 | | - abs(delta_y) < f * 0.3 |
1937 | | - and abs(delta_x) > current_spacewidth() * f * 15 |
1938 | | - and (output + text)[-1] != " " |
1939 | | - ): |
1940 | | - text += " " |
1941 | | - elif orientation == 90: |
1942 | | - if delta_x > 0.8 * f: |
1943 | | - if (output + text)[-1] != "\n": |
1944 | | - output += text + "\n" |
1945 | | - if visitor_text is not None: |
1946 | | - visitor_text( |
1947 | | - text + "\n", |
1948 | | - cm_matrix, |
1949 | | - tm_matrix, |
1950 | | - cmap[3], |
1951 | | - font_size, |
1952 | | - ) |
1953 | | - text = "" |
1954 | | - elif ( |
1955 | | - abs(delta_x) < f * 0.3 |
1956 | | - and abs(delta_y) > current_spacewidth() * f * 15 |
1957 | | - and (output + text)[-1] != " " |
1958 | | - ): |
1959 | | - text += " " |
1960 | | - elif orientation == 270: |
1961 | | - if delta_x < -0.8 * f: |
1962 | | - if (output + text)[-1] != "\n": |
1963 | | - output += text + "\n" |
1964 | | - if visitor_text is not None: |
1965 | | - visitor_text( |
1966 | | - text + "\n", |
1967 | | - cm_matrix, |
1968 | | - tm_matrix, |
1969 | | - cmap[3], |
1970 | | - font_size, |
1971 | | - ) |
1972 | | - text = "" |
1973 | | - elif ( |
1974 | | - abs(delta_x) < f * 0.3 |
1975 | | - and abs(delta_y) > current_spacewidth() * f * 15 |
1976 | | - and (output + text)[-1] != " " |
1977 | | - ): |
1978 | | - text += " " |
1979 | | - except Exception: |
1980 | | - pass |
| 1771 | + text, output, tm_prev = crlf_space_check( |
| 1772 | + text, |
| 1773 | + tm_prev, |
| 1774 | + cm_matrix, |
| 1775 | + tm_matrix, |
| 1776 | + cmap, |
| 1777 | + orientations, |
| 1778 | + output, |
| 1779 | + font_size, |
| 1780 | + visitor_text, |
| 1781 | + current_spacewidth(), |
| 1782 | + ) |
| 1783 | + except OrientationNotFoundError: |
| 1784 | + return None |
1981 | 1785 |
|
1982 | 1786 | for operands, operator in content.operations: |
1983 | 1787 | if visitor_operand_before is not None: |
|
0 commit comments