Skip to content

Commit 85f9757

Browse files
authored
STY: Simplify _extract_text (#1683)
Created a new _text_extraction submodule. By now most of the text extraction logic is still in `_page.py`, but that should change in future.
1 parent 3a9d6f6 commit 85f9757

File tree

4 files changed

+348
-255
lines changed

4 files changed

+348
-255
lines changed

pypdf/_page.py

Lines changed: 32 additions & 228 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@
4646

4747
from ._cmap import build_char_map, unknown_char_map
4848
from ._protocols import PdfReaderProtocol, PdfWriterProtocol
49+
from ._text_extraction import (
50+
OrientationNotFoundError,
51+
crlf_space_check,
52+
handle_tj,
53+
mult,
54+
)
4955
from ._utils import (
5056
CompressedTransformationMatrix,
5157
File,
@@ -72,60 +78,11 @@
7278
NullObject,
7379
NumberObject,
7480
RectangleObject,
75-
encode_pdfdocencoding,
7681
)
7782

78-
CUSTOM_RTL_MIN: int = -1
79-
CUSTOM_RTL_MAX: int = -1
80-
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
8183
MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'
8284

8385

84-
def set_custom_rtl(
85-
_min: Union[str, int, None] = None,
86-
_max: Union[str, int, None] = None,
87-
specials: Union[str, List[int], None] = None,
88-
) -> Tuple[int, int, List[int]]:
89-
"""
90-
Change the Right-To-Left and special characters custom parameters.
91-
92-
Args:
93-
_min: The new minimum value for the range of custom characters that
94-
will be written right to left.
95-
If set to ``None``, the value will not be changed.
96-
If set to an integer or string, it will be converted to its ASCII code.
97-
The default value is -1, which sets no additional range to be converted.
98-
_max: The new maximum value for the range of custom characters that will
99-
be written right to left.
100-
If set to ``None``, the value will not be changed.
101-
If set to an integer or string, it will be converted to its ASCII code.
102-
The default value is -1, which sets no additional range to be converted.
103-
specials: The new list of special characters to be inserted in the
104-
current insertion order.
105-
If set to ``None``, the current value will not be changed.
106-
If set to a string, it will be converted to a list of ASCII codes.
107-
The default value is an empty list.
108-
109-
Returns:
110-
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
111-
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
112-
"""
113-
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
114-
if isinstance(_min, int):
115-
CUSTOM_RTL_MIN = _min
116-
elif isinstance(_min, str):
117-
CUSTOM_RTL_MIN = ord(_min)
118-
if isinstance(_max, int):
119-
CUSTOM_RTL_MAX = _max
120-
elif isinstance(_max, str):
121-
CUSTOM_RTL_MAX = ord(_max)
122-
if isinstance(specials, str):
123-
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
124-
elif isinstance(specials, list):
125-
CUSTOM_RTL_SPECIAL_CHARS = specials
126-
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
127-
128-
12986
def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
13087
retval: Union[None, RectangleObject, IndirectObject] = self.get(name)
13188
if isinstance(retval, RectangleObject):
@@ -1664,26 +1621,6 @@ def _extract_text(
16641621
TL = 0.0
16651622
font_size = 12.0 # init just in case of
16661623

1667-
def mult(m: List[float], n: List[float]) -> List[float]:
1668-
return [
1669-
m[0] * n[0] + m[1] * n[2],
1670-
m[0] * n[1] + m[1] * n[3],
1671-
m[2] * n[0] + m[3] * n[2],
1672-
m[2] * n[1] + m[3] * n[3],
1673-
m[4] * n[0] + m[5] * n[2] + n[4],
1674-
m[4] * n[1] + m[5] * n[3] + n[5],
1675-
]
1676-
1677-
def orient(m: List[float]) -> int:
1678-
if m[3] > 1e-6:
1679-
return 0
1680-
elif m[3] < -1e-6:
1681-
return 180
1682-
elif m[1] > 0:
1683-
return 90
1684-
else:
1685-
return 270
1686-
16871624
def current_spacewidth() -> float:
16881625
return _space_width / 1000.0
16891626

@@ -1815,169 +1752,36 @@ def process_operation(operator: bytes, operands: List) -> None:
18151752

18161753
elif operator == b"Tj":
18171754
check_crlf_space = True
1818-
m = mult(tm_matrix, cm_matrix)
1819-
orientation = orient(m)
1820-
if orientation in orientations:
1821-
if isinstance(operands[0], str):
1822-
text += operands[0]
1823-
else:
1824-
t: str = ""
1825-
tt: bytes = (
1826-
encode_pdfdocencoding(operands[0])
1827-
if isinstance(operands[0], str)
1828-
else operands[0]
1829-
)
1830-
if isinstance(cmap[0], str):
1831-
try:
1832-
t = tt.decode(
1833-
cmap[0], "surrogatepass"
1834-
) # apply str encoding
1835-
except Exception:
1836-
# the data does not match the expectation,
1837-
# we use the alternative ;
1838-
# text extraction may not be good
1839-
t = tt.decode(
1840-
"utf-16-be" if cmap[0] == "charmap" else "charmap",
1841-
"surrogatepass",
1842-
) # apply str encoding
1843-
else: # apply dict encoding
1844-
t = "".join(
1845-
[
1846-
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
1847-
for x in tt
1848-
]
1849-
)
1850-
# "\u0590 - \u08FF \uFB50 - \uFDFF"
1851-
for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
1852-
# x can be a sequence of bytes ; ex: habibi.pdf
1853-
if len(x) == 1:
1854-
xx = ord(x)
1855-
else:
1856-
xx = 1
1857-
# fmt: off
1858-
if (
1859-
# cases where the current inserting order is kept
1860-
(xx <= 0x2F) # punctuations but...
1861-
or (0x3A <= xx and xx <= 0x40) # numbers (x30-39)
1862-
or (0x2000 <= xx and xx <= 0x206F) # upper punctuations..
1863-
or (0x20A0 <= xx and xx <= 0x21FF) # but (numbers) indices/exponents
1864-
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
1865-
):
1866-
text = x + text if rtl_dir else text + x
1867-
elif ( # right-to-left characters set
1868-
(0x0590 <= xx and xx <= 0x08FF)
1869-
or (0xFB1D <= xx and xx <= 0xFDFF)
1870-
or (0xFE70 <= xx and xx <= 0xFEFF)
1871-
or (CUSTOM_RTL_MIN <= xx and xx <= CUSTOM_RTL_MAX)
1872-
):
1873-
if not rtl_dir:
1874-
rtl_dir = True
1875-
output += text
1876-
if visitor_text is not None:
1877-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
1878-
text = ""
1879-
text = x + text
1880-
else: # left-to-right
1881-
# print(">",xx,x,end="")
1882-
if rtl_dir:
1883-
rtl_dir = False
1884-
output += text
1885-
if visitor_text is not None:
1886-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
1887-
text = ""
1888-
text = text + x
1889-
# fmt: on
1755+
text, rtl_dir = handle_tj(
1756+
text,
1757+
operands,
1758+
cm_matrix,
1759+
tm_matrix, # text matrix
1760+
cmap,
1761+
orientations,
1762+
output,
1763+
font_size,
1764+
rtl_dir,
1765+
visitor_text,
1766+
)
18901767
else:
18911768
return None
18921769
if check_crlf_space:
1893-
m = mult(tm_matrix, cm_matrix)
1894-
orientation = orient(m)
1895-
delta_x = m[4] - tm_prev[4]
1896-
delta_y = m[5] - tm_prev[5]
1897-
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
1898-
f = font_size * k
1899-
tm_prev = m
1900-
if orientation not in orientations:
1901-
return None
19021770
try:
1903-
if orientation == 0:
1904-
if delta_y < -0.8 * f:
1905-
if (output + text)[-1] != "\n":
1906-
output += text + "\n"
1907-
if visitor_text is not None:
1908-
visitor_text(
1909-
text + "\n",
1910-
cm_matrix,
1911-
tm_matrix,
1912-
cmap[3],
1913-
font_size,
1914-
)
1915-
text = ""
1916-
elif (
1917-
abs(delta_y) < f * 0.3
1918-
and abs(delta_x) > current_spacewidth() * f * 15
1919-
and (output + text)[-1] != " "
1920-
):
1921-
text += " "
1922-
elif orientation == 180:
1923-
if delta_y > 0.8 * f:
1924-
if (output + text)[-1] != "\n":
1925-
output += text + "\n"
1926-
if visitor_text is not None:
1927-
visitor_text(
1928-
text + "\n",
1929-
cm_matrix,
1930-
tm_matrix,
1931-
cmap[3],
1932-
font_size,
1933-
)
1934-
text = ""
1935-
elif (
1936-
abs(delta_y) < f * 0.3
1937-
and abs(delta_x) > current_spacewidth() * f * 15
1938-
and (output + text)[-1] != " "
1939-
):
1940-
text += " "
1941-
elif orientation == 90:
1942-
if delta_x > 0.8 * f:
1943-
if (output + text)[-1] != "\n":
1944-
output += text + "\n"
1945-
if visitor_text is not None:
1946-
visitor_text(
1947-
text + "\n",
1948-
cm_matrix,
1949-
tm_matrix,
1950-
cmap[3],
1951-
font_size,
1952-
)
1953-
text = ""
1954-
elif (
1955-
abs(delta_x) < f * 0.3
1956-
and abs(delta_y) > current_spacewidth() * f * 15
1957-
and (output + text)[-1] != " "
1958-
):
1959-
text += " "
1960-
elif orientation == 270:
1961-
if delta_x < -0.8 * f:
1962-
if (output + text)[-1] != "\n":
1963-
output += text + "\n"
1964-
if visitor_text is not None:
1965-
visitor_text(
1966-
text + "\n",
1967-
cm_matrix,
1968-
tm_matrix,
1969-
cmap[3],
1970-
font_size,
1971-
)
1972-
text = ""
1973-
elif (
1974-
abs(delta_x) < f * 0.3
1975-
and abs(delta_y) > current_spacewidth() * f * 15
1976-
and (output + text)[-1] != " "
1977-
):
1978-
text += " "
1979-
except Exception:
1980-
pass
1771+
text, output, tm_prev = crlf_space_check(
1772+
text,
1773+
tm_prev,
1774+
cm_matrix,
1775+
tm_matrix,
1776+
cmap,
1777+
orientations,
1778+
output,
1779+
font_size,
1780+
visitor_text,
1781+
current_spacewidth(),
1782+
)
1783+
except OrientationNotFoundError:
1784+
return None
19811785

19821786
for operands, operator in content.operations:
19831787
if visitor_operand_before is not None:

0 commit comments

Comments
 (0)