Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
New text_extraction submodule
  • Loading branch information
MartinThoma committed Mar 5, 2023
commit 855db7e9d5486114bb2a95f0b6ff57072ca0873d
266 changes: 1 addition & 265 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,87 +72,12 @@
NullObject,
NumberObject,
RectangleObject,
TextStringObject,
encode_pdfdocencoding,
)
from .text_extraction import OrientationNotFoundError, crlf_space_check, handle_tj, mult

CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'


class OrientationNotFoundError(Exception):
pass


def mult(m: List[float], n: List[float]) -> List[float]:
return [
m[0] * n[0] + m[1] * n[2],
m[0] * n[1] + m[1] * n[3],
m[2] * n[0] + m[3] * n[2],
m[2] * n[1] + m[3] * n[3],
m[4] * n[0] + m[5] * n[2] + n[4],
m[4] * n[1] + m[5] * n[3] + n[5],
]


def orient(m: List[float]) -> int:
if m[3] > 1e-6:
return 0
elif m[3] < -1e-6:
return 180
elif m[1] > 0:
return 90
else:
return 270


def set_custom_rtl(
_min: Union[str, int, None] = None,
_max: Union[str, int, None] = None,
specials: Union[str, List[int], None] = None,
) -> Tuple[int, int, List[int]]:
"""
Change the Right-To-Left and special characters custom parameters.

Args:
_min: The new minimum value for the range of custom characters that
will be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
_max: The new maximum value for the range of custom characters that will
be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
specials: The new list of special characters to be inserted in the
current insertion order.
If set to ``None``, the current value will not be changed.
If set to a string, it will be converted to a list of ASCII codes.
The default value is an empty list.

Returns:
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
"""
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
if isinstance(_min, int):
CUSTOM_RTL_MIN = _min
elif isinstance(_min, str):
CUSTOM_RTL_MIN = ord(_min)
if isinstance(_max, int):
CUSTOM_RTL_MAX = _max
elif isinstance(_max, str):
CUSTOM_RTL_MAX = ord(_max)
if isinstance(specials, str):
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
elif isinstance(specials, list):
CUSTOM_RTL_SPECIAL_CHARS = specials
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS


def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
retval: Union[None, RectangleObject, IndirectObject] = self.get(name)
if isinstance(retval, RectangleObject):
Expand Down Expand Up @@ -2266,192 +2191,3 @@ def _get_fonts_walk(
_get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb)

return fnt, emb # return the sets for each page


def handle_tj(
text: str,
operands: List[Union[str, TextStringObject]],
cm_matrix: List[float],
tm_matrix: List[float],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
orientations: Tuple[int, ...],
output: str,
font_size: float,
rtl_dir: bool,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
) -> Tuple[str, bool]:
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
if orientation in orientations:
if isinstance(operands[0], str):
text += operands[0]
else:
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(cmap[0], str):
try:
t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
except Exception:
# the data does not match the expectation,
# we use the alternative ;
# text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
)
# "\u0590 - \u08FF \uFB50 - \uFDFF"
for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
# x can be a sequence of bytes ; ex: habibi.pdf
if len(x) == 1:
xx = ord(x)
else:
xx = 1
# fmt: off
if (
# cases where the current inserting order is kept
(xx <= 0x2F) # punctuations but...
or (0x3A <= xx and xx <= 0x40) # numbers (x30-39)
or (0x2000 <= xx and xx <= 0x206F) # upper punctuations..
or (0x20A0 <= xx and xx <= 0x21FF) # but (numbers) indices/exponents
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
):
text = x + text if rtl_dir else text + x
elif ( # right-to-left characters set
(0x0590 <= xx and xx <= 0x08FF)
or (0xFB1D <= xx and xx <= 0xFDFF)
or (0xFE70 <= xx and xx <= 0xFEFF)
or (CUSTOM_RTL_MIN <= xx and xx <= CUSTOM_RTL_MAX)
):
if not rtl_dir:
rtl_dir = True
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = x + text
else: # left-to-right
# print(">",xx,x,end="")
if rtl_dir:
rtl_dir = False
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = text + x
# fmt: on
return text, rtl_dir


def crlf_space_check(
text: str,
tm_prev: List[float],
cm_matrix: List[float],
tm_matrix: List[float],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
orientations: Tuple[int, ...],
output: str,
font_size: float,
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float]]:
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - tm_prev[4]
delta_y = m[5] - tm_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
tm_prev = m
if orientation not in orientations:
raise OrientationNotFoundError
try:
if orientation == 0:
if delta_y < -0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_y) < f * 0.3
and abs(delta_x) > spacewidth * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 180:
if delta_y > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_y) < f * 0.3
and abs(delta_x) > spacewidth * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 90:
if delta_x > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_x) < f * 0.3
and abs(delta_y) > spacewidth * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 270:
if delta_x < -0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_x) < f * 0.3
and abs(delta_y) > spacewidth * f * 15
and (output + text)[-1] != " "
):
text += " "
except Exception:
pass
return text, output, tm_prev
Loading