Skip to content

Commit dde4c79

Browse files
authored
ENH: Extend cmap compatibilty to GBK_EUC_H/V (#1812)
Fixes #1809
1 parent 8e343c1 commit dde4c79

File tree

2 files changed

+11
-1
lines changed

2 files changed

+11
-1
lines changed

pypdf/_cmap.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ def build_char_map(
9393
"/GB-EUC-V": "gbk", # TBC
9494
"/GBpc-EUC-H": "gb2312", # TBC
9595
"/GBpc-EUC-V": "gb2312", # TBC
96+
"/GBK-EUC-H": "gbk", # TBC
97+
"/GBK-EUC-V": "gbk", # TBC
9698
# UCS2 in code
9799
}
98100

tests/test_cmap.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,17 @@ def test_iss1533():
135135

136136

137137
@pytest.mark.enable_socket()
138-
def test_ucs2(caplog):
138+
def test_ucs2_gbk(caplog):
139139
url = "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf"
140140
name = "tstUCS2.pdf"
141141
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
142142
reader.pages[1].extract_text() # no error
143143
assert caplog.text == ""
144+
# iss 1809
145+
url = "https://github.com/py-pdf/pypdf/files/11315397/3.pdf"
146+
name = "tst-GBK_EUC.pdf"
147+
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
148+
t = reader.pages[0].extract_text()
149+
assert "NJA" in t
150+
assert "中华男科学杂志" in t
151+
# assert caplog.text == "" a duplicate field confirmed in page 0, so no check of caplog

0 commit comments

Comments
 (0)