Skip to content

Commit ce9b5a5

Browse files
committed
Encode surrogates in UTF-8 even for a wide Py_UNICODE.
Implement sys.maxunicode. Explicitly wrap around upper/lower computations for wide Py_UNICODE. When decoding large characters with UTF-8, represent expected test results using the \U notation.
1 parent 236d8b7 commit ce9b5a5

5 files changed

Lines changed: 48 additions & 17 deletions

File tree

Include/unicodeobject.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,9 @@ extern DL_IMPORT(int) PyUnicode_GetSize(
274274
PyObject *unicode /* Unicode object */
275275
);
276276

277+
/* Get the maximum ordinal for a Unicode character. */
278+
extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
279+
277280
/* Resize an already allocated Unicode object to the new size length.
278281
279282
*unicode is modified to point to the new (resized) object and 0

Lib/test/test_unicode.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -386,9 +386,9 @@ def test_fixup(s):
386386
''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
387387
# UTF-8 specific decoding tests
388388
verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
389-
'utf-8') == u'\ud84d\udc56' )
389+
'utf-8') == u'\U00023456' )
390390
verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
391-
'utf-8') == u'\ud800\udc02' )
391+
'utf-8') == u'\U00010002' )
392392
verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
393393
'utf-8') == u'\u20ac' )
394394

Objects/unicodectype.c

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,21 @@ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
5959
/* Returns the titlecase Unicode characters corresponding to ch or just
6060
ch if no titlecase mapping is known. */
6161

62-
Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch)
62+
Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
6363
{
6464
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
6565

6666
if (ctype->title)
67-
return ch + ctype->title;
68-
69-
return ch + ctype->upper;
67+
ch += ctype->title;
68+
else
69+
ch += ctype->upper;
70+
71+
#ifdef USE_UCS4_STORAGE
72+
/* The database assumes that the values wrap around at 0x10000. */
73+
if (ch > 0x10000)
74+
ch -= 0x10000;
75+
#endif
76+
return ch;
7077
}
7178

7279
/* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -348,21 +355,33 @@ int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
348355
/* Returns the uppercase Unicode characters corresponding to ch or just
349356
ch if no uppercase mapping is known. */
350357

351-
Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
358+
Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch)
352359
{
353360
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
354361

355-
return ch + ctype->upper;
362+
ch += ctype->upper;
363+
#ifdef USE_UCS4_STORAGE
364+
/* The database assumes that the values wrap around at 0x10000. */
365+
if (ch > 0x10000)
366+
ch -= 0x10000;
367+
#endif
368+
return ch;
356369
}
357370

358371
/* Returns the lowercase Unicode characters corresponding to ch or just
359372
ch if no lowercase mapping is known. */
360373

361-
Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
374+
Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch)
362375
{
363376
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
364377

365-
return ch + ctype->lower;
378+
ch += ctype->lower;
379+
#ifdef USE_UCS4_STORAGE
380+
/* The database assumes that the values wrap around at 0x10000. */
381+
if (ch > 0x10000)
382+
ch -= 0x10000;
383+
#endif
384+
return ch;
366385
}
367386

368387
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',

Objects/unicodeobject.c

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,18 @@ static PyUnicodeObject *unicode_latin1[256];
103103
*/
104104
static char unicode_default_encoding[100];
105105

106+
Py_UNICODE
107+
PyUnicode_GetMax()
108+
{
109+
#ifdef USE_UCS4_STORAGE
110+
return 0x10FFFF;
111+
#else
112+
/* This is actually an illegal character, so it should
113+
not be passed to unichr. */
114+
return 0xFFFF;
115+
#endif
116+
}
117+
106118
/* --- Unicode Object ----------------------------------------------------- */
107119

108120
static
@@ -884,12 +896,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
884896
cbWritten += 2;
885897
}
886898
else if (ch < 0x10000) {
887-
#if Py_UNICODE_SIZE == 4
888-
*p++ = 0xe0 | (ch>>12);
889-
*p++ = 0x80 | ((ch>>6) & 0x3f);
890-
*p++ = 0x80 | (ch & 0x3f);
891-
cbWritten += 3;
892-
#else
893899
/* Check for high surrogate */
894900
if (0xD800 <= ch && ch <= 0xDBFF) {
895901
if (i != size) {
@@ -920,7 +926,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
920926
}
921927
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
922928
*p++ = (char)(0x80 | (ch & 0x3f));
923-
#endif
924929
} else {
925930
*p++ = 0xf0 | (ch>>18);
926931
*p++ = 0x80 | ((ch>>12) & 0x3f);

Python/sysmodule.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,7 @@ exc_traceback -- traceback of exception currently being handled\n\
533533
Static objects:\n\
534534
\n\
535535
maxint -- the largest supported integer (the smallest is -maxint-1)\n\
536+
maxunicode -- the largest supported character\n\
536537
builtin_module_names -- tuple of module names built into this intepreter\n\
537538
version -- the version of this interpreter as a string\n\
538539
version_info -- version information as a tuple\n\
@@ -643,6 +644,9 @@ _PySys_Init(void)
643644
PyDict_SetItemString(sysdict, "maxint",
644645
v = PyInt_FromLong(PyInt_GetMax()));
645646
Py_XDECREF(v);
647+
PyDict_SetItemString(sysdict, "maxunicode",
648+
v = PyInt_FromLong(PyUnicode_GetMax()));
649+
Py_XDECREF(v);
646650
PyDict_SetItemString(sysdict, "builtin_module_names",
647651
v = list_builtin_module_names());
648652
Py_XDECREF(v);

0 commit comments

Comments
 (0)