Skip to content

Commit 974ed7c

Browse files
committed
- SF #962502: Add two more methods for unicode type; width() and
iswide() for east asian width manipulation. (Inspired by David Goodger, Reviewed by Martin v. Loewis) - Move _PyUnicode_TypeRecord.flags to the end of the struct so that no padding is added for UCS-4 builds. (Suggested by Martin v. Loewis)
1 parent b6568b9 commit 974ed7c

11 files changed

Lines changed: 655 additions & 431 deletions

File tree

Doc/api/concrete.tex

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,11 @@ \subsection{Unicode Objects \label{unicodeObjects}}
850850
character.
851851
\end{cfuncdesc}
852852

853+
\begin{cfuncdesc}{int}{Py_UNICODE_ISWIDE}{Py_UNICODE ch}
854+
Returns 1/0 depending on whether \var{ch} is a wide or full-width
855+
character.
856+
\end{cfuncdesc}
857+
853858
These APIs can be used for fast direct character conversions:
854859

855860
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
@@ -908,6 +913,10 @@ \subsection{Unicode Objects \label{unicodeObjects}}
908913
Return the length of the Unicode object.
909914
\end{cfuncdesc}
910915

916+
\begin{cfuncdesc}{int}{PyUnicode_GetWidth}{PyObject *unicode}
917+
Return the fixed-width representation length of the Unicode object.
918+
\end{cfuncdesc}
919+
911920
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromEncodedObject}{PyObject *obj,
912921
const char *encoding,
913922
const char *errors}

Doc/lib/libstdtypes.tex

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,12 @@ \subsubsection{String Methods \label{string-methods}}
642642
there is at least one cased character, false otherwise.
643643
\end{methoddesc}
644644

645+
\begin{methoddesc}[string]{iswide}{}
646+
Return true if all characters in the string are wide or full width and
647+
there is at least one wide or full width character, false otherwise.
648+
This method is supported by unicode type only.
649+
\end{methoddesc}
650+
645651
\begin{methoddesc}[string]{join}{seq}
646652
Return a string which is the concatenation of the strings in the
647653
sequence \var{seq}. The separator between elements is the string
@@ -774,6 +780,11 @@ \subsubsection{String Methods \label{string-methods}}
774780
Return a copy of the string converted to uppercase.
775781
\end{methoddesc}
776782

783+
\begin{methoddesc}[string]{width}{}
784+
Return length of fixed-width representation of the string. This method
785+
is supported by unicode type only.
786+
\end{methoddesc}
787+
777788
\begin{methoddesc}[string]{zfill}{width}
778789
Return the numeric string left filled with zeros in a string
779790
of length \var{width}. The original string is returned if

Include/unicodeobject.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
180180
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
181181
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
182182
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
183+
# define PyUnicode_GetWidth PyUnicodeUCS2_GetWidth
183184
# define PyUnicode_Join PyUnicodeUCS2_Join
184185
# define PyUnicode_Replace PyUnicodeUCS2_Replace
185186
# define PyUnicode_Resize PyUnicodeUCS2_Resize
@@ -199,6 +200,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
199200
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
200201
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
201202
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
203+
# define _PyUnicode_IsWide _PyUnicodeUCS2_IsWide
202204
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
203205
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
204206
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
@@ -252,6 +254,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
252254
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
253255
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
254256
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
257+
# define PyUnicode_GetWidth PyUnicodeUCS4_GetWidth
255258
# define PyUnicode_Join PyUnicodeUCS4_Join
256259
# define PyUnicode_Replace PyUnicodeUCS4_Replace
257260
# define PyUnicode_Resize PyUnicodeUCS4_Resize
@@ -270,6 +273,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
270273
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
271274
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
272275
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
276+
# define _PyUnicode_IsWide _PyUnicodeUCS4_IsWide
273277
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
274278
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
275279
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
@@ -315,6 +319,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
315319

316320
#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
317321

322+
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
323+
318324
#else
319325

320326
#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
@@ -338,6 +344,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
338344

339345
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
340346

347+
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
348+
341349
#endif
342350

343351
#define Py_UNICODE_ISALNUM(ch) \
@@ -430,6 +438,12 @@ PyAPI_FUNC(int) PyUnicode_GetSize(
430438
PyObject *unicode /* Unicode object */
431439
);
432440

441+
/* Get the fixed-width representation length of the Unicode object */
442+
443+
PyAPI_FUNC(int) PyUnicode_GetWidth(
444+
PyObject *unicode /* Unicode object */
445+
);
446+
433447
/* Get the maximum ordinal for a Unicode character. */
434448
PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
435449

@@ -1151,6 +1165,10 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
11511165
Py_UNICODE ch /* Unicode character */
11521166
);
11531167

1168+
PyAPI_FUNC(int) _PyUnicode_IsWide(
1169+
Py_UNICODE ch /* Unicode character */
1170+
);
1171+
11541172
#ifdef __cplusplus
11551173
}
11561174
#endif

Lib/test/test_unicode.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,26 @@ def test_isnumeric(self):
291291

292292
self.assertRaises(TypeError, u"abc".isnumeric, 42)
293293

294+
def test_iswide(self):
295+
self.checkequalnofix(False, u'', 'iswide')
296+
self.checkequalnofix(False, u'\x1f', 'iswide') # Neutral
297+
self.checkequalnofix(False, u'\x20', 'iswide') # Narrow
298+
self.checkequalnofix(True, u'\u2329', 'iswide') # Wide
299+
self.checkequalnofix(False, u'\uff64', 'iswide') # Half
300+
self.checkequalnofix(True, u'\u3000', 'iswide') # Full
301+
self.checkequalnofix(False, u'\u2460', 'iswide') # Ambiguous
302+
self.checkequalnofix(True, u'\ud55c\uae00', 'iswide')
303+
self.checkequalnofix(False, u'\ud55c\u2606\uae00', 'iswide')
304+
305+
def test_wide(self):
306+
self.assertEqual(u''.width(), 0)
307+
self.assertEqual(u'abcd'.width(), 4)
308+
self.assertEqual(u'\u0187\u01c9'.width(), 2)
309+
self.assertEqual(u'\u2460\u2329'.width(), 3)
310+
self.assertEqual(u'\u2329\u2460'.width(), 3)
311+
self.assertEqual(u'\ud55c\uae00'.width(), 4)
312+
self.assertEqual(u'\ud55c\u2606\uae00'.width(), 5)
313+
294314
def test_contains(self):
295315
# Testing Unicode contains method
296316
self.assert_('a' in u'abdb')

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ What's New in Python 2.4 alpha 1?
1212
Core and builtins
1313
-----------------
1414

15+
- Unicode type got two new methods; iswide() and width(). They
16+
manipulate east asian width information as of Unicode TR11.
17+
1518
- Improved the tuple hashing algorithm to give fewer collisions in
1619
common cases. Fixes bug #942952.
1720

Modules/unicodedata_db.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
1+
/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
22

33
#define UNIDATA_VERSION "3.2.0"
44
/* a list of unique database records */

Modules/unicodename_db.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
1+
/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
22

33
#define NAME_MAXLEN 256
44

Objects/unicodectype.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,15 @@
1919
#define SPACE_MASK 0x20
2020
#define TITLE_MASK 0x40
2121
#define UPPER_MASK 0x80
22+
#define WIDE_MASK 0x100
2223

2324
typedef struct {
24-
const unsigned short flags;
2525
const Py_UNICODE upper;
2626
const Py_UNICODE lower;
2727
const Py_UNICODE title;
2828
const unsigned char decimal;
2929
const unsigned char digit;
30+
const unsigned short flags;
3031
} _PyUnicode_TypeRecord;
3132

3233
#include "unicodetype_db.h"
@@ -322,6 +323,15 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
322323
return 1;
323324
}
324325

326+
/* Returns 1 for Unicode characters having Full or Wide width, 0 otherwise */
327+
328+
int _PyUnicode_IsWide(Py_UNICODE ch)
329+
{
330+
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
331+
332+
return (ctype->flags & WIDE_MASK) != 0;
333+
}
334+
325335
#ifndef WANT_WCTYPE_FUNCTIONS
326336

327337
/* Returns 1 for Unicode characters having the bidirectional type

Objects/unicodeobject.c

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,27 @@ int PyUnicode_GetSize(PyObject *unicode)
655655
return -1;
656656
}
657657

658+
int PyUnicode_GetWidth(PyObject *unicode)
659+
{
660+
const Py_UNICODE *p, *e;
661+
int width;
662+
663+
if (!PyUnicode_Check(unicode)) {
664+
PyErr_BadArgument();
665+
return -1;
666+
}
667+
668+
p = PyUnicode_AS_UNICODE(unicode);
669+
e = p + PyUnicode_GET_SIZE(unicode);
670+
for (width = 0; p < e; p++)
671+
if (Py_UNICODE_ISWIDE(*p))
672+
width += 2;
673+
else
674+
width++;
675+
676+
return width;
677+
}
678+
658679
const char *PyUnicode_GetDefaultEncoding(void)
659680
{
660681
return unicode_default_encoding;
@@ -5316,6 +5337,35 @@ unicode_isnumeric(PyUnicodeObject *self)
53165337
return PyBool_FromLong(1);
53175338
}
53185339

5340+
PyDoc_STRVAR(iswide__doc__,
5341+
"S.iswide() -> bool\n\
5342+
\n\
5343+
Return True if all characters in S are wide width\n\
5344+
and there is at least one character in S, False otherwise.");
5345+
5346+
static PyObject*
5347+
unicode_iswide(PyUnicodeObject *self)
5348+
{
5349+
register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5350+
register const Py_UNICODE *e;
5351+
5352+
/* Shortcut for single character strings */
5353+
if (PyUnicode_GET_SIZE(self) == 1 &&
5354+
Py_UNICODE_ISWIDE(*p))
5355+
Py_RETURN_TRUE;
5356+
5357+
/* Special case for empty strings */
5358+
if (PyString_GET_SIZE(self) == 0)
5359+
Py_RETURN_FALSE;
5360+
5361+
e = p + PyUnicode_GET_SIZE(self);
5362+
for (; p < e; p++) {
5363+
if (!Py_UNICODE_ISWIDE(*p))
5364+
Py_RETURN_FALSE;
5365+
}
5366+
Py_RETURN_TRUE;
5367+
}
5368+
53195369
PyDoc_STRVAR(join__doc__,
53205370
"S.join(sequence) -> unicode\n\
53215371
\n\
@@ -5335,7 +5385,7 @@ unicode_length(PyUnicodeObject *self)
53355385
}
53365386

53375387
PyDoc_STRVAR(ljust__doc__,
5338-
"S.ljust(width[, fillchar]) -> unicode\n\
5388+
"S.ljust(width[, fillchar]) -> int\n\
53395389
\n\
53405390
Return S left justified in a Unicode string of length width. Padding is\n\
53415391
done using the specified fill character (default is a space).");
@@ -5927,6 +5977,21 @@ unicode_upper(PyUnicodeObject *self)
59275977
return fixup(self, fixupper);
59285978
}
59295979

5980+
PyDoc_STRVAR(width__doc__,
5981+
"S.width() -> unicode\n\
5982+
\n\
5983+
Return a fixed-width representation length of S.");
5984+
5985+
static PyObject*
5986+
unicode_width(PyObject *self)
5987+
{
5988+
int width = PyUnicode_GetWidth(self);
5989+
if (width == -1)
5990+
return NULL;
5991+
else
5992+
return PyInt_FromLong((long)width);
5993+
}
5994+
59305995
PyDoc_STRVAR(zfill__doc__,
59315996
"S.zfill(width) -> unicode\n\
59325997
\n\
@@ -6090,6 +6155,8 @@ static PyMethodDef unicode_methods[] = {
60906155
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
60916156
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
60926157
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
6158+
{"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
6159+
{"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
60936160
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
60946161
#if 0
60956162
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},

0 commit comments

Comments
 (0)