Skip to content

Commit 76a31a6

Browse files
author
Victor Stinner
committed
Cleanup decode_code_page_stateful() and encode_code_page()
* Fix decode_code_page_errors() result * Inline decode_code_page() and encode_code_page_chunk() * Replace the PyUnicodeObject type by PyObject
1 parent a9e7364 commit 76a31a6

2 files changed

Lines changed: 78 additions & 111 deletions

File tree

Lib/test/test_codecs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1980,6 +1980,9 @@ def test_multibyte_encoding(self):
19801980
))
19811981

19821982
def test_incremental(self):
1983+
decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
1984+
self.assertEqual(decoded, ('', 0))
1985+
19831986
decoded = codecs.code_page_decode(932,
19841987
b'\xe9\x80\xe9', 'strict',
19851988
False)

Objects/unicodeobject.c

Lines changed: 75 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -7006,7 +7006,7 @@ decode_code_page_flags(UINT code_page)
70067006
*/
70077007
static int
70087008
decode_code_page_strict(UINT code_page,
7009-
PyUnicodeObject **v,
7009+
PyObject **v,
70107010
const char *in,
70117011
int insize)
70127012
{
@@ -7022,15 +7022,15 @@ decode_code_page_strict(UINT code_page,
70227022

70237023
if (*v == NULL) {
70247024
/* Create unicode object */
7025-
*v = _PyUnicode_New(outsize);
7025+
*v = (PyObject*)_PyUnicode_New(outsize);
70267026
if (*v == NULL)
70277027
return -1;
70287028
out = PyUnicode_AS_UNICODE(*v);
70297029
}
70307030
else {
70317031
/* Extend unicode object */
70327032
Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7033-
if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
7033+
if (PyUnicode_Resize(v, n + outsize) < 0)
70347034
return -1;
70357035
out = PyUnicode_AS_UNICODE(*v) + n;
70367036
}
@@ -7057,9 +7057,8 @@ decode_code_page_strict(UINT code_page,
70577057
*/
70587058
static int
70597059
decode_code_page_errors(UINT code_page,
7060-
PyUnicodeObject **v,
7061-
const char *in,
7062-
int size,
7060+
PyObject **v,
7061+
const char *in, const int size,
70637062
const char *errors)
70647063
{
70657064
const char *startin = in;
@@ -7103,7 +7102,7 @@ decode_code_page_errors(UINT code_page,
71037102
PyErr_NoMemory();
71047103
goto error;
71057104
}
7106-
*v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7105+
*v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
71077106
if (*v == NULL)
71087107
goto error;
71097108
startout = PyUnicode_AS_UNICODE(*v);
@@ -7115,7 +7114,7 @@ decode_code_page_errors(UINT code_page,
71157114
PyErr_NoMemory();
71167115
goto error;
71177116
}
7118-
if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7117+
if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
71197118
goto error;
71207119
startout = PyUnicode_AS_UNICODE(*v) + n;
71217120
}
@@ -7173,9 +7172,9 @@ decode_code_page_errors(UINT code_page,
71737172
/* Extend unicode object */
71747173
outsize = out - startout;
71757174
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7176-
if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
7175+
if (PyUnicode_Resize(v, outsize) < 0)
71777176
goto error;
7178-
ret = 0;
7177+
ret = size;
71797178

71807179
error:
71817180
Py_XDECREF(encoding_obj);
@@ -7184,50 +7183,13 @@ decode_code_page_errors(UINT code_page,
71847183
return ret;
71857184
}
71867185

7187-
/*
7188-
* Decode a byte string from a Windows code page into unicode object. If
7189-
* 'final' is set, converts trailing lead-byte too.
7190-
*
7191-
* Returns consumed size if succeed, or raise a WindowsError or
7192-
* UnicodeDecodeError exception and returns -1 on error.
7193-
*/
7194-
static int
7195-
decode_code_page(UINT code_page,
7196-
PyUnicodeObject **v,
7197-
const char *s, int size,
7198-
int final, const char *errors)
7199-
{
7200-
int done;
7201-
7202-
/* Skip trailing lead-byte unless 'final' is set */
7203-
if (size == 0) {
7204-
if (*v == NULL) {
7205-
Py_INCREF(unicode_empty);
7206-
*v = (PyUnicodeObject*)unicode_empty;
7207-
if (*v == NULL)
7208-
return -1;
7209-
}
7210-
return 0;
7211-
}
7212-
7213-
if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
7214-
--size;
7215-
7216-
done = decode_code_page_strict(code_page, v, s, size);
7217-
if (done == -2)
7218-
done = decode_code_page_errors(code_page, v, s, size, errors);
7219-
return done;
7220-
}
7221-
72227186
static PyObject *
72237187
decode_code_page_stateful(int code_page,
7224-
const char *s,
7225-
Py_ssize_t size,
7226-
const char *errors,
7227-
Py_ssize_t *consumed)
7188+
const char *s, Py_ssize_t size,
7189+
const char *errors, Py_ssize_t *consumed)
72287190
{
7229-
PyUnicodeObject *v = NULL;
7230-
int done;
7191+
PyObject *v = NULL;
7192+
int chunk_size, final, converted, done;
72317193

72327194
if (code_page < 0) {
72337195
PyErr_SetString(PyExc_ValueError, "invalid code page number");
@@ -7237,29 +7199,53 @@ decode_code_page_stateful(int code_page,
72377199
if (consumed)
72387200
*consumed = 0;
72397201

7202+
do
7203+
{
72407204
#ifdef NEED_RETRY
7241-
retry:
7242-
if (size > INT_MAX)
7243-
done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
7244-
else
7205+
if (size > INT_MAX) {
7206+
chunk_size = INT_MAX;
7207+
final = 0;
7208+
done = 0;
7209+
}
7210+
else
72457211
#endif
7246-
done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
7212+
{
7213+
chunk_size = (int)size;
7214+
final = (consumed == NULL);
7215+
done = 1;
7216+
}
72477217

7248-
if (done < 0) {
7249-
Py_XDECREF(v);
7250-
return NULL;
7251-
}
7218+
/* Skip trailing lead-byte unless 'final' is set */
7219+
if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7220+
--chunk_size;
72527221

7253-
if (consumed)
7254-
*consumed += done;
7222+
if (chunk_size == 0 && done) {
7223+
if (v != NULL)
7224+
break;
7225+
Py_INCREF(unicode_empty);
7226+
return unicode_empty;
7227+
}
72557228

7256-
#ifdef NEED_RETRY
7257-
if (size > INT_MAX) {
7258-
s += done;
7259-
size -= done;
7260-
goto retry;
7261-
}
7262-
#endif
7229+
7230+
converted = decode_code_page_strict(code_page, &v,
7231+
s, chunk_size);
7232+
if (converted == -2)
7233+
converted = decode_code_page_errors(code_page, &v,
7234+
s, chunk_size,
7235+
errors);
7236+
assert(converted != 0);
7237+
7238+
if (converted < 0) {
7239+
Py_XDECREF(v);
7240+
return NULL;
7241+
}
7242+
7243+
if (consumed)
7244+
*consumed += converted;
7245+
7246+
s += converted;
7247+
size -= converted;
7248+
} while (!done);
72637249

72647250
#ifndef DONT_MAKE_RESULT_READY
72657251
if (_PyUnicode_READY_REPLACE(&v)) {
@@ -7268,7 +7254,7 @@ decode_code_page_stateful(int code_page,
72687254
}
72697255
#endif
72707256
assert(_PyUnicode_CheckConsistency(v, 1));
7271-
return (PyObject *)v;
7257+
return v;
72727258
}
72737259

72747260
PyObject *
@@ -7583,40 +7569,6 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
75837569
return ret;
75847570
}
75857571

7586-
/*
7587-
* Encode a Unicode string to a Windows code page into a byte string.
7588-
*
7589-
* Returns consumed characters if succeed, or raise a WindowsError and returns
7590-
* -1 on other error.
7591-
*/
7592-
static int
7593-
encode_code_page_chunk(UINT code_page, PyObject **outbytes,
7594-
PyObject *unicode, Py_ssize_t unicode_offset,
7595-
const Py_UNICODE *p, int size,
7596-
const char* errors)
7597-
{
7598-
int done;
7599-
7600-
if (size == 0) {
7601-
if (*outbytes == NULL) {
7602-
*outbytes = PyBytes_FromStringAndSize(NULL, 0);
7603-
if (*outbytes == NULL)
7604-
return -1;
7605-
}
7606-
return 0;
7607-
}
7608-
7609-
done = encode_code_page_strict(code_page, outbytes,
7610-
p, size,
7611-
errors);
7612-
if (done == -2)
7613-
done = encode_code_page_errors(code_page, outbytes,
7614-
unicode, unicode_offset,
7615-
p, size,
7616-
errors);
7617-
return done;
7618-
}
7619-
76207572
static PyObject *
76217573
encode_code_page(int code_page,
76227574
PyObject *unicode,
@@ -7626,7 +7578,7 @@ encode_code_page(int code_page,
76267578
Py_ssize_t size;
76277579
PyObject *outbytes = NULL;
76287580
Py_ssize_t offset;
7629-
int chunk_len, ret;
7581+
int chunk_len, ret, done;
76307582

76317583
p = PyUnicode_AsUnicodeAndSize(unicode, &size);
76327584
if (p == NULL)
@@ -7637,20 +7589,32 @@ encode_code_page(int code_page,
76377589
return NULL;
76387590
}
76397591

7592+
if (size == 0)
7593+
return PyBytes_FromStringAndSize(NULL, 0);
7594+
76407595
offset = 0;
76417596
do
76427597
{
76437598
#ifdef NEED_RETRY
7644-
if (size > INT_MAX)
7599+
if (size > INT_MAX) {
76457600
chunk_len = INT_MAX;
7601+
done = 0;
7602+
}
76467603
else
76477604
#endif
7605+
{
76487606
chunk_len = (int)size;
7649-
ret = encode_code_page_chunk(code_page, &outbytes,
7650-
unicode, offset,
7651-
p, chunk_len,
7652-
errors);
7607+
done = 1;
7608+
}
76537609

7610+
ret = encode_code_page_strict(code_page, &outbytes,
7611+
p, chunk_len,
7612+
errors);
7613+
if (ret == -2)
7614+
ret = encode_code_page_errors(code_page, &outbytes,
7615+
unicode, offset,
7616+
p, chunk_len,
7617+
errors);
76547618
if (ret < 0) {
76557619
Py_XDECREF(outbytes);
76567620
return NULL;
@@ -7659,7 +7623,7 @@ encode_code_page(int code_page,
76597623
p += chunk_len;
76607624
offset += chunk_len;
76617625
size -= chunk_len;
7662-
} while (size != 0);
7626+
} while (!done);
76637627

76647628
return outbytes;
76657629
}

0 commit comments

Comments
 (0)