bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944)

They support now splitting escape sequences between input chunks.

Add the third parameter "final" in codecs.raw_unicode_escape_decode().
It is True by default to match the former behavior.
This commit is contained in:
Serhiy Storchaka 2021-10-14 20:04:19 +03:00 committed by GitHub
parent d413c50363
commit 39aa98346d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 116 additions and 35 deletions

View file

@ -796,6 +796,16 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
string. */ string. */
); );
/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed /* bytes consumed */
);
/* --- Latin-1 Codecs ----------------------------------------------------- */ /* --- Latin-1 Codecs ----------------------------------------------------- */
PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(

View file

@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False): def encode(self, input, final=False):
return codecs.raw_unicode_escape_encode(input, self.errors)[0] return codecs.raw_unicode_escape_encode(input, self.errors)[0]
class IncrementalDecoder(codecs.IncrementalDecoder): class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def decode(self, input, final=False): def _buffer_decode(self, input, errors, final):
return codecs.raw_unicode_escape_decode(input, self.errors)[0] return codecs.raw_unicode_escape_decode(input, errors, final)
class StreamWriter(Codec,codecs.StreamWriter): class StreamWriter(Codec,codecs.StreamWriter):
pass pass
class StreamReader(Codec,codecs.StreamReader): class StreamReader(Codec,codecs.StreamReader):
pass def decode(self, input, errors='strict'):
return codecs.raw_unicode_escape_decode(input, errors, False)
### encodings module API ### encodings module API

View file

@ -2483,7 +2483,11 @@ def test_partial(self):
] ]
) )
class RawUnicodeEscapeTest(unittest.TestCase): class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
encoding = "raw-unicode-escape"
test_lone_surrogates = None
def test_empty(self): def test_empty(self):
self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0)) self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0)) self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
@ -2532,6 +2536,35 @@ def test_decode_errors(self):
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
def test_partial(self):
self.check_partial(
"\x00\t\n\r\\\xff\uffff\U00010000",
[
'\x00',
'\x00\t',
'\x00\t\n',
'\x00\t\n\r',
'\x00\t\n\r',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff',
'\x00\t\n\r\\\xff\uffff\U00010000',
]
)
class EscapeEncodeTest(unittest.TestCase): class EscapeEncodeTest(unittest.TestCase):

View file

@ -0,0 +1,2 @@
Fix incremental decoder and stream reader in the "raw-unicode-escape" codec.
Previously they failed if the escape sequence was split.

View file

@ -509,17 +509,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
_codecs.raw_unicode_escape_decode _codecs.raw_unicode_escape_decode
data: Py_buffer(accept={str, buffer}) data: Py_buffer(accept={str, buffer})
errors: str(accept={str, NoneType}) = None errors: str(accept={str, NoneType}) = None
final: bool(accept={int}) = True
/ /
[clinic start generated code]*/ [clinic start generated code]*/
static PyObject * static PyObject *
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
const char *errors) const char *errors, int final)
/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/ /*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/
{ {
PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len, Py_ssize_t consumed = data->len;
errors); PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
return codec_tuple(decoded, data->len); errors,
final ? NULL : &consumed);
return codec_tuple(decoded, consumed);
} }
/*[clinic input] /*[clinic input]

View file

@ -1143,7 +1143,7 @@ exit:
} }
PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__, PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__,
"raw_unicode_escape_decode($module, data, errors=None, /)\n" "raw_unicode_escape_decode($module, data, errors=None, final=True, /)\n"
"--\n" "--\n"
"\n"); "\n");
@ -1152,7 +1152,7 @@ PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__,
static PyObject * static PyObject *
_codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data, _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
const char *errors); const char *errors, int final);
static PyObject * static PyObject *
_codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs) _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
@ -1160,8 +1160,9 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss
PyObject *return_value = NULL; PyObject *return_value = NULL;
Py_buffer data = {NULL, NULL}; Py_buffer data = {NULL, NULL};
const char *errors = NULL; const char *errors = NULL;
int final = 1;
if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 2)) { if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 3)) {
goto exit; goto exit;
} }
if (PyUnicode_Check(args[0])) { if (PyUnicode_Check(args[0])) {
@ -1202,8 +1203,15 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss
_PyArg_BadArgument("raw_unicode_escape_decode", "argument 2", "str or None", args[1]); _PyArg_BadArgument("raw_unicode_escape_decode", "argument 2", "str or None", args[1]);
goto exit; goto exit;
} }
if (nargs < 3) {
goto skip_optional;
}
final = _PyLong_AsInt(args[2]);
if (final == -1 && PyErr_Occurred()) {
goto exit;
}
skip_optional: skip_optional:
return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors); return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors, final);
exit: exit:
/* Cleanup for data */ /* Cleanup for data */
@ -2809,4 +2817,4 @@ exit:
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/ /*[clinic end generated code: output=814dae36b6f885cb input=a9049054013a1b77]*/

View file

@ -6379,8 +6379,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
unsigned char c = (unsigned char) *s++; unsigned char c = (unsigned char) *s++;
Py_UCS4 ch; Py_UCS4 ch;
int count; int count;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *message; const char *message;
#define WRITE_ASCII_CHAR(ch) \ #define WRITE_ASCII_CHAR(ch) \
@ -6407,7 +6405,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
continue; continue;
} }
startinpos = s - starts - 1; Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */ /* \ - Escapes */
if (s >= end) { if (s >= end) {
message = "\\ at end of string"; message = "\\ at end of string";
@ -6554,8 +6552,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
*consumed = startinpos; *consumed = startinpos;
break; break;
} }
error: error:;
endinpos = s-starts; Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos; writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer( if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler, errors, &errorHandler,
@ -6735,9 +6733,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* --- Raw Unicode Escape Codec ------------------------------------------- */ /* --- Raw Unicode Escape Codec ------------------------------------------- */
PyObject * PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s, _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
Py_ssize_t size, Py_ssize_t size,
const char *errors) const char *errors,
Py_ssize_t *consumed)
{ {
const char *starts = s; const char *starts = s;
_PyUnicodeWriter writer; _PyUnicodeWriter writer;
@ -6746,6 +6745,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *exc = NULL; PyObject *exc = NULL;
if (size == 0) { if (size == 0) {
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY(); _Py_RETURN_UNICODE_EMPTY();
} }
@ -6764,8 +6766,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
unsigned char c = (unsigned char) *s++; unsigned char c = (unsigned char) *s++;
Py_UCS4 ch; Py_UCS4 ch;
int count; int count;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *message; const char *message;
#define WRITE_CHAR(ch) \ #define WRITE_CHAR(ch) \
@ -6780,11 +6780,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
} while(0) } while(0)
/* Non-escape characters are interpreted as Unicode ordinals */ /* Non-escape characters are interpreted as Unicode ordinals */
if (c != '\\' || s >= end) { if (c != '\\' || (s >= end && !consumed)) {
WRITE_CHAR(c); WRITE_CHAR(c);
continue; continue;
} }
Py_ssize_t startinpos = s - starts - 1;
/* \ - Escapes */
if (s >= end) {
assert(consumed);
// Set message to silent compiler warning.
// Actually it is never used.
message = "\\ at end of string";
goto incomplete;
}
c = (unsigned char) *s++; c = (unsigned char) *s++;
if (c == 'u') { if (c == 'u') {
count = 4; count = 4;
@ -6800,10 +6810,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
WRITE_CHAR(c); WRITE_CHAR(c);
continue; continue;
} }
startinpos = s - starts - 2;
/* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
for (ch = 0; count && s < end; ++s, --count) { for (ch = 0; count; ++s, --count) {
if (s >= end) {
goto incomplete;
}
c = (unsigned char)*s; c = (unsigned char)*s;
ch <<= 4; ch <<= 4;
if (c >= '0' && c <= '9') { if (c >= '0' && c <= '9') {
@ -6816,18 +6828,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
ch += c - ('A' - 10); ch += c - ('A' - 10);
} }
else { else {
break; goto error;
} }
} }
if (!count) { if (ch > MAX_UNICODE) {
if (ch <= MAX_UNICODE) {
WRITE_CHAR(ch);
continue;
}
message = "\\Uxxxxxxxx out of range"; message = "\\Uxxxxxxxx out of range";
goto error;
} }
WRITE_CHAR(ch);
continue;
endinpos = s-starts; incomplete:
if (consumed) {
*consumed = startinpos;
break;
}
error:;
Py_ssize_t endinpos = s-starts;
writer.min_length = end - s + writer.pos; writer.min_length = end - s + writer.pos;
if (unicode_decode_call_errorhandler_writer( if (unicode_decode_call_errorhandler_writer(
errors, &errorHandler, errors, &errorHandler,
@ -6849,7 +6866,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
return NULL; return NULL;
}
PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t size,
const char *errors)
{
return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
} }