bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec (GH-28944)

They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.raw_unicode_escape_decode(). It is True by default to match the former behavior.
2024-09-05 00:05:39 +00:00 · 2021-10-14 20:04:19 +03:00 · 2021-10-14 20:04:19 +03:00 · 39aa98346d
parent d413c50363
commit 39aa98346d
7 changed files with 116 additions and 35 deletions
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@ -796,6 +796,16 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
                                              string. */
 );
 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
 /* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
        const char *string,     /* Unicode-Escape encoded string */
        Py_ssize_t length,      /* size of string */
        const char *errors,     /* error handling */
        Py_ssize_t *consumed    /* bytes consumed */
 );
 /* --- Latin-1 Codecs ----------------------------------------------------- */
 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
--- a/Lib/encodings/raw_unicode_escape.py
+++ b/Lib/encodings/raw_unicode_escape.py
@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        return codecs.raw_unicode_escape_encode(input, self.errors)[0]
-class IncrementalDecoder(codecs.IncrementalDecoder):
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
-    def decode(self, input, final=False):
+    def _buffer_decode(self, input, errors, final):
-        return codecs.raw_unicode_escape_decode(input, self.errors)[0]
+        return codecs.raw_unicode_escape_decode(input, errors, final)
 class StreamWriter(Codec,codecs.StreamWriter):
    pass
 class StreamReader(Codec,codecs.StreamReader):
-    pass
+    def decode(self, input, errors='strict'):
        return codecs.raw_unicode_escape_decode(input, errors, False)
 ### encodings module API
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@ -2483,7 +2483,11 @@ def test_partial(self):
            ]
        )
-class RawUnicodeEscapeTest(unittest.TestCase):
+class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
    encoding = "raw-unicode-escape"
    test_lone_surrogates = None
    def test_empty(self):
        self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
        self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
@ -2532,6 +2536,35 @@ def test_decode_errors(self):
        self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
        self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
    def test_partial(self):
        self.check_partial(
            "\x00\t\n\r\\\xff\uffff\U00010000",
            [
                '\x00',
                '\x00\t',
                '\x00\t\n',
                '\x00\t\n\r',
                '\x00\t\n\r',
                '\x00\t\n\r\\\xff',
                '\x00\t\n\r\\\xff',
                '\x00\t\n\r\\\xff',
                '\x00\t\n\r\\\xff',
                '\x00\t\n\r\\\xff',
                '\x00\t\n\r\\\xff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff',
                '\x00\t\n\r\\\xff\uffff\U00010000',
            ]
        )
 class EscapeEncodeTest(unittest.TestCase):
--- a/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst
+++ b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst
@ -0,0 +1,2 @@
 Fix incremental decoder and stream reader in the "raw-unicode-escape" codec.
 Previously they failed if the escape sequence was split.
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@ -509,17 +509,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
 _codecs.raw_unicode_escape_decode
    data: Py_buffer(accept={str, buffer})
    errors: str(accept={str, NoneType}) = None
    final: bool(accept={int}) = True
    /
 [clinic start generated code]*/
 static PyObject *
 _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
-                                       const char *errors)
+                                       const char *errors, int final)
-/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/
+/*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/
 {
-    PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
+    Py_ssize_t consumed = data->len;
-                                                         errors);
+    PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
-    return codec_tuple(decoded, data->len);
+                                                                  errors,
                                                                  final ? NULL : &consumed);
    return codec_tuple(decoded, consumed);
 }
 /*[clinic input]
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@ -1143,7 +1143,7 @@ exit:
 }
 PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__,
-"raw_unicode_escape_decode($module, data, errors=None, /)\n"
+"raw_unicode_escape_decode($module, data, errors=None, final=True, /)\n"
 "--\n"
 "\n");
@ -1152,7 +1152,7 @@ PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__,
 static PyObject *
 _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
-                                       const char *errors);
+                                       const char *errors, int final);
 static PyObject *
 _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
@ -1160,8 +1160,9 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss
    PyObject *return_value = NULL;
    Py_buffer data = {NULL, NULL};
    const char *errors = NULL;
    int final = 1;
-    if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 2)) {
+    if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 3)) {
        goto exit;
    }
    if (PyUnicode_Check(args[0])) {
@ -1202,8 +1203,15 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss
        _PyArg_BadArgument("raw_unicode_escape_decode", "argument 2", "str or None", args[1]);
        goto exit;
    }
    if (nargs < 3) {
        goto skip_optional;
    }
    final = _PyLong_AsInt(args[2]);
    if (final == -1 && PyErr_Occurred()) {
        goto exit;
    }
 skip_optional:
-    return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors);
+    return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors, final);
 exit:
    /* Cleanup for data */
@ -2809,4 +2817,4 @@ exit:
 #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
    #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=814dae36b6f885cb input=a9049054013a1b77]*/
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6379,8 +6379,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
        unsigned char c = (unsigned char) *s++;
        Py_UCS4 ch;
        int count;
        Py_ssize_t startinpos;
        Py_ssize_t endinpos;
        const char *message;
 #define WRITE_ASCII_CHAR(ch)                                                  \
@ -6407,7 +6405,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
            continue;
        }
-        startinpos = s - starts - 1;
+        Py_ssize_t startinpos = s - starts - 1;
        /* \ - Escapes */
        if (s >= end) {
            message = "\\ at end of string";
@ -6554,8 +6552,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
            *consumed = startinpos;
            break;
        }
-      error:
+      error:;
-        endinpos = s-starts;
+        Py_ssize_t endinpos = s-starts;
        writer.min_length = end - s + writer.pos;
        if (unicode_decode_call_errorhandler_writer(
                errors, &errorHandler,
@ -6735,9 +6733,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
 /* --- Raw Unicode Escape Codec ------------------------------------------- */
 PyObject *
-PyUnicode_DecodeRawUnicodeEscape(const char *s,
+_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
-                                 Py_ssize_t size,
+                                          Py_ssize_t size,
-                                 const char *errors)
+                                          const char *errors,
                                          Py_ssize_t *consumed)
 {
    const char *starts = s;
    _PyUnicodeWriter writer;
@ -6746,6 +6745,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
    PyObject *exc = NULL;
    if (size == 0) {
        if (consumed) {
            *consumed = 0;
        }
        _Py_RETURN_UNICODE_EMPTY();
    }
@ -6764,8 +6766,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
        unsigned char c = (unsigned char) *s++;
        Py_UCS4 ch;
        int count;
        Py_ssize_t startinpos;
        Py_ssize_t endinpos;
        const char *message;
 #define WRITE_CHAR(ch)                                                        \
@ -6780,11 +6780,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
            } while(0)
        /* Non-escape characters are interpreted as Unicode ordinals */
-        if (c != '\\' || s >= end) {
+        if (c != '\\' || (s >= end && !consumed)) {
            WRITE_CHAR(c);
            continue;
        }
        Py_ssize_t startinpos = s - starts - 1;
        /* \ - Escapes */
        if (s >= end) {
            assert(consumed);
            // Set message to silent compiler warning.
            // Actually it is never used.
            message = "\\ at end of string";
            goto incomplete;
        }
        c = (unsigned char) *s++;
        if (c == 'u') {
            count = 4;
@ -6800,10 +6810,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
            WRITE_CHAR(c);
            continue;
        }
        startinpos = s - starts - 2;
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
-        for (ch = 0; count && s < end; ++s, --count) {
+        for (ch = 0; count; ++s, --count) {
            if (s >= end) {
                goto incomplete;
            }
            c = (unsigned char)*s;
            ch <<= 4;
            if (c >= '0' && c <= '9') {
@ -6816,18 +6828,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
                ch += c - ('A' - 10);
            }
            else {
-                break;
+                goto error;
            }
        }
-        if (!count) {
+        if (ch > MAX_UNICODE) {
            if (ch <= MAX_UNICODE) {
                WRITE_CHAR(ch);
                continue;
            }
            message = "\\Uxxxxxxxx out of range";
            goto error;
        }
        WRITE_CHAR(ch);
        continue;
-        endinpos = s-starts;
+      incomplete:
        if (consumed) {
            *consumed = startinpos;
            break;
        }
      error:;
        Py_ssize_t endinpos = s-starts;
        writer.min_length = end - s + writer.pos;
        if (unicode_decode_call_errorhandler_writer(
                errors, &errorHandler,
@ -6849,7 +6866,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return NULL;
 }
 PyObject *
 PyUnicode_DecodeRawUnicodeEscape(const char *s,
                                 Py_ssize_t size,
                                 const char *errors)
 {
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
 }
		`@ -0,0 +1,2 @@`
							`Fix incremental decoder and stream reader in the "raw-unicode-escape" codec.`
							`Previously they failed if the escape sequence was split.`