bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843)

Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c.
2024-10-06 14:39:37 +00:00 · 2021-03-17 21:46:53 +01:00 · 2021-03-17 21:46:53 +01:00 · 9976834f80
parent 6086ae7fd4
commit 9976834f80
4 changed files with 149 additions and 70 deletions
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@ -200,38 +200,72 @@ def test_undecodable_code(self):
        if not stdout.startswith(pattern):
            raise AssertionError("%a doesn't start with %a" % (stdout, pattern))

+    @unittest.skipIf(sys.platform == 'win32',
+                     'Windows has a native unicode API')
+    def test_invalid_utf8_arg(self):
+        # bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba'
+        # byte sequence with surrogateescape rather than decoding it as the
+        # U+7fffbeba character which is outside the [U+0000; U+10ffff] range of
+        # Python Unicode characters.
+        #
+        # Test with default config, in the C locale, in the Python UTF-8 Mode.
+        code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))'
+        base_cmd = [sys.executable, '-c', code]
+
+        def run_default(arg):
+            cmd = [sys.executable, '-c', code, arg]
+            return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
+        def run_c_locale(arg):
+            cmd = [sys.executable, '-c', code, arg]
+            env = dict(os.environ)
+            env['LC_ALL'] = 'C'
+            return subprocess.run(cmd, stdout=subprocess.PIPE,
+                                  text=True, env=env)
+
+        def run_utf8_mode(arg):
+            cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
+            return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
+        valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+        # invalid UTF-8 byte sequences with a valid UTF-8 sequence
+        # in the middle.
+        invalid_utf8 = (
+            b'\xff'                      # invalid byte
+            b'\xc3\xff'                  # invalid byte sequence
+            b'\xc3\xa9'                  # valid utf-8: U+00E9 character
+            b'\xed\xa0\x80'              # lone surrogate character (invalid)
+            b'\xfd\xbf\xbf\xbb\xba\xba'  # character outside [U+0000; U+10ffff]
+        )
+        test_args = [valid_utf8, invalid_utf8]
+
+        for run_cmd in (run_default, run_c_locale, run_utf8_mode):
+            with self.subTest(run_cmd=run_cmd):
+                for arg in test_args:
+                    proc = run_cmd(arg)
+                    self.assertEqual(proc.stdout.rstrip(), ascii(arg))
+
    @unittest.skipUnless((sys.platform == 'darwin' or
                support.is_android), 'test specific to Mac OS X and Android')
    def test_osx_android_utf8(self):
-        def check_output(text):
-            decoded = text.decode('utf-8', 'surrogateescape')
-            expected = ascii(decoded).encode('ascii') + b'\n'
-
-            env = os.environ.copy()
-            # C locale gives ASCII locale encoding, but Python uses UTF-8
-            # to parse the command line arguments on Mac OS X and Android.
-            env['LC_ALL'] = 'C'
-
-            p = subprocess.Popen(
-                (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
-                stdout=subprocess.PIPE,
-                env=env)
-            stdout, stderr = p.communicate()
-            self.assertEqual(stdout, expected)
-            self.assertEqual(p.returncode, 0)
-
-        # test valid utf-8
        text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
-        check_output(text)
+        code = "import sys; print(ascii(sys.argv[1]))"

-        # test invalid utf-8
-        text = (
-            b'\xff'         # invalid byte
-            b'\xc3\xa9'     # valid utf-8 character
-            b'\xc3\xff'     # invalid byte sequence
-            b'\xed\xa0\x80' # lone surrogate character (invalid)
-        )
-        check_output(text)
+        decoded = text.decode('utf-8', 'surrogateescape')
+        expected = ascii(decoded).encode('ascii') + b'\n'
+
+        env = os.environ.copy()
+        # C locale gives ASCII locale encoding, but Python uses UTF-8
+        # to parse the command line arguments on Mac OS X and Android.
+        env['LC_ALL'] = 'C'
+
+        p = subprocess.Popen(
+            (sys.executable, "-c", code, text),
+            stdout=subprocess.PIPE,
+            env=env)
+        stdout, stderr = p.communicate()
+        self.assertEqual(stdout, expected)
+        self.assertEqual(p.returncode, 0)

    def test_non_interactive_output_buffering(self):
        code = textwrap.dedent("""
--- a/Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst
+++ b/Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst
@ -0,0 +1,4 @@
+Python no longer fails at startup with a fatal error if a command line
+argument contains an invalid Unicode character. The
+:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be
+decoded as Unicode characters outside the [U+0000; U+10ffff] range.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -94,7 +94,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
 extern "C" {
 #endif

-/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
+// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
+// The value must be the same in fileutils.c.
 #define MAX_UNICODE 0x10ffff

 #ifdef Py_DEBUG
@ -1784,8 +1785,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
            *maxchar = ch;
            if (*maxchar > MAX_UNICODE) {
                PyErr_Format(PyExc_ValueError,
-                             "character U+%x is not in range [U+0000; U+10ffff]",
-                             ch);
+                             "character U+%x is not in range [U+0000; U+%x]",
+                             ch, MAX_UNICODE);
                return -1;
            }
        }
@ -14089,7 +14090,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
    {
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
-    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
    default:
        Py_UNREACHABLE();
    }
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@ -34,6 +34,13 @@ extern int winerror_to_errno(int);
 int _Py_open_cloexec_works = -1;
 #endif

+// The value must be the same in unicodeobject.c.
+#define MAX_UNICODE 0x10ffff
+
+// mbstowcs() and mbrtowc() errors
+static const size_t DECODE_ERROR = ((size_t)-1);
+static const size_t INCOMPLETE_CHARACTER = (size_t)-2;
+

 static int
 get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
@ -82,6 +89,57 @@ _Py_device_encoding(int fd)
 #endif
 }

+
+static size_t
+is_valid_wide_char(wchar_t ch)
+{
+    if (Py_UNICODE_IS_SURROGATE(ch)) {
+        // Reject lone surrogate characters
+        return 0;
+    }
+    if (ch > MAX_UNICODE) {
+        // bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
+        // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
+        // it creates characters outside the [U+0000; U+10ffff] range:
+        // https://sourceware.org/bugzilla/show_bug.cgi?id=2373
+        return 0;
+    }
+    return 1;
+}
+
+
+static size_t
+_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
+{
+    size_t count = mbstowcs(dest, src, n);
+    if (dest != NULL && count != DECODE_ERROR) {
+        for (size_t i=0; i < count; i++) {
+            wchar_t ch = dest[i];
+            if (!is_valid_wide_char(ch)) {
+                return DECODE_ERROR;
+            }
+        }
+    }
+    return count;
+}
+
+
+#ifdef HAVE_MBRTOWC
+static size_t
+_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
+{
+    assert(pwc != NULL);
+    size_t count = mbrtowc(pwc, str, len, pmbs);
+    if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
+        if (!is_valid_wide_char(*pwc)) {
+            return DECODE_ERROR;
+        }
+    }
+    return count;
+}
+#endif
+
+
 #if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)

 #define USE_FORCE_ASCII
@ -148,8 +206,8 @@ check_force_ascii(void)
        size_t res;

        ch = (unsigned char)0xA7;
-        res = mbstowcs(&wch, (char*)&ch, 1);
-        if (res != (size_t)-1 && wch == L'\xA7') {
+        res = _Py_mbstowcs(&wch, (char*)&ch, 1);
+        if (res != DECODE_ERROR && wch == L'\xA7') {
            /* On HP-UX withe C locale or the POSIX locale,
               nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
               Latin1 encoding in practice. Force ASCII in this case.
@ -196,8 +254,8 @@ check_force_ascii(void)

        unsigned uch = (unsigned char)i;
        ch[0] = (char)uch;
-        res = mbstowcs(wch, ch, 1);
-        if (res != (size_t)-1) {
+        res = _Py_mbstowcs(wch, ch, 1);
+        if (res != DECODE_ERROR) {
            /* decoding a non-ASCII character from the locale encoding succeed:
               the locale encoding is not ASCII, force ASCII */
            return 1;
@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
     */
    argsize = strlen(arg);
 #else
-    argsize = mbstowcs(NULL, arg, 0);
+    argsize = _Py_mbstowcs(NULL, arg, 0);
 #endif
-    if (argsize != (size_t)-1) {
+    if (argsize != DECODE_ERROR) {
        if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
            return -1;
        }
@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
            return -1;
        }

-        count = mbstowcs(res, arg, argsize + 1);
-        if (count != (size_t)-1) {
-            wchar_t *tmp;
-            /* Only use the result if it contains no
-               surrogate characters. */
-            for (tmp = res; *tmp != 0 &&
-                         !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
-                ;
-            if (*tmp == 0) {
-                if (wlen != NULL) {
-                    *wlen = count;
-                }
-                *wstr = res;
-                return 0;
+        count = _Py_mbstowcs(res, arg, argsize + 1);
+        if (count != DECODE_ERROR) {
+            *wstr = res;
+            if (wlen != NULL) {
+                *wlen = count;
            }
+            return 0;
        }
        PyMem_RawFree(res);
    }
@ -436,13 +486,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
    out = res;
    memset(&mbs, 0, sizeof mbs);
    while (argsize) {
-        size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
+        size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
        if (converted == 0) {
            /* Reached end of string; null char stored. */
            break;
        }

-        if (converted == (size_t)-2) {
+        if (converted == INCOMPLETE_CHARACTER) {
            /* Incomplete character. This should never happen,
               since we provide everything that we have -
               unless there is a bug in the C library, or I
@ -450,32 +500,22 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
            goto decode_error;
        }

-        if (converted == (size_t)-1) {
+        if (converted == DECODE_ERROR) {
            if (!surrogateescape) {
                goto decode_error;
            }

-            /* Conversion error. Escape as UTF-8b, and start over
-               in the initial shift state. */
+            /* Decoding error. Escape as UTF-8b, and start over in the initial
+               shift state. */
            *out++ = 0xdc00 + *in++;
            argsize--;
            memset(&mbs, 0, sizeof mbs);
            continue;
        }

-        if (Py_UNICODE_IS_SURROGATE(*out)) {
-            if (!surrogateescape) {
-                goto decode_error;
-            }
+        // _Py_mbrtowc() reject lone surrogate characters
+        assert(!Py_UNICODE_IS_SURROGATE(*out));

-            /* Surrogate character.  Escape the original
-               byte sequence with surrogateescape. */
-            argsize -= converted;
-            while (converted--) {
-                *out++ = 0xdc00 + *in++;
-            }
-            continue;
-        }
        /* successfully converted some bytes */
        in += converted;
        argsize -= converted;
@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str,
                else {
                    converted = wcstombs(NULL, buf, 0);
                }
-                if (converted == (size_t)-1) {
+                if (converted == DECODE_ERROR) {
                    goto encode_error;
                }
                if (bytes != NULL) {
@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
    char cmode[10];
    size_t r;
    r = wcstombs(cmode, mode, 10);
-    if (r == (size_t)-1 || r >= 10) {
+    if (r == DECODE_ERROR || r >= 10) {
        errno = EINVAL;
        return NULL;
    }