#include #include #include "tokenizer.h" #include "pegen.h" #include "string_parser.h" //// STRING HANDLING FUNCTIONS //// static int warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) { unsigned char c = *first_invalid_escape; int octal = ('4' <= c && c <= '7'); PyObject *msg = octal ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'", first_invalid_escape) : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c); if (msg == NULL) { return -1; } PyObject *category; if (p->feature_version >= 12) { category = PyExc_SyntaxWarning; } else { category = PyExc_DeprecationWarning; } if (PyErr_WarnExplicitObject(category, msg, p->tok->filename, t->lineno, NULL, NULL) < 0) { if (PyErr_ExceptionMatches(category)) { /* Replace the DeprecationWarning exception with a SyntaxError to get a more accurate error report */ PyErr_Clear(); /* This is needed, in order for the SyntaxError to point to the token t, since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the error location, if p->known_err_token is not set. */ p->known_err_token = t; if (octal) { RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'", first_invalid_escape); } else { RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); } } Py_DECREF(msg); return -1; } Py_DECREF(msg); return 0; } static PyObject * decode_utf8(const char **sPtr, const char *end) { const char *s; const char *t; t = s = *sPtr; while (s < end && (*s & 0x80)) { s++; } *sPtr = s; return PyUnicode_DecodeUTF8(t, s - t, NULL); } static PyObject * decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) { PyObject *v; PyObject *u; char *buf; char *p; const char *end; /* check for integer overflow */ if (len > SIZE_MAX / 6) { return NULL; } /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ u = PyBytes_FromStringAndSize((char *)NULL, len * 6); if (u == NULL) { return NULL; } p = buf = PyBytes_AsString(u); if (p == NULL) { return NULL; } end = s + len; while (s < end) { if (*s == '\\') { *p++ = *s++; if (s >= end || *s & 0x80) { strcpy(p, "u005c"); p += 5; if (s >= end) { break; } } } if (*s & 0x80) { PyObject *w; int kind; const void *data; Py_ssize_t w_len; Py_ssize_t i; w = decode_utf8(&s, end); if (w == NULL) { Py_DECREF(u); return NULL; } kind = PyUnicode_KIND(w); data = PyUnicode_DATA(w); w_len = PyUnicode_GET_LENGTH(w); for (i = 0; i < w_len; i++) { Py_UCS4 chr = PyUnicode_READ(kind, data, i); sprintf(p, "\\U%08x", chr); p += 10; } /* Should be impossible to overflow */ assert(p - buf <= PyBytes_GET_SIZE(u)); Py_DECREF(w); } else { *p++ = *s++; } } len = p - buf; s = buf; const char *first_invalid_escape; v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape); // HACK: later we can simply pass the line no, since we don't preserve the tokens // when we are decoding the string but we preserve the line numbers. if (v != NULL && first_invalid_escape != NULL && t != NULL) { if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) { /* We have not decref u before because first_invalid_escape points inside u. */ Py_XDECREF(u); Py_DECREF(v); return NULL; } } Py_XDECREF(u); return v; } static PyObject * decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) { const char *first_invalid_escape; PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape); if (result == NULL) { return NULL; } if (first_invalid_escape != NULL) { if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) { Py_DECREF(result); return NULL; } } return result; } PyObject * _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t) { if (raw) { return PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); } return decode_unicode_with_escapes(p, s, len, t); } /* s must include the bracketing quote characters, and r, b &/or f prefixes (if any), and embedded escape sequences (if any). (f-strings are handled by the parser) _PyPegen_parse_string parses it, and returns the decoded Python string object. */ PyObject * _PyPegen_parse_string(Parser *p, Token *t) { const char *s = PyBytes_AsString(t->bytes); if (s == NULL) { return NULL; } size_t len; int quote = Py_CHARMASK(*s); int bytesmode = 0; int rawmode = 0; if (Py_ISALPHA(quote)) { while (!bytesmode || !rawmode) { if (quote == 'b' || quote == 'B') { quote =(unsigned char)*++s; bytesmode = 1; } else if (quote == 'u' || quote == 'U') { quote = (unsigned char)*++s; } else if (quote == 'r' || quote == 'R') { quote = (unsigned char)*++s; rawmode = 1; } else { break; } } } if (quote != '\'' && quote != '\"') { PyErr_BadInternalCall(); return NULL; } /* Skip the leading quote char. */ s++; len = strlen(s); if (len > INT_MAX) { PyErr_SetString(PyExc_OverflowError, "string to parse is too long"); return NULL; } if (s[--len] != quote) { /* Last quote char must match the first. */ PyErr_BadInternalCall(); return NULL; } if (len >= 4 && s[0] == quote && s[1] == quote) { /* A triple quoted string. We've already skipped one quote at the start and one at the end of the string. Now skip the two at the start. */ s += 2; len -= 2; /* And check that the last two match. */ if (s[--len] != quote || s[--len] != quote) { PyErr_BadInternalCall(); return NULL; } } /* Avoid invoking escape decoding routines if possible. */ rawmode = rawmode || strchr(s, '\\') == NULL; if (bytesmode) { /* Disallow non-ASCII characters. */ const char *ch; for (ch = s; *ch; ch++) { if (Py_CHARMASK(*ch) >= 0x80) { RAISE_SYNTAX_ERROR_KNOWN_LOCATION( t, "bytes can only contain ASCII " "literal characters"); return NULL; } } if (rawmode) { return PyBytes_FromStringAndSize(s, len); } return decode_bytes_with_escapes(p, s, len, t); } return _PyPegen_decode_string(p, rawmode, s, len, t); }