GH-96458: Statically initialize utf8 representation of static strings (#96481)

This commit is contained in:
Kumar Aditya 2022-09-03 12:13:08 +05:30 committed by GitHub
parent 16c6759b37
commit 6dab8c95bd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 139 additions and 165 deletions

View file

@ -113,10 +113,12 @@ extern "C" {
._ ## NAME = _PyASCIIObject_INIT(LITERAL)
#define INIT_ID(NAME) \
._ ## NAME = _PyASCIIObject_INIT(#NAME)
#define _PyUnicode_LATIN1_INIT(LITERAL) \
#define _PyUnicode_LATIN1_INIT(LITERAL, UTF8) \
{ \
._latin1 = { \
._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0), \
.utf8 = (UTF8), \
.utf8_length = sizeof(UTF8) - 1, \
}, \
._data = (LITERAL), \
}

View file

@ -1287,134 +1287,134 @@ extern "C" {
_PyASCIIObject_INIT("\x7f"), \
}, \
.latin1 = { \
_PyUnicode_LATIN1_INIT("\x80"), \
_PyUnicode_LATIN1_INIT("\x81"), \
_PyUnicode_LATIN1_INIT("\x82"), \
_PyUnicode_LATIN1_INIT("\x83"), \
_PyUnicode_LATIN1_INIT("\x84"), \
_PyUnicode_LATIN1_INIT("\x85"), \
_PyUnicode_LATIN1_INIT("\x86"), \
_PyUnicode_LATIN1_INIT("\x87"), \
_PyUnicode_LATIN1_INIT("\x88"), \
_PyUnicode_LATIN1_INIT("\x89"), \
_PyUnicode_LATIN1_INIT("\x8a"), \
_PyUnicode_LATIN1_INIT("\x8b"), \
_PyUnicode_LATIN1_INIT("\x8c"), \
_PyUnicode_LATIN1_INIT("\x8d"), \
_PyUnicode_LATIN1_INIT("\x8e"), \
_PyUnicode_LATIN1_INIT("\x8f"), \
_PyUnicode_LATIN1_INIT("\x90"), \
_PyUnicode_LATIN1_INIT("\x91"), \
_PyUnicode_LATIN1_INIT("\x92"), \
_PyUnicode_LATIN1_INIT("\x93"), \
_PyUnicode_LATIN1_INIT("\x94"), \
_PyUnicode_LATIN1_INIT("\x95"), \
_PyUnicode_LATIN1_INIT("\x96"), \
_PyUnicode_LATIN1_INIT("\x97"), \
_PyUnicode_LATIN1_INIT("\x98"), \
_PyUnicode_LATIN1_INIT("\x99"), \
_PyUnicode_LATIN1_INIT("\x9a"), \
_PyUnicode_LATIN1_INIT("\x9b"), \
_PyUnicode_LATIN1_INIT("\x9c"), \
_PyUnicode_LATIN1_INIT("\x9d"), \
_PyUnicode_LATIN1_INIT("\x9e"), \
_PyUnicode_LATIN1_INIT("\x9f"), \
_PyUnicode_LATIN1_INIT("\xa0"), \
_PyUnicode_LATIN1_INIT("\xa1"), \
_PyUnicode_LATIN1_INIT("\xa2"), \
_PyUnicode_LATIN1_INIT("\xa3"), \
_PyUnicode_LATIN1_INIT("\xa4"), \
_PyUnicode_LATIN1_INIT("\xa5"), \
_PyUnicode_LATIN1_INIT("\xa6"), \
_PyUnicode_LATIN1_INIT("\xa7"), \
_PyUnicode_LATIN1_INIT("\xa8"), \
_PyUnicode_LATIN1_INIT("\xa9"), \
_PyUnicode_LATIN1_INIT("\xaa"), \
_PyUnicode_LATIN1_INIT("\xab"), \
_PyUnicode_LATIN1_INIT("\xac"), \
_PyUnicode_LATIN1_INIT("\xad"), \
_PyUnicode_LATIN1_INIT("\xae"), \
_PyUnicode_LATIN1_INIT("\xaf"), \
_PyUnicode_LATIN1_INIT("\xb0"), \
_PyUnicode_LATIN1_INIT("\xb1"), \
_PyUnicode_LATIN1_INIT("\xb2"), \
_PyUnicode_LATIN1_INIT("\xb3"), \
_PyUnicode_LATIN1_INIT("\xb4"), \
_PyUnicode_LATIN1_INIT("\xb5"), \
_PyUnicode_LATIN1_INIT("\xb6"), \
_PyUnicode_LATIN1_INIT("\xb7"), \
_PyUnicode_LATIN1_INIT("\xb8"), \
_PyUnicode_LATIN1_INIT("\xb9"), \
_PyUnicode_LATIN1_INIT("\xba"), \
_PyUnicode_LATIN1_INIT("\xbb"), \
_PyUnicode_LATIN1_INIT("\xbc"), \
_PyUnicode_LATIN1_INIT("\xbd"), \
_PyUnicode_LATIN1_INIT("\xbe"), \
_PyUnicode_LATIN1_INIT("\xbf"), \
_PyUnicode_LATIN1_INIT("\xc0"), \
_PyUnicode_LATIN1_INIT("\xc1"), \
_PyUnicode_LATIN1_INIT("\xc2"), \
_PyUnicode_LATIN1_INIT("\xc3"), \
_PyUnicode_LATIN1_INIT("\xc4"), \
_PyUnicode_LATIN1_INIT("\xc5"), \
_PyUnicode_LATIN1_INIT("\xc6"), \
_PyUnicode_LATIN1_INIT("\xc7"), \
_PyUnicode_LATIN1_INIT("\xc8"), \
_PyUnicode_LATIN1_INIT("\xc9"), \
_PyUnicode_LATIN1_INIT("\xca"), \
_PyUnicode_LATIN1_INIT("\xcb"), \
_PyUnicode_LATIN1_INIT("\xcc"), \
_PyUnicode_LATIN1_INIT("\xcd"), \
_PyUnicode_LATIN1_INIT("\xce"), \
_PyUnicode_LATIN1_INIT("\xcf"), \
_PyUnicode_LATIN1_INIT("\xd0"), \
_PyUnicode_LATIN1_INIT("\xd1"), \
_PyUnicode_LATIN1_INIT("\xd2"), \
_PyUnicode_LATIN1_INIT("\xd3"), \
_PyUnicode_LATIN1_INIT("\xd4"), \
_PyUnicode_LATIN1_INIT("\xd5"), \
_PyUnicode_LATIN1_INIT("\xd6"), \
_PyUnicode_LATIN1_INIT("\xd7"), \
_PyUnicode_LATIN1_INIT("\xd8"), \
_PyUnicode_LATIN1_INIT("\xd9"), \
_PyUnicode_LATIN1_INIT("\xda"), \
_PyUnicode_LATIN1_INIT("\xdb"), \
_PyUnicode_LATIN1_INIT("\xdc"), \
_PyUnicode_LATIN1_INIT("\xdd"), \
_PyUnicode_LATIN1_INIT("\xde"), \
_PyUnicode_LATIN1_INIT("\xdf"), \
_PyUnicode_LATIN1_INIT("\xe0"), \
_PyUnicode_LATIN1_INIT("\xe1"), \
_PyUnicode_LATIN1_INIT("\xe2"), \
_PyUnicode_LATIN1_INIT("\xe3"), \
_PyUnicode_LATIN1_INIT("\xe4"), \
_PyUnicode_LATIN1_INIT("\xe5"), \
_PyUnicode_LATIN1_INIT("\xe6"), \
_PyUnicode_LATIN1_INIT("\xe7"), \
_PyUnicode_LATIN1_INIT("\xe8"), \
_PyUnicode_LATIN1_INIT("\xe9"), \
_PyUnicode_LATIN1_INIT("\xea"), \
_PyUnicode_LATIN1_INIT("\xeb"), \
_PyUnicode_LATIN1_INIT("\xec"), \
_PyUnicode_LATIN1_INIT("\xed"), \
_PyUnicode_LATIN1_INIT("\xee"), \
_PyUnicode_LATIN1_INIT("\xef"), \
_PyUnicode_LATIN1_INIT("\xf0"), \
_PyUnicode_LATIN1_INIT("\xf1"), \
_PyUnicode_LATIN1_INIT("\xf2"), \
_PyUnicode_LATIN1_INIT("\xf3"), \
_PyUnicode_LATIN1_INIT("\xf4"), \
_PyUnicode_LATIN1_INIT("\xf5"), \
_PyUnicode_LATIN1_INIT("\xf6"), \
_PyUnicode_LATIN1_INIT("\xf7"), \
_PyUnicode_LATIN1_INIT("\xf8"), \
_PyUnicode_LATIN1_INIT("\xf9"), \
_PyUnicode_LATIN1_INIT("\xfa"), \
_PyUnicode_LATIN1_INIT("\xfb"), \
_PyUnicode_LATIN1_INIT("\xfc"), \
_PyUnicode_LATIN1_INIT("\xfd"), \
_PyUnicode_LATIN1_INIT("\xfe"), \
_PyUnicode_LATIN1_INIT("\xff"), \
_PyUnicode_LATIN1_INIT("\x80", "\xc2\x80"), \
_PyUnicode_LATIN1_INIT("\x81", "\xc2\x81"), \
_PyUnicode_LATIN1_INIT("\x82", "\xc2\x82"), \
_PyUnicode_LATIN1_INIT("\x83", "\xc2\x83"), \
_PyUnicode_LATIN1_INIT("\x84", "\xc2\x84"), \
_PyUnicode_LATIN1_INIT("\x85", "\xc2\x85"), \
_PyUnicode_LATIN1_INIT("\x86", "\xc2\x86"), \
_PyUnicode_LATIN1_INIT("\x87", "\xc2\x87"), \
_PyUnicode_LATIN1_INIT("\x88", "\xc2\x88"), \
_PyUnicode_LATIN1_INIT("\x89", "\xc2\x89"), \
_PyUnicode_LATIN1_INIT("\x8a", "\xc2\x8a"), \
_PyUnicode_LATIN1_INIT("\x8b", "\xc2\x8b"), \
_PyUnicode_LATIN1_INIT("\x8c", "\xc2\x8c"), \
_PyUnicode_LATIN1_INIT("\x8d", "\xc2\x8d"), \
_PyUnicode_LATIN1_INIT("\x8e", "\xc2\x8e"), \
_PyUnicode_LATIN1_INIT("\x8f", "\xc2\x8f"), \
_PyUnicode_LATIN1_INIT("\x90", "\xc2\x90"), \
_PyUnicode_LATIN1_INIT("\x91", "\xc2\x91"), \
_PyUnicode_LATIN1_INIT("\x92", "\xc2\x92"), \
_PyUnicode_LATIN1_INIT("\x93", "\xc2\x93"), \
_PyUnicode_LATIN1_INIT("\x94", "\xc2\x94"), \
_PyUnicode_LATIN1_INIT("\x95", "\xc2\x95"), \
_PyUnicode_LATIN1_INIT("\x96", "\xc2\x96"), \
_PyUnicode_LATIN1_INIT("\x97", "\xc2\x97"), \
_PyUnicode_LATIN1_INIT("\x98", "\xc2\x98"), \
_PyUnicode_LATIN1_INIT("\x99", "\xc2\x99"), \
_PyUnicode_LATIN1_INIT("\x9a", "\xc2\x9a"), \
_PyUnicode_LATIN1_INIT("\x9b", "\xc2\x9b"), \
_PyUnicode_LATIN1_INIT("\x9c", "\xc2\x9c"), \
_PyUnicode_LATIN1_INIT("\x9d", "\xc2\x9d"), \
_PyUnicode_LATIN1_INIT("\x9e", "\xc2\x9e"), \
_PyUnicode_LATIN1_INIT("\x9f", "\xc2\x9f"), \
_PyUnicode_LATIN1_INIT("\xa0", "\xc2\xa0"), \
_PyUnicode_LATIN1_INIT("\xa1", "\xc2\xa1"), \
_PyUnicode_LATIN1_INIT("\xa2", "\xc2\xa2"), \
_PyUnicode_LATIN1_INIT("\xa3", "\xc2\xa3"), \
_PyUnicode_LATIN1_INIT("\xa4", "\xc2\xa4"), \
_PyUnicode_LATIN1_INIT("\xa5", "\xc2\xa5"), \
_PyUnicode_LATIN1_INIT("\xa6", "\xc2\xa6"), \
_PyUnicode_LATIN1_INIT("\xa7", "\xc2\xa7"), \
_PyUnicode_LATIN1_INIT("\xa8", "\xc2\xa8"), \
_PyUnicode_LATIN1_INIT("\xa9", "\xc2\xa9"), \
_PyUnicode_LATIN1_INIT("\xaa", "\xc2\xaa"), \
_PyUnicode_LATIN1_INIT("\xab", "\xc2\xab"), \
_PyUnicode_LATIN1_INIT("\xac", "\xc2\xac"), \
_PyUnicode_LATIN1_INIT("\xad", "\xc2\xad"), \
_PyUnicode_LATIN1_INIT("\xae", "\xc2\xae"), \
_PyUnicode_LATIN1_INIT("\xaf", "\xc2\xaf"), \
_PyUnicode_LATIN1_INIT("\xb0", "\xc2\xb0"), \
_PyUnicode_LATIN1_INIT("\xb1", "\xc2\xb1"), \
_PyUnicode_LATIN1_INIT("\xb2", "\xc2\xb2"), \
_PyUnicode_LATIN1_INIT("\xb3", "\xc2\xb3"), \
_PyUnicode_LATIN1_INIT("\xb4", "\xc2\xb4"), \
_PyUnicode_LATIN1_INIT("\xb5", "\xc2\xb5"), \
_PyUnicode_LATIN1_INIT("\xb6", "\xc2\xb6"), \
_PyUnicode_LATIN1_INIT("\xb7", "\xc2\xb7"), \
_PyUnicode_LATIN1_INIT("\xb8", "\xc2\xb8"), \
_PyUnicode_LATIN1_INIT("\xb9", "\xc2\xb9"), \
_PyUnicode_LATIN1_INIT("\xba", "\xc2\xba"), \
_PyUnicode_LATIN1_INIT("\xbb", "\xc2\xbb"), \
_PyUnicode_LATIN1_INIT("\xbc", "\xc2\xbc"), \
_PyUnicode_LATIN1_INIT("\xbd", "\xc2\xbd"), \
_PyUnicode_LATIN1_INIT("\xbe", "\xc2\xbe"), \
_PyUnicode_LATIN1_INIT("\xbf", "\xc2\xbf"), \
_PyUnicode_LATIN1_INIT("\xc0", "\xc3\x80"), \
_PyUnicode_LATIN1_INIT("\xc1", "\xc3\x81"), \
_PyUnicode_LATIN1_INIT("\xc2", "\xc3\x82"), \
_PyUnicode_LATIN1_INIT("\xc3", "\xc3\x83"), \
_PyUnicode_LATIN1_INIT("\xc4", "\xc3\x84"), \
_PyUnicode_LATIN1_INIT("\xc5", "\xc3\x85"), \
_PyUnicode_LATIN1_INIT("\xc6", "\xc3\x86"), \
_PyUnicode_LATIN1_INIT("\xc7", "\xc3\x87"), \
_PyUnicode_LATIN1_INIT("\xc8", "\xc3\x88"), \
_PyUnicode_LATIN1_INIT("\xc9", "\xc3\x89"), \
_PyUnicode_LATIN1_INIT("\xca", "\xc3\x8a"), \
_PyUnicode_LATIN1_INIT("\xcb", "\xc3\x8b"), \
_PyUnicode_LATIN1_INIT("\xcc", "\xc3\x8c"), \
_PyUnicode_LATIN1_INIT("\xcd", "\xc3\x8d"), \
_PyUnicode_LATIN1_INIT("\xce", "\xc3\x8e"), \
_PyUnicode_LATIN1_INIT("\xcf", "\xc3\x8f"), \
_PyUnicode_LATIN1_INIT("\xd0", "\xc3\x90"), \
_PyUnicode_LATIN1_INIT("\xd1", "\xc3\x91"), \
_PyUnicode_LATIN1_INIT("\xd2", "\xc3\x92"), \
_PyUnicode_LATIN1_INIT("\xd3", "\xc3\x93"), \
_PyUnicode_LATIN1_INIT("\xd4", "\xc3\x94"), \
_PyUnicode_LATIN1_INIT("\xd5", "\xc3\x95"), \
_PyUnicode_LATIN1_INIT("\xd6", "\xc3\x96"), \
_PyUnicode_LATIN1_INIT("\xd7", "\xc3\x97"), \
_PyUnicode_LATIN1_INIT("\xd8", "\xc3\x98"), \
_PyUnicode_LATIN1_INIT("\xd9", "\xc3\x99"), \
_PyUnicode_LATIN1_INIT("\xda", "\xc3\x9a"), \
_PyUnicode_LATIN1_INIT("\xdb", "\xc3\x9b"), \
_PyUnicode_LATIN1_INIT("\xdc", "\xc3\x9c"), \
_PyUnicode_LATIN1_INIT("\xdd", "\xc3\x9d"), \
_PyUnicode_LATIN1_INIT("\xde", "\xc3\x9e"), \
_PyUnicode_LATIN1_INIT("\xdf", "\xc3\x9f"), \
_PyUnicode_LATIN1_INIT("\xe0", "\xc3\xa0"), \
_PyUnicode_LATIN1_INIT("\xe1", "\xc3\xa1"), \
_PyUnicode_LATIN1_INIT("\xe2", "\xc3\xa2"), \
_PyUnicode_LATIN1_INIT("\xe3", "\xc3\xa3"), \
_PyUnicode_LATIN1_INIT("\xe4", "\xc3\xa4"), \
_PyUnicode_LATIN1_INIT("\xe5", "\xc3\xa5"), \
_PyUnicode_LATIN1_INIT("\xe6", "\xc3\xa6"), \
_PyUnicode_LATIN1_INIT("\xe7", "\xc3\xa7"), \
_PyUnicode_LATIN1_INIT("\xe8", "\xc3\xa8"), \
_PyUnicode_LATIN1_INIT("\xe9", "\xc3\xa9"), \
_PyUnicode_LATIN1_INIT("\xea", "\xc3\xaa"), \
_PyUnicode_LATIN1_INIT("\xeb", "\xc3\xab"), \
_PyUnicode_LATIN1_INIT("\xec", "\xc3\xac"), \
_PyUnicode_LATIN1_INIT("\xed", "\xc3\xad"), \
_PyUnicode_LATIN1_INIT("\xee", "\xc3\xae"), \
_PyUnicode_LATIN1_INIT("\xef", "\xc3\xaf"), \
_PyUnicode_LATIN1_INIT("\xf0", "\xc3\xb0"), \
_PyUnicode_LATIN1_INIT("\xf1", "\xc3\xb1"), \
_PyUnicode_LATIN1_INIT("\xf2", "\xc3\xb2"), \
_PyUnicode_LATIN1_INIT("\xf3", "\xc3\xb3"), \
_PyUnicode_LATIN1_INIT("\xf4", "\xc3\xb4"), \
_PyUnicode_LATIN1_INIT("\xf5", "\xc3\xb5"), \
_PyUnicode_LATIN1_INIT("\xf6", "\xc3\xb6"), \
_PyUnicode_LATIN1_INIT("\xf7", "\xc3\xb7"), \
_PyUnicode_LATIN1_INIT("\xf8", "\xc3\xb8"), \
_PyUnicode_LATIN1_INIT("\xf9", "\xc3\xb9"), \
_PyUnicode_LATIN1_INIT("\xfa", "\xc3\xba"), \
_PyUnicode_LATIN1_INIT("\xfb", "\xc3\xbb"), \
_PyUnicode_LATIN1_INIT("\xfc", "\xc3\xbc"), \
_PyUnicode_LATIN1_INIT("\xfd", "\xc3\xbd"), \
_PyUnicode_LATIN1_INIT("\xfe", "\xc3\xbe"), \
_PyUnicode_LATIN1_INIT("\xff", "\xc3\xbf"), \
}, \
}, \
\

View file

@ -19,7 +19,6 @@ extern PyStatus _PyUnicode_InitGlobalObjects(PyInterpreterState *);
extern PyStatus _PyUnicode_InitTypes(PyInterpreterState *);
extern void _PyUnicode_Fini(PyInterpreterState *);
extern void _PyUnicode_FiniTypes(PyInterpreterState *);
extern void _PyStaticUnicode_Dealloc(PyObject *);
extern PyTypeObject _PyUnicodeASCIIIter_Type;

View file

@ -15184,23 +15184,6 @@ _PyUnicode_FiniTypes(PyInterpreterState *interp)
}
static void unicode_static_dealloc(PyObject *op)
{
PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
assert(ascii->state.compact);
if (!ascii->state.ascii) {
PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
if (compact->utf8) {
PyObject_Free(compact->utf8);
compact->utf8 = NULL;
compact->utf8_length = 0;
}
}
}
void
_PyUnicode_Fini(PyInterpreterState *interp)
{
@ -15217,24 +15200,8 @@ _PyUnicode_Fini(PyInterpreterState *interp)
_PyUnicode_FiniEncodings(&state->fs_codec);
unicode_clear_identifiers(state);
// Clear the single character singletons
for (int i = 0; i < 128; i++) {
unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
}
for (int i = 0; i < 128; i++) {
unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
}
}
void
_PyStaticUnicode_Dealloc(PyObject *op)
{
unicode_static_dealloc(op);
}
/* A _string module, to export formatter_parser and formatter_field_name_split
to the string.Formatter class implemented in Python. */

View file

@ -195,7 +195,6 @@ def generate_unicode(self, name: str, s: str) -> str:
else:
self.write("PyCompactUnicodeObject _compact;")
self.write(f"{datatype} _data[{len(s)+1}];")
self.deallocs.append(f"_PyStaticUnicode_Dealloc((PyObject *)&{name});")
with self.block(f"{name} =", ";"):
if ascii:
with self.block("._ascii =", ","):
@ -218,6 +217,9 @@ def generate_unicode(self, name: str, s: str) -> str:
self.write(f".kind = {kind},")
self.write(".compact = 1,")
self.write(".ascii = 0,")
utf8 = s.encode('utf-8')
self.write(f'.utf8 = {make_string_literal(utf8)},')
self.write(f'.utf8_length = {len(utf8)},')
with self.block(f"._data =", ","):
for i in range(0, len(s), 16):
data = s[i:i+16]

View file

@ -287,7 +287,11 @@ def generate_runtime_init(identifiers, strings):
immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).ascii[{i}]')
with printer.block('.latin1 =', ','):
for i in range(128, 256):
printer.write(f'_PyUnicode_LATIN1_INIT("\\x{i:02x}"),')
utf8 = ['"']
for c in chr(i).encode('utf-8'):
utf8.append(f"\\x{c:02x}")
utf8.append('"')
printer.write(f'_PyUnicode_LATIN1_INIT("\\x{i:02x}", {"".join(utf8)}),')
immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).latin1[{i} - 128]')
printer.write('')
with printer.block('.tuple_empty =', ','):