gh-106290: Fix edge cases around uops (#106319)

- Tweak uops debugging output
- Fix the bug from gh-106290
- Rename `SET_IP` to `SAVE_IP` (per https://github.com/faster-cpython/ideas/issues/558)
- Add a `SAVE_IP` uop at the start of the trace (ditto)
- Allow `unbound_local_error`; this gives us uops for `LOAD_FAST_CHECK`, `LOAD_CLOSURE`, and `DELETE_FAST`
- Longer traces
- Support `STORE_FAST_LOAD_FAST`, `STORE_FAST_STORE_FAST`
- Add deps on pycore_uops.h to Makefile(.pre.in)
This commit is contained in:
Guido van Rossum 2023-07-03 13:05:11 -07:00 committed by GitHub
parent 58906213cc
commit 2028a4f6d9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 274 additions and 234 deletions

View file

@ -8,7 +8,7 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define"
#endif
#define _Py_UOP_MAX_TRACE_LENGTH 16
#define _Py_UOP_MAX_TRACE_LENGTH 32
typedef struct {
int opcode;

View file

@ -1800,6 +1800,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_unionobject.h \
$(srcdir)/Include/internal/pycore_unicodeobject.h \
$(srcdir)/Include/internal/pycore_unicodeobject_generated.h \
$(srcdir)/Include/internal/pycore_uops.h \
$(srcdir)/Include/internal/pycore_warnings.h \
$(srcdir)/Include/internal/pycore_weakref.h \
$(DTRACE_HEADERS) \

View file

@ -2773,24 +2773,26 @@ void Py_LeaveRecursiveCall(void)
_PyInterpreterFrame *
_PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject **stack_pointer)
{
#ifdef LLTRACE
#ifdef Py_DEBUG
char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG");
int lltrace = 0;
if (uop_debug != NULL && *uop_debug >= '0') {
lltrace = *uop_debug - '0'; // TODO: Parse an int and all that
}
if (lltrace >= 2) {
PyCodeObject *code = _PyFrame_GetCode(frame);
_Py_CODEUNIT *instr = frame->prev_instr + 1;
fprintf(stderr,
"Entering _PyUopExecute for %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#define DPRINTF(level, ...) \
if (lltrace >= (level)) { fprintf(stderr, __VA_ARGS__); }
#else
#define DPRINTF(level, ...)
#endif
DPRINTF(3,
"Entering _PyUopExecute for %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(_PyFrame_GetCode(frame)->co_qualname),
PyUnicode_AsUTF8(_PyFrame_GetCode(frame)->co_filename),
_PyFrame_GetCode(frame)->co_firstlineno,
(long)(frame->prev_instr + 1 -
(_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive));
PyThreadState *tstate = _PyThreadState_GET();
_PyUOpExecutorObject *self = (_PyUOpExecutorObject *)executor;
@ -2803,7 +2805,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
}
OBJECT_STAT_INC(optimization_traces_executed);
_Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive - 1;
_Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive;
int pc = 0;
int opcode;
uint64_t operand;
@ -2812,14 +2814,11 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
opcode = self->trace[pc].opcode;
operand = self->trace[pc].operand;
oparg = (int)operand;
#ifdef LLTRACE
if (lltrace >= 3) {
const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : _PyOpcode_uop_name[opcode];
int stack_level = (int)(stack_pointer - _PyFrame_Stackbase(frame));
fprintf(stderr, " uop %s, operand %" PRIu64 ", stack_level %d\n",
opname, operand, stack_level);
}
#endif
DPRINTF(3,
" uop %s, operand %" PRIu64 ", stack_level %d\n",
opcode < 256 ? _PyOpcode_OpName[opcode] : _PyOpcode_uop_name[opcode],
operand,
(int)(stack_pointer - _PyFrame_Stackbase(frame)));
pc++;
OBJECT_STAT_INC(optimization_uops_executed);
switch (opcode) {
@ -2828,7 +2827,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
#define ENABLE_SPECIALIZATION 0
#include "executor_cases.c.h"
case SET_IP:
case SAVE_IP:
{
frame->prev_instr = ip_offset + oparg;
break;
@ -2836,6 +2835,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
case EXIT_TRACE:
{
frame->prev_instr--; // Back up to just before destination
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return frame;
@ -2850,6 +2850,13 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
}
}
unbound_local_error:
format_exc_check_arg(tstate, PyExc_UnboundLocalError,
UNBOUNDLOCAL_ERROR_MSG,
PyTuple_GetItem(_PyFrame_GetCode(frame)->co_localsplusnames, oparg)
);
goto error;
pop_4_error:
STACK_SHRINK(1);
pop_3_error:
@ -2861,11 +2868,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
error:
// On ERROR_IF we return NULL as the frame.
// The caller recovers the frame from cframe.current_frame.
#ifdef LLTRACE
if (lltrace >= 2) {
fprintf(stderr, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
}
#endif
DPRINTF(2, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return NULL;
@ -2873,11 +2876,8 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
deoptimize:
// On DEOPT_IF we just repeat the last instruction.
// This presumes nothing was popped from the stack (nor pushed).
#ifdef LLTRACE
if (lltrace >= 2) {
fprintf(stderr, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
}
#endif
DPRINTF(2, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
frame->prev_instr--; // Back up to just before destination
_PyFrame_SetStackPointer(frame, stack_pointer);
Py_DECREF(self);
return frame;

File diff suppressed because it is too large Load diff

View file

@ -20,7 +20,7 @@
0)
#define EXIT_TRACE 300
#define SET_IP 301
#define SAVE_IP 301
#define _GUARD_BOTH_INT 302
#define _BINARY_OP_MULTIPLY_INT 303
#define _BINARY_OP_ADD_INT 304
@ -1164,6 +1164,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {
};
const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
[NOP] = { .nuops = 1, .uops = { { NOP, 0, 0 } } },
[LOAD_FAST_CHECK] = { .nuops = 1, .uops = { { LOAD_FAST_CHECK, 0, 0 } } },
[LOAD_FAST] = { .nuops = 1, .uops = { { LOAD_FAST, 0, 0 } } },
[LOAD_FAST_AND_CLEAR] = { .nuops = 1, .uops = { { LOAD_FAST_AND_CLEAR, 0, 0 } } },
[LOAD_CONST] = { .nuops = 1, .uops = { { LOAD_CONST, 0, 0 } } },
@ -1218,6 +1219,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
[LOAD_LOCALS] = { .nuops = 1, .uops = { { _LOAD_LOCALS, 0, 0 } } },
[LOAD_NAME] = { .nuops = 2, .uops = { { _LOAD_LOCALS, 0, 0 }, { _LOAD_FROM_DICT_OR_GLOBALS, 0, 0 } } },
[LOAD_FROM_DICT_OR_GLOBALS] = { .nuops = 1, .uops = { { _LOAD_FROM_DICT_OR_GLOBALS, 0, 0 } } },
[DELETE_FAST] = { .nuops = 1, .uops = { { DELETE_FAST, 0, 0 } } },
[DELETE_DEREF] = { .nuops = 1, .uops = { { DELETE_DEREF, 0, 0 } } },
[LOAD_FROM_DICT_OR_DEREF] = { .nuops = 1, .uops = { { LOAD_FROM_DICT_OR_DEREF, 0, 0 } } },
[LOAD_DEREF] = { .nuops = 1, .uops = { { LOAD_DEREF, 0, 0 } } },
@ -1266,7 +1268,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
#ifdef Py_DEBUG
const char * const _PyOpcode_uop_name[512] = {
[300] = "EXIT_TRACE",
[301] = "SET_IP",
[301] = "SAVE_IP",
[302] = "_GUARD_BOTH_INT",
[303] = "_BINARY_OP_MULTIPLY_INT",
[304] = "_BINARY_OP_ADD_INT",

View file

@ -282,11 +282,6 @@ PyUnstable_Optimizer_NewCounter(void)
///////////////////// Experimental UOp Optimizer /////////////////////
#ifdef Py_DEBUG
/* For debugging the interpreter: */
# define LLTRACE 1 /* Low-level trace feature */
#endif
static void
uop_dealloc(_PyUOpExecutorObject *self) {
PyObject_Free(self);
@ -308,60 +303,81 @@ translate_bytecode_to_trace(
_PyUOpInstruction *trace,
int max_length)
{
#ifdef LLTRACE
int trace_length = 0;
#ifdef Py_DEBUG
char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG");
int lltrace = 0;
if (uop_debug != NULL && *uop_debug >= '0') {
lltrace = *uop_debug - '0'; // TODO: Parse an int and all that
}
if (lltrace >= 4) {
fprintf(stderr,
"Optimizing %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#define ADD_TO_TRACE(OPCODE, OPERAND) \
if (lltrace >= 2) { \
const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : _PyOpcode_uop_name[(OPCODE)]; \
fprintf(stderr, " ADD_TO_TRACE(%s, %" PRIu64 ")\n", opname, (uint64_t)(OPERAND)); \
} \
trace[trace_length].opcode = (OPCODE); \
trace[trace_length].operand = (OPERAND); \
trace_length++;
#define DPRINTF(level, ...) \
if (lltrace >= (level)) { fprintf(stderr, __VA_ARGS__); }
#else
#define ADD_TO_TRACE(OPCODE, OPERAND) \
trace[trace_length].opcode = (OPCODE); \
trace[trace_length].operand = (OPERAND); \
trace_length++;
#define DPRINTF(level, ...)
#endif
int trace_length = 0;
// Always reserve space for one uop, plus SET_UP, plus EXIT_TRACE
while (trace_length + 3 <= max_length) {
#define ADD_TO_TRACE(OPCODE, OPERAND) \
DPRINTF(2, \
" ADD_TO_TRACE(%s, %" PRIu64 ")\n", \
(OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : _PyOpcode_uop_name[(OPCODE)], \
(uint64_t)(OPERAND)); \
assert(trace_length < max_length); \
trace[trace_length].opcode = (OPCODE); \
trace[trace_length].operand = (OPERAND); \
trace_length++;
DPRINTF(4,
"Optimizing %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
for (;;) {
ADD_TO_TRACE(SAVE_IP, (int)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
int opcode = instr->op.code;
uint64_t operand = instr->op.arg;
switch (opcode) {
case LOAD_FAST_LOAD_FAST:
case STORE_FAST_LOAD_FAST:
case STORE_FAST_STORE_FAST:
{
// Reserve space for two uops (+ SETUP + EXIT_TRACE)
// Reserve space for two uops (+ SAVE_IP + EXIT_TRACE)
if (trace_length + 4 > max_length) {
DPRINTF(1, "Ran out of space for LOAD_FAST_LOAD_FAST\n");
goto done;
}
uint64_t oparg1 = operand >> 4;
uint64_t oparg2 = operand & 15;
ADD_TO_TRACE(LOAD_FAST, oparg1);
ADD_TO_TRACE(LOAD_FAST, oparg2);
switch (opcode) {
case LOAD_FAST_LOAD_FAST:
ADD_TO_TRACE(LOAD_FAST, oparg1);
ADD_TO_TRACE(LOAD_FAST, oparg2);
break;
case STORE_FAST_LOAD_FAST:
ADD_TO_TRACE(STORE_FAST, oparg1);
ADD_TO_TRACE(LOAD_FAST, oparg2);
break;
case STORE_FAST_STORE_FAST:
ADD_TO_TRACE(STORE_FAST, oparg1);
ADD_TO_TRACE(STORE_FAST, oparg2);
break;
default:
Py_FatalError("Missing case");
}
break;
}
default:
{
const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
if (expansion->nuops > 0) {
// Reserve space for nuops (+ SETUP + EXIT_TRACE)
// Reserve space for nuops (+ SAVE_IP + EXIT_TRACE)
int nuops = expansion->nuops;
if (trace_length + nuops + 2 > max_length) {
DPRINTF(1,
"Ran out of space for %s\n",
opcode < 256 ? _PyOpcode_OpName[opcode] : _PyOpcode_uop_name[opcode]);
goto done;
}
for (int i = 0; i < nuops; i++) {
@ -387,49 +403,45 @@ translate_bytecode_to_trace(
Py_FatalError("garbled expansion");
}
ADD_TO_TRACE(expansion->uops[i].uop, operand);
assert(expansion->uops[0].size == 0); // TODO
}
break;
}
// fprintf(stderr, "Unsupported opcode %d\n", opcode);
goto done; // Break out of while loop
DPRINTF(2,
"Unsupported opcode %s\n",
opcode < 256 ? _PyOpcode_OpName[opcode] : _PyOpcode_uop_name[opcode]);
goto done; // Break out of loop
}
}
instr++;
// Add cache size for opcode
instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
ADD_TO_TRACE(SET_IP, (int)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
done:
if (trace_length > 0) {
// Skip short traces like SAVE_IP, LOAD_FAST, SAVE_IP, EXIT_TRACE
if (trace_length > 3) {
ADD_TO_TRACE(EXIT_TRACE, 0);
#ifdef LLTRACE
if (lltrace >= 1) {
fprintf(stderr,
"Created a trace for %s (%s:%d) at offset %ld -- length %d\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive),
trace_length);
}
#endif
DPRINTF(1,
"Created a trace for %s (%s:%d) at offset %ld -- length %d\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive),
trace_length);
return trace_length;
}
else {
#ifdef LLTRACE
if (lltrace >= 4) {
fprintf(stderr,
"No trace for %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#endif
DPRINTF(4,
"No trace for %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
return trace_length;
return 0;
#undef ADD_TO_TRACE
#undef DPRINTF
}
static int

View file

@ -308,8 +308,7 @@ class ActiveCacheEffect:
FORBIDDEN_NAMES_IN_UOPS = (
"resume_with_error", # Proxy for "goto", which isn't an IDENTIFIER
"unbound_local_error",
"resume_with_error",
"kwnames",
"next_instr",
"oparg1", # Proxy for super-instructions like LOAD_FAST_LOAD_FAST
@ -401,20 +400,25 @@ def __init__(self, inst: parser.InstDef):
def is_viable_uop(self) -> bool:
"""Whether this instruction is viable as a uop."""
if self.always_exits:
# print(f"Skipping {self.name} because it always exits")
return False
if self.instr_flags.HAS_ARG_FLAG:
# If the instruction uses oparg, it cannot use any caches
if self.active_caches:
# print(f"Skipping {self.name} because it uses oparg and caches")
return False
else:
# If it doesn't use oparg, it can have one cache entry
if len(self.active_caches) > 1:
# print(f"Skipping {self.name} because it has >1 cache entries")
return False
res = True
for forbidden in FORBIDDEN_NAMES_IN_UOPS:
# TODO: Don't check in '#ifdef ENABLE_SPECIALIZATION' regions
if variable_used(self.inst, forbidden):
return False
return True
# print(f"Skipping {self.name} because it uses {forbidden}")
res = False
return res
def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None:
"""Write one instruction, sans prologue and epilogue."""
@ -1323,7 +1327,7 @@ def add(name: str) -> None:
self.out.emit(make_text(name, counter))
counter += 1
add("EXIT_TRACE")
add("SET_IP")
add("SAVE_IP")
for instr in self.instrs.values():
if instr.kind == "op" and instr.is_viable_uop():
add(instr.name)