From b45cbd3a177220f5b8a2e026ea1ffdde13cf4189 Mon Sep 17 00:00:00 2001 From: cocolato Date: Fri, 13 Mar 2026 00:07:36 +0800 Subject: [PATCH 1/2] replacing Bloom filter linked lists with contiguous arrays --- Include/internal/pycore_interp_structs.h | 6 +- Include/internal/pycore_optimizer.h | 5 +- InternalDocs/jit.md | 4 +- Modules/_testinternalcapi.c | 6 +- Objects/funcobject.c | 1 - Python/jit.c | 19 +++- Python/optimizer.c | 118 ++++++++++++----------- Python/pylifecycle.c | 6 ++ Python/pystate.c | 5 +- 9 files changed, 104 insertions(+), 66 deletions(-) diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 776fb9575c2365..e2008f8303e21b 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -14,6 +14,7 @@ extern "C" { #include "pycore_structs.h" // PyHamtObject #include "pycore_tstate.h" // _PyThreadStateImpl #include "pycore_typedefs.h" // _PyRuntimeState +#include "pycore_uop.h" // _PyBloomFilter #define CODE_MAX_WATCHERS 8 #define CONTEXT_MAX_WATCHERS 8 @@ -972,7 +973,10 @@ struct _is { // Optimization configuration (thresholds and flags for JIT and interpreter) _PyOptimizationConfig opt_config; - struct _PyExecutorObject *executor_list_head; + _PyBloomFilter *executor_blooms; // Contiguous bloom filter array + struct _PyExecutorObject **executor_ptrs; // Corresponding executor pointer array + size_t executor_count; // Number of valid executors + size_t executor_capacity; // Array capacity struct _PyExecutorObject *executor_deletion_list_head; struct _PyExecutorObject *cold_executor; struct _PyExecutorObject *cold_dynamic_executor; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index c63f0167a0f64a..22b9ce8b003491 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -129,7 +129,8 @@ typedef struct { uint8_t pending_deletion; int32_t index; // Index of ENTER_EXECUTOR (if code isn't NULL, below). _PyBloomFilter bloom; - _PyExecutorLinkListNode links; + int32_t bloom_array_idx; // Index in interp->executor_blooms/executor_ptrs. + _PyExecutorLinkListNode links; // Used by deletion list. PyCodeObject *code; // Weak (NULL if no corresponding ENTER_EXECUTOR). } _PyVMData; @@ -157,7 +158,7 @@ typedef struct _PyExecutorObject { // Export for '_opcode' shared extension (JIT compiler). PyAPI_FUNC(_PyExecutorObject*) _Py_GetExecutor(PyCodeObject *code, int offset); -void _Py_ExecutorInit(_PyExecutorObject *, const _PyBloomFilter *); +int _Py_ExecutorInit(_PyExecutorObject *, const _PyBloomFilter *); void _Py_ExecutorDetach(_PyExecutorObject *); void _Py_BloomFilter_Init(_PyBloomFilter *); void _Py_BloomFilter_Add(_PyBloomFilter *bloom, void *obj); diff --git a/InternalDocs/jit.md b/InternalDocs/jit.md index 1740b22b85f77b..decfccad2d8d37 100644 --- a/InternalDocs/jit.md +++ b/InternalDocs/jit.md @@ -78,8 +78,8 @@ and execution returns to the adaptive interpreter. ## Invalidating Executors In addition to being stored on the code object, each executor is also -inserted into a list of all executors, which is stored in the interpreter -state's `executor_list_head` field. This list is used when it is necessary +inserted into contiguous arrays (`executor_blooms` and `executor_ptrs`) +stored in the interpreter state. These arrays are used when it is necessary to invalidate executors because values they used in their construction may have changed. diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index aa5911ef2fb449..8c316b7c8ddda0 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -278,10 +278,8 @@ get_jit_code_ranges(PyObject *self, PyObject *Py_UNUSED(args)) if (interp == NULL) { return ranges; } - for (_PyExecutorObject *exec = interp->executor_list_head; - exec != NULL; - exec = exec->vm_data.links.next) - { + for (size_t i = 0; i < interp->executor_count; i++) { + _PyExecutorObject *exec = interp->executor_ptrs[i]; if (exec->jit_code == NULL || exec->jit_size == 0) { continue; } diff --git a/Objects/funcobject.c b/Objects/funcobject.c index fc32826fb3a861..a7a3d9c78ef3d0 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -12,7 +12,6 @@ #include "pycore_setobject.h" // _PySet_NextEntry() #include "pycore_stats.h" #include "pycore_weakref.h" // FT_CLEAR_WEAKREFS() -#include "pycore_optimizer.h" // _Py_Executors_InvalidateDependency static const char * func_event_name(PyFunction_WatchEvent event) { diff --git a/Python/jit.c b/Python/jit.c index 3e0a0aa8bfcc81..4990c743224d3c 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -62,6 +62,23 @@ jit_error(const char *message) static size_t _Py_jit_shim_size = 0; +static int +address_in_executor_array(_PyExecutorObject **ptrs, size_t count, uintptr_t addr) +{ + for (size_t i = 0; i < count; i++) { + _PyExecutorObject *exec = ptrs[i]; + if (exec->jit_code == NULL || exec->jit_size == 0) { + continue; + } + uintptr_t start = (uintptr_t)exec->jit_code; + uintptr_t end = start + exec->jit_size; + if (addr >= start && addr < end) { + return 1; + } + } + return 0; +} + static int address_in_executor_list(_PyExecutorObject *head, uintptr_t addr) { @@ -94,7 +111,7 @@ _PyJIT_AddressInJitCode(PyInterpreterState *interp, uintptr_t addr) return 1; } } - if (address_in_executor_list(interp->executor_list_head, addr)) { + if (address_in_executor_array(interp->executor_ptrs, interp->executor_count, addr)) { return 1; } if (address_in_executor_list(interp->executor_deletion_list_head, addr)) { diff --git a/Python/optimizer.c b/Python/optimizer.c index 7315bb6b9f603d..6c1caa7f01e42b 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1379,7 +1379,10 @@ make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, i // linking of executor. Otherwise, the GC tries to untrack a // still untracked object during dealloc. _PyObject_GC_TRACK(executor); - _Py_ExecutorInit(executor, dependencies); + if (_Py_ExecutorInit(executor, dependencies) < 0) { + Py_DECREF(executor); + return NULL; + } #ifdef Py_DEBUG char *python_lltrace = Py_GETENV("PYTHON_LLTRACE"); int lltrace = 0; @@ -1646,50 +1649,54 @@ bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes) return true; } -static void +static int link_executor(_PyExecutorObject *executor) { PyInterpreterState *interp = _PyInterpreterState_GET(); - _PyExecutorLinkListNode *links = &executor->vm_data.links; - _PyExecutorObject *head = interp->executor_list_head; - if (head == NULL) { - interp->executor_list_head = executor; - links->previous = NULL; - links->next = NULL; - } - else { - assert(head->vm_data.links.previous == NULL); - links->previous = NULL; - links->next = head; - head->vm_data.links.previous = executor; - interp->executor_list_head = executor; - } - /* executor_list_head must be first in list */ - assert(interp->executor_list_head->vm_data.links.previous == NULL); + if (interp->executor_count == interp->executor_capacity) { + size_t new_cap = interp->executor_capacity ? interp->executor_capacity * 2 : 64; + _PyBloomFilter *new_blooms = PyMem_Realloc( + interp->executor_blooms, new_cap * sizeof(_PyBloomFilter)); + if (new_blooms == NULL) { + return -1; + } + _PyExecutorObject **new_ptrs = PyMem_Realloc( + interp->executor_ptrs, new_cap * sizeof(_PyExecutorObject *)); + if (new_ptrs == NULL) { + /* Revert blooms realloc — the old pointer may have been freed by + * a successful realloc, but new_blooms is the valid pointer. */ + interp->executor_blooms = new_blooms; + return -1; + } + interp->executor_blooms = new_blooms; + interp->executor_ptrs = new_ptrs; + interp->executor_capacity = new_cap; + } + size_t idx = interp->executor_count++; + interp->executor_blooms[idx] = executor->vm_data.bloom; + interp->executor_ptrs[idx] = executor; + executor->vm_data.bloom_array_idx = (int32_t)idx; + return 0; } static void unlink_executor(_PyExecutorObject *executor) { - _PyExecutorLinkListNode *links = &executor->vm_data.links; - _PyExecutorObject *next = links->next; - _PyExecutorObject *prev = links->previous; - if (next != NULL) { - next->vm_data.links.previous = prev; - } - if (prev != NULL) { - prev->vm_data.links.next = next; - } - else { - // prev == NULL implies that executor is the list head - PyInterpreterState *interp = PyInterpreterState_Get(); - assert(interp->executor_list_head == executor); - interp->executor_list_head = next; + PyInterpreterState *interp = PyInterpreterState_Get(); + int32_t idx = executor->vm_data.bloom_array_idx; + assert(idx >= 0 && (size_t)idx < interp->executor_count); + size_t last = --interp->executor_count; + if ((size_t)idx != last) { + /* Swap-remove: move the last element into the vacated slot */ + interp->executor_blooms[idx] = interp->executor_blooms[last]; + interp->executor_ptrs[idx] = interp->executor_ptrs[last]; + interp->executor_ptrs[idx]->vm_data.bloom_array_idx = idx; } + executor->vm_data.bloom_array_idx = -1; } /* This must be called by optimizers before using the executor */ -void +int _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set) { executor->vm_data.valid = true; @@ -1698,7 +1705,10 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) { executor->vm_data.bloom.bits[i] = dependency_set->bits[i]; } - link_executor(executor); + if (link_executor(executor) < 0) { + return -1; + } + return 0; } static _PyExecutorObject * @@ -1810,10 +1820,16 @@ _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) { assert(executor->vm_data.valid); _Py_BloomFilter_Add(&executor->vm_data.bloom, obj); + /* Keep the contiguous bloom array in sync */ + PyInterpreterState *interp = _PyInterpreterState_GET(); + int32_t idx = executor->vm_data.bloom_array_idx; + assert(idx >= 0 && (size_t)idx < interp->executor_count); + _Py_BloomFilter_Add(&interp->executor_blooms[idx], obj); } /* Invalidate all executors that depend on `obj` - * May cause other executors to be invalidated as well + * May cause other executors to be invalidated as well. + * Uses contiguous bloom filter array for cache-friendly scanning. */ void _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation) @@ -1821,23 +1837,20 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is _PyBloomFilter obj_filter; _Py_BloomFilter_Init(&obj_filter); _Py_BloomFilter_Add(&obj_filter, obj); - /* Walk the list of executors */ - /* TO DO -- Use a tree to avoid traversing as many objects */ + /* Scan contiguous bloom filter array */ PyObject *invalidate = PyList_New(0); if (invalidate == NULL) { goto error; } /* Clearing an executor can clear others, so we need to make a list of * executors to invalidate first */ - for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { - assert(exec->vm_data.valid); - _PyExecutorObject *next = exec->vm_data.links.next; - if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter) && - PyList_Append(invalidate, (PyObject *)exec)) + for (size_t i = 0; i < interp->executor_count; i++) { + assert(interp->executor_ptrs[i]->vm_data.valid); + if (bloom_filter_may_contain(&interp->executor_blooms[i], &obj_filter) && + PyList_Append(invalidate, (PyObject *)interp->executor_ptrs[i])) { goto error; } - exec = next; } for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { PyObject *exec = PyList_GET_ITEM(invalidate, i); @@ -1859,8 +1872,9 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is void _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) { - while (interp->executor_list_head) { - _PyExecutorObject *executor = interp->executor_list_head; + while (interp->executor_count > 0) { + /* Invalidate from the end to avoid repeated swap-remove shifts */ + _PyExecutorObject *executor = interp->executor_ptrs[interp->executor_count - 1]; assert(executor->vm_data.valid); if (executor->vm_data.code) { // Clear the entire code object so its co_executors array be freed: @@ -1878,8 +1892,7 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) void _Py_Executors_InvalidateCold(PyInterpreterState *interp) { - /* Walk the list of executors */ - /* TO DO -- Use a tree to avoid traversing as many objects */ + /* Scan contiguous executor array */ PyObject *invalidate = PyList_New(0); if (invalidate == NULL) { goto error; @@ -1887,9 +1900,9 @@ _Py_Executors_InvalidateCold(PyInterpreterState *interp) /* Clearing an executor can deallocate others, so we need to make a list of * executors to invalidate first */ - for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + for (size_t i = 0; i < interp->executor_count; i++) { + _PyExecutorObject *exec = interp->executor_ptrs[i]; assert(exec->vm_data.valid); - _PyExecutorObject *next = exec->vm_data.links.next; if (exec->vm_data.cold && PyList_Append(invalidate, (PyObject *)exec) < 0) { goto error; @@ -1897,8 +1910,6 @@ _Py_Executors_InvalidateCold(PyInterpreterState *interp) else { exec->vm_data.cold = true; } - - exec = next; } for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { PyObject *exec = PyList_GET_ITEM(invalidate, i); @@ -2142,9 +2153,8 @@ _PyDumpExecutors(FILE *out) fprintf(out, " rankdir = \"LR\"\n\n"); fprintf(out, " node [colorscheme=greys9]\n"); PyInterpreterState *interp = PyInterpreterState_Get(); - for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { - executor_to_gv(exec, out); - exec = exec->vm_data.links.next; + for (size_t i = 0; i < interp->executor_count; i++) { + executor_to_gv(interp->executor_ptrs[i], out); } fprintf(out, "}\n\n"); return 0; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 2b8e9a02cb6184..68052a91d6a1f1 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1761,6 +1761,12 @@ finalize_modules(PyThreadState *tstate) interp->compiling = false; #ifdef _Py_TIER2 _Py_Executors_InvalidateAll(interp, 0); + PyMem_Free(interp->executor_blooms); + PyMem_Free(interp->executor_ptrs); + interp->executor_blooms = NULL; + interp->executor_ptrs = NULL; + interp->executor_count = 0; + interp->executor_capacity = 0; #endif // Stop watching __builtin__ modifications diff --git a/Python/pystate.c b/Python/pystate.c index 17b8430b19c188..fcb73b4dcefe1d 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -597,7 +597,10 @@ init_interpreter(PyInterpreterState *interp, interp->_code_object_generation = 0; interp->jit = false; interp->compiling = false; - interp->executor_list_head = NULL; + interp->executor_blooms = NULL; + interp->executor_ptrs = NULL; + interp->executor_count = 0; + interp->executor_capacity = 0; interp->executor_deletion_list_head = NULL; interp->executor_creation_counter = JIT_CLEANUP_THRESHOLD; From aad35f112a498470dee81ba9ae455079c0a43255 Mon Sep 17 00:00:00 2001 From: cocolato Date: Fri, 13 Mar 2026 09:39:32 +0800 Subject: [PATCH 2/2] remove unnecessary field --- Include/internal/pycore_optimizer.h | 1 - Python/optimizer.c | 11 +++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 22b9ce8b003491..cea9fcce237301 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -128,7 +128,6 @@ typedef struct { bool cold; uint8_t pending_deletion; int32_t index; // Index of ENTER_EXECUTOR (if code isn't NULL, below). - _PyBloomFilter bloom; int32_t bloom_array_idx; // Index in interp->executor_blooms/executor_ptrs. _PyExecutorLinkListNode links; // Used by deletion list. PyCodeObject *code; // Weak (NULL if no corresponding ENTER_EXECUTOR). diff --git a/Python/optimizer.c b/Python/optimizer.c index 6c1caa7f01e42b..a9e788d0dcb1d5 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1650,7 +1650,7 @@ bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes) } static int -link_executor(_PyExecutorObject *executor) +link_executor(_PyExecutorObject *executor, const _PyBloomFilter *bloom) { PyInterpreterState *interp = _PyInterpreterState_GET(); if (interp->executor_count == interp->executor_capacity) { @@ -1673,7 +1673,7 @@ link_executor(_PyExecutorObject *executor) interp->executor_capacity = new_cap; } size_t idx = interp->executor_count++; - interp->executor_blooms[idx] = executor->vm_data.bloom; + interp->executor_blooms[idx] = *bloom; interp->executor_ptrs[idx] = executor; executor->vm_data.bloom_array_idx = (int32_t)idx; return 0; @@ -1702,10 +1702,7 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s executor->vm_data.valid = true; executor->vm_data.pending_deletion = 0; executor->vm_data.code = NULL; - for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) { - executor->vm_data.bloom.bits[i] = dependency_set->bits[i]; - } - if (link_executor(executor) < 0) { + if (link_executor(executor, dependency_set) < 0) { return -1; } return 0; @@ -1819,8 +1816,6 @@ void _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj) { assert(executor->vm_data.valid); - _Py_BloomFilter_Add(&executor->vm_data.bloom, obj); - /* Keep the contiguous bloom array in sync */ PyInterpreterState *interp = _PyInterpreterState_GET(); int32_t idx = executor->vm_data.bloom_array_idx; assert(idx >= 0 && (size_t)idx < interp->executor_count);