Skip to content

Commit 81ef1b7

Browse files
authored
gh-144888: Replace bloom filter linked lists with continuous arrays to optimize executor invalidating performance (GH-145873)
1 parent e18abc6 commit 81ef1b7

File tree

9 files changed

+103
-71
lines changed

9 files changed

+103
-71
lines changed

Include/internal/pycore_interp_structs.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ extern "C" {
1414
#include "pycore_structs.h" // PyHamtObject
1515
#include "pycore_tstate.h" // _PyThreadStateImpl
1616
#include "pycore_typedefs.h" // _PyRuntimeState
17+
#include "pycore_uop.h" // _PyBloomFilter
1718

1819
#define CODE_MAX_WATCHERS 8
1920
#define CONTEXT_MAX_WATCHERS 8
@@ -972,7 +973,10 @@ struct _is {
972973

973974
// Optimization configuration (thresholds and flags for JIT and interpreter)
974975
_PyOptimizationConfig opt_config;
975-
struct _PyExecutorObject *executor_list_head;
976+
_PyBloomFilter *executor_blooms; // Contiguous bloom filter array
977+
struct _PyExecutorObject **executor_ptrs; // Corresponding executor pointer array
978+
size_t executor_count; // Number of valid executors
979+
size_t executor_capacity; // Array capacity
976980
struct _PyExecutorObject *executor_deletion_list_head;
977981
struct _PyExecutorObject *cold_executor;
978982
struct _PyExecutorObject *cold_dynamic_executor;

Include/internal/pycore_optimizer.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,8 @@ typedef struct {
128128
bool cold;
129129
uint8_t pending_deletion;
130130
int32_t index; // Index of ENTER_EXECUTOR (if code isn't NULL, below).
131-
_PyBloomFilter bloom;
132-
_PyExecutorLinkListNode links;
131+
int32_t bloom_array_idx; // Index in interp->executor_blooms/executor_ptrs.
132+
_PyExecutorLinkListNode links; // Used by deletion list.
133133
PyCodeObject *code; // Weak (NULL if no corresponding ENTER_EXECUTOR).
134134
} _PyVMData;
135135

@@ -157,7 +157,7 @@ typedef struct _PyExecutorObject {
157157
// Export for '_opcode' shared extension (JIT compiler).
158158
PyAPI_FUNC(_PyExecutorObject*) _Py_GetExecutor(PyCodeObject *code, int offset);
159159

160-
void _Py_ExecutorInit(_PyExecutorObject *, const _PyBloomFilter *);
160+
int _Py_ExecutorInit(_PyExecutorObject *, const _PyBloomFilter *);
161161
void _Py_ExecutorDetach(_PyExecutorObject *);
162162
void _Py_BloomFilter_Init(_PyBloomFilter *);
163163
void _Py_BloomFilter_Add(_PyBloomFilter *bloom, void *obj);

InternalDocs/jit.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ and execution returns to the adaptive interpreter.
7878
## Invalidating Executors
7979

8080
In addition to being stored on the code object, each executor is also
81-
inserted into a list of all executors, which is stored in the interpreter
82-
state's `executor_list_head` field. This list is used when it is necessary
81+
inserted into contiguous arrays (`executor_blooms` and `executor_ptrs`)
82+
stored in the interpreter state. These arrays are used when it is necessary
8383
to invalidate executors because values they used in their construction may
8484
have changed.
8585

Modules/_testinternalcapi.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -278,10 +278,8 @@ get_jit_code_ranges(PyObject *self, PyObject *Py_UNUSED(args))
278278
if (interp == NULL) {
279279
return ranges;
280280
}
281-
for (_PyExecutorObject *exec = interp->executor_list_head;
282-
exec != NULL;
283-
exec = exec->vm_data.links.next)
284-
{
281+
for (size_t i = 0; i < interp->executor_count; i++) {
282+
_PyExecutorObject *exec = interp->executor_ptrs[i];
285283
if (exec->jit_code == NULL || exec->jit_size == 0) {
286284
continue;
287285
}

Objects/funcobject.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#include "pycore_setobject.h" // _PySet_NextEntry()
1313
#include "pycore_stats.h"
1414
#include "pycore_weakref.h" // FT_CLEAR_WEAKREFS()
15-
#include "pycore_optimizer.h" // _Py_Executors_InvalidateDependency
1615

1716
static const char *
1817
func_event_name(PyFunction_WatchEvent event) {

Python/jit.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,23 @@ jit_error(const char *message)
6262

6363
static size_t _Py_jit_shim_size = 0;
6464

65+
static int
66+
address_in_executor_array(_PyExecutorObject **ptrs, size_t count, uintptr_t addr)
67+
{
68+
for (size_t i = 0; i < count; i++) {
69+
_PyExecutorObject *exec = ptrs[i];
70+
if (exec->jit_code == NULL || exec->jit_size == 0) {
71+
continue;
72+
}
73+
uintptr_t start = (uintptr_t)exec->jit_code;
74+
uintptr_t end = start + exec->jit_size;
75+
if (addr >= start && addr < end) {
76+
return 1;
77+
}
78+
}
79+
return 0;
80+
}
81+
6582
static int
6683
address_in_executor_list(_PyExecutorObject *head, uintptr_t addr)
6784
{
@@ -94,7 +111,7 @@ _PyJIT_AddressInJitCode(PyInterpreterState *interp, uintptr_t addr)
94111
return 1;
95112
}
96113
}
97-
if (address_in_executor_list(interp->executor_list_head, addr)) {
114+
if (address_in_executor_array(interp->executor_ptrs, interp->executor_count, addr)) {
98115
return 1;
99116
}
100117
if (address_in_executor_list(interp->executor_deletion_list_head, addr)) {

Python/optimizer.c

Lines changed: 63 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,10 @@ make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, i
13791379
// linking of executor. Otherwise, the GC tries to untrack a
13801380
// still untracked object during dealloc.
13811381
_PyObject_GC_TRACK(executor);
1382-
_Py_ExecutorInit(executor, dependencies);
1382+
if (_Py_ExecutorInit(executor, dependencies) < 0) {
1383+
Py_DECREF(executor);
1384+
return NULL;
1385+
}
13831386
#ifdef Py_DEBUG
13841387
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
13851388
int lltrace = 0;
@@ -1646,59 +1649,63 @@ bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes)
16461649
return true;
16471650
}
16481651

1649-
static void
1650-
link_executor(_PyExecutorObject *executor)
1652+
static int
1653+
link_executor(_PyExecutorObject *executor, const _PyBloomFilter *bloom)
16511654
{
16521655
PyInterpreterState *interp = _PyInterpreterState_GET();
1653-
_PyExecutorLinkListNode *links = &executor->vm_data.links;
1654-
_PyExecutorObject *head = interp->executor_list_head;
1655-
if (head == NULL) {
1656-
interp->executor_list_head = executor;
1657-
links->previous = NULL;
1658-
links->next = NULL;
1659-
}
1660-
else {
1661-
assert(head->vm_data.links.previous == NULL);
1662-
links->previous = NULL;
1663-
links->next = head;
1664-
head->vm_data.links.previous = executor;
1665-
interp->executor_list_head = executor;
1666-
}
1667-
/* executor_list_head must be first in list */
1668-
assert(interp->executor_list_head->vm_data.links.previous == NULL);
1656+
if (interp->executor_count == interp->executor_capacity) {
1657+
size_t new_cap = interp->executor_capacity ? interp->executor_capacity * 2 : 64;
1658+
_PyBloomFilter *new_blooms = PyMem_Realloc(
1659+
interp->executor_blooms, new_cap * sizeof(_PyBloomFilter));
1660+
if (new_blooms == NULL) {
1661+
return -1;
1662+
}
1663+
_PyExecutorObject **new_ptrs = PyMem_Realloc(
1664+
interp->executor_ptrs, new_cap * sizeof(_PyExecutorObject *));
1665+
if (new_ptrs == NULL) {
1666+
/* Revert blooms realloc — the old pointer may have been freed by
1667+
* a successful realloc, but new_blooms is the valid pointer. */
1668+
interp->executor_blooms = new_blooms;
1669+
return -1;
1670+
}
1671+
interp->executor_blooms = new_blooms;
1672+
interp->executor_ptrs = new_ptrs;
1673+
interp->executor_capacity = new_cap;
1674+
}
1675+
size_t idx = interp->executor_count++;
1676+
interp->executor_blooms[idx] = *bloom;
1677+
interp->executor_ptrs[idx] = executor;
1678+
executor->vm_data.bloom_array_idx = (int32_t)idx;
1679+
return 0;
16691680
}
16701681

16711682
static void
16721683
unlink_executor(_PyExecutorObject *executor)
16731684
{
1674-
_PyExecutorLinkListNode *links = &executor->vm_data.links;
1675-
_PyExecutorObject *next = links->next;
1676-
_PyExecutorObject *prev = links->previous;
1677-
if (next != NULL) {
1678-
next->vm_data.links.previous = prev;
1679-
}
1680-
if (prev != NULL) {
1681-
prev->vm_data.links.next = next;
1682-
}
1683-
else {
1684-
// prev == NULL implies that executor is the list head
1685-
PyInterpreterState *interp = PyInterpreterState_Get();
1686-
assert(interp->executor_list_head == executor);
1687-
interp->executor_list_head = next;
1685+
PyInterpreterState *interp = PyInterpreterState_Get();
1686+
int32_t idx = executor->vm_data.bloom_array_idx;
1687+
assert(idx >= 0 && (size_t)idx < interp->executor_count);
1688+
size_t last = --interp->executor_count;
1689+
if ((size_t)idx != last) {
1690+
/* Swap-remove: move the last element into the vacated slot */
1691+
interp->executor_blooms[idx] = interp->executor_blooms[last];
1692+
interp->executor_ptrs[idx] = interp->executor_ptrs[last];
1693+
interp->executor_ptrs[idx]->vm_data.bloom_array_idx = idx;
16881694
}
1695+
executor->vm_data.bloom_array_idx = -1;
16891696
}
16901697

16911698
/* This must be called by optimizers before using the executor */
1692-
void
1699+
int
16931700
_Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set)
16941701
{
16951702
executor->vm_data.valid = true;
16961703
executor->vm_data.pending_deletion = 0;
16971704
executor->vm_data.code = NULL;
1698-
for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
1699-
executor->vm_data.bloom.bits[i] = dependency_set->bits[i];
1705+
if (link_executor(executor, dependency_set) < 0) {
1706+
return -1;
17001707
}
1701-
link_executor(executor);
1708+
return 0;
17021709
}
17031710

17041711
static _PyExecutorObject *
@@ -1809,35 +1816,36 @@ void
18091816
_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj)
18101817
{
18111818
assert(executor->vm_data.valid);
1812-
_Py_BloomFilter_Add(&executor->vm_data.bloom, obj);
1819+
PyInterpreterState *interp = _PyInterpreterState_GET();
1820+
int32_t idx = executor->vm_data.bloom_array_idx;
1821+
assert(idx >= 0 && (size_t)idx < interp->executor_count);
1822+
_Py_BloomFilter_Add(&interp->executor_blooms[idx], obj);
18131823
}
18141824

18151825
/* Invalidate all executors that depend on `obj`
1816-
* May cause other executors to be invalidated as well
1826+
* May cause other executors to be invalidated as well.
1827+
* Uses contiguous bloom filter array for cache-friendly scanning.
18171828
*/
18181829
void
18191830
_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation)
18201831
{
18211832
_PyBloomFilter obj_filter;
18221833
_Py_BloomFilter_Init(&obj_filter);
18231834
_Py_BloomFilter_Add(&obj_filter, obj);
1824-
/* Walk the list of executors */
1825-
/* TO DO -- Use a tree to avoid traversing as many objects */
1835+
/* Scan contiguous bloom filter array */
18261836
PyObject *invalidate = PyList_New(0);
18271837
if (invalidate == NULL) {
18281838
goto error;
18291839
}
18301840
/* Clearing an executor can clear others, so we need to make a list of
18311841
* executors to invalidate first */
1832-
for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
1833-
assert(exec->vm_data.valid);
1834-
_PyExecutorObject *next = exec->vm_data.links.next;
1835-
if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter) &&
1836-
PyList_Append(invalidate, (PyObject *)exec))
1842+
for (size_t i = 0; i < interp->executor_count; i++) {
1843+
assert(interp->executor_ptrs[i]->vm_data.valid);
1844+
if (bloom_filter_may_contain(&interp->executor_blooms[i], &obj_filter) &&
1845+
PyList_Append(invalidate, (PyObject *)interp->executor_ptrs[i]))
18371846
{
18381847
goto error;
18391848
}
1840-
exec = next;
18411849
}
18421850
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
18431851
PyObject *exec = PyList_GET_ITEM(invalidate, i);
@@ -1859,8 +1867,9 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is
18591867
void
18601868
_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
18611869
{
1862-
while (interp->executor_list_head) {
1863-
_PyExecutorObject *executor = interp->executor_list_head;
1870+
while (interp->executor_count > 0) {
1871+
/* Invalidate from the end to avoid repeated swap-remove shifts */
1872+
_PyExecutorObject *executor = interp->executor_ptrs[interp->executor_count - 1];
18641873
assert(executor->vm_data.valid);
18651874
if (executor->vm_data.code) {
18661875
// Clear the entire code object so its co_executors array be freed:
@@ -1878,27 +1887,24 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
18781887
void
18791888
_Py_Executors_InvalidateCold(PyInterpreterState *interp)
18801889
{
1881-
/* Walk the list of executors */
1882-
/* TO DO -- Use a tree to avoid traversing as many objects */
1890+
/* Scan contiguous executor array */
18831891
PyObject *invalidate = PyList_New(0);
18841892
if (invalidate == NULL) {
18851893
goto error;
18861894
}
18871895

18881896
/* Clearing an executor can deallocate others, so we need to make a list of
18891897
* executors to invalidate first */
1890-
for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
1898+
for (size_t i = 0; i < interp->executor_count; i++) {
1899+
_PyExecutorObject *exec = interp->executor_ptrs[i];
18911900
assert(exec->vm_data.valid);
1892-
_PyExecutorObject *next = exec->vm_data.links.next;
18931901

18941902
if (exec->vm_data.cold && PyList_Append(invalidate, (PyObject *)exec) < 0) {
18951903
goto error;
18961904
}
18971905
else {
18981906
exec->vm_data.cold = true;
18991907
}
1900-
1901-
exec = next;
19021908
}
19031909
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
19041910
PyObject *exec = PyList_GET_ITEM(invalidate, i);
@@ -2142,9 +2148,8 @@ _PyDumpExecutors(FILE *out)
21422148
fprintf(out, " rankdir = \"LR\"\n\n");
21432149
fprintf(out, " node [colorscheme=greys9]\n");
21442150
PyInterpreterState *interp = PyInterpreterState_Get();
2145-
for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
2146-
executor_to_gv(exec, out);
2147-
exec = exec->vm_data.links.next;
2151+
for (size_t i = 0; i < interp->executor_count; i++) {
2152+
executor_to_gv(interp->executor_ptrs[i], out);
21482153
}
21492154
fprintf(out, "}\n\n");
21502155
return 0;

Python/pylifecycle.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1761,6 +1761,12 @@ finalize_modules(PyThreadState *tstate)
17611761
interp->compiling = false;
17621762
#ifdef _Py_TIER2
17631763
_Py_Executors_InvalidateAll(interp, 0);
1764+
PyMem_Free(interp->executor_blooms);
1765+
PyMem_Free(interp->executor_ptrs);
1766+
interp->executor_blooms = NULL;
1767+
interp->executor_ptrs = NULL;
1768+
interp->executor_count = 0;
1769+
interp->executor_capacity = 0;
17641770
#endif
17651771

17661772
// Stop watching __builtin__ modifications

Python/pystate.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,10 @@ init_interpreter(PyInterpreterState *interp,
597597
interp->_code_object_generation = 0;
598598
interp->jit = false;
599599
interp->compiling = false;
600-
interp->executor_list_head = NULL;
600+
interp->executor_blooms = NULL;
601+
interp->executor_ptrs = NULL;
602+
interp->executor_count = 0;
603+
interp->executor_capacity = 0;
601604
interp->executor_deletion_list_head = NULL;
602605
interp->executor_creation_counter = JIT_CLEANUP_THRESHOLD;
603606

0 commit comments

Comments
 (0)