Skip to content

Commit bb25f72

Browse files
authored
gh-132657: Add maybe_enable_deferred_ref_count() (gh-142843)
If we are specializing to `LOAD_GLOBAL_MODULE` or `LOAD_ATTR_MODULE`, try to enable deferred reference counting for the value, if the object is owned by a different thread. This applies to the free-threaded build only and should improve scaling of multi-threaded programs.
1 parent 949b5ec commit bb25f72

File tree

5 files changed

+53
-5
lines changed

5 files changed

+53
-5
lines changed

Include/internal/pycore_dict.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ extern Py_ssize_t _Py_dict_lookup_threadsafe_stackref(PyDictObject *mp, PyObject
114114

115115
extern int _PyDict_GetMethodStackRef(PyDictObject *dict, PyObject *name, _PyStackRef *method);
116116

117+
extern Py_ssize_t _PyDict_LookupIndexAndValue(PyDictObject *, PyObject *, PyObject **);
117118
extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *);
118119
extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key);
119120

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
If we are specializing to ``LOAD_GLOBAL_MODULE`` or ``LOAD_ATTR_MODULE``, try
2+
to enable deferred reference counting for the value, if the object is owned by
3+
a different thread. This applies to the free-threaded build only and should
4+
improve scaling of multi-threaded programs. Note that when deferred reference
5+
counting is enabled, the object will be deallocated by the GC, rather than by
6+
:c:func:`Py_DECREF`.

Objects/dictobject.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2349,10 +2349,9 @@ dict_unhashable_type(PyObject *key)
23492349
}
23502350

23512351
Py_ssize_t
2352-
_PyDict_LookupIndex(PyDictObject *mp, PyObject *key)
2352+
_PyDict_LookupIndexAndValue(PyDictObject *mp, PyObject *key, PyObject **value)
23532353
{
23542354
// TODO: Thread safety
2355-
PyObject *value;
23562355
assert(PyDict_CheckExact((PyObject*)mp));
23572356
assert(PyUnicode_CheckExact(key));
23582357

@@ -2362,7 +2361,14 @@ _PyDict_LookupIndex(PyDictObject *mp, PyObject *key)
23622361
return -1;
23632362
}
23642363

2365-
return _Py_dict_lookup(mp, key, hash, &value);
2364+
return _Py_dict_lookup(mp, key, hash, value);
2365+
}
2366+
2367+
Py_ssize_t
2368+
_PyDict_LookupIndex(PyDictObject *mp, PyObject *key)
2369+
{
2370+
PyObject *value; // discarded
2371+
return _PyDict_LookupIndexAndValue(mp, key, &value);
23662372
}
23672373

23682374
/* Same as PyDict_GetItemWithError() but with hash supplied by caller.

Python/specialize.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,21 @@ static int function_kind(PyCodeObject *code);
358358
static bool function_check_args(PyObject *o, int expected_argcount, int opcode);
359359
static uint32_t function_get_version(PyObject *o, int opcode);
360360

361+
#ifdef Py_GIL_DISABLED
362+
static void
363+
maybe_enable_deferred_ref_count(PyObject *op)
364+
{
365+
if (!_Py_IsOwnedByCurrentThread(op)) {
366+
// For module level variables that are heavily used from multiple
367+
// threads, deferred reference counting provides good scaling
368+
// benefits. The downside is that the object will only be deallocated
369+
// by a GC run.
370+
PyUnstable_Object_EnableDeferredRefcount(op);
371+
}
372+
}
373+
#endif
374+
375+
361376
static int
362377
specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, PyObject *name)
363378
{
@@ -366,7 +381,8 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P
366381
SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_ATTR_NON_STRING);
367382
return -1;
368383
}
369-
Py_ssize_t index = _PyDict_LookupIndex(dict, name);
384+
PyObject *value;
385+
Py_ssize_t index = _PyDict_LookupIndexAndValue(dict, name, &value);
370386
assert(index != DKIX_ERROR);
371387
if (index != (uint16_t)index) {
372388
SPECIALIZATION_FAIL(LOAD_ATTR,
@@ -381,6 +397,9 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P
381397
SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_OUT_OF_VERSIONS);
382398
return -1;
383399
}
400+
#ifdef Py_GIL_DISABLED
401+
maybe_enable_deferred_ref_count(value);
402+
#endif
384403
write_u32(cache->version, keys_version);
385404
cache->index = (uint16_t)index;
386405
specialize(instr, LOAD_ATTR_MODULE);
@@ -1269,7 +1288,6 @@ specialize_attr_loadclassattr(PyObject *owner, _Py_CODEUNIT *instr,
12691288
return 1;
12701289
}
12711290

1272-
12731291
static void
12741292
specialize_load_global_lock_held(
12751293
PyObject *globals, PyObject *builtins,
@@ -1289,7 +1307,12 @@ specialize_load_global_lock_held(
12891307
SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_LOAD_GLOBAL_NON_STRING_OR_SPLIT);
12901308
goto fail;
12911309
}
1310+
#ifdef Py_GIL_DISABLED
1311+
PyObject *value;
1312+
Py_ssize_t index = _PyDict_LookupIndexAndValue((PyDictObject *)globals, name, &value);
1313+
#else
12921314
Py_ssize_t index = _PyDictKeys_StringLookup(globals_keys, name);
1315+
#endif
12931316
if (index == DKIX_ERROR) {
12941317
SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_EXPECTED_ERROR);
12951318
goto fail;
@@ -1310,6 +1333,9 @@ specialize_load_global_lock_held(
13101333
SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_OUT_OF_RANGE);
13111334
goto fail;
13121335
}
1336+
#ifdef Py_GIL_DISABLED
1337+
maybe_enable_deferred_ref_count(value);
1338+
#endif
13131339
cache->index = (uint16_t)index;
13141340
cache->module_keys_version = (uint16_t)keys_version;
13151341
specialize(instr, LOAD_GLOBAL_MODULE);

Tools/ftscalingbench/ftscalingbench.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# > echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost
2222
#
2323

24+
import copy
2425
import math
2526
import os
2627
import queue
@@ -214,6 +215,14 @@ def instantiate_dataclass():
214215
for _ in range(1000 * WORK_SCALE):
215216
obj = MyDataClass(x=1, y=2, z=3)
216217

218+
219+
@register_benchmark
220+
def deepcopy():
221+
x = {'list': [1, 2], 'tuple': (1, None)}
222+
for i in range(40 * WORK_SCALE):
223+
copy.deepcopy(x)
224+
225+
217226
def bench_one_thread(func):
218227
t0 = time.perf_counter_ns()
219228
func()

0 commit comments

Comments
 (0)