From 729654cf6f1b472f47acc98c3ab4890afbf3f08e Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sat, 13 Dec 2025 20:06:35 +0800 Subject: [PATCH 01/25] Add new argument to `gil_safe_call_once_and_store::call_once_and_store_result` --- include/pybind11/gil_safe_call_once.h | 29 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2abd8fc326..06fb9ef75e 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -8,8 +8,12 @@ #include #include -#ifdef Py_GIL_DISABLED +#if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) # include + +using atomic_bool = std::atomic_bool; +#else +using atomic_bool = bool; #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -53,7 +57,8 @@ class gil_safe_call_once_and_store { public: // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. template - gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn) { + gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, + void (*finalize_fn)(T &) = nullptr) { if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -61,8 +66,9 @@ class gil_safe_call_once_and_store { std::call_once(once_flag_, [&] { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. - is_initialized_ = true; // This write is guarded by the GIL. + ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. + finalize_fn_ = finalize_fn; // Store the finalizer. + is_initialized_ = true; // This write is guarded by the GIL. }); // All threads will observe `is_initialized_` as true here. } @@ -83,20 +89,21 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { + if (is_initialized_ && finalize_fn_ != nullptr) { + finalize_fn_(*reinterpret_cast(storage_)); + } + } private: alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; -#ifdef Py_GIL_DISABLED - std::atomic_bool -#else - bool -#endif - is_initialized_{false}; + void (*finalize_fn_)(T &) = nullptr; + // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, // therefore `std::optional` is not a viable alternative here. + atomic_bool is_initialized_{false}; }; PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) From d2b76050a11ed5284903c69f9d9e01054d5754f6 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 01:03:04 +0800 Subject: [PATCH 02/25] Add per-interpreter storage for `gil_safe_call_once_and_store` --- include/pybind11/detail/internals.h | 39 +++++++++++- include/pybind11/gil_safe_call_once.h | 91 ++++++++++++++++++++++++++- 2 files changed, 126 insertions(+), 4 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 858de67525..d5c4da1acf 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,7 +39,7 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -# define PYBIND11_INTERNALS_VERSION 11 +# define PYBIND11_INTERNALS_VERSION 12 #endif #if PYBIND11_INTERNALS_VERSION < 11 @@ -234,6 +234,34 @@ inline uint64_t round_up_to_next_pow2(uint64_t x) { class loader_life_support; +struct call_once_storage_base { + call_once_storage_base() = default; + virtual ~call_once_storage_base() = default; + call_once_storage_base(const call_once_storage_base &) = delete; + call_once_storage_base(call_once_storage_base &&) = delete; + call_once_storage_base &operator=(const call_once_storage_base &) = delete; + call_once_storage_base &operator=(call_once_storage_base &&) = delete; +}; + +template +struct call_once_storage : call_once_storage_base { + void (*finalize)(T &) = nullptr; + alignas(T) char storage[sizeof(T)] = {0}; + + call_once_storage() = default; + ~call_once_storage() override { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } + memset(storage, 0, sizeof(T)); + finalize = nullptr; + }; + call_once_storage(const call_once_storage &) = delete; + call_once_storage(call_once_storage &&) = delete; + call_once_storage &operator=(const call_once_storage &) = delete; + call_once_storage &operator=(call_once_storage &&) = delete; +}; + /// Internal data structure used to track registered instances and types. /// Whenever binary incompatible changes are made to this structure, /// `PYBIND11_INTERNALS_VERSION` must be incremented. @@ -283,6 +311,8 @@ struct internals { type_map native_enum_type_map; + std::unordered_map call_once_storage_map; + internals() : static_property_type(make_static_property_type()), default_metaclass(make_default_metaclass()) { @@ -308,7 +338,12 @@ struct internals { internals(internals &&other) = delete; internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; - ~internals() = default; + ~internals() { + for (auto &[_, storage_ptr] : call_once_storage_map) { + delete storage_ptr; + } + call_once_storage_map.clear(); + } }; // the internals struct (above) is shared between all the modules. local_internals are only diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 06fb9ef75e..a848404eaf 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -3,6 +3,7 @@ #pragma once #include "detail/common.h" +#include "detail/internals.h" #include "gil.h" #include @@ -52,6 +53,7 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) // functions, which is usually the case. // // For in-depth background, see docs/advanced/deadlock.md +#ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT template class gil_safe_call_once_and_store { public: @@ -59,6 +61,7 @@ class gil_safe_call_once_and_store { template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*finalize_fn)(T &) = nullptr) { + if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -80,10 +83,10 @@ class gil_safe_call_once_and_store { T &get_stored() { assert(is_initialized_); PYBIND11_WARNING_PUSH -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 +# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 // Needed for gcc 4.8.5 PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing") -#endif +# endif return *reinterpret_cast(storage_); PYBIND11_WARNING_POP } @@ -96,6 +99,7 @@ class gil_safe_call_once_and_store { } private: + // Global static storage (per process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; void (*finalize_fn_)(T &) = nullptr; @@ -105,5 +109,88 @@ class gil_safe_call_once_and_store { // therefore `std::optional` is not a viable alternative here. atomic_bool is_initialized_{false}; }; +#else +// Subinterpreter support is enabled. +// In this case, we should store the result per-interpreter instead of globally, because +// each subinterpreter has its own separate state. The cached object may not shareable +// across interpreters (e.g., imported modules and their members). +template +class gil_safe_call_once_and_store { +public: + // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. + template + gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, + void (*finalize_fn)(T &) = nullptr) { + if (!is_initialized_by_atleast_one_interpreter_ + || detail::get_num_interpreters_seen() > 1) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + if (it == storage_map.end()) { + gil_scoped_release gil_rel; // Needed to establish lock ordering. + { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + auto s = new detail::call_once_storage{}; + ::new (s->storage) T(fn()); // fn may release, but will reacquire, the GIL. + s->finalize = finalize_fn; + last_storage_ = reinterpret_cast(s->storage); + storage_map.emplace(key, s); + }; + } + is_initialized_by_atleast_one_interpreter_ = true; + }); + // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. + } + // Intentionally not returning `T &` to ensure the calling code is self-documenting. + return *this; + } + // This must only be called after `call_once_and_store_result()` was called. + T &get_stored() { + T *result = last_storage_; + if (!is_initialized_by_atleast_one_interpreter_ + || detail::get_num_interpreters_seen() > 1) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + assert(it != storage_map.end()); + auto *s = static_cast *>(it->second); + result = last_storage_ = reinterpret_cast(s->storage); + }); + } + assert(result != nullptr); + return *result; + } + + constexpr gil_safe_call_once_and_store() = default; + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { + if (is_initialized_by_atleast_one_interpreter_) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + if (it != storage_map.end()) { + delete it->second; + storage_map.erase(it); + } + }); + } + } + +private: + // No storage needed when subinterpreter support is enabled. + // The actual storage is stored in the per-interpreter state dict in + // `internals.call_once_storage_map`. + + // Fast local cache to avoid repeated lookups when there are no multiple interpreters. + // This is only valid if there is a single interpreter. Otherwise, it is not used. + T *last_storage_ = nullptr; + // This flag is true if the value has been initialized by any interpreter (may not be the + // current one). + atomic_bool is_initialized_by_atleast_one_interpreter_{false}; +}; +#endif PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) From e7417606e8979f948be47d0512bfaf9a21d2953f Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 10:56:19 +0800 Subject: [PATCH 03/25] Make `~gil_safe_call_once_and_store` a no-op --- include/pybind11/gil_safe_call_once.h | 57 ++++++++++----------------- 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index a848404eaf..10ba995dcc 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -60,8 +60,7 @@ class gil_safe_call_once_and_store { // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, - void (*finalize_fn)(T &) = nullptr) { - + void (*)(T &) /*unused*/ = nullptr) { if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -69,9 +68,8 @@ class gil_safe_call_once_and_store { std::call_once(once_flag_, [&] { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. - finalize_fn_ = finalize_fn; // Store the finalizer. - is_initialized_ = true; // This write is guarded by the GIL. + ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. + is_initialized_ = true; // This write is guarded by the GIL. }); // All threads will observe `is_initialized_` as true here. } @@ -92,17 +90,15 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { - if (is_initialized_ && finalize_fn_ != nullptr) { - finalize_fn_(*reinterpret_cast(storage_)); - } - } + // The instance is a global static, so its destructor runs when the process + // is terminating. Therefore, do nothing here because the Python interpreter + // may have been finalized already. + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: // Global static storage (per process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; - void (*finalize_fn_)(T &) = nullptr; // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, @@ -124,19 +120,19 @@ class gil_safe_call_once_and_store { if (!is_initialized_by_atleast_one_interpreter_ || detail::get_num_interpreters_seen() > 1) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); + auto it = storage_map.find(k); if (it == storage_map.end()) { gil_scoped_release gil_rel; // Needed to establish lock ordering. { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - auto s = new detail::call_once_storage{}; - ::new (s->storage) T(fn()); // fn may release, but will reacquire, the GIL. - s->finalize = finalize_fn; - last_storage_ = reinterpret_cast(s->storage); - storage_map.emplace(key, s); + auto v = new detail::call_once_storage{}; + ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. + v->finalize = finalize_fn; + last_storage_ = reinterpret_cast(v->storage); + storage_map.emplace(k, v); }; } is_initialized_by_atleast_one_interpreter_ = true; @@ -153,12 +149,10 @@ class gil_safe_call_once_and_store { if (!is_initialized_by_atleast_one_interpreter_ || detail::get_num_interpreters_seen() > 1) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); - assert(it != storage_map.end()); - auto *s = static_cast *>(it->second); - result = last_storage_ = reinterpret_cast(s->storage); + auto *v = static_cast *>(storage_map.at(k)); + result = last_storage_ = reinterpret_cast(v->storage); }); } assert(result != nullptr); @@ -166,19 +160,10 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { - if (is_initialized_by_atleast_one_interpreter_) { - detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); - if (it != storage_map.end()) { - delete it->second; - storage_map.erase(it); - } - }); - } - } + // The instance is a global static, so its destructor runs when the process + // is terminating. Therefore, do nothing here because the Python interpreter + // may have been finalized already. + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: // No storage needed when subinterpreter support is enabled. From 5d1d6782b9fa7cc6f705b6adad1828ac2c66ec5a Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 12:15:30 +0800 Subject: [PATCH 04/25] Fix C++11 compatibility --- include/pybind11/detail/internals.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index d5c4da1acf..046e47314f 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -339,8 +339,8 @@ struct internals { internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; ~internals() { - for (auto &[_, storage_ptr] : call_once_storage_map) { - delete storage_ptr; + for (auto &entry : call_once_storage_map) { + delete entry.second; } call_once_storage_map.clear(); } From 0bac82df687e2bdd919c653b9ee0d1fecd155fa5 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 13:06:15 +0800 Subject: [PATCH 05/25] Improve thread-safety and add default finalizer --- include/pybind11/detail/internals.h | 11 +++-- include/pybind11/gil_safe_call_once.h | 60 +++++++++++++++++---------- 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 046e47314f..dd0c2af957 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -247,14 +247,17 @@ template struct call_once_storage : call_once_storage_base { void (*finalize)(T &) = nullptr; alignas(T) char storage[sizeof(T)] = {0}; + std::atomic_bool is_initialized{false}; call_once_storage() = default; ~call_once_storage() override { - if (finalize != nullptr) { - finalize(*reinterpret_cast(storage)); + if (is_initialized) { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } else { + reinterpret_cast(storage)->~T(); + } } - memset(storage, 0, sizeof(T)); - finalize = nullptr; }; call_once_storage(const call_once_storage &) = delete; call_once_storage(call_once_storage &&) = delete; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 10ba995dcc..5904f97ba4 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -54,6 +54,11 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) // // For in-depth background, see docs/advanced/deadlock.md #ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +// Subinterpreter support is disabled. +// In this case, we can store the result globally, because there is only a single interpreter. +// +// The life span of the stored result is the entire process lifetime. It is leaked on process +// termination to avoid destructor calls after the Python interpreter was finalized. template class gil_safe_call_once_and_store { public: @@ -107,9 +112,12 @@ class gil_safe_call_once_and_store { }; #else // Subinterpreter support is enabled. -// In this case, we should store the result per-interpreter instead of globally, because -// each subinterpreter has its own separate state. The cached object may not shareable -// across interpreters (e.g., imported modules and their members). +// In this case, we should store the result per-interpreter instead of globally, because each +// subinterpreter has its own separate state. The cached result may not shareable across +// interpreters (e.g., imported modules and their members). +// +// The life span of the stored result is the entire interpreter lifetime. An additional +// `finalize_fn` can be provided to clean up the stored result when the interpreter is destroyed. template class gil_safe_call_once_and_store { public: @@ -117,26 +125,32 @@ class gil_safe_call_once_and_store { template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*finalize_fn)(T &) = nullptr) { - if (!is_initialized_by_atleast_one_interpreter_ - || detail::get_num_interpreters_seen() > 1) { - detail::with_internals([&](detail::internals &internals) { - const void *k = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(k); - if (it == storage_map.end()) { - gil_scoped_release gil_rel; // Needed to establish lock ordering. - { - // Only one thread will ever enter here. - gil_scoped_acquire gil_acq; + if (!is_last_storage_valid()) { + // Multiple threads may enter here, because the GIL is released in the next line and + // CPython API calls in the `fn()` call below may release and reacquire the GIL. + gil_scoped_release gil_rel; // Needed to establish lock ordering. + { + gil_scoped_acquire gil_acq; + detail::with_internals([&](detail::internals &internals) { + // The concurrency control is done inside `detail::with_internals`. + // At most one thread will enter here at a time. + const void *k = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + // There can be multiple threads going through here, but only one each at a + // time. So only one thread will create the storage. Other threads will find it + // already created. + auto it = storage_map.find(k); + if (it == storage_map.end()) { auto v = new detail::call_once_storage{}; ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. v->finalize = finalize_fn; last_storage_ = reinterpret_cast(v->storage); + v->is_initialized = true; storage_map.emplace(k, v); - }; - } - is_initialized_by_atleast_one_interpreter_ = true; - }); + } + is_initialized_by_atleast_one_interpreter_ = true; + }); + } // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. @@ -146,8 +160,7 @@ class gil_safe_call_once_and_store { // This must only be called after `call_once_and_store_result()` was called. T &get_stored() { T *result = last_storage_; - if (!is_initialized_by_atleast_one_interpreter_ - || detail::get_num_interpreters_seen() > 1) { + if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; @@ -159,13 +172,18 @@ class gil_safe_call_once_and_store { return *result; } - constexpr gil_safe_call_once_and_store() = default; + gil_safe_call_once_and_store() = default; // The instance is a global static, so its destructor runs when the process // is terminating. Therefore, do nothing here because the Python interpreter // may have been finalized already. PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: + bool is_last_storage_valid() const { + return is_initialized_by_atleast_one_interpreter_ + && detail::get_num_interpreters_seen() <= 1 && last_storage_ != nullptr; + } + // No storage needed when subinterpreter support is enabled. // The actual storage is stored in the per-interpreter state dict in // `internals.call_once_storage_map`. From be971103aad809575d22db6bcc5aa56c8215b2c4 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 16:24:06 +0800 Subject: [PATCH 06/25] Try fix thread-safety --- include/pybind11/gil_safe_call_once.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 5904f97ba4..2bedb6d665 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -130,7 +130,6 @@ class gil_safe_call_once_and_store { // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. { - gil_scoped_acquire gil_acq; detail::with_internals([&](detail::internals &internals) { // The concurrency control is done inside `detail::with_internals`. // At most one thread will enter here at a time. @@ -141,10 +140,11 @@ class gil_safe_call_once_and_store { // already created. auto it = storage_map.find(k); if (it == storage_map.end()) { + gil_scoped_acquire gil_acq; auto v = new detail::call_once_storage{}; ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. v->finalize = finalize_fn; - last_storage_ = reinterpret_cast(v->storage); + last_storage_ptr_ = reinterpret_cast(v->storage); v->is_initialized = true; storage_map.emplace(k, v); } @@ -159,13 +159,13 @@ class gil_safe_call_once_and_store { // This must only be called after `call_once_and_store_result()` was called. T &get_stored() { - T *result = last_storage_; + T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; auto *v = static_cast *>(storage_map.at(k)); - result = last_storage_ = reinterpret_cast(v->storage); + result = last_storage_ptr_ = reinterpret_cast(v->storage); }); } assert(result != nullptr); @@ -181,7 +181,7 @@ class gil_safe_call_once_and_store { private: bool is_last_storage_valid() const { return is_initialized_by_atleast_one_interpreter_ - && detail::get_num_interpreters_seen() <= 1 && last_storage_ != nullptr; + && detail::get_num_interpreters_seen() <= 1; } // No storage needed when subinterpreter support is enabled. @@ -190,7 +190,7 @@ class gil_safe_call_once_and_store { // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. - T *last_storage_ = nullptr; + T *last_storage_ptr_ = nullptr; // This flag is true if the value has been initialized by any interpreter (may not be the // current one). atomic_bool is_initialized_by_atleast_one_interpreter_{false}; From 3e77ce953a740fe2182af686723901bde05cc2a5 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 01:15:46 +0800 Subject: [PATCH 07/25] Try fix thread-safety --- include/pybind11/detail/internals.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index dd0c2af957..b5e9d6eb7c 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -602,27 +602,26 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - if (get_num_interpreters_seen() > 1) { - // Whenever the interpreter changes on the current thread we need to invalidate the - // internals_pp so that it can be pulled from the interpreter's state dict. That is - // slow, so we use the current PyThreadState to check if it is necessary. - auto *tstate = get_thread_state_unchecked(); - if (!tstate || tstate->interp != last_istate_tls()) { - gil_scoped_acquire_simple gil; - if (!tstate) { - tstate = get_thread_state_unchecked(); - } - last_istate_tls() = tstate->interp; - internals_p_tls() = get_or_create_pp_in_state_dict(); + // Whenever the interpreter changes on the current thread we need to invalidate the + // internals_pp so that it can be pulled from the interpreter's state dict. That is + // slow, so we use the current PyThreadState to check if it is necessary. + auto *tstate = get_thread_state_unchecked(); + if (!tstate || tstate->interp != last_istate_tls()) { + gil_scoped_acquire_simple gil; + if (!tstate) { + tstate = get_thread_state_unchecked(); } - return internals_p_tls(); + last_istate_tls() = tstate->interp; + internals_p_tls() = get_or_create_pp_in_state_dict(); } -#endif + return internals_p_tls(); +#else if (!internals_singleton_pp_) { gil_scoped_acquire_simple gil; internals_singleton_pp_ = get_or_create_pp_in_state_dict(); } return internals_singleton_pp_; +#endif } /// Drop all the references we're currently holding. From d5b8813a66f2b66dcc7419e87d16401356033159 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 14:36:35 +0800 Subject: [PATCH 08/25] Add a warning comment --- include/pybind11/detail/internals.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index b5e9d6eb7c..e22e94ffe5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -602,6 +602,19 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT + // WARNING: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for + // the single-interpreter case. + // + // For multi-interpreter support, the subinterpreters can be initialized concurrently, and + // the first time this function may not be called in the main interpreter. + // For example, a clean main interpreter that does not import any pybind11 module and then + // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a + // pybind11 module concurrently. + // + // Multiple subinterpreters may observe `get_num_interpreters_seen() <= 1` at the same + // time, while `get_num_interpreters_seen() += 1` in `PYBIND11_MODULE(...)` is called + // later. + // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is // slow, so we use the current PyThreadState to check if it is necessary. From f6d0f88bd6a29858e92da1503362bfdab2a86c39 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 18:16:16 +0800 Subject: [PATCH 09/25] Simplify `PYBIND11_INTERNALS_VERSION >= 12` --- include/pybind11/detail/class.h | 2 -- include/pybind11/detail/internals.h | 11 ++--------- include/pybind11/detail/type_caster_base.h | 10 ++-------- include/pybind11/gil_safe_call_once.h | 1 + include/pybind11/pybind11.h | 4 ---- 5 files changed, 5 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index 21e966cfea..1cd9af0bd1 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -226,14 +226,12 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { local_internals.registered_types_cpp.erase(tinfo->cpptype); } else { internals.registered_types_cpp.erase(tindex); -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast.erase(tinfo->cpptype); for (const std::type_info *alias : tinfo->alias_chain) { auto num_erased = internals.registered_types_cpp_fast.erase(alias); (void) num_erased; assert(num_erased > 0); } -#endif } internals.registered_types_py.erase(tinfo->type); diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index e22e94ffe5..b67b9ce6d4 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -42,8 +42,8 @@ # define PYBIND11_INTERNALS_VERSION 12 #endif -#if PYBIND11_INTERNALS_VERSION < 11 -# error "PYBIND11_INTERNALS_VERSION 11 is the minimum for all platforms for pybind11v3." +#if PYBIND11_INTERNALS_VERSION < 12 +# error "PYBIND11_INTERNALS_VERSION 12 is the minimum for all platforms for pybind11v3." #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -273,14 +273,12 @@ struct internals { pymutex mutex; pymutex exception_translator_mutex; #endif -#if PYBIND11_INTERNALS_VERSION >= 12 // non-normative but fast "hint" for registered_types_cpp. Meant // to be used as the first level of a two-level lookup: successful // lookups are correct, but unsuccessful lookups need to try // registered_types_cpp and then backfill this map if they find // anything. fast_type_map registered_types_cpp_fast; -#endif // std::type_index -> pybind11's type information type_map registered_types_cpp; @@ -306,9 +304,6 @@ struct internals { PyObject *instance_base = nullptr; // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: thread_specific_storage tstate; -#if PYBIND11_INTERNALS_VERSION <= 11 - thread_specific_storage loader_life_support_tls; // OBSOLETE (PR #5830) -#endif // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: PyInterpreterState *istate = nullptr; @@ -396,7 +391,6 @@ struct type_info { void *(*module_local_load)(PyObject *, const type_info *) = nullptr; holder_enum_t holder_enum_v = holder_enum_t::undefined; -#if PYBIND11_INTERNALS_VERSION >= 12 // When a type appears in multiple DSOs, // internals::registered_types_cpp_fast will have multiple distinct // keys (the std::type_info from each DSO) mapped to the same @@ -407,7 +401,6 @@ struct type_info { // nb_alias_chain` added in // https://github.com/wjakob/nanobind/commit/b515b1f7f2f4ecc0357818e6201c94a9f4cbfdc2 std::forward_list alias_chain; -#endif /* A simple type never occurs as a (direct or indirect) parent * of a class that makes use of multiple inheritance. diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index b0c59e1138..21b7f0950e 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -227,32 +227,26 @@ inline detail::type_info *get_global_type_info_lock_held(const std::type_info &t // next time. detail::type_info *type_info = nullptr; auto &internals = get_internals(); -#if PYBIND11_INTERNALS_VERSION >= 12 auto &fast_types = internals.registered_types_cpp_fast; -#endif auto &types = internals.registered_types_cpp; -#if PYBIND11_INTERNALS_VERSION >= 12 auto fast_it = fast_types.find(&tp); if (fast_it != fast_types.end()) { -# ifndef NDEBUG +#ifndef NDEBUG auto types_it = types.find(std::type_index(tp)); assert(types_it != types.end()); assert(types_it->second == fast_it->second); -# endif +#endif return fast_it->second; } -#endif // PYBIND11_INTERNALS_VERSION >= 12 auto it = types.find(std::type_index(tp)); if (it != types.end()) { -#if PYBIND11_INTERNALS_VERSION >= 12 // We found the type in the slow map but not the fast one, so // some other DSO added it (otherwise it would be in the fast // map under &tp) and therefore we must be an alias. Record // that. it->second->alias_chain.push_front(&tp); fast_types.emplace(&tp, it->second); -#endif type_info = it->second; } return type_info; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2bedb6d665..a0d74bc6f3 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -196,4 +196,5 @@ class gil_safe_call_once_and_store { atomic_bool is_initialized_by_atleast_one_interpreter_{false}; }; #endif + PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 91b38d91ed..8bd62c85c9 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1692,9 +1692,7 @@ class generic_type : public object { local_internals.registered_types_cpp[rec.type] = tinfo; } else { internals.registered_types_cpp[tindex] = tinfo; -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[rec.type] = tinfo; -#endif } PYBIND11_WARNING_PUSH @@ -2201,9 +2199,7 @@ class class_ : public detail::generic_type { type_info *const val = internals.registered_types_cpp[std::type_index(typeid(type))]; internals.registered_types_cpp[std::type_index(typeid(type_alias))] = val; -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[&typeid(type_alias)] = val; -#endif } }); } From 7d8339eff5998b33c5455c3f6937756e3168d6fa Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 18:45:23 +0800 Subject: [PATCH 10/25] Try fix thread-safety --- include/pybind11/detail/internals.h | 4 ++-- include/pybind11/gil_safe_call_once.h | 28 +++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index b67b9ce6d4..4abf7d41df 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -245,8 +245,8 @@ struct call_once_storage_base { template struct call_once_storage : call_once_storage_base { - void (*finalize)(T &) = nullptr; alignas(T) char storage[sizeof(T)] = {0}; + void (*finalize)(T &) = nullptr; std::atomic_bool is_initialized{false}; call_once_storage() = default; @@ -337,7 +337,7 @@ struct internals { internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; ~internals() { - for (auto &entry : call_once_storage_map) { + for (const auto &entry : call_once_storage_map) { delete entry.second; } call_once_storage_map.clear(); diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index a0d74bc6f3..98e7149947 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -131,22 +131,22 @@ class gil_safe_call_once_and_store { gil_scoped_release gil_rel; // Needed to establish lock ordering. { detail::with_internals([&](detail::internals &internals) { - // The concurrency control is done inside `detail::with_internals`. - // At most one thread will enter here at a time. - const void *k = reinterpret_cast(this); + const void *key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here, but only one each at a - // time. So only one thread will create the storage. Other threads will find it - // already created. - auto it = storage_map.find(k); - if (it == storage_map.end()) { + // There can be multiple threads going through here. + if (storage_map.find(key) == storage_map.end()) { gil_scoped_acquire gil_acq; - auto v = new detail::call_once_storage{}; - ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. - v->finalize = finalize_fn; - last_storage_ptr_ = reinterpret_cast(v->storage); - v->is_initialized = true; - storage_map.emplace(k, v); + // Only one thread will enter here at a time. + // Fast recheck to avoid double work. + if (storage_map.find(key) == storage_map.end()) { + auto value = new detail::call_once_storage{}; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + storage_map.emplace(key, value); + last_storage_ptr_ = reinterpret_cast(value->storage); + } } is_initialized_by_atleast_one_interpreter_ = true; }); From 1920f4345a61acbd444f0d3309a124ac7dee895d Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 19:48:42 +0800 Subject: [PATCH 11/25] Try fix thread-safety --- include/pybind11/detail/internals.h | 3 +- include/pybind11/gil_safe_call_once.h | 50 +++++++++++++++------------ 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 4abf7d41df..802a57e3e5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -245,7 +245,8 @@ struct call_once_storage_base { template struct call_once_storage : call_once_storage_base { - alignas(T) char storage[sizeof(T)] = {0}; + alignas(T) char storage[sizeof(T)] = {}; + std::once_flag once_flag; void (*finalize)(T &) = nullptr; std::atomic_bool is_initialized{false}; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 98e7149947..e00bbb9f06 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -129,28 +129,34 @@ class gil_safe_call_once_and_store { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. - { - detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here. - if (storage_map.find(key) == storage_map.end()) { - gil_scoped_acquire gil_acq; - // Only one thread will enter here at a time. - // Fast recheck to avoid double work. - if (storage_map.find(key) == storage_map.end()) { - auto value = new detail::call_once_storage{}; - // fn may release, but will reacquire, the GIL. - ::new (value->storage) T(fn()); - value->finalize = finalize_fn; - value->is_initialized = true; - storage_map.emplace(key, value); - last_storage_ptr_ = reinterpret_cast(value->storage); - } + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + // There can be multiple threads going through here. + detail::call_once_storage *value = nullptr; + { + gil_scoped_acquire gil_acq; + // Only one thread will enter here at a time. + const auto it = storage_map.find(key); + if (it != storage_map.end()) { + value = static_cast *>(it->second); + } else { + value = new detail::call_once_storage{}; + storage_map.emplace(key, value); } + } + assert(value != nullptr); + std::call_once(value->once_flag, [&] { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + last_storage_ptr_ = reinterpret_cast(value->storage); is_initialized_by_atleast_one_interpreter_ = true; }); - } + }); // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. @@ -162,10 +168,10 @@ class gil_safe_call_once_and_store { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { - const void *k = reinterpret_cast(this); + const void *key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto *v = static_cast *>(storage_map.at(k)); - result = last_storage_ptr_ = reinterpret_cast(v->storage); + auto *value = static_cast *>(storage_map.at(key)); + result = last_storage_ptr_ = reinterpret_cast(value->storage); }); } assert(result != nullptr); From a6754ba40d2326c3680984b52a6f893cd89d57bd Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 16 Dec 2025 15:57:09 +0800 Subject: [PATCH 12/25] Revert get_pp() --- include/pybind11/detail/internals.h | 56 +++++++++++++++++------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 802a57e3e5..c157bf53cb 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -596,39 +596,46 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - // WARNING: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for - // the single-interpreter case. + // FIXME: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for + // the multi-interpreter case. The singleton may be initialized by a subinterpreter not the + // main interpreter. // // For multi-interpreter support, the subinterpreters can be initialized concurrently, and // the first time this function may not be called in the main interpreter. // For example, a clean main interpreter that does not import any pybind11 module and then // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a // pybind11 module concurrently. - // - // Multiple subinterpreters may observe `get_num_interpreters_seen() <= 1` at the same - // time, while `get_num_interpreters_seen() += 1` in `PYBIND11_MODULE(...)` is called - // later. - - // Whenever the interpreter changes on the current thread we need to invalidate the - // internals_pp so that it can be pulled from the interpreter's state dict. That is - // slow, so we use the current PyThreadState to check if it is necessary. - auto *tstate = get_thread_state_unchecked(); - if (!tstate || tstate->interp != last_istate_tls()) { - gil_scoped_acquire_simple gil; - if (!tstate) { - tstate = get_thread_state_unchecked(); + if (get_num_interpreters_seen() > 1) { + // Whenever the interpreter changes on the current thread we need to invalidate the + // internals_pp so that it can be pulled from the interpreter's state dict. That is + // slow, so we use the current PyThreadState to check if it is necessary. + auto *tstate = get_thread_state_unchecked(); + if (!tstate || tstate->interp != last_istate_tls()) { + gil_scoped_acquire_simple gil; + if (!tstate) { + tstate = get_thread_state_unchecked(); + } + last_istate_tls() = tstate->interp; + internals_p_tls() = get_or_create_pp_in_state_dict(); } - last_istate_tls() = tstate->interp; - internals_p_tls() = get_or_create_pp_in_state_dict(); + return internals_p_tls(); } - return internals_p_tls(); -#else - if (!internals_singleton_pp_) { - gil_scoped_acquire_simple gil; - internals_singleton_pp_ = get_or_create_pp_in_state_dict(); +#endif + return get_pp_for_main_interpreter(); + } + + /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already + /// exist. May acquire the GIL. Will never return nullptr. + std::unique_ptr *get_pp_for_main_interpreter() { + // This function **assumes** that the current thread is running in the main interpreter. + if (!seen_main_interpreter_) { + std::call_once(seen_main_interpreter_flag_, [&] { + gil_scoped_acquire_simple gil; + internals_singleton_pp_ = get_or_create_pp_in_state_dict(); + seen_main_interpreter_ = true; + }); } return internals_singleton_pp_; -#endif } /// Drop all the references we're currently holding. @@ -705,6 +712,9 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; std::unique_ptr *internals_singleton_pp_; + + std::once_flag seen_main_interpreter_flag_; + std::atomic_bool seen_main_interpreter_{false}; }; // If We loaded the internals through `state_dict`, our `error_already_set` From 1aed3ab1b4682ab61cd41a00284d5a6f1b63e1d1 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 16 Dec 2025 16:33:14 +0800 Subject: [PATCH 13/25] Update comments --- include/pybind11/detail/internals.h | 9 +++++++-- include/pybind11/gil_safe_call_once.h | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index c157bf53cb..4ff904607a 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -627,19 +627,23 @@ class internals_pp_manager { /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already /// exist. May acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp_for_main_interpreter() { - // This function **assumes** that the current thread is running in the main interpreter. if (!seen_main_interpreter_) { + // The first call to this function **MUST** be from the main interpreter. + // Here we **ASSUME** that the current thread is running in the main interpreter. + // The caller is responsible for ensuring this. std::call_once(seen_main_interpreter_flag_, [&] { gil_scoped_acquire_simple gil; internals_singleton_pp_ = get_or_create_pp_in_state_dict(); seen_main_interpreter_ = true; }); } + // This is shared between all threads and all interpreters. return internals_singleton_pp_; } /// Drop all the references we're currently holding. void unref() { + // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { last_istate_tls() = nullptr; @@ -651,6 +655,7 @@ class internals_pp_manager { } void destroy() { + // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { auto *tstate = get_thread_state_unchecked(); @@ -711,8 +716,8 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; + // Pointer to the singleton internals for the main interpreter std::unique_ptr *internals_singleton_pp_; - std::once_flag seen_main_interpreter_flag_; std::atomic_bool seen_main_interpreter_{false}; }; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index e00bbb9f06..68314c6f8d 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -130,7 +130,7 @@ class gil_safe_call_once_and_store { // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *const key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; // There can be multiple threads going through here. detail::call_once_storage *value = nullptr; @@ -168,7 +168,7 @@ class gil_safe_call_once_and_store { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *const key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; auto *value = static_cast *>(storage_map.at(key)); result = last_storage_ptr_ = reinterpret_cast(value->storage); From b61e902dce793e2b82b1a0f6e9ba8ffb5c875894 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 12:02:29 +0800 Subject: [PATCH 14/25] Move call-once storage out of internals --- include/pybind11/detail/internals.h | 41 +------- include/pybind11/gil_safe_call_once.h | 140 +++++++++++++++++++------- 2 files changed, 104 insertions(+), 77 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 4ff904607a..11a2ee4c92 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -234,38 +234,6 @@ inline uint64_t round_up_to_next_pow2(uint64_t x) { class loader_life_support; -struct call_once_storage_base { - call_once_storage_base() = default; - virtual ~call_once_storage_base() = default; - call_once_storage_base(const call_once_storage_base &) = delete; - call_once_storage_base(call_once_storage_base &&) = delete; - call_once_storage_base &operator=(const call_once_storage_base &) = delete; - call_once_storage_base &operator=(call_once_storage_base &&) = delete; -}; - -template -struct call_once_storage : call_once_storage_base { - alignas(T) char storage[sizeof(T)] = {}; - std::once_flag once_flag; - void (*finalize)(T &) = nullptr; - std::atomic_bool is_initialized{false}; - - call_once_storage() = default; - ~call_once_storage() override { - if (is_initialized) { - if (finalize != nullptr) { - finalize(*reinterpret_cast(storage)); - } else { - reinterpret_cast(storage)->~T(); - } - } - }; - call_once_storage(const call_once_storage &) = delete; - call_once_storage(call_once_storage &&) = delete; - call_once_storage &operator=(const call_once_storage &) = delete; - call_once_storage &operator=(call_once_storage &&) = delete; -}; - /// Internal data structure used to track registered instances and types. /// Whenever binary incompatible changes are made to this structure, /// `PYBIND11_INTERNALS_VERSION` must be incremented. @@ -310,8 +278,6 @@ struct internals { type_map native_enum_type_map; - std::unordered_map call_once_storage_map; - internals() : static_property_type(make_static_property_type()), default_metaclass(make_default_metaclass()) { @@ -337,12 +303,7 @@ struct internals { internals(internals &&other) = delete; internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; - ~internals() { - for (const auto &entry : call_once_storage_map) { - delete entry.second; - } - call_once_storage_map.clear(); - } + ~internals() = default; }; // the internals struct (above) is shared between all the modules. local_internals are only diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 68314c6f8d..2268ca3ac7 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -115,7 +115,45 @@ class gil_safe_call_once_and_store { // In this case, we should store the result per-interpreter instead of globally, because each // subinterpreter has its own separate state. The cached result may not shareable across // interpreters (e.g., imported modules and their members). -// + +struct call_once_storage_base { + call_once_storage_base() = default; + virtual ~call_once_storage_base() = default; + call_once_storage_base(const call_once_storage_base &) = delete; + call_once_storage_base(call_once_storage_base &&) = delete; + call_once_storage_base &operator=(const call_once_storage_base &) = delete; + call_once_storage_base &operator=(call_once_storage_base &&) = delete; +}; + +template +struct call_once_storage : call_once_storage_base { + alignas(T) char storage[sizeof(T)] = {}; + std::once_flag once_flag; + void (*finalize)(T &) = nullptr; + std::atomic_bool is_initialized{false}; + + call_once_storage() = default; + ~call_once_storage() override { + if (is_initialized) { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } else { + reinterpret_cast(storage)->~T(); + } + } + }; + call_once_storage(const call_once_storage &) = delete; + call_once_storage(call_once_storage &&) = delete; + call_once_storage &operator=(const call_once_storage &) = delete; + call_once_storage &operator=(call_once_storage &&) = delete; +}; + +/// Storage map for `gil_safe_call_once_and_store`. Stored in a capsule in the interpreter's state +/// dict with proper destructor to ensure cleanup when the interpreter is destroyed. +using call_once_storage_map_type = std::unordered_map; + +# define PYBIND11_CALL_ONCE_STORAGE_MAP_ID PYBIND11_INTERNALS_ID "_call_once_storage_map__" + // The life span of the stored result is the entire interpreter lifetime. An additional // `finalize_fn` can be provided to clean up the stored result when the interpreter is destroyed. template @@ -129,35 +167,33 @@ class gil_safe_call_once_and_store { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. - detail::with_internals([&](detail::internals &internals) { - const void *const key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here. - detail::call_once_storage *value = nullptr; - { - gil_scoped_acquire gil_acq; - // Only one thread will enter here at a time. - const auto it = storage_map.find(key); - if (it != storage_map.end()) { - value = static_cast *>(it->second); - } else { - value = new detail::call_once_storage{}; - storage_map.emplace(key, value); - } + const void *const key = reinterpret_cast(this); + // There can be multiple threads going through here. + call_once_storage *value = nullptr; + { + gil_scoped_acquire gil_acq; + // Only one thread will enter here at a time. + auto &storage_map = *get_or_create_call_once_storage_map(); + const auto it = storage_map.find(key); + if (it != storage_map.end()) { + value = static_cast *>(it->second); + } else { + value = new call_once_storage{}; + storage_map.emplace(key, value); } - assert(value != nullptr); - std::call_once(value->once_flag, [&] { - // Only one thread will ever enter here. - gil_scoped_acquire gil_acq; - // fn may release, but will reacquire, the GIL. - ::new (value->storage) T(fn()); - value->finalize = finalize_fn; - value->is_initialized = true; - last_storage_ptr_ = reinterpret_cast(value->storage); - is_initialized_by_atleast_one_interpreter_ = true; - }); + } + assert(value != nullptr); + std::call_once(value->once_flag, [&] { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + last_storage_ptr_ = reinterpret_cast(value->storage); + is_initialized_by_atleast_one_interpreter_ = true; }); - // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. + // All threads will observe `is_initialized_by_atleast_one_interpreter_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. return *this; @@ -167,12 +203,11 @@ class gil_safe_call_once_and_store { T &get_stored() { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { - detail::with_internals([&](detail::internals &internals) { - const void *const key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto *value = static_cast *>(storage_map.at(key)); - result = last_storage_ptr_ = reinterpret_cast(value->storage); - }); + gil_scoped_acquire gil_acq; + const void *const key = reinterpret_cast(this); + auto &storage_map = *get_or_create_call_once_storage_map(); + auto *value = static_cast *>(storage_map.at(key)); + result = last_storage_ptr_ = reinterpret_cast(value->storage); } assert(result != nullptr); return *result; @@ -187,12 +222,43 @@ class gil_safe_call_once_and_store { private: bool is_last_storage_valid() const { return is_initialized_by_atleast_one_interpreter_ - && detail::get_num_interpreters_seen() <= 1; + && detail::get_num_interpreters_seen() == 1; + } + + static call_once_storage_map_type *get_or_create_call_once_storage_map() { + error_scope err_scope; + dict state_dict = detail::get_python_state_dict(); + auto storage_map_obj = reinterpret_steal( + detail::dict_getitemstringref(state_dict.ptr(), PYBIND11_CALL_ONCE_STORAGE_MAP_ID)); + call_once_storage_map_type *storage_map = nullptr; + if (storage_map_obj) { + void *raw_ptr = PyCapsule_GetPointer(storage_map_obj.ptr(), /*name=*/nullptr); + if (!raw_ptr) { + raise_from(PyExc_SystemError, + "pybind11::gil_safe_call_once_and_store::" + "get_or_create_call_once_storage_map() FAILED"); + throw error_already_set(); + } + storage_map = reinterpret_cast(raw_ptr); + } else { + storage_map = new call_once_storage_map_type(); + // Create capsule with destructor to clean up the storage map when the interpreter + // shuts down + state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] + = capsule(storage_map, [](void *ptr) noexcept { + auto *map = reinterpret_cast(ptr); + for (const auto &entry : *map) { + delete entry.second; + } + delete map; + }); + } + return storage_map; } // No storage needed when subinterpreter support is enabled. - // The actual storage is stored in the per-interpreter state dict in - // `internals.call_once_storage_map`. + // The actual storage is stored in the per-interpreter state dict via + // `get_or_create_call_once_storage_map()`. // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. From b72cd4162baf14472f37c66144aca55df7c9fa74 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 11:22:54 +0800 Subject: [PATCH 15/25] Revert internal version bump --- include/pybind11/detail/class.h | 2 ++ include/pybind11/detail/internals.h | 13 ++++++++++--- include/pybind11/detail/type_caster_base.h | 10 ++++++++-- include/pybind11/pybind11.h | 4 ++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index 1cd9af0bd1..21e966cfea 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -226,12 +226,14 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { local_internals.registered_types_cpp.erase(tinfo->cpptype); } else { internals.registered_types_cpp.erase(tindex); +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast.erase(tinfo->cpptype); for (const std::type_info *alias : tinfo->alias_chain) { auto num_erased = internals.registered_types_cpp_fast.erase(alias); (void) num_erased; assert(num_erased > 0); } +#endif } internals.registered_types_py.erase(tinfo->type); diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 11a2ee4c92..5347511538 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,11 +39,11 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -# define PYBIND11_INTERNALS_VERSION 12 +# define PYBIND11_INTERNALS_VERSION 11 #endif -#if PYBIND11_INTERNALS_VERSION < 12 -# error "PYBIND11_INTERNALS_VERSION 12 is the minimum for all platforms for pybind11v3." +#if PYBIND11_INTERNALS_VERSION < 11 +# error "PYBIND11_INTERNALS_VERSION 11 is the minimum for all platforms for pybind11v3." #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -242,12 +242,14 @@ struct internals { pymutex mutex; pymutex exception_translator_mutex; #endif +#if PYBIND11_INTERNALS_VERSION >= 12 // non-normative but fast "hint" for registered_types_cpp. Meant // to be used as the first level of a two-level lookup: successful // lookups are correct, but unsuccessful lookups need to try // registered_types_cpp and then backfill this map if they find // anything. fast_type_map registered_types_cpp_fast; +#endif // std::type_index -> pybind11's type information type_map registered_types_cpp; @@ -273,6 +275,9 @@ struct internals { PyObject *instance_base = nullptr; // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: thread_specific_storage tstate; +#if PYBIND11_INTERNALS_VERSION <= 11 + thread_specific_storage loader_life_support_tls; // OBSOLETE (PR #5830) +#endif // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: PyInterpreterState *istate = nullptr; @@ -353,6 +358,7 @@ struct type_info { void *(*module_local_load)(PyObject *, const type_info *) = nullptr; holder_enum_t holder_enum_v = holder_enum_t::undefined; +#if PYBIND11_INTERNALS_VERSION >= 12 // When a type appears in multiple DSOs, // internals::registered_types_cpp_fast will have multiple distinct // keys (the std::type_info from each DSO) mapped to the same @@ -363,6 +369,7 @@ struct type_info { // nb_alias_chain` added in // https://github.com/wjakob/nanobind/commit/b515b1f7f2f4ecc0357818e6201c94a9f4cbfdc2 std::forward_list alias_chain; +#endif /* A simple type never occurs as a (direct or indirect) parent * of a class that makes use of multiple inheritance. diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 21b7f0950e..b0c59e1138 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -227,26 +227,32 @@ inline detail::type_info *get_global_type_info_lock_held(const std::type_info &t // next time. detail::type_info *type_info = nullptr; auto &internals = get_internals(); +#if PYBIND11_INTERNALS_VERSION >= 12 auto &fast_types = internals.registered_types_cpp_fast; +#endif auto &types = internals.registered_types_cpp; +#if PYBIND11_INTERNALS_VERSION >= 12 auto fast_it = fast_types.find(&tp); if (fast_it != fast_types.end()) { -#ifndef NDEBUG +# ifndef NDEBUG auto types_it = types.find(std::type_index(tp)); assert(types_it != types.end()); assert(types_it->second == fast_it->second); -#endif +# endif return fast_it->second; } +#endif // PYBIND11_INTERNALS_VERSION >= 12 auto it = types.find(std::type_index(tp)); if (it != types.end()) { +#if PYBIND11_INTERNALS_VERSION >= 12 // We found the type in the slow map but not the fast one, so // some other DSO added it (otherwise it would be in the fast // map under &tp) and therefore we must be an alias. Record // that. it->second->alias_chain.push_front(&tp); fast_types.emplace(&tp, it->second); +#endif type_info = it->second; } return type_info; diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 8bd62c85c9..91b38d91ed 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1692,7 +1692,9 @@ class generic_type : public object { local_internals.registered_types_cpp[rec.type] = tinfo; } else { internals.registered_types_cpp[tindex] = tinfo; +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[rec.type] = tinfo; +#endif } PYBIND11_WARNING_PUSH @@ -2199,7 +2201,9 @@ class class_ : public detail::generic_type { type_info *const val = internals.registered_types_cpp[std::type_index(typeid(type))]; internals.registered_types_cpp[std::type_index(typeid(type_alias))] = val; +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[&typeid(type_alias)] = val; +#endif } }); } From ac02a3208d4bd377059bb97bba4df5bb8f1b3923 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 12:07:48 +0800 Subject: [PATCH 16/25] Cleanup outdated comments --- include/pybind11/detail/internals.h | 35 +++++------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 5347511538..5ccd4d18e5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -564,15 +564,6 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - // FIXME: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for - // the multi-interpreter case. The singleton may be initialized by a subinterpreter not the - // main interpreter. - // - // For multi-interpreter support, the subinterpreters can be initialized concurrently, and - // the first time this function may not be called in the main interpreter. - // For example, a clean main interpreter that does not import any pybind11 module and then - // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a - // pybind11 module concurrently. if (get_num_interpreters_seen() > 1) { // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is @@ -589,29 +580,15 @@ class internals_pp_manager { return internals_p_tls(); } #endif - return get_pp_for_main_interpreter(); - } - - /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already - /// exist. May acquire the GIL. Will never return nullptr. - std::unique_ptr *get_pp_for_main_interpreter() { - if (!seen_main_interpreter_) { - // The first call to this function **MUST** be from the main interpreter. - // Here we **ASSUME** that the current thread is running in the main interpreter. - // The caller is responsible for ensuring this. - std::call_once(seen_main_interpreter_flag_, [&] { - gil_scoped_acquire_simple gil; - internals_singleton_pp_ = get_or_create_pp_in_state_dict(); - seen_main_interpreter_ = true; - }); + if (!internals_singleton_pp_) { + gil_scoped_acquire_simple gil; + internals_singleton_pp_ = get_or_create_pp_in_state_dict(); } - // This is shared between all threads and all interpreters. return internals_singleton_pp_; } /// Drop all the references we're currently holding. void unref() { - // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { last_istate_tls() = nullptr; @@ -623,7 +600,6 @@ class internals_pp_manager { } void destroy() { - // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { auto *tstate = get_thread_state_unchecked(); @@ -684,10 +660,9 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; - // Pointer to the singleton internals for the main interpreter + // Pointer-to-pointer to the singleton internals for the first seen interpreter (may not be the + // main interpreter) std::unique_ptr *internals_singleton_pp_; - std::once_flag seen_main_interpreter_flag_; - std::atomic_bool seen_main_interpreter_{false}; }; // If We loaded the internals through `state_dict`, our `error_already_set` From ddb6dd4c73cc84b82373ba6c2e851dbecd19ad62 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:56:14 -0800 Subject: [PATCH 17/25] Move atomic_bool alias into pybind11::detail namespace The `using atomic_bool = ...` declaration was at global scope, polluting the global namespace. Move it into pybind11::detail to avoid potential conflicts with user code. --- include/pybind11/gil_safe_call_once.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2268ca3ac7..ffd147ad0b 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -11,13 +11,17 @@ #if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) # include +#endif + +PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +namespace detail { +#if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) using atomic_bool = std::atomic_bool; #else using atomic_bool = bool; #endif - -PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +} // namespace detail // Use the `gil_safe_call_once_and_store` class below instead of the naive // @@ -108,7 +112,7 @@ class gil_safe_call_once_and_store { // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, // therefore `std::optional` is not a viable alternative here. - atomic_bool is_initialized_{false}; + detail::atomic_bool is_initialized_{false}; }; #else // Subinterpreter support is enabled. @@ -265,7 +269,7 @@ class gil_safe_call_once_and_store { T *last_storage_ptr_ = nullptr; // This flag is true if the value has been initialized by any interpreter (may not be the // current one). - atomic_bool is_initialized_by_atleast_one_interpreter_{false}; + detail::atomic_bool is_initialized_by_atleast_one_interpreter_{false}; }; #endif From 3fb52dff6a90385f278473487f8b56fd48d46598 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:56:30 -0800 Subject: [PATCH 18/25] Add explicit #include for subinterpreter support The subinterpreter branch uses std::unordered_map but relied on transitive includes. Add an explicit include for robustness. --- include/pybind11/gil_safe_call_once.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index ffd147ad0b..20166ecf08 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -12,6 +12,9 @@ #if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) # include #endif +#ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +# include +#endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) From 32deca43804bde28bf863cf9e0f9d0f3fbab69f9 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:56:49 -0800 Subject: [PATCH 19/25] Remove extraneous semicolon after destructor definition Style fix: remove trailing semicolon after ~call_once_storage() destructor body. --- include/pybind11/gil_safe_call_once.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 20166ecf08..51e7f0c931 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -148,7 +148,7 @@ struct call_once_storage : call_once_storage_base { reinterpret_cast(storage)->~T(); } } - }; + } call_once_storage(const call_once_storage &) = delete; call_once_storage(call_once_storage &&) = delete; call_once_storage &operator=(const call_once_storage &) = delete; From a4d4d734863c4d413dc361893f72700e98703a94 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:57:11 -0800 Subject: [PATCH 20/25] Add comment explaining unused finalize parameter Clarify why the finalize callback parameter is intentionally ignored when subinterpreter support is disabled: the storage is process-global and leaked to avoid destructor calls after interpreter finalization. --- include/pybind11/gil_safe_call_once.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 51e7f0c931..6ac6af4aff 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -70,6 +70,9 @@ template class gil_safe_call_once_and_store { public: // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. + // Note: The second parameter (finalize callback) is intentionally unused when subinterpreter + // support is disabled. In that case, storage is process-global and intentionally leaked to + // avoid calling destructors after the Python interpreter has been finalized. template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*)(T &) /*unused*/ = nullptr) { From 7cb30ceb070f8bc8682e245b4b9446f244664c20 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:57:36 -0800 Subject: [PATCH 21/25] Add comment explaining error_scope usage Clarify why error_scope is used: to preserve any existing Python error state that might be cleared or modified by dict_getitemstringref. --- include/pybind11/gil_safe_call_once.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 6ac6af4aff..083018f950 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -236,6 +236,9 @@ class gil_safe_call_once_and_store { } static call_once_storage_map_type *get_or_create_call_once_storage_map() { + // Preserve any existing Python error state. dict_getitemstringref may clear + // errors or set new ones when the key is not found; we restore the original + // error state when this scope exits. error_scope err_scope; dict state_dict = detail::get_python_state_dict(); auto storage_map_obj = reinterpret_steal( From 7d3413944d5d2b2c8788c8bc3c5da99ea4711b5b Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Fri, 19 Dec 2025 21:58:18 -0800 Subject: [PATCH 22/25] Improve exception safety in get_or_create_call_once_storage_map() Use std::unique_ptr to hold the newly allocated storage map until the capsule is successfully created. This prevents a memory leak if capsule creation throws an exception. --- include/pybind11/gil_safe_call_once.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 083018f950..b68dd26e65 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -13,6 +13,7 @@ # include #endif #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +# include # include #endif @@ -254,17 +255,22 @@ class gil_safe_call_once_and_store { } storage_map = reinterpret_cast(raw_ptr); } else { - storage_map = new call_once_storage_map_type(); + // Use unique_ptr for exception safety: if capsule creation throws, + // the map is automatically deleted. + auto storage_map_ptr = std::unique_ptr( + new call_once_storage_map_type()); // Create capsule with destructor to clean up the storage map when the interpreter // shuts down state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] - = capsule(storage_map, [](void *ptr) noexcept { + = capsule(storage_map_ptr.get(), [](void *ptr) noexcept { auto *map = reinterpret_cast(ptr); for (const auto &entry : *map) { delete entry.second; } delete map; }); + // Capsule now owns the storage map, release from unique_ptr + storage_map = storage_map_ptr.release(); } return storage_map; } From 78e39452d13fa1106f60ff8d6d6be82616444f9f Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 03:32:42 -0800 Subject: [PATCH 23/25] Add timeout-minutes: 3 to cpptest workflow steps Add a 3-minute timeout to all C++ test (cpptest) steps across all platforms to detect hangs early. This uses GitHub Actions' built-in timeout-minutes property which works on Linux, macOS, and Windows. --- .github/workflows/ci.yml | 15 +++++++++++++++ .github/workflows/reusable-standard.yml | 1 + .github/workflows/upstream.yml | 2 ++ 3 files changed, 18 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c5a200e32e..4800b9c25c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -229,6 +229,7 @@ jobs: run: cmake --build . --target pytest - name: Compiled tests + timeout-minutes: 3 run: cmake --build . --target cpptest - name: Interface test @@ -334,6 +335,7 @@ jobs: run: cmake --build --preset default --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build --preset default --target cpptest - name: Visibility test @@ -393,6 +395,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test @@ -516,6 +519,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test @@ -570,6 +574,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test @@ -652,6 +657,7 @@ jobs: cmake --build build-11 --target check - name: C++ tests C++11 + timeout-minutes: 3 run: | set +e; source /opt/intel/oneapi/setvars.sh; set -e cmake --build build-11 --target cpptest @@ -689,6 +695,7 @@ jobs: cmake --build build-17 --target check - name: C++ tests C++17 + timeout-minutes: 3 run: | set +e; source /opt/intel/oneapi/setvars.sh; set -e cmake --build build-17 --target cpptest @@ -760,6 +767,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test @@ -1000,6 +1008,7 @@ jobs: run: cmake --build build --target pytest - name: C++20 tests + timeout-minutes: 3 run: cmake --build build --target cpptest -j 2 - name: Interface test C++20 @@ -1076,6 +1085,7 @@ jobs: run: cmake --build build --target pytest -j 2 - name: C++11 tests + timeout-minutes: 3 run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build --target cpptest -j 2 - name: Interface test C++11 @@ -1100,6 +1110,7 @@ jobs: run: cmake --build build2 --target pytest -j 2 - name: C++14 tests + timeout-minutes: 3 run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build2 --target cpptest -j 2 - name: Interface test C++14 @@ -1124,6 +1135,7 @@ jobs: run: cmake --build build3 --target pytest -j 2 - name: C++17 tests + timeout-minutes: 3 run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build3 --target cpptest -j 2 - name: Interface test C++17 @@ -1195,6 +1207,7 @@ jobs: run: cmake --build . --target pytest -j 2 - name: C++ tests + timeout-minutes: 3 run: cmake --build . --target cpptest -j 2 - name: Interface test @@ -1257,6 +1270,7 @@ jobs: run: cmake --build . --target pytest -j 2 - name: C++ tests + timeout-minutes: 3 run: cmake --build . --target cpptest -j 2 - name: Interface test @@ -1329,6 +1343,7 @@ jobs: run: cmake --build build --target pytest -j 2 - name: C++ tests + timeout-minutes: 3 run: PYTHONHOME=/clangarm64 PYTHONPATH=/clangarm64 cmake --build build --target cpptest -j 2 - name: Interface test diff --git a/.github/workflows/reusable-standard.yml b/.github/workflows/reusable-standard.yml index 96b14bdfba..56d92e2779 100644 --- a/.github/workflows/reusable-standard.yml +++ b/.github/workflows/reusable-standard.yml @@ -83,6 +83,7 @@ jobs: run: cmake --build build --target pytest - name: C++ tests + timeout-minutes: 3 run: cmake --build build --target cpptest - name: Interface test diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index 15ede7a856..890ae0b6fd 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -66,6 +66,7 @@ jobs: run: cmake --build build11 --target pytest -j 2 - name: C++11 tests + timeout-minutes: 3 run: cmake --build build11 --target cpptest -j 2 - name: Interface test C++11 @@ -87,6 +88,7 @@ jobs: run: cmake --build build17 --target pytest - name: C++17 tests + timeout-minutes: 3 run: cmake --build build17 --target cpptest # Third build - C++17 mode with unstable ABI From 1014ee403f6bd65f55ec5e62a5d969dd6e286d3c Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 03:38:00 -0800 Subject: [PATCH 24/25] Add progress reporter for test_with_catch Catch2 runner Add a custom Catch2 streaming reporter that prints one line per test case as it starts and ends, with immediate flushing to keep CI logs current. This makes it easy to see where the embedded/interpreter tests are spending time and to pinpoint which test case is stuck when builds hang (e.g., free-threading issues). The reporter: - Prints "[ RUN ]" when each test starts - Prints "[ OK ]" or "[ FAILED ]" when each test ends - Prints the Python version once at the start via Py_GetVersion() - Uses StreamingReporterBase for immediate output (not buffered) - Is set as the default reporter via CATCH_CONFIG_DEFAULT_REPORTER This approach gives visibility into all tests without changing their behavior, turning otherwise opaque 90-minute CI timeouts into locatable issues in the Catch output. --- tests/test_with_catch/catch.cpp | 58 +++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/test_with_catch/catch.cpp b/tests/test_with_catch/catch.cpp index 5bd8b3880e..5dbc01f677 100644 --- a/tests/test_with_catch/catch.cpp +++ b/tests/test_with_catch/catch.cpp @@ -13,10 +13,68 @@ PYBIND11_WARNING_DISABLE_MSVC(4996) #endif #define CATCH_CONFIG_RUNNER +#define CATCH_CONFIG_DEFAULT_REPORTER "progress" #include namespace py = pybind11; +// Simple progress reporter that prints a line per test case. +namespace { + +class ProgressReporter : public Catch::StreamingReporterBase { +public: + using StreamingReporterBase::StreamingReporterBase; + + static std::string getDescription() { return "Simple progress reporter (one line per test)"; } + + void testCaseStarting(Catch::TestCaseInfo const &testInfo) override { + print_python_version_once(); + auto &os = Catch::cout(); + os << "[ RUN ] " << testInfo.name << '\n'; + os.flush(); + } + + void testCaseEnded(Catch::TestCaseStats const &stats) override { + bool failed = stats.totals.assertions.failed > 0; + auto &os = Catch::cout(); + os << (failed ? "[ FAILED ] " : "[ OK ] ") << stats.testInfo.name << '\n'; + os.flush(); + } + + void noMatchingTestCases(std::string const &spec) override { + auto &os = Catch::cout(); + os << "[ NO TEST ] no matching test cases for spec: " << spec << '\n'; + os.flush(); + } + + void reportInvalidArguments(std::string const &arg) override { + auto &os = Catch::cout(); + os << "[ ERROR ] invalid Catch2 arguments: " << arg << '\n'; + os.flush(); + } + + void assertionStarting(Catch::AssertionInfo const &) override {} + + bool assertionEnded(Catch::AssertionStats const &) override { return false; } + +private: + void print_python_version_once() { + if (printed_) { + return; + } + printed_ = true; + auto &os = Catch::cout(); + os << "[ PYTHON ] " << Py_GetVersion() << '\n'; + os.flush(); + } + + bool printed_ = false; +}; + +} // namespace + +CATCH_REGISTER_REPORTER("progress", ProgressReporter) + int main(int argc, char *argv[]) { // Setup for TEST_CASE in test_interpreter.cpp, tagging on a large random number: std::string updated_pythonpath("pybind11_test_with_catch_PYTHONPATH_2099743835476552"); From 21d0dc5e5919449a57d30729079b27f87101e7c7 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Sat, 20 Dec 2025 03:40:50 -0800 Subject: [PATCH 25/25] clang-format auto-fix (overlooked before) --- include/pybind11/gil_safe_call_once.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index b68dd26e65..a6450cf7d6 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -257,8 +257,8 @@ class gil_safe_call_once_and_store { } else { // Use unique_ptr for exception safety: if capsule creation throws, // the map is automatically deleted. - auto storage_map_ptr = std::unique_ptr( - new call_once_storage_map_type()); + auto storage_map_ptr + = std::unique_ptr(new call_once_storage_map_type()); // Create capsule with destructor to clean up the storage map when the interpreter // shuts down state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID]