From 212e9428c981719d3cdb36e891ae935541f26e43 Mon Sep 17 00:00:00 2001 From: Adam Johnson Date: Thu, 18 Dec 2025 00:03:45 +0000 Subject: [PATCH] Optimize `ObjectId` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From profiles of a mongoengine-using project, I’ve noticed that `ObjectId` creation happens a lot, around ~0.5% of total request time. This PR optimizes the class in two ways: 1. It inlines the `__generate()` and `__validate()` methods, removing their call overhead. The methods are small and not called anywhere else, and Python imposes a constant overhead for method calls, so inlining them saves time. 2. Checking for `str` values before `ObjectId` ones. I believe that typical usage patterns involve creating `ObjectId`s from strings more often than from existing `ObjectId` instances, so checking for `str` first should reduce the average number of checks needed. I validated these changes with the below benchmark script, run with [richbench](https://github.com/tonybaloney/rich-bench).
bench_objectid.py ```python from __future__ import annotations import os import struct import threading import time from random import SystemRandom from typing import Any, NoReturn, Optional, Union _MAX_COUNTER_VALUE = 0xFFFFFF _PACK_INT = struct.Struct(">I").pack _PACK_INT_RANDOM = struct.Struct(">I5s").pack def _raise_invalid_id(oid: str) -> NoReturn: raise ValueError(f"{oid!r} is not a valid ObjectId") def _random_bytes() -> bytes: return os.urandom(5) class ObjectIdBefore: _pid = os.getpid() _inc = SystemRandom().randint(0, _MAX_COUNTER_VALUE) _inc_lock = threading.Lock() __random = _random_bytes() __slots__ = ("__id",) def __init__(self, oid: Optional[Union[str, ObjectIdBefore, bytes]] = None) -> None: if oid is None: self.__generate() elif isinstance(oid, bytes) and len(oid) == 12: self.__id = oid else: self.__validate(oid) @classmethod def _random(cls) -> bytes: pid = os.getpid() if pid != cls._pid: cls._pid = pid cls.__random = _random_bytes() return cls.__random def __generate(self) -> None: with ObjectIdBefore._inc_lock: inc = ObjectIdBefore._inc ObjectIdBefore._inc = (inc + 1) % (_MAX_COUNTER_VALUE + 1) self.__id = ( _PACK_INT_RANDOM(int(time.time()), ObjectIdBefore._random()) + _PACK_INT(inc)[1:4] ) def __validate(self, oid: Any) -> None: if isinstance(oid, ObjectIdBefore): self.__id = oid.binary elif isinstance(oid, str): if len(oid) == 24: try: self.__id = bytes.fromhex(oid) except (TypeError, ValueError): _raise_invalid_id(oid) else: _raise_invalid_id(oid) else: raise TypeError(f"id must be an instance of (bytes, str, ObjectId), not {type(oid)}") @property def binary(self) -> bytes: return self.__id class ObjectIdAfter: _pid = os.getpid() _inc = SystemRandom().randint(0, _MAX_COUNTER_VALUE) _inc_lock = threading.Lock() __random = _random_bytes() __slots__ = ("__id",) def __init__(self, oid: Optional[Union[str, ObjectIdAfter, bytes]] = None) -> None: if oid is None: with ObjectIdAfter._inc_lock: inc = ObjectIdAfter._inc ObjectIdAfter._inc = (inc + 1) % (_MAX_COUNTER_VALUE + 1) self.__id = ( _PACK_INT_RANDOM(int(time.time()), ObjectIdAfter._random()) + _PACK_INT(inc)[1:4] ) elif isinstance(oid, bytes) and len(oid) == 12: self.__id = oid elif isinstance(oid, str): if len(oid) == 24: try: self.__id = bytes.fromhex(oid) except (TypeError, ValueError): _raise_invalid_id(oid) else: _raise_invalid_id(oid) elif isinstance(oid, ObjectIdAfter): self.__id = oid.binary else: raise TypeError(f"id must be an instance of (bytes, str, ObjectId), not {type(oid)}") @classmethod def _random(cls) -> bytes: pid = os.getpid() if pid != cls._pid: cls._pid = pid cls.__random = _random_bytes() return cls.__random @property def binary(self) -> bytes: return self.__id test_oid_str = "507f1f77bcf86cd799439011" test_oid_bytes = bytes.fromhex(test_oid_str) test_oid_obj_before = ObjectIdBefore(test_oid_bytes) test_oid_obj_after = ObjectIdAfter(test_oid_bytes) invalid_str_short = "507f1f77" invalid_str_long = "507f1f77bcf86cd799439011aabbccdd" invalid_str_non_hex = "507f1f77bcf86cd799439ZZZ" invalid_bytes_short = b"short" invalid_bytes_long = b"toolongbytes1234" invalid_type = 12345 def bench_none_before(): for _ in range(1_000_000): ObjectIdBefore() def bench_none_after(): for _ in range(1_000_000): ObjectIdAfter() def bench_str_before(): for _ in range(1_000_000): ObjectIdBefore(test_oid_str) def bench_str_after(): for _ in range(1_000_000): ObjectIdAfter(test_oid_str) def bench_objectid_before(): for _ in range(1_000_000): ObjectIdBefore(test_oid_obj_before) def bench_objectid_after(): for _ in range(1_000_000): ObjectIdAfter(test_oid_obj_after) def bench_invalid_str_short_before(): for _ in range(1_000_000): try: ObjectIdBefore(invalid_str_short) except Exception: pass def bench_invalid_str_short_after(): for _ in range(10_000): try: ObjectIdAfter(invalid_str_short) except Exception: pass def bench_invalid_type_before(): for _ in range(10_000): try: ObjectIdBefore(invalid_type) except Exception: pass def bench_invalid_type_after(): for _ in range(10_000): try: ObjectIdAfter(invalid_type) except Exception: pass __benchmarks__ = [ (bench_none_before, bench_none_after, "ObjectId() - generate new"), (bench_str_before, bench_str_after, "ObjectId(str) - 24-char hex"), (bench_objectid_before, bench_objectid_after, "ObjectId(ObjectId) - copy"), ( bench_invalid_str_short_before, bench_invalid_str_short_after, "ObjectId(str) - invalid short", ), (bench_invalid_type_before, bench_invalid_type_after, "ObjectId(int) - invalid type"), ] ```
Results: ``` $ uvx -p 3.13 richbench . Benchmarks, repeat=5, number=5 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓ ┃ Benchmark ┃ Min ┃ Max ┃ Mean ┃ Min (+) ┃ Max (+) ┃ Mean (+) ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩ │ ObjectId() - generate new │ 1.920 │ 2.021 │ 1.990 │ 1.865 (1.0x) │ 1.868 (1.1x) │ 1.866 (1.1x) │ │ ObjectId(str) - 24-char hex │ 0.938 │ 0.952 │ 0.946 │ 0.716 (1.3x) │ 0.725 (1.3x) │ 0.720 (1.3x) │ │ ObjectId(ObjectId) - copy │ 0.845 │ 0.857 │ 0.850 │ 0.818 (1.0x) │ 0.828 (1.0x) │ 0.822 (1.0x) │ │ ObjectId(str) - invalid short │ 1.839 │ 1.857 │ 1.845 │ 0.014 (127.9x) │ 0.015 (125.7x) │ 0.015 (126.4x) │ │ ObjectId(int) - invalid type │ 0.020 │ 0.021 │ 0.021 │ 0.016 (1.2x) │ 0.017 (1.3x) │ 0.017 (1.2x) │ └───────────────────────────────┴─────────┴─────────┴─────────┴─────────────────┴─────────────────┴─────────────────┘ ``` The speedups for normal usage are like 1.1x to 1.3x, which is pretty decent for a small change. The invalid string case is much faster, but that’s less relevant since it’s an error path. --- bson/objectid.py | 51 +++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/bson/objectid.py b/bson/objectid.py index 970c4e52e8..68d0ba7d95 100644 --- a/bson/objectid.py +++ b/bson/objectid.py @@ -98,11 +98,27 @@ def __init__(self, oid: Optional[Union[str, ObjectId, bytes]] = None) -> None: objectid.rst>`_. """ if oid is None: - self.__generate() + # Generate a new value for this ObjectId. + with ObjectId._inc_lock: + inc = ObjectId._inc + ObjectId._inc = (inc + 1) % (_MAX_COUNTER_VALUE + 1) + + # 4 bytes current time, 5 bytes random, 3 bytes inc. + self.__id = _PACK_INT_RANDOM(int(time.time()), ObjectId._random()) + _PACK_INT(inc)[1:4] elif isinstance(oid, bytes) and len(oid) == 12: self.__id = oid + elif isinstance(oid, str): + if len(oid) == 24: + try: + self.__id = bytes.fromhex(oid) + except (TypeError, ValueError): + _raise_invalid_id(oid) + else: + _raise_invalid_id(oid) + elif isinstance(oid, ObjectId): + self.__id = oid.binary else: - self.__validate(oid) + raise TypeError(f"id must be an instance of (bytes, str, ObjectId), not {type(oid)}") @classmethod def from_datetime(cls: Type[ObjectId], generation_time: datetime.datetime) -> ObjectId: @@ -163,37 +179,6 @@ def _random(cls) -> bytes: cls.__random = _random_bytes() return cls.__random - def __generate(self) -> None: - """Generate a new value for this ObjectId.""" - with ObjectId._inc_lock: - inc = ObjectId._inc - ObjectId._inc = (inc + 1) % (_MAX_COUNTER_VALUE + 1) - - # 4 bytes current time, 5 bytes random, 3 bytes inc. - self.__id = _PACK_INT_RANDOM(int(time.time()), ObjectId._random()) + _PACK_INT(inc)[1:4] - - def __validate(self, oid: Any) -> None: - """Validate and use the given id for this ObjectId. - - Raises TypeError if id is not an instance of :class:`str`, - :class:`bytes`, or ObjectId. Raises InvalidId if it is not a - valid ObjectId. - - :param oid: a valid ObjectId - """ - if isinstance(oid, ObjectId): - self.__id = oid.binary - elif isinstance(oid, str): - if len(oid) == 24: - try: - self.__id = bytes.fromhex(oid) - except (TypeError, ValueError): - _raise_invalid_id(oid) - else: - _raise_invalid_id(oid) - else: - raise TypeError(f"id must be an instance of (bytes, str, ObjectId), not {type(oid)}") - @property def binary(self) -> bytes: """12-byte binary representation of this ObjectId."""