diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index e1873dd5d..a9a1c43c4 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -69,15 +69,17 @@ def can_read(path): 'default': None }, {'name': 'herd_path', 'type': str, - 'doc': 'The path to read/write the HERD file', 'default': None},) + 'doc': 'The path to read/write the HERD file', 'default': None}, + {'name': 'deduplicate_objects', 'type': bool, + 'doc': 'whether to deduplicate identical container objects by creating soft links', 'default': True}) def __init__(self, **kwargs): """Open an HDF5 file for IO. """ self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) - path, manager, mode, comm, file_obj, driver, aws_region, herd_path = popargs('path', 'manager', 'mode', - 'comm', 'file', 'driver', - 'aws_region', 'herd_path', - kwargs) + path, manager, mode, comm, file_obj, driver, aws_region, herd_path, deduplicate_objects = popargs( + 'path', 'manager', 'mode', 'comm', 'file', 'driver', 'aws_region', 'herd_path', + 'deduplicate_objects', kwargs + ) self.__open_links = [] # keep track of other files opened from links in this file self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close @@ -93,9 +95,9 @@ def __init__(self, **kwargs): raise UnsupportedOperation(msg) if manager is None: - manager = BuildManager(TypeMap(NamespaceCatalog())) + manager = BuildManager(TypeMap(NamespaceCatalog()), deduplicate_objects=deduplicate_objects) elif isinstance(manager, TypeMap): - manager = BuildManager(manager) + manager = BuildManager(manager, deduplicate_objects=deduplicate_objects) self.__driver = driver self.__aws_region = aws_region self.__comm = comm diff --git a/src/hdmf/backends/io.py b/src/hdmf/backends/io.py index 86fd25b26..95a181e64 100644 --- a/src/hdmf/backends/io.py +++ b/src/hdmf/backends/io.py @@ -22,9 +22,13 @@ def can_read(path): {"name": "source", "type": (str, Path), "doc": "the source of container being built i.e. file path", 'default': None}, {'name': 'herd_path', 'type': str, - 'doc': 'The path to read/write the HERD file', 'default': None},) + 'doc': 'The path to read/write the HERD file', 'default': None}, + {'name': 'deduplicate_objects', 'type': bool, + 'doc': 'whether to deduplicate identical container objects by creating soft links', 'default': True}) def __init__(self, **kwargs): - manager, source, herd_path = getargs('manager', 'source', 'herd_path', kwargs) + manager, source, herd_path, deduplicate_objects = getargs( + 'manager', 'source', 'herd_path', 'deduplicate_objects', kwargs + ) if isinstance(source, Path): source = source.resolve() elif (isinstance(source, str) and @@ -38,6 +42,7 @@ def __init__(self, **kwargs): self.__source = source self.herd_path = herd_path self.herd = None + self.__deduplicate_objects = deduplicate_objects self.open() @property diff --git a/src/hdmf/build/manager.py b/src/hdmf/build/manager.py index aa66c0862..fc788e7bc 100644 --- a/src/hdmf/build/manager.py +++ b/src/hdmf/build/manager.py @@ -86,13 +86,19 @@ class BuildManager: A class for managing builds of AbstractContainers """ - def __init__(self, type_map): + def __init__(self, type_map, deduplicate_objects=True): + """ + Args: + type_map (TypeMap): the TypeMap to use for mapping container classes to specifications + deduplicate_objects (bool): whether to deduplicate identical container objects by creating soft links + """ self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) self.__builders = dict() self.__containers = dict() self.__active_builders = set() self.__type_map = type_map self.__ref_queue = deque() # a queue of the ReferenceBuilders that need to be added + self.__deduplicate_objects = deduplicate_objects @property def namespace_catalog(self): @@ -102,6 +108,11 @@ def namespace_catalog(self): def type_map(self): return self.__type_map + @property + def deduplicate_objects(self): + """Whether to deduplicate identical container objects by creating soft links.""" + return self.__deduplicate_objects + @docval({"name": "object", "type": (BaseBuilder, AbstractContainer), "doc": "the container or builder to get a proxy for"}, {"name": "source", "type": str, @@ -260,8 +271,13 @@ def clear_cache(self): @docval({"name": "container", "type": AbstractContainer, "doc": "the container to get the builder for"}) def get_builder(self, **kwargs): - """Return the prebuilt builder for the given container or None if it does not exist.""" + """Return the prebuilt builder for the given container or None if it does not exist. + + If deduplicate_objects is False, this will always return None to force creation of new builders. + """ container = getargs('container', kwargs) + if not self.__deduplicate_objects: + return None container_id = self.__conthash__(container) result = self.__builders.get(container_id) return result diff --git a/tests/unit/build_tests/test_deduplicate_objects.py b/tests/unit/build_tests/test_deduplicate_objects.py new file mode 100644 index 000000000..bba7819fb --- /dev/null +++ b/tests/unit/build_tests/test_deduplicate_objects.py @@ -0,0 +1,148 @@ +"""Tests for the deduplicate_objects functionality in BuildManager""" + +from hdmf.build import DatasetBuilder, BuildManager, TypeMap, ObjectMapper +from hdmf.spec import GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog +from hdmf.testing import TestCase +from hdmf.container import Data + +from tests.unit.helpers.utils import Foo, CORE_NAMESPACE + + +class FooMapper(ObjectMapper): + """Maps nested 'attr2' attribute on dataset 'my_data' to Foo.attr2 in constructor and attribute map""" + + def __init__(self, spec): + super().__init__(spec) + my_data_spec = spec.get_dataset('my_data') + self.map_spec('attr2', my_data_spec.get_attribute('attr2')) + + +class TestBuildManagerDeduplication(TestCase): + """Test BuildManager deduplication functionality""" + + def setUp(self): + self.foo_spec = GroupSpec( + doc='A test group specification with a data type', + data_type_def='Foo', + datasets=[ + DatasetSpec( + doc='an example dataset', + dtype='int', + name='my_data', + attributes=[ + AttributeSpec( + name='attr2', + doc='an example integer attribute', + dtype='int' + ) + ] + ) + ], + attributes=[AttributeSpec('attr1', 'an example string attribute', 'text')] + ) + + self.spec_catalog = SpecCatalog() + self.spec_catalog.register_spec(self.foo_spec, 'test.yaml') + self.namespace = SpecNamespace( + 'a test namespace', + CORE_NAMESPACE, + [{'source': 'test.yaml'}], + version='0.1.0', + catalog=self.spec_catalog) + self.namespace_catalog = NamespaceCatalog() + self.namespace_catalog.add_namespace(CORE_NAMESPACE, self.namespace) + self.type_map = TypeMap(self.namespace_catalog) + self.type_map.register_container_type(CORE_NAMESPACE, 'Foo', Foo) + self.type_map.register_map(Foo, FooMapper) + + def test_default_deduplicate_objects_true(self): + """Test that deduplicate_objects defaults to True""" + manager = BuildManager(self.type_map) + self.assertTrue(manager.deduplicate_objects) + + def test_deduplicate_objects_explicit_true(self): + """Test that deduplicate_objects can be explicitly set to True""" + manager = BuildManager(self.type_map, deduplicate_objects=True) + self.assertTrue(manager.deduplicate_objects) + + def test_deduplicate_objects_explicit_false(self): + """Test that deduplicate_objects can be explicitly set to False""" + manager = BuildManager(self.type_map, deduplicate_objects=False) + self.assertFalse(manager.deduplicate_objects) + + def test_get_builder_with_deduplication_enabled(self): + """Test that get_builder returns cached builder when deduplication is enabled""" + manager = BuildManager(self.type_map, deduplicate_objects=True) + + # Create a simple Data container + container = Data(name="test_data", data=[1, 2, 3]) + + # Create and cache a builder + builder = DatasetBuilder(name="test_data", data=[1, 2, 3]) + manager.prebuilt(container, builder) + + # get_builder should return the cached builder + cached_builder = manager.get_builder(container) + self.assertIs(cached_builder, builder) + + def test_get_builder_with_deduplication_disabled(self): + """Test that get_builder returns None when deduplication is disabled""" + manager = BuildManager(self.type_map, deduplicate_objects=False) + + # Create a simple Data container + container = Data(name="test_data", data=[1, 2, 3]) + + # Create and cache a builder + builder = DatasetBuilder(name="test_data", data=[1, 2, 3]) + manager.prebuilt(container, builder) + + # get_builder should return None when deduplication is disabled + cached_builder = manager.get_builder(container) + self.assertIsNone(cached_builder) + + def test_build_memoization_with_deduplication_enabled(self): + """Test that repeated builds return same builder when deduplication is enabled""" + manager = BuildManager(self.type_map, deduplicate_objects=True) + + container_inst = Foo('my_foo', list(range(10)), 'value1', 10) + + # Build twice - should get same builder + builder1 = manager.build(container_inst) + builder2 = manager.build(container_inst) + + self.assertIs(builder1, builder2) + + def test_build_no_memoization_with_deduplication_disabled(self): + """Test that repeated builds create new builders when deduplication is disabled""" + manager = BuildManager(self.type_map, deduplicate_objects=False) + + container_inst = Foo('my_foo', list(range(10)), 'value1', 10) + + # Build twice - should get different builders + builder1 = manager.build(container_inst) + builder2 = manager.build(container_inst) + + self.assertIsNot(builder1, builder2) + # But they should have the same content + self.assertDictEqual(builder1, builder2) + + def test_clear_cache_behavior(self): + """Test that clear_cache works regardless of deduplication setting""" + # Test with deduplication enabled + manager_true = BuildManager(self.type_map, deduplicate_objects=True) + container = Data(name="test_data", data=[1, 2, 3]) + builder = DatasetBuilder(name="test_data", data=[1, 2, 3]) + manager_true.prebuilt(container, builder) + + self.assertIs(manager_true.get_builder(container), builder) + manager_true.clear_cache() + self.assertIsNone(manager_true.get_builder(container)) + + # Test with deduplication disabled + manager_false = BuildManager(self.type_map, deduplicate_objects=False) + manager_false.prebuilt(container, builder) + + # Should return None even before clearing cache + self.assertIsNone(manager_false.get_builder(container)) + manager_false.clear_cache() + self.assertIsNone(manager_false.get_builder(container))