Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,17 @@ def can_read(path):
'default': None
},
{'name': 'herd_path', 'type': str,
'doc': 'The path to read/write the HERD file', 'default': None},)
'doc': 'The path to read/write the HERD file', 'default': None},
{'name': 'deduplicate_objects', 'type': bool,
'doc': 'whether to deduplicate identical container objects by creating soft links', 'default': True})
def __init__(self, **kwargs):
"""Open an HDF5 file for IO.
"""
self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
path, manager, mode, comm, file_obj, driver, aws_region, herd_path = popargs('path', 'manager', 'mode',
'comm', 'file', 'driver',
'aws_region', 'herd_path',
kwargs)
path, manager, mode, comm, file_obj, driver, aws_region, herd_path, deduplicate_objects = popargs(
'path', 'manager', 'mode', 'comm', 'file', 'driver', 'aws_region', 'herd_path',
'deduplicate_objects', kwargs
)

self.__open_links = [] # keep track of other files opened from links in this file
self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close
Expand All @@ -93,9 +95,9 @@ def __init__(self, **kwargs):
raise UnsupportedOperation(msg)

if manager is None:
manager = BuildManager(TypeMap(NamespaceCatalog()))
manager = BuildManager(TypeMap(NamespaceCatalog()), deduplicate_objects=deduplicate_objects)
elif isinstance(manager, TypeMap):
manager = BuildManager(manager)
manager = BuildManager(manager, deduplicate_objects=deduplicate_objects)
self.__driver = driver
self.__aws_region = aws_region
self.__comm = comm
Expand Down
9 changes: 7 additions & 2 deletions src/hdmf/backends/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,13 @@ def can_read(path):
{"name": "source", "type": (str, Path),
"doc": "the source of container being built i.e. file path", 'default': None},
{'name': 'herd_path', 'type': str,
'doc': 'The path to read/write the HERD file', 'default': None},)
'doc': 'The path to read/write the HERD file', 'default': None},
{'name': 'deduplicate_objects', 'type': bool,
'doc': 'whether to deduplicate identical container objects by creating soft links', 'default': True})
def __init__(self, **kwargs):
manager, source, herd_path = getargs('manager', 'source', 'herd_path', kwargs)
manager, source, herd_path, deduplicate_objects = getargs(
'manager', 'source', 'herd_path', 'deduplicate_objects', kwargs
)
if isinstance(source, Path):
source = source.resolve()
elif (isinstance(source, str) and
Expand All @@ -38,6 +42,7 @@ def __init__(self, **kwargs):
self.__source = source
self.herd_path = herd_path
self.herd = None
self.__deduplicate_objects = deduplicate_objects
self.open()

@property
Expand Down
20 changes: 18 additions & 2 deletions src/hdmf/build/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,19 @@ class BuildManager:
A class for managing builds of AbstractContainers
"""

def __init__(self, type_map):
def __init__(self, type_map, deduplicate_objects=True):
"""
Args:
type_map (TypeMap): the TypeMap to use for mapping container classes to specifications
deduplicate_objects (bool): whether to deduplicate identical container objects by creating soft links
"""
self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
self.__builders = dict()
self.__containers = dict()
self.__active_builders = set()
self.__type_map = type_map
self.__ref_queue = deque() # a queue of the ReferenceBuilders that need to be added
self.__deduplicate_objects = deduplicate_objects

@property
def namespace_catalog(self):
Expand All @@ -102,6 +108,11 @@ def namespace_catalog(self):
def type_map(self):
return self.__type_map

@property
def deduplicate_objects(self):
"""Whether to deduplicate identical container objects by creating soft links."""
return self.__deduplicate_objects

@docval({"name": "object", "type": (BaseBuilder, AbstractContainer),
"doc": "the container or builder to get a proxy for"},
{"name": "source", "type": str,
Expand Down Expand Up @@ -260,8 +271,13 @@ def clear_cache(self):

@docval({"name": "container", "type": AbstractContainer, "doc": "the container to get the builder for"})
def get_builder(self, **kwargs):
"""Return the prebuilt builder for the given container or None if it does not exist."""
"""Return the prebuilt builder for the given container or None if it does not exist.

If deduplicate_objects is False, this will always return None to force creation of new builders.
"""
container = getargs('container', kwargs)
if not self.__deduplicate_objects:
return None
container_id = self.__conthash__(container)
result = self.__builders.get(container_id)
return result
Expand Down
148 changes: 148 additions & 0 deletions tests/unit/build_tests/test_deduplicate_objects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Tests for the deduplicate_objects functionality in BuildManager"""

from hdmf.build import DatasetBuilder, BuildManager, TypeMap, ObjectMapper
from hdmf.spec import GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog
from hdmf.testing import TestCase
from hdmf.container import Data

from tests.unit.helpers.utils import Foo, CORE_NAMESPACE


class FooMapper(ObjectMapper):
"""Maps nested 'attr2' attribute on dataset 'my_data' to Foo.attr2 in constructor and attribute map"""

def __init__(self, spec):
super().__init__(spec)
my_data_spec = spec.get_dataset('my_data')
self.map_spec('attr2', my_data_spec.get_attribute('attr2'))


class TestBuildManagerDeduplication(TestCase):
"""Test BuildManager deduplication functionality"""

def setUp(self):
self.foo_spec = GroupSpec(
doc='A test group specification with a data type',
data_type_def='Foo',
datasets=[
DatasetSpec(
doc='an example dataset',
dtype='int',
name='my_data',
attributes=[
AttributeSpec(
name='attr2',
doc='an example integer attribute',
dtype='int'
)
]
)
],
attributes=[AttributeSpec('attr1', 'an example string attribute', 'text')]
)

self.spec_catalog = SpecCatalog()
self.spec_catalog.register_spec(self.foo_spec, 'test.yaml')
self.namespace = SpecNamespace(
'a test namespace',
CORE_NAMESPACE,
[{'source': 'test.yaml'}],
version='0.1.0',
catalog=self.spec_catalog)
self.namespace_catalog = NamespaceCatalog()
self.namespace_catalog.add_namespace(CORE_NAMESPACE, self.namespace)
self.type_map = TypeMap(self.namespace_catalog)
self.type_map.register_container_type(CORE_NAMESPACE, 'Foo', Foo)
self.type_map.register_map(Foo, FooMapper)

def test_default_deduplicate_objects_true(self):
"""Test that deduplicate_objects defaults to True"""
manager = BuildManager(self.type_map)
self.assertTrue(manager.deduplicate_objects)

def test_deduplicate_objects_explicit_true(self):
"""Test that deduplicate_objects can be explicitly set to True"""
manager = BuildManager(self.type_map, deduplicate_objects=True)
self.assertTrue(manager.deduplicate_objects)

def test_deduplicate_objects_explicit_false(self):
"""Test that deduplicate_objects can be explicitly set to False"""
manager = BuildManager(self.type_map, deduplicate_objects=False)
self.assertFalse(manager.deduplicate_objects)

def test_get_builder_with_deduplication_enabled(self):
"""Test that get_builder returns cached builder when deduplication is enabled"""
manager = BuildManager(self.type_map, deduplicate_objects=True)

# Create a simple Data container
container = Data(name="test_data", data=[1, 2, 3])

# Create and cache a builder
builder = DatasetBuilder(name="test_data", data=[1, 2, 3])
manager.prebuilt(container, builder)

# get_builder should return the cached builder
cached_builder = manager.get_builder(container)
self.assertIs(cached_builder, builder)

def test_get_builder_with_deduplication_disabled(self):
"""Test that get_builder returns None when deduplication is disabled"""
manager = BuildManager(self.type_map, deduplicate_objects=False)

# Create a simple Data container
container = Data(name="test_data", data=[1, 2, 3])

# Create and cache a builder
builder = DatasetBuilder(name="test_data", data=[1, 2, 3])
manager.prebuilt(container, builder)

# get_builder should return None when deduplication is disabled
cached_builder = manager.get_builder(container)
self.assertIsNone(cached_builder)

def test_build_memoization_with_deduplication_enabled(self):
"""Test that repeated builds return same builder when deduplication is enabled"""
manager = BuildManager(self.type_map, deduplicate_objects=True)

container_inst = Foo('my_foo', list(range(10)), 'value1', 10)

# Build twice - should get same builder
builder1 = manager.build(container_inst)
builder2 = manager.build(container_inst)

self.assertIs(builder1, builder2)

def test_build_no_memoization_with_deduplication_disabled(self):
"""Test that repeated builds create new builders when deduplication is disabled"""
manager = BuildManager(self.type_map, deduplicate_objects=False)

container_inst = Foo('my_foo', list(range(10)), 'value1', 10)

# Build twice - should get different builders
builder1 = manager.build(container_inst)
builder2 = manager.build(container_inst)

self.assertIsNot(builder1, builder2)
# But they should have the same content
self.assertDictEqual(builder1, builder2)

def test_clear_cache_behavior(self):
"""Test that clear_cache works regardless of deduplication setting"""
# Test with deduplication enabled
manager_true = BuildManager(self.type_map, deduplicate_objects=True)
container = Data(name="test_data", data=[1, 2, 3])
builder = DatasetBuilder(name="test_data", data=[1, 2, 3])
manager_true.prebuilt(container, builder)

self.assertIs(manager_true.get_builder(container), builder)
manager_true.clear_cache()
self.assertIsNone(manager_true.get_builder(container))

# Test with deduplication disabled
manager_false = BuildManager(self.type_map, deduplicate_objects=False)
manager_false.prebuilt(container, builder)

# Should return None even before clearing cache
self.assertIsNone(manager_false.get_builder(container))
manager_false.clear_cache()
self.assertIsNone(manager_false.get_builder(container))
Loading