hdmf-dev · mavaylon1 · Aug 27, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 30, 2024
diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py
@@ -341,7 +341,9 @@ def copy_file(self, **kwargs):
              'default': True},
             {'name': 'herd', 'type': 'hdmf.common.resources.HERD',
              'doc': 'A HERD object to populate with references.',
-             'default': None})
+             'default': None},
+            {'name': 'expandable', 'type': bool, 'default': True,
+             'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')})
     def write(self, **kwargs):
         """Write the container to an HDF5 file."""
         if self.__mode == 'r':
@@ -782,10 +784,15 @@ def close_linked_files(self):
              'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
              'default': True},
             {'name': 'export_source', 'type': str,
-             'doc': 'The source of the builders when exporting', 'default': None})
+             'doc': 'The source of the builders when exporting', 'default': None},
+            {'name': 'expandable', 'type': bool, 'default': True,
+             'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')})
     def write_builder(self, **kwargs):
         f_builder = popargs('builder', kwargs)
-        link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
+        link_data, exhaust_dci, export_source = getargs('link_data',
+                                                        'exhaust_dci',
+                                                        'export_source',
+                                                        kwargs)
         self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s"
                           % (f_builder.name, self.source, kwargs))
         for name, gbldr in f_builder.groups.items():
@@ -953,6 +960,8 @@ def _filler():
              'default': True},
             {'name': 'export_source', 'type': str,
              'doc': 'The source of the builders when exporting', 'default': None},
+            {'name': 'expandable', 'type': bool, 'default': True,
+             'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')},
             returns='the Group that was created', rtype=Group)
     def write_group(self, **kwargs):
         parent, builder = popargs('parent', 'builder', kwargs)
@@ -1053,21 +1062,24 @@ def write_link(self, **kwargs):
              'default': True},
             {'name': 'export_source', 'type': str,
              'doc': 'The source of the builders when exporting', 'default': None},
+            {'name': 'expandable', 'type': bool, 'default': True,
+             'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')},
             returns='the Dataset that was created', rtype=Dataset)
     def write_dataset(self, **kwargs):  # noqa: C901
         """ Write a dataset to HDF5
 
         The function uses other dataset-dependent write functions, e.g,
         ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data.
         """
-        parent, builder = popargs('parent', 'builder', kwargs)
+        parent, builder, expandable = popargs('parent', 'builder', 'expandable', kwargs)
         link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
         self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
         if self.get_written(builder):
             self.logger.debug("    DatasetBuilder '%s' is already written" % builder.name)
             return None
         name = builder.name
         data = builder.data
+        matched_spec_shape = builder.spec_shapes
         dataio = None
         options = dict()  # dict with additional
         if isinstance(data, H5DataIO):
@@ -1183,8 +1195,9 @@ def _filler():
             elif len(np.shape(data)) == 0:
                 dset = self.__scalar_fill__(parent, name, data, options)
             else:
-                dset = self.__list_fill__(parent, name, data, options)
+                dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options)
         # Write a dataset containing references, i.e., object reference.
+
         # NOTE: we can ignore options['io_settings'] for scalar data
         elif self.__is_ref(options['dtype']):
             _dtype = self.__dtypes.get(options['dtype'])
@@ -1241,7 +1254,7 @@ def _filler():
                 self.__dci_queue.append(dataset=dset, data=data)
             # Write a regular in memory array (e.g., numpy array, list etc.)
             elif hasattr(data, '__len__'):
-                dset = self.__list_fill__(parent, name, data, options)
+                dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options)
             # Write a regular scalar dataset
             else:
                 dset = self.__scalar_fill__(parent, name, data, options)
@@ -1369,7 +1382,7 @@ def __chunked_iter_fill__(cls, parent, name, data, options=None):
         return dset
 
     @classmethod
-    def __list_fill__(cls, parent, name, data, options=None):
+    def __list_fill__(cls, parent, name, data, matched_spec_shape, expandable, options=None):
         # define the io settings and data type if necessary
         io_settings = {}
         dtype = None
@@ -1391,6 +1404,11 @@ def __list_fill__(cls, parent, name, data, options=None):
             data_shape = (len(data),)
         else:
             data_shape = get_data_shape(data)
+        if expandable:
+            # Don't override existing settings
+            if 'maxshape' not in io_settings:
+                if matched_spec_shape is not None:
+                    io_settings['maxshape'] = matched_spec_shape
 
         # Create the dataset
         try:

diff --git a/src/hdmf/build/builders.py b/src/hdmf/build/builders.py
@@ -328,6 +328,9 @@ class DatasetBuilder(BaseBuilder):
              'doc': 'The datatype of this dataset.', 'default': None},
             {'name': 'attributes', 'type': dict,
              'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
+            {'name': 'spec_shapes', 'type': tuple,
+             'doc': ('The shape(s) defined in the spec.'),
+             'default': None},
             {'name': 'dimension_labels', 'type': tuple,
              'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
                      'supplied only on build.'),
@@ -339,22 +342,27 @@ class DatasetBuilder(BaseBuilder):
             {'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
     def __init__(self, **kwargs):
         """ Create a Builder object for a dataset """
-        name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
-            'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
-            kwargs
-        )
+        name, data, dtype, attributes, spec_shapes, dimension_labels, maxshape, chunks, parent, source = getargs(
+            'name', 'data', 'dtype', 'attributes', 'spec_shapes', 'dimension_labels', 'maxshape', 'chunks',
+            'parent', 'source', kwargs)
         super().__init__(name, attributes, parent, source)
         self['data'] = data
         self['attributes'] = _copy.copy(attributes)
         self.__dimension_labels = dimension_labels
         self.__chunks = chunks
+        self.__spec_shapes = spec_shapes
         self.__maxshape = maxshape
         if isinstance(data, BaseBuilder):
             if dtype is None:
                 dtype = self.OBJECT_REF_TYPE
         self.__dtype = dtype
         self.__name = name
 
+    @property
+    def spec_shapes(self):
+        """The shapes defined in the spec."""
+        return self.__spec_shapes
+
     @property
     def data(self):
         """The data stored in the dataset represented by this builder."""

diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py
@@ -593,6 +593,7 @@ def get_attribute(self, **kwargs):
     def get_attr_value(self, **kwargs):
         ''' Get the value of the attribute corresponding to this spec from the given container '''
         spec, container, manager = getargs('spec', 'container', 'manager', kwargs)
+
         attr_name = self.get_attribute(spec)
         if attr_name is None:
             return None
@@ -770,7 +771,10 @@ def build(self, **kwargs):
                     msg = "'container' must be of type Data with DatasetSpec"
                     raise ValueError(msg)
                 spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext)
-                dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims)
+                dimension_labels, matched_shape = self.__get_spec_info(container.data,
+                                                                       spec_shape,
+                                                                       spec_dims,
+                                                                       spec_dtype)
                 if isinstance(spec_dtype, RefSpec):
                     self.logger.debug("Building %s '%s' as a dataset of references (source: %s)"
                                       % (container.__class__.__name__, container.name, repr(source)))
@@ -781,6 +785,7 @@ def build(self, **kwargs):
                         parent=parent,
                         source=source,
                         dtype=spec_dtype.reftype,
+                        spec_shapes=matched_shape,
                         dimension_labels=dimension_labels,
                     )
                     manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager))
@@ -795,6 +800,7 @@ def build(self, **kwargs):
                         parent=parent,
                         source=source,
                         dtype=spec_dtype,
+                        spec_shapes=matched_shape,
                         dimension_labels=dimension_labels,
                     )
                     manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container,
@@ -813,6 +819,7 @@ def build(self, **kwargs):
                             parent=parent,
                             source=source,
                             dtype="object",
+                            spec_shapes=matched_shape,
                             dimension_labels=dimension_labels,
                         )
                         manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager))
@@ -838,6 +845,7 @@ def build(self, **kwargs):
                             parent=parent,
                             source=source,
                             dtype=dtype,
+                            spec_shapes=matched_shape,
                             dimension_labels=dimension_labels,
                         )
 
@@ -869,10 +877,7 @@ def __check_dset_spec(self, orig, ext):
             spec = ext
         return dtype, shape, dims, spec
 
-    def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple:
-        if spec_shape is None or spec_dims is None:
-            return None
-        data_shape = get_data_shape(data)
+    def __get_matched_dimension(self, data_shape, spec_shape, spec_dtype=None):
         # if shape is a list of allowed shapes, find the index of the shape that matches the data
         if isinstance(spec_shape[0], list):
             match_shape_inds = list()
@@ -888,37 +893,58 @@ def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple
                         break
                 if match:
                     match_shape_inds.append(i)
-            # use the most specific match -- the one with the fewest Nones
-            if match_shape_inds:
-                if len(match_shape_inds) == 1:
-                    return tuple(spec_dims[match_shape_inds[0]])
+            return match_shape_inds
+
+    def __get_spec_info(self, data, spec_shape, spec_dims, spec_dtype=None):
+        """This will return the dimension labels and shape by matching the data shape to a permissible spec shape."""
+        if spec_shape is None and spec_dims is None:
+            return None, None
+        else:
+            if spec_dtype is not None and isinstance(spec_dtype, list):
+                data_shape = (len(data),)
+            else:
+                data_shape = get_data_shape(data)
+
+            if isinstance(spec_shape[0], list):
+                match_shape_inds = self.__get_matched_dimension(data_shape, spec_shape, spec_dtype)
+                if len(match_shape_inds) == 0:
+                    # no matches found
+                    msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
+                    warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
+                    return None, None
+                elif len(match_shape_inds) == 1:
+                    if spec_dims is not None:
+                        return tuple(spec_dims[match_shape_inds[0]]), tuple(spec_shape[match_shape_inds[0]])
+                    else:
+                        return spec_dims, tuple(spec_shape[match_shape_inds[0]])
                 else:
                     count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds]
                     index_min_count = count_nones.index(min(count_nones))
                     best_match_ind = match_shape_inds[index_min_count]
-                    return tuple(spec_dims[best_match_ind])
+                    if spec_dims is not None:
+                        return tuple(spec_dims[best_match_ind]), tuple(spec_shape[best_match_ind])
+                    else:
+                        return spec_dims, tuple(spec_shape[best_match_ind])
             else:
-                # no matches found
-                msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
-                warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
-                return None
-        else:
-            if len(data_shape) != len(spec_shape):
-                msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
-                warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
-                return None
-            # check each dimension. None means any length is allowed
-            match = True
-            for j, d in enumerate(data_shape):
-                if spec_shape[j] is not None and spec_shape[j] != d:
-                    match = False
-                    break
-            if not match:
-                msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
-                warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
-                return None
-            # shape is a single list of allowed dimension lengths
-            return tuple(spec_dims)
+                if len(data_shape) != len(spec_shape):
+                    msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
+                    warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
+                    return None, None
+                # check each dimension. None means any length is allowed
+                match = True
+                for j, d in enumerate(data_shape):
+                    if spec_shape[j] is not None and spec_shape[j] != d:
+                        match = False
+                        break
+                if not match:
+                    msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
+                    warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
+                    return None, None
+                # shape is a single list of allowed dimension lengths
+                if spec_dims is not None:
+                    return tuple(spec_dims), tuple(spec_shape)
+                else:
+                    return None, tuple(spec_shape)
 
     def __is_reftype(self, data):
         if (isinstance(data, AbstractDataChunkIterator) or
@@ -1136,7 +1162,19 @@ def __add_datasets(self, builder, datasets, container, build_manager, source, ex
                         raise BuildError(builder, msg) from ex
                     self.logger.debug("        Adding untyped dataset for spec name %s and adding attributes"
                                       % repr(spec.name))
-                    sub_builder = DatasetBuilder(spec.name, data, parent=builder, source=source, dtype=dtype)
+
+                    dimension_labels, matched_shape = self.__get_spec_info(data,
+                                                                           spec.shape,
+                                                                           spec.dims,
+                                                                           dtype)
+
+                    sub_builder = DatasetBuilder(spec.name,
+                                                 data,
+                                                 parent=builder,
+                                                 source=source,
+                                                 dtype=dtype,
+                                                 spec_shapes=matched_shape,
+                                                 dimension_labels=dimension_labels)
                     builder.set_dataset(sub_builder)
                 self.__add_attributes(sub_builder, spec.attributes, container, build_manager, source, export)
             else:

diff --git a/tests/unit/helpers/utils.py b/tests/unit/helpers/utils.py
@@ -343,6 +343,70 @@ def __init__(self, spec):
     manager = BuildManager(type_map)
     return manager
 
+############################################
+# Qux: A test class with variable data shapes
+############################################
+class QuxData(Data):
+    pass
+
+
+class QuxBucket(Container):
+    "PseudoFile"
+    @docval(
+        {"name": "name", "type": str, "doc": "the name of this bucket"},
+        {"name": "qux_data", "type": QuxData, "doc": "Data with user defined shape."},
+    )
+    def __init__(self, **kwargs):
+        name, qux_data = getargs("name", "qux_data", kwargs)
+        super().__init__(name=name)
+        self.__qux_data = qux_data
+        self.__qux_data.parent = self
+
+    @property
+    def qux_data(self):
+        return self.__qux_data
+
+
+def get_qux_buildmanager(shape):
+    qux_data_spec = DatasetSpec(
+        doc="A test dataset of references specification with a data type",
+        name="qux_data",
+        data_type_def="QuxData",
+        shape=shape,
+        dtype='int'
+    )
+
+    qux_bucket_spec = GroupSpec(
+        doc="A test group specification for a data type containing data type",
+        data_type_def="QuxBucket",
+        datasets=[
+            DatasetSpec(doc="doc", data_type_inc="QuxData"),
+        ],
+    )
+
+    spec_catalog = SpecCatalog()
+    spec_catalog.register_spec(qux_data_spec, "test.yaml")
+    spec_catalog.register_spec(qux_bucket_spec, "test.yaml")
+
+    namespace = SpecNamespace(
+        "a test namespace",
+        CORE_NAMESPACE,
+        [{"source": "test.yaml"}],
+        version="0.1.0",
+        catalog=spec_catalog,
+    )
+
+    namespace_catalog = NamespaceCatalog()
+    namespace_catalog.add_namespace(CORE_NAMESPACE, namespace)
+
+    type_map = TypeMap(namespace_catalog)
+    type_map.register_container_type(CORE_NAMESPACE, "QuxData", QuxData)
+    type_map.register_container_type(CORE_NAMESPACE, "QuxBucket", QuxBucket)
+
+    manager = BuildManager(type_map)
+    return manager
+
+
 
 ############################################
 #  Baz example data containers and specs