Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,73 @@ def from_dict(
store_path=store_path,
)

async def copy_store(
self,
store: StoreLike,
*,
overwrite: bool = False,
consolidate_metadata: bool | None = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need this here? IMO it's simpler if copying is just copying. If people want to add consolidated metadata, they should do that after copying.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adjusted it. It now consolidates if the source store was consolidated, but this does not require an argument. One more thing, I only take into account when consolidation happened at the root level. It could be (although the user should not do this) that consolidation happened at the subgroup level. This could be accounted for by checking if member. metadata.consolidated_metadata when looping through members. WDYT should I add this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would copy the original group and its children exactly, including their consolidated metadata.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would be the purpose of copying over stale consolidated metadata?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it keeps the scope of this function simple -- it just copies the original group exactly. If this tool produces a different group, then it's not really copying.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

regarding the from_store method, I would open a separate PR for this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consolidated metadata is already in the group metadata document for zarr v3 if "kind" is set to "inline".

👍 but doing something other (more or less) than just copying would require interpreting that which is too far IMO.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI @joshmoore the method under discussion here is a method on the Group class, not on stores. maybe this is a point of confusion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO the only consolidated metadata-related parameter we need is whether iterating over the child groups uses their consolidated metadata (if present). See the relevant kwarg in the signature of members.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added both, only thing remaining is probably adding consolidated_metadata in the async group call.

) -> AsyncGroup:
target_zarr_format = self.metadata.zarr_format

new_group = await AsyncGroup.from_store(
store,
overwrite=overwrite,
attributes=self.metadata.attributes,
zarr_format=target_zarr_format,
)

async for _, member in self.members(max_depth=None):
child_path = member.store_path.path
target_path = StorePath(store=new_group.store, path=child_path)
if isinstance(member, AsyncGroup):
await async_api.group(
store=target_path,
overwrite=overwrite,
attributes=member.metadata.attributes,
zarr_format=target_zarr_format,
)
else:
kwargs = {}
if target_zarr_format == 3:
kwargs["chunk_key_encoding"] = member.metadata.chunk_key_encoding
kwargs["dimension_names"] = member.metadata.dimension_names
else:
kwargs["chunk_key_encoding"] = {
"name": "v2",
"separator": member.metadata.dimension_separator,
}
# Serializer done this way in case of having zarr_format 2, otherwise mypy complains.
new_array = await new_group.create_array(
name=child_path,
shape=member.shape,
dtype=member.dtype,
chunks=member.chunks,
shards=member.shards,
filters=member.filters,
compressors=member.compressors,
serializer=member.serializer if member.serializer is not None else "auto",
fill_value=member.metadata.fill_value,
attributes=member.attrs,
overwrite=overwrite,
config={"order": member.order},
**kwargs,
)

if target_zarr_format == 3:
for region in member._iter_shard_regions():
data = await member.getitem(selection=region)
await new_array.setitem(selection=region, value=data)
else:
for region in member._iter_chunk_regions():
data = await member.getitem(selection=region)
await new_array.setitem(selection=region, value=data)

if consolidate_metadata:
await async_api.consolidate_metadata(new_group.store)

return new_group

async def setitem(self, key: str, value: Any) -> None:
"""
Fastpath for creating a new array
Expand Down Expand Up @@ -1874,6 +1941,21 @@ def open(
obj = sync(AsyncGroup.open(store, zarr_format=zarr_format))
return cls(obj)

def copy_store(
self,
store: StoreLike,
*,
overwrite: bool = False,
consolidate_metadata: bool | None = None,
) -> Group:
return Group(
sync(
self._async_group.copy_store(
store=store, overwrite=overwrite, consolidate_metadata=consolidate_metadata
)
)
)

def __getitem__(self, path: str) -> AnyArray | Group:
"""Obtain a group member.

Expand Down
52 changes: 52 additions & 0 deletions tests/test_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,58 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad
members_observed = group.members(max_depth=-1)


@pytest.mark.parametrize(
("zarr_format", "shards", "consolidate_metadata"),
[
(2, None, False),
(2, None, True),
(3, (50,), False),
(3, (50,), True),
],
)
def test_copy_store(zarr_format: int, shards: tuple[int, ...], consolidate_metadata: bool) -> None:
src_store = MemoryStore()
src = Group.from_store(src_store, attributes={"root": True}, zarr_format=zarr_format)

src.create_group("subgroup")

arr_data = np.arange(100)
src.create_array(
"dataset",
shape=(100,),
chunks=(10,),
shards=shards,
dtype=arr_data.dtype,
)
src["dataset"] = arr_data

dst_store = MemoryStore()
if zarr_format == 3 and consolidate_metadata:
with pytest.warns(
ZarrUserWarning,
match="Consolidated metadata is currently not part in the Zarr format 3 specification.",
):
dst = src.copy_store(
dst_store, overwrite=True, consolidate_metadata=consolidate_metadata
)
else:
dst = src.copy_store(dst_store, overwrite=True, consolidate_metadata=consolidate_metadata)

assert dst.attrs.get("root") is True

subgroup = dst["subgroup"]
assert isinstance(subgroup, Group)

copied_arr = dst["dataset"]
copied_data = copied_arr[:]
assert np.array_equal(copied_data, arr_data)

if consolidate_metadata:
assert zarr.open_group(dst_store).metadata.consolidated_metadata
else:
assert not zarr.open_group(dst_store).metadata.consolidated_metadata


def test_group(store: Store, zarr_format: ZarrFormat) -> None:
"""
Test basic Group routines.
Expand Down