From 276b521730e5d0f995725da0a77e26046be6ad9c Mon Sep 17 00:00:00 2001 From: TonioF Date: Tue, 6 Jan 2026 15:19:04 +0100 Subject: [PATCH 1/7] add better support for multi-product archives (#2) --- CHANGES.md | 1 + xarray_enmap/cli.py | 62 ++++++++++++++++++++++++++++++------ xarray_enmap/xarray_enmap.py | 62 +++++++++++++++++++++++++++++++++++- 3 files changed, 114 insertions(+), 11 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 724df57..08ea917 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ ## Changes in 0.0.3 (under development) - Add the command-line conversion tool convert-enmap (#9) +- Added support for backend methods `open_datatree` and `open_groups_as_dict` (#2) ## Changes in 0.0.2 diff --git a/xarray_enmap/cli.py b/xarray_enmap/cli.py index 1302bbb..4af465a 100644 --- a/xarray_enmap/cli.py +++ b/xarray_enmap/cli.py @@ -10,6 +10,9 @@ import shutil import sys import tempfile + +import xarray + from . import xarray_enmap LOGGER = logging.getLogger(__name__) @@ -48,6 +51,13 @@ def main(): help="Higher Zarr output compression. ~25%% smaller than default compression. " "Compression process (but not decompression) is much slower.", ) + parser.add_argument( + "--open-as-datatree", + "-oad", + action="store_true", + help="Whether to write the data as datatree. This parameter is only considered when" + "the parameter zarr-output is given.", + ) parser.add_argument("--verbose", "-v", action="count", default=0) args = parser.parse_args() @@ -71,6 +81,7 @@ def loglevel(verbosity): args.tiff_output, temp_dir, args.compress, + args.open_as_datatree ) else: temp_dir = os.path.expanduser(args.tempdir) @@ -82,6 +93,7 @@ def loglevel(verbosity): temp_dir, args.compress, args.extract_only, + args.open_as_datatree ) @@ -91,6 +103,7 @@ def process( output_dir_tiff: str, temp_dir: str, compress: bool = False, + open_as_datatree: bool = False ): if output_dir_zarr is output_dir_tiff is None: LOGGER.warn("No output destinations specified.") @@ -117,24 +130,53 @@ def process( raise ValueError( f"{input_filename} is neither a file nor a directory." ) - for data_dir in data_dirs: - if output_dir_tiff is not None: - shutil.copytree( - data_dir, pathlib.Path(output_dir_tiff) / data_dir.name - ) - if output_dir_zarr is not None: - write_zarr(data_dir, output_dir_zarr, compress) + if output_dir_zarr is not None and open_as_datatree: + write_datatree_as_zarr(input_path, data_dirs, output_dir_zarr, compress) + else: + for data_dir in data_dirs: + if output_dir_tiff is not None: + shutil.copytree( + data_dir, pathlib.Path(output_dir_tiff) / data_dir.name + ) + if output_dir_zarr is not None: + write_zarr(data_dir, output_dir_zarr, compress) def write_zarr( - data_dir, output_dir: str, compress: bool = False + data_dir, output_dir: str, compress: bool = False, open_as_datatree: bool = False ): LOGGER.info(f"Writing {data_dir} to a Zarr archive...") ensure_module_importable("zarr") ds = xarray_enmap.read_dataset_from_inner_directory(data_dir) + store_path = pathlib.Path(output_dir) / (data_dir.name + ".zarr") + zarr_args = _get_zarr_args(compress, store_path) + ds.to_zarr(**zarr_args) + + +def write_datatree_as_zarr( + input_path, data_dirs, output_dir: str, compress: bool = False +): + name = input_path.name + LOGGER.info(f"Writing {name} to a Zarr archive...") + suffixes = input_path.suffixes + suffixes.reverse() + for suffix in suffixes: + name = name.removesuffix(suffix) + ensure_module_importable("zarr") + groups = {} + for data_dir in data_dirs: + group_name = data_dir if isinstance(data_dir, str) else data_dir.name + groups[group_name] = xarray_enmap.read_dataset_from_inner_directory(data_dir) + dt = xarray.DataTree.from_dict(groups) + store_path = pathlib.Path(output_dir) / (name + ".zarr") + zarr_args = _get_zarr_args(compress, store_path) + dt.to_zarr(**zarr_args) + + +def _get_zarr_args(compress: bool, store_path: str): zarr_args = { "zarr_format": 2, - "store": pathlib.Path(output_dir) / (data_dir.name + ".zarr") + "store": store_path } if compress: ensure_module_importable("numcodecs") @@ -146,7 +188,7 @@ def write_zarr( ) } } - ds.to_zarr(**zarr_args) + return zarr_args def ensure_module_importable(module_name: str): diff --git a/xarray_enmap/xarray_enmap.py b/xarray_enmap/xarray_enmap.py index 10d507b..e4d16e0 100644 --- a/xarray_enmap/xarray_enmap.py +++ b/xarray_enmap/xarray_enmap.py @@ -16,8 +16,10 @@ import zipfile import shapely +import xarray import xarray as xr - +from xarray import DataTree, Dataset +from xarray.backends import AbstractDataStore LOGGER = logging.getLogger(__name__) @@ -37,6 +39,7 @@ class EnmapEntrypoint(xr.backends.BackendEntrypoint): + supports_groups = True temp_dir = None def open_dataset( @@ -60,6 +63,34 @@ def open_dataset( ds.set_close(self.close) return ds + def open_datatree( + self, + filename_or_obj: str | os.PathLike[Any], + *, + drop_variables: str | Iterable[str] | None = None, + ) -> DataTree: + groups = self.open_groups_as_dict(filename_or_obj, drop_variables=drop_variables) + dt = xarray.DataTree.from_dict(data=groups) + return dt + + def open_groups_as_dict( + self, + filename_or_obj: str | os.PathLike[Any], + *, + drop_variables: str | Iterable[str] | None = None, + ) -> dict[str, Dataset]: + self.temp_dir = tempfile.mkdtemp(prefix="xarray-enmap-") + path = pathlib.Path(filename_or_obj) + if path.is_file(): + groups = read_groups_from_archive(filename_or_obj, self.temp_dir) + elif path.is_dir(): + groups = read_groups_from_unknown_directory(path, self.temp_dir) + else: + raise ValueError( + f"{filename_or_obj} is neither a path nor a directory." + ) + return groups + def close(self): if self.temp_dir: shutil.rmtree(self.temp_dir) @@ -102,6 +133,35 @@ def read_dataset_from_inner_directory(data_dir: str | os.PathLike[Any]) -> xr.Da return ds +def read_groups_from_archive( + input_filename: str | os.PathLike[Any], temp_dir: str +) -> dict[str, Dataset]: + data_dirs = list(extract_archives(input_filename, temp_dir)) + groups = {} + for data_dir in data_dirs: + group_name = data_dir if isinstance(data_dir, str) else data_dir.name + groups[group_name] = read_dataset_from_inner_directory(data_dir) + return groups + + +def read_groups_from_unknown_directory( + data_dir: str | os.PathLike[Any], temp_dir: str +) -> dict[str, Dataset]: + data_path = pathlib.Path(data_dir) + metadata_files = list(data_path.glob("*METADATA.XML")) + match len(metadata_files): + case 0: + # assume outer directory + return read_groups_from_archive(data_path, temp_dir) + case 1: + # assume inner directory + return dict( + data_path=read_dataset_from_inner_directory(data_path) + ) + case _: + raise RuntimeError("Too many METADATA.XML files") + + def find_datafiles(data_path: pathlib.Path) -> Mapping[str, pathlib.Path]: assert data_path.is_dir() tiffs = list(data_path.glob("*.TIF")) From 6e4aa18a9adab5eabd29e52f728b58fa21a51a9a Mon Sep 17 00:00:00 2001 From: Tonio Fincke Date: Fri, 9 Jan 2026 19:06:20 +0100 Subject: [PATCH 2/7] Update xarray_enmap/cli.py Co-authored-by: Pontus Lurcock --- xarray_enmap/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray_enmap/cli.py b/xarray_enmap/cli.py index 4af465a..a757ca9 100644 --- a/xarray_enmap/cli.py +++ b/xarray_enmap/cli.py @@ -52,7 +52,7 @@ def main(): "Compression process (but not decompression) is much slower.", ) parser.add_argument( - "--open-as-datatree", + "--datatree", "-oad", action="store_true", help="Whether to write the data as datatree. This parameter is only considered when" From db5f020d71a00f51e1de5bdc09c71f39b9adf5d0 Mon Sep 17 00:00:00 2001 From: Tonio Fincke Date: Fri, 9 Jan 2026 19:06:33 +0100 Subject: [PATCH 3/7] Update xarray_enmap/cli.py Co-authored-by: Pontus Lurcock --- xarray_enmap/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray_enmap/cli.py b/xarray_enmap/cli.py index a757ca9..1c5c36c 100644 --- a/xarray_enmap/cli.py +++ b/xarray_enmap/cli.py @@ -143,7 +143,7 @@ def process( def write_zarr( - data_dir, output_dir: str, compress: bool = False, open_as_datatree: bool = False + data_dir, output_dir: str, compress: bool = False ): LOGGER.info(f"Writing {data_dir} to a Zarr archive...") ensure_module_importable("zarr") From dabdd05a09400e707559c3da435b33e2747c8eab Mon Sep 17 00:00:00 2001 From: Tonio Fincke Date: Fri, 9 Jan 2026 19:07:31 +0100 Subject: [PATCH 4/7] Update xarray_enmap/cli.py Co-authored-by: Pontus Lurcock --- xarray_enmap/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray_enmap/cli.py b/xarray_enmap/cli.py index 1c5c36c..947c02d 100644 --- a/xarray_enmap/cli.py +++ b/xarray_enmap/cli.py @@ -154,7 +154,7 @@ def write_zarr( def write_datatree_as_zarr( - input_path, data_dirs, output_dir: str, compress: bool = False + input_path: pathlib.Path, data_dirs: Iterable[pathlib.Path | str], output_dir: str, compress: bool = False ): name = input_path.name LOGGER.info(f"Writing {name} to a Zarr archive...") From a297c1e8a1e4c75834ca4b358942a2adbac5df07 Mon Sep 17 00:00:00 2001 From: Tonio Fincke Date: Fri, 9 Jan 2026 19:08:11 +0100 Subject: [PATCH 5/7] Update xarray_enmap/cli.py Co-authored-by: Pontus Lurcock --- xarray_enmap/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray_enmap/cli.py b/xarray_enmap/cli.py index 947c02d..ccaf6f4 100644 --- a/xarray_enmap/cli.py +++ b/xarray_enmap/cli.py @@ -55,7 +55,7 @@ def main(): "--datatree", "-oad", action="store_true", - help="Whether to write the data as datatree. This parameter is only considered when" + help="Whether to write the data as datatree. This parameter is only considered when " "the parameter zarr-output is given.", ) parser.add_argument("--verbose", "-v", action="count", default=0) From e84ffaff42d42acf8efe3867ed00e5ce04add39f Mon Sep 17 00:00:00 2001 From: Tonio Fincke Date: Fri, 9 Jan 2026 19:08:38 +0100 Subject: [PATCH 6/7] Update xarray_enmap/cli.py Co-authored-by: Pontus Lurcock --- xarray_enmap/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray_enmap/cli.py b/xarray_enmap/cli.py index ccaf6f4..ab6bfef 100644 --- a/xarray_enmap/cli.py +++ b/xarray_enmap/cli.py @@ -53,7 +53,7 @@ def main(): ) parser.add_argument( "--datatree", - "-oad", + "-d", action="store_true", help="Whether to write the data as datatree. This parameter is only considered when " "the parameter zarr-output is given.", From 165b1eb2523da722dc42662191996c2fb3b0f477 Mon Sep 17 00:00:00 2001 From: TonioF Date: Fri, 9 Jan 2026 19:19:41 +0100 Subject: [PATCH 7/7] finalised merge --- xarray_enmap/cli.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/xarray_enmap/cli.py b/xarray_enmap/cli.py index ca87202..af4ffeb 100644 --- a/xarray_enmap/cli.py +++ b/xarray_enmap/cli.py @@ -10,6 +10,7 @@ import shutil import sys import tempfile +from collections.abc import Iterable import xarray @@ -89,7 +90,7 @@ def loglevel(verbosity): args.tiff_output, temp_dir, args.compress, - args.open_as_datatree + args.datatree, scale_reflectance, ) else: @@ -102,7 +103,7 @@ def loglevel(verbosity): temp_dir, args.compress, args.extract_only, - args.open_as_datatree + args.datatree, scale_reflectance, ) @@ -113,7 +114,7 @@ def process( output_dir_tiff: str, temp_dir: str, compress: bool = False, - open_as_datatree: bool = False + open_as_datatree: bool = False, scale_reflectance: bool = True, ): if output_dir_zarr is output_dir_tiff is None: @@ -176,7 +177,11 @@ def write_zarr( def write_datatree_as_zarr( - input_path: pathlib.Path, data_dirs: Iterable[pathlib.Path | str], output_dir: str, compress: bool = False + input_path: pathlib.Path, + data_dirs: Iterable[pathlib.Path | str], + output_dir: str, + compress: bool = False, + scale_reflectance: bool = True ): name = input_path.name LOGGER.info(f"Writing {name} to a Zarr archive...") @@ -185,10 +190,16 @@ def write_datatree_as_zarr( for suffix in suffixes: name = name.removesuffix(suffix) ensure_module_importable("zarr") + LOGGER.info( + f"Using {'scaled' if scale_reflectance else 'unscaled'} " + f"reflectance." + ) groups = {} for data_dir in data_dirs: group_name = data_dir if isinstance(data_dir, str) else data_dir.name - groups[group_name] = xarray_enmap.read_dataset_from_inner_directory(data_dir) + groups[group_name] = xarray_enmap.read_dataset_from_inner_directory( + data_dir, scale_reflectance + ) dt = xarray.DataTree.from_dict(groups) store_path = pathlib.Path(output_dir) / (name + ".zarr") zarr_args = _get_zarr_args(compress, store_path)