From 324bf937707841ca3ced6a0cb400a8acbc112aac Mon Sep 17 00:00:00 2001
From: JP Hastings-Spital <jp@byjp.me>
Date: Fri, 12 Sep 2025 15:57:26 +0100
Subject: [PATCH] feat: swap cdf_to_xarray to accept CDF instances, rather than
 filenames

This change allows for CDF files loaded from S3 buckets to be converted to xarray Datasets, without being forced to retrieve the remote dataset, then load it from disk.

cdf_to_xarray now requires a CDF object, not a filename. `cdf_to_xarray("some.cdf")` would now be `cdf_to_xarray(CDF("some.cdf"))`
---
 cdflib/xarray/cdf_to_xarray.py     | 10 ++--
 tests/test_cdfwrite.py             |  6 +--
 tests/test_xarray_reader_writer.py | 79 ++++++++++++++++++++----------
 3 files changed, 60 insertions(+), 35 deletions(-)

diff --git a/cdflib/xarray/cdf_to_xarray.py b/cdflib/xarray/cdf_to_xarray.py
index ab16c5f..9552bdb 100644
--- a/cdflib/xarray/cdf_to_xarray.py
+++ b/cdflib/xarray/cdf_to_xarray.py
@@ -99,12 +99,10 @@ def _convert_cdf_time_types(
 
 
 def _convert_cdf_to_dicts(
-    filename: Union[str, Path], to_datetime: bool = False, to_unixtime: bool = False
+    cdf_file: CDF, to_datetime: bool = False, to_unixtime: bool = False
 ) -> Tuple[Dict[str, List[Union[str, np.ndarray]]], Dict[str, Any], Dict[str, npt.NDArray], Dict[str, VDRInfo]]:
-    # Open the CDF file
     # Converts the entire CDF file into python dictionary objects
 
-    cdf_file = CDF(filename, string_encoding="latin-1")
     cdf_info = cdf_file.cdf_info()
     all_cdf_variables = cdf_info.rVariables + cdf_info.zVariables
 
@@ -685,12 +683,12 @@ def _verify_dimension_sizes(created_data_vars: Dict[str, xr.Variable], created_c
                     )
 
 
-def cdf_to_xarray(filename: str, to_datetime: bool = True, to_unixtime: bool = False, fillval_to_nan: bool = False) -> xr.Dataset:
+def cdf_to_xarray(cdf_file: CDF, to_datetime: bool = True, to_unixtime: bool = False, fillval_to_nan: bool = False) -> xr.Dataset:
     """
     This function converts CDF files into XArray Dataset Objects.
 
     Parameters:
-        filename (str):  The path to the CDF file to read
+        cdf_file (CDF):  The CDF file to convert
         to_datetime (bool, optional): Whether or not to convert CDF_EPOCH/EPOCH_16/TT2000 to datetime64, or leave them as is
         to_unixtime (bool, optional): Whether or not to convert CDF_EPOCH/EPOCH_16/TT2000 to unixtime, or leave them as is
         fillval_to_nan (bool, optional): If True, any data values that match the FILLVAL attribute for a variable will be set to NaN
@@ -767,7 +765,7 @@ def cdf_to_xarray(filename: str, to_datetime: bool = True, to_unixtime: bool = F
 
     # Convert the CDF file into a series of dicts, so we don't need to keep reading the file
     global_attributes, all_variable_attributes, all_variable_data, all_variable_properties = _convert_cdf_to_dicts(
-        filename, to_datetime=to_datetime, to_unixtime=to_unixtime
+        cdf_file, to_datetime=to_datetime, to_unixtime=to_unixtime
     )
 
     created_vars, depend_dimensions = _generate_xarray_data_variables(
diff --git a/tests/test_cdfwrite.py b/tests/test_cdfwrite.py
index 9c69ad0..b1b5548 100755
--- a/tests/test_cdfwrite.py
+++ b/tests/test_cdfwrite.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pytest
 
-from cdflib import cdfread, cdfwrite
+from cdflib import cdfread, cdfwrite, CDF
 from cdflib.xarray import cdf_to_xarray
 
 R = Path(__file__).parent
@@ -660,7 +660,7 @@ def test_string_input_but_number_type(tmp_path):
     cdf.close()
 
     # Reading it back in would cause an error
-    cdf_to_xarray(tmp_path / "test.cdf")
+    cdf_to_xarray(CDF(tmp_path / "test.cdf"))
 
 
 def test_array_string_input_but_number_type(tmp_path):
@@ -681,4 +681,4 @@ def test_array_string_input_but_number_type(tmp_path):
     cdf.close()
 
     # Reading it back in would cause an error
-    cdf_to_xarray(tmp_path / "test.cdf")
+    cdf_to_xarray(CDF(tmp_path / "test.cdf"))
diff --git a/tests/test_xarray_reader_writer.py b/tests/test_xarray_reader_writer.py
index 61c4f96..b7c5019 100644
--- a/tests/test_xarray_reader_writer.py
+++ b/tests/test_xarray_reader_writer.py
@@ -5,6 +5,7 @@
 import pytest
 import xarray as xr
 
+from cdflib import CDF
 from cdflib.xarray import cdf_to_xarray, xarray_to_cdf
 
 # To run these tests use `pytest --remote-data`
@@ -43,10 +44,12 @@ def test_xarray_read_write(tmp_path, cdf_fname, nc_fname):
     if not os.path.exists(cdf_fname):
         urllib.request.urlretrieve(url, cdf_fname)
 
-    a = cdf_to_xarray(cdf_fname, fillval_to_nan=True)
+    with CDF(cdf_fname) as cdf_file:
+        a = cdf_to_xarray(cdf_file, fillval_to_nan=True)
 
     xarray_to_cdf(a, tmp_path / cdf_fname)
-    b = cdf_to_xarray(tmp_path / cdf_fname, fillval_to_nan=True)
+    with CDF(tmp_path / cdf_fname) as cdf_file:
+        b = cdf_to_xarray(cdf_file, fillval_to_nan=True)
 
     url = f"https://lasp.colorado.edu/maven/sdc/public/data/sdc/web/cdflib_testing/{nc_fname}"
     if not os.path.exists(nc_fname):
@@ -54,7 +57,8 @@ def test_xarray_read_write(tmp_path, cdf_fname, nc_fname):
 
     c = xr.load_dataset(nc_fname)
     xarray_to_cdf(c, tmp_path / ("nc_" + cdf_fname))
-    d = cdf_to_xarray(tmp_path / ("nc_" + cdf_fname), fillval_to_nan=True)
+    with CDF(tmp_path / ("nc_" + cdf_fname)) as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
 
 
 @pytest.mark.remote_data
@@ -73,7 +77,8 @@ def test_MGITM_model():
     c["altitude"].attrs["VAR_TYPE"] = "support_data"
 
     xarray_to_cdf(c, "MGITM_LS180_F130_150615-created-from-netcdf-input.cdf")
-    d = cdf_to_xarray("MGITM_LS180_F130_150615-created-from-netcdf-input.cdf", fillval_to_nan=True)
+    with CDF("MGITM_LS180_F130_150615-created-from-netcdf-input.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("MGITM_LS180_F130_150615-created-from-netcdf-input.cdf")
     os.remove("MGITM_LS180_F130_150615.nc")
 
@@ -92,7 +97,8 @@ def test_goes_mag():
     c["time"].attrs["VAR_TYPE"] = "support_data"
     c["time_orbit"].attrs["VAR_TYPE"] = "support_data"
     xarray_to_cdf(c, "dn_magn-l2-hires_g17_d20211219_v1-0-1-created-from-netcdf-input.cdf")
-    d = cdf_to_xarray("dn_magn-l2-hires_g17_d20211219_v1-0-1-created-from-netcdf-input.cdf", fillval_to_nan=True)
+    with CDF("dn_magn-l2-hires_g17_d20211219_v1-0-1-created-from-netcdf-input.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("dn_magn-l2-hires_g17_d20211219_v1-0-1-created-from-netcdf-input.cdf")
     os.remove("dn_magn-l2-hires_g17_d20211219_v1-0-1.nc")
 
@@ -112,7 +118,8 @@ def test_saber():
     c["sclongitude"].attrs["VAR_TYPE"] = "support_data"
     c["scaltitude"].attrs["VAR_TYPE"] = "support_data"
     xarray_to_cdf(c, "SABER_L2B_2021020_103692_02.07-created-from-netcdf-input.cdf")
-    d = cdf_to_xarray("SABER_L2B_2021020_103692_02.07-created-from-netcdf-input.cdf", fillval_to_nan=True)
+    with CDF("SABER_L2B_2021020_103692_02.07-created-from-netcdf-input.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("SABER_L2B_2021020_103692_02.07-created-from-netcdf-input.cdf")
     os.remove("SABER_L2B_2021020_103692_02.07.nc")
 
@@ -124,9 +131,11 @@ def test_euv():
     if not os.path.exists(fname):
         urllib.request.urlretrieve(url, fname)
 
-    a = cdf_to_xarray("mvn_euv_l3_minute_20201130_v14_r02.cdf", fillval_to_nan=True)
+    with CDF("mvn_euv_l3_minute_20201130_v14_r02.cdf") as cdf_file:
+        a = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     xarray_to_cdf(a, "mvn_euv_l3_minute_20201130_v14_r02-created-from-cdf-input.cdf")
-    b = cdf_to_xarray("mvn_euv_l3_minute_20201130_v14_r02-created-from-cdf-input.cdf", fillval_to_nan=True)
+    with CDF("mvn_euv_l3_minute_20201130_v14_r02-created-from-cdf-input.cdf") as cdf_file:
+        b = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("mvn_euv_l3_minute_20201130_v14_r02-created-from-cdf-input.cdf")
     os.remove("mvn_euv_l3_minute_20201130_v14_r02.cdf")
 
@@ -138,9 +147,11 @@ def test_sep_anc():
     if not os.path.exists(fname):
         urllib.request.urlretrieve(url, fname)
 
-    a = cdf_to_xarray("mvn_sep_l2_anc_20210501_v06_r00.cdf", fillval_to_nan=True)
+    with CDF("mvn_sep_l2_anc_20210501_v06_r00.cdf") as cdf_file:
+        a = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     xarray_to_cdf(a, "mvn_sep_l2_anc_20210501_v06_r00-created-from-cdf-input.cdf")
-    a = cdf_to_xarray("mvn_sep_l2_anc_20210501_v06_r00-created-from-cdf-input.cdf", fillval_to_nan=True)
+    with CDF("mvn_sep_l2_anc_20210501_v06_r00-created-from-cdf-input.cdf") as cdf_file:
+        a = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("mvn_sep_l2_anc_20210501_v06_r00-created-from-cdf-input.cdf")
     os.remove("mvn_sep_l2_anc_20210501_v06_r00.cdf")
 
@@ -152,9 +163,11 @@ def test_sep_svy():
     if not os.path.exists(fname):
         urllib.request.urlretrieve(url, fname)
 
-    a = cdf_to_xarray("mvn_sep_l2_s2-raw-svy-full_20191231_v04_r05.cdf", fillval_to_nan=True)
+    with CDF("mvn_sep_l2_s2-raw-svy-full_20191231_v04_r05.cdf") as cdf_file:
+        a = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     xarray_to_cdf(a, "mvn_sep_l2_s2-raw-svy-full_20191231_v04_r05-created-from-cdf-input.cdf")
-    b = cdf_to_xarray("mvn_sep_l2_s2-raw-svy-full_20191231_v04_r05-created-from-cdf-input.cdf", fillval_to_nan=True)
+    with CDF("mvn_sep_l2_s2-raw-svy-full_20191231_v04_r05-created-from-cdf-input.cdf") as cdf_file:
+        b = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("mvn_sep_l2_s2-raw-svy-full_20191231_v04_r05-created-from-cdf-input.cdf")
     os.remove("mvn_sep_l2_s2-raw-svy-full_20191231_v04_r05.cdf")
 
@@ -187,9 +200,11 @@ def test_swe_arc3d():
     if not os.path.exists(fname):
         urllib.request.urlretrieve(url, fname)
 
-    a = cdf_to_xarray("mvn_swe_l2_arc3d_20180717_v04_r02.cdf", fillval_to_nan=True)
+    with CDF("mvn_swe_l2_arc3d_20180717_v04_r02.cdf") as cdf_file:
+        a = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     xarray_to_cdf(a, "mvn_swe_l2_arc3d_20180717_v04_r02-created-from-cdf-input.cdf")
-    b = cdf_to_xarray("mvn_swe_l2_arc3d_20180717_v04_r02-created-from-cdf-input.cdf", fillval_to_nan=True)
+    with CDF("mvn_swe_l2_arc3d_20180717_v04_r02-created-from-cdf-input.cdf") as cdf_file:
+        b = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("mvn_swe_l2_arc3d_20180717_v04_r02-created-from-cdf-input.cdf")
     os.remove("mvn_swe_l2_arc3d_20180717_v04_r02.cdf")
 
@@ -200,7 +215,8 @@ def test_swe_arc3d():
 
     c = xr.load_dataset("mvn_swe_l2_arc3d_20180717_v04_r02.nc")
     xarray_to_cdf(c, "mvn_swe_l2_arc3d_20180717_v04_r02-created-from-netcdf-input.cdf")
-    d = cdf_to_xarray("mvn_swe_l2_arc3d_20180717_v04_r02-created-from-netcdf-input.cdf", fillval_to_nan=True)
+    with CDF("mvn_swe_l2_arc3d_20180717_v04_r02-created-from-netcdf-input.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("mvn_swe_l2_arc3d_20180717_v04_r02-created-from-netcdf-input.cdf")
     os.remove("mvn_swe_l2_arc3d_20180717_v04_r02.nc")
 
@@ -212,9 +228,11 @@ def test_swe_svyspec():
     if not os.path.exists(fname):
         urllib.request.urlretrieve(url, fname)
 
-    a = cdf_to_xarray("mvn_swe_l2_svyspec_20180718_v04_r04.cdf", fillval_to_nan=True)
+    with CDF("mvn_swe_l2_svyspec_20180718_v04_r04.cdf") as cdf_file:
+        a = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     xarray_to_cdf(a, "mvn_swe_l2_svyspec_20180718_v04_r04-created-from-cdf-input.cdf")
-    b = cdf_to_xarray("mvn_swe_l2_svyspec_20180718_v04_r04-created-from-cdf-input.cdf", fillval_to_nan=True)
+    with CDF("mvn_swe_l2_svyspec_20180718_v04_r04-created-from-cdf-input.cdf") as cdf_file:
+        b = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("mvn_swe_l2_svyspec_20180718_v04_r04-created-from-cdf-input.cdf")
     os.remove("mvn_swe_l2_svyspec_20180718_v04_r04.cdf")
 
@@ -225,7 +243,8 @@ def test_swe_svyspec():
 
     c = xr.load_dataset("mvn_swe_l2_svyspec_20180718_v04_r04.nc")
     xarray_to_cdf(c, "mvn_swe_l2_svyspec_20180718_v04_r04-created-from-netcdf-input.cdf")
-    d = cdf_to_xarray("mvn_swe_l2_svyspec_20180718_v04_r04-created-from-netcdf-input.cdf", fillval_to_nan=True)
+    with CDF("mvn_swe_l2_svyspec_20180718_v04_r04-created-from-netcdf-input.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("mvn_swe_l2_svyspec_20180718_v04_r04-created-from-netcdf-input.cdf")
     os.remove("mvn_swe_l2_svyspec_20180718_v04_r04.nc")
 
@@ -239,7 +258,8 @@ def test_raids():
 
     c = xr.load_dataset("raids_nirs_20100823_v1.1.nc")
     xarray_to_cdf(c, "raids_nirs_20100823_v1.1-created-from-netcdf-input.cdf")
-    d = cdf_to_xarray("raids_nirs_20100823_v1.1-created-from-netcdf-input.cdf", fillval_to_nan=True)
+    with CDF("raids_nirs_20100823_v1.1-created-from-netcdf-input.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("raids_nirs_20100823_v1.1-created-from-netcdf-input.cdf")
     os.remove("raids_nirs_20100823_v1.1.nc")
 
@@ -274,7 +294,8 @@ def test_see_l3():
 
     c = xr.load_dataset("see__L3_2021009_012_01.ncdf")
     xarray_to_cdf(c, "see__L3_2021009_012_01.ncdfhello2.cdf")
-    d = cdf_to_xarray("see__L3_2021009_012_01.ncdfhello2.cdf", fillval_to_nan=True)
+    with CDF("see__L3_2021009_012_01.ncdfhello2.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("see__L3_2021009_012_01.ncdfhello2.cdf")
     os.remove("see__L3_2021009_012_01.ncdf")
 
@@ -288,7 +309,8 @@ def test_see_l2a():
 
     c = xr.load_dataset("see__xps_L2A_2021006_012_02.ncdf")
     xarray_to_cdf(c, "see__xps_L2A_2021006_012_02.ncdfhello2.cdf")
-    d = cdf_to_xarray("see__xps_L2A_2021006_012_02.ncdfhello2.cdf", fillval_to_nan=True)
+    with CDF("see__xps_L2A_2021006_012_02.ncdfhello2.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("see__xps_L2A_2021006_012_02.ncdfhello2.cdf")
     os.remove("see__xps_L2A_2021006_012_02.ncdf")
 
@@ -302,7 +324,8 @@ def test_something():
 
     c = xr.load_dataset("sgpsondewnpnC1.nc")
     xarray_to_cdf(c, "sgpsondewnpnC1-created-from-netcdf-input.cdf")
-    d = cdf_to_xarray("sgpsondewnpnC1-created-from-netcdf-input.cdf", fillval_to_nan=True)
+    with CDF("sgpsondewnpnC1-created-from-netcdf-input.cdf") as cdf_file:
+        d = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     os.remove("sgpsondewnpnC1-created-from-netcdf-input.cdf")
     os.remove("sgpsondewnpnC1.nc")
 
@@ -348,7 +371,8 @@ def test_build_from_scratch():
 
 
 def test_smoke(cdf_path, tmp_path):
-    a = cdf_to_xarray(cdf_path, fillval_to_nan=True)
+    with CDF(cdf_path) as cdf_file:
+        a = cdf_to_xarray(cdf_file, fillval_to_nan=True)
     xarray_to_cdf(a, tmp_path / cdf_path.name)
 
 
@@ -363,7 +387,8 @@ def test_datetime64_conversion():
     epoch = xr.Variable(epoch_dims, epoch_data)
     ds = xr.Dataset(data_vars={"data": data, "epoch": epoch})
     xarray_to_cdf(ds, "hello.cdf")
-    x = cdf_to_xarray("hello.cdf", to_datetime=True)
+    with CDF("hello.cdf") as cdf_file:
+        x = cdf_to_xarray(cdf_file, to_datetime=True)
     assert x["epoch"][0] == np.datetime64("1970-01-01T00:00:01")
     os.remove("hello.cdf")
 
@@ -380,7 +405,8 @@ def test_datetime64_conversion_odd_units():
     epoch = xr.Variable(epoch_dims, epoch_data)
     ds = xr.Dataset(data_vars={"data": data, "epoch": epoch})
     xarray_to_cdf(ds, "hello.cdf")
-    x = cdf_to_xarray("hello.cdf", to_datetime=True)
+    with CDF("hello.cdf") as cdf_file:
+        x = cdf_to_xarray(cdf_file, to_datetime=True)
     assert x["epoch"][1] == np.datetime64("2000-01-02")
     os.remove("hello.cdf")
 
@@ -397,6 +423,7 @@ def test_numpy_string_array():
     epoch = xr.Variable(epoch_dims, epoch_data)
     ds = xr.Dataset(data_vars={"data": data, "epoch": epoch})
     xarray_to_cdf(ds, "hello.cdf")
-    x = cdf_to_xarray("hello.cdf", to_datetime=True)
+    with CDF("hello.cdf") as cdf_file:
+        x = cdf_to_xarray(cdf_file, to_datetime=True)
     assert x["data"][2] == "c"
     os.remove("hello.cdf")