From d3931c76246118a2aab0ffc4833b2ed6e3691ada Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Thu, 28 Dec 2023 16:41:34 -0600
Subject: [PATCH 01/28] add lma writer

---
 pyxlma/lmalib/io/write.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 pyxlma/lmalib/io/write.py

diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py
new file mode 100644
index 0000000..aa0b8cd
--- /dev/null
+++ b/pyxlma/lmalib/io/write.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+def dataset(dataset, path):
+    "Write an LMA dataset to a netCDF file"
+
+    if 'flash_event_count' in dataset.data_vars:
+        if np.all(dataset.flash_event_count == np.iinfo(np.uint32).max):
+            dataset = dataset.drop_vars(['flash_init_latitude', 'flash_init_longitude', 'flash_init_altitude', 'flash_area', 'flash_volume', 'flash_energy', 'flash_center_latitude', 'flash_center_longitude', 'flash_center_altitude', 'flash_power', 'flash_event_count', 'flash_duration_threshold', 'flash_time_start', 'flash_time_end', 'flash_duration'])
+    
+    for var in dataset.data_vars:
+        if np.all(dataset[var].data == np.nan):
+            dataset = dataset.drop_vars(var)
+    comp = dict(zlib=True, complevel=5)
+    encoding = {var: comp for var in dataset.data_vars}
+    dataset = dataset.chunk('auto')
+    dataset.to_netcdf(path, encoding=encoding)
\ No newline at end of file

From aacf8557c37bc0af3fc812b89bcd274471eed46f Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 30 Jun 2024 19:33:10 -0500
Subject: [PATCH 02/28] use xarray sizes instead of dims (warning silence)

---
 pyxlma/lmalib/io/read.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index e1f3990..e2ca541 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -113,7 +113,7 @@ def dataset(filenames, sort_time=True):
             ds = to_dataset(lma_file, event_id_start=next_event_id).set_index(
                 {'number_of_stations':'station_code', 'number_of_events':'event_id'})
             lma_data.append(ds)
-            next_event_id += ds.dims['number_of_events']
+            next_event_id += ds.sizes['number_of_events']
         except:
             raise
     ds = combine_datasets(lma_data)

From f5bfa96d52794a0141cf6a877beb24ca1bf7803b Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 1 Jul 2024 01:07:05 -0500
Subject: [PATCH 03/28] add original_filename, fix station network location

---
 pyxlma/lmalib/io/read.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index e2ca541..022f570 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -3,6 +3,7 @@
 import numpy as np
 import gzip
 import datetime as dt
+from os import path
 
 class open_gzip_or_dat:
     def __init__(self, filename):
@@ -201,6 +202,7 @@ def to_dataset(lma_file, event_id_start=0):
     ds.attrs['history'] = "LMA source file created "+lma_file.file_created
     ds.attrs['event_algorithm_name'] = lma_file.analysis_program
     ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version
+    ds.attrs['original_filename'] =  path.basename(lma_file.file)
 
     # -- Populate the station mask information --
     # int, because NetCDF doesn't have booleans
@@ -361,7 +363,7 @@ def __init__(self,filename):
                     file_created = line.decode().split(':')[1:]
                     self.file_created = ':'.join(file_created)[:-1]
                 if line.startswith(b'Location:'):
-                    self.network_location = ':'.join(line.decode().split(':')[1:])[:-1]
+                    self.network_location = ':'.join(line.decode().split(':')[1:])[:-1].replace(' ','')
                 if line.startswith(b'Data start time:'):
                     timestring = line.decode().split()[-2:]
                     self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y')

From fe657e4f9ebbc34096768be1765235d8ed191e6e Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 1 Jul 2024 01:13:32 -0500
Subject: [PATCH 04/28] properly support stations appearing/disappearing/moving

---
 pyxlma/lmalib/io/read.py | 206 ++++++++++++++++++++++++++++++---------
 1 file changed, 161 insertions(+), 45 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 022f570..be1e479 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -37,6 +37,17 @@ def combine_datasets(lma_data):
         pyxlma.lmalib.io.cf_netcdf.new_dataset or
         pyxlma.lmalib.io.read.to_dataset
     """
+    def reorder_stations(dataset, index_mapping):
+        for var in dataset.data_vars:
+            if 'number_of_stations' in dataset[var].dims:
+                if var == 'station_code':
+                    new_station_codes = dataset[var].data.copy()
+                    new_station_codes = new_station_codes[index_mapping]
+                    dataset = dataset.assign(station_code=('number_of_stations', new_station_codes))
+                else:
+                    reordered_data = dataset[var].isel(number_of_stations=index_mapping)
+                    dataset[var].data = reordered_data.values
+        return dataset
     # Get a list of all the global attributes from each dataset
     attrs = [d.attrs for d in lma_data]
     # Create a dict of {attr_name: [list of values from each dataset]}
@@ -53,46 +64,153 @@ def combine_datasets(lma_data):
             final_attrs[k] = tuple(set_of_values)[0]
         else:
             final_attrs[k] = '; '.join(attr_vals)
-    # print(final_attrs)
-
-    # Get just the pure-station variables
-    lma_station_data = xr.concat(
-        [d.drop_dims(['number_of_events']) for d in lma_data],
-        dim='number_of_files'
-    )
-    # print(lma_station_data)
-
-    # Get just the pure-event variables
-    lma_event_data = xr.concat(
-        [d.drop_dims(['number_of_stations']).drop(
-            ['network_center_latitude',
-             'network_center_longitude',
-             'network_center_altitude',]
-          ) for d in lma_data],
-        dim='number_of_events'
-    )
-    # print(lma_event_data)
-
-    # Get any varaibles with joint dimensions,
-    # i.e., ('number_of_events', 'number_of_stations')
-    event_contributing_stations = xr.concat(
-        [d['event_contributing_stations'] for d in lma_data],
-        dim='number_of_events'
-    )
-    # print(event_contributing_stations)
-
-    # Find the mean of the station data, and then add back the event data
-    ds = xr.merge([lma_station_data.mean(dim='number_of_files'),
-                   lma_event_data],
-                  compat='equals')
-    # ... and then restore the contributing stations.
-    # Note the coordinate variables are being used as labels to ensure data remain aligned.
-    ds['event_contributing_stations'] = event_contributing_stations
-
-    # Restore the global attributes
-    ds.attrs.update(final_attrs)
-
-    return ds
+    # Get the station data from the first dataset and assign each station a unique index
+    lma_data[0]['station_code'] = lma_data[0].number_of_stations
+    lma_data[0]['number_of_stations'] = np.arange(len(lma_data[0].number_of_stations))
+    all_data = lma_data[0]
+
+    # Set up a few arrays to keep track of all known stations
+    station_networks = all_data.station_network.data.copy()
+    station_codes = all_data.station_code.data.copy()
+    station_lats = all_data.station_latitude.data.copy()
+    station_lons = all_data.station_longitude.data.copy()
+    station_alts = all_data.station_altitude.data.copy()
+
+    # Check each subsequent dataset for new stations
+    for new_file_num in range(1, len(lma_data)):
+        new_file = lma_data[new_file_num]
+        # Demote station code to a data variable and assign an index to each station in the new file
+        new_file['station_code'] = new_file.number_of_stations
+        new_file['number_of_stations'] = np.arange(len(new_file.number_of_stations))
+        stations_in_file = []
+        # Check each station in the new file against all known stations
+        for station_num in range(len(new_file.number_of_stations.data)):
+            station = new_file.isel(number_of_stations=station_num)
+            matching_networks = station_networks == station.station_network.data
+            matching_station_code = station_codes == station.station_code.data
+            matching_lat = station_lats == station.station_latitude.data
+            matching_lon = station_lons == station.station_longitude.data
+            matching_alt = station_alts == station.station_altitude.data
+            # If the lat/lon/alt are the same, then the station is in the same location
+            matching_loc = matching_lat * matching_lon * matching_alt
+            matching_id = matching_networks * matching_station_code
+            matching = matching_id * matching_loc
+            # check for a single match in the previously-known stations
+            if sum(matching) == 1:
+                # station completely matches previous
+                previous_index = np.argmax(matching)
+                stations_in_file.append(previous_index)
+            elif sum(matching_id) == 1:
+                # station has moved geographically since previous
+                previous_index = np.argmax(matching_id)
+                # Update the global list of lat/lon/alts to reflect the move
+                station_lats[previous_index] = station.station_latitude.data
+                station_lons[previous_index] = station.station_longitude.data
+                station_alts[previous_index] = station.station_altitude.data
+                stations_in_file.append(previous_index)
+            elif sum(matching_loc) == 1:
+                # station was renamed since previous
+                previous_index = np.argmax(matching_loc)
+                # Update the global list of station identifying information to reflect the rename
+                station_networks[previous_index] = station.station_network.data
+                station_codes[previous_index] = station.station_code.data
+                stations_in_file.append(previous_index)
+            else:
+                # station is new
+                previous_index = -1
+                stations_in_file.append(previous_index)
+        # Find the indices of any newly-discovered stations
+        indices_of_new_stations = np.where(np.array(stations_in_file) == -1)[0]
+        # Add the new stations to the known stations
+        for idx_of_new_station in indices_of_new_stations:
+            new_station = new_file.isel(number_of_stations=idx_of_new_station)
+            # The new station is appended to the end of the known stations
+            new_station_index = len(all_data.number_of_stations)
+            # Expand all of the previous data to contain the new station. Fill with nan values for all previous times.
+            all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=np.nan)
+            # Update the dimension coordinate to reflect the new station's addition
+            all_data['number_of_stations'] = ('number_of_stations', np.arange(new_station_index+1))
+            # Add the new station's information to the global list of known stations
+            station_networks = np.append(station_networks, new_station.station_network.data)
+            station_codes = np.append(station_codes, new_station.station_code.data)
+            station_lats = np.append(station_lats, new_station.station_latitude.data)
+            station_lons = np.append(station_lons, new_station.station_longitude.data)
+            station_alts = np.append(station_alts, new_station.station_altitude.data)
+            # Update the station index for the new station
+            stations_in_file[idx_of_new_station] = new_station_index
+        # Check for any previously-known stations that are no longer in the new file
+        for old_station_id in all_data.number_of_stations.data:
+            if old_station_id not in stations_in_file:
+                # a station has been removed, create a row of nan values for this file to indicate that the station is no longer present
+                # first create a temporary index for the dead station
+                temp_station_id = len(new_file.number_of_stations)
+                # fill the new row with nan values
+                new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=np.nan)
+                new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1))
+                # add the dead station to the list of stations from this file so that it is the same length as the all_data dataset
+                stations_in_file.append(temp_station_id)
+                # update the mapping of the dead station to the temporary index so that the data can be re-ordered
+                stations_in_file[old_station_id] = temp_station_id
+                stations_in_file[temp_station_id] = old_station_id
+        # Re-order the station data to match the order of the stations in the new file
+        # This can happen if lma_analysis decides to change the order of the stations, or if new stations are added or removed
+        new_file = reorder_stations(new_file, stations_in_file)
+        # Concatenate the new file's station information with the previously-known station information
+        lma_station_data = xr.concat(
+                [d.drop_dims(['number_of_events']) for d in [all_data, new_file]],
+                dim='number_of_files'
+            )
+        # Rebuild the event_contributing_stations array
+        event_contributing_stations = xr.concat(
+                [d['event_contributing_stations'] for d in [all_data, new_file]],
+                dim='number_of_events'
+            )
+        # Attach the event_contributing_stations array to the station dataset
+        lma_station_data['event_contributing_stations'] = event_contributing_stations
+
+        # Combining the events is a simple concatenation. If only the stations had been this easy...
+        lma_event_data = xr.concat(
+            [d.drop_dims(['number_of_stations']).drop_vars(
+                ['network_center_latitude',
+                'network_center_longitude',
+                'network_center_altitude',
+                'file_start_time']
+            ) for d in [all_data, new_file]],
+            dim='number_of_events'
+        )
+        # merge all the data from this file with the previously-known data
+        combined_event_data = xr.merge([lma_station_data, lma_event_data])
+        all_data = combined_event_data
+    # Sort the data by file_start_time
+    all_data = all_data.sortby('file_start_time')
+    # Update the global attributes
+    all_data.attrs.update(final_attrs)
+    # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary
+    all_the_same = True
+    # These variables are expected to change between files, so they are not checked for equality
+    expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time']
+    for var in all_data.data_vars:
+        if var in expected_to_change:
+            continue
+        if 'number_of_files' in all_data[var].dims:
+            if (all_data[var] == all_data[var].isel(number_of_files=0)).all():
+                pass
+            else:
+                all_the_same = False
+    # If all of the variables are the same for all files, remove the 'number_of_files' dimension.
+    if all_the_same:
+        # Some variables need to be averaged
+        mean_data = all_data.mean(dim='number_of_files')
+        # ...but most can just be copied over (this is necessary because some are strings which can't be averaged)
+        all_data = all_data.isel(number_of_files=0)
+        # the file_start_time variable is not needed in the reduced dataset
+        all_data = all_data.drop_vars(['file_start_time'])
+        # replace the copied variables with the averaged variables where necessary
+        for var in expected_to_change:
+            if var == 'file_start_time':
+                continue
+            all_data[var] = mean_data[var]
+    return all_data
 
 def dataset(filenames, sort_time=True):
     """ Create an xarray dataset of the type returned by
@@ -127,15 +245,13 @@ def dataset(filenames, sort_time=True):
         # the dimension name, too.
         resetted = ('number_of_events_', 'number_of_stations_')
         ds = ds.reset_coords(resetted)
-        ds = ds.rename({resetted[0]:'event_id',
-                        resetted[1]:'station_code'})
+        ds = ds.rename({resetted[0]:'event_id'})
     else:
         # The approach for newer xarray versions requires we explicitly rename only the variables.
         # The generic "rename" in the block above renames vars, coords, and dims.
         resetted = ('number_of_events', 'number_of_stations')
         ds = ds.reset_coords(resetted)
-        ds = ds.rename_vars({resetted[0]:'event_id',
-                        resetted[1]:'station_code'})
+        ds = ds.rename_vars({resetted[0]:'event_id'})
     if sort_time:
         ds = ds.sortby('event_time')
     return ds, starttime
@@ -203,7 +319,7 @@ def to_dataset(lma_file, event_id_start=0):
     ds.attrs['event_algorithm_name'] = lma_file.analysis_program
     ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version
     ds.attrs['original_filename'] =  path.basename(lma_file.file)
-
+    ds['file_start_time'] = starttime
     # -- Populate the station mask information --
     # int, because NetCDF doesn't have booleans
     station_mask_bools = np.zeros((N_events, N_stations), dtype='int8')

From 043acd80118d620ee9f3c896eb82ddf324c655dc Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 1 Jul 2024 01:18:14 -0500
Subject: [PATCH 05/28] preserve event_id == number_of_events
 (deeplycloudy/xlma-python#43)

---
 pyxlma/lmalib/io/read.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index be1e479..cb5198c 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -181,8 +181,6 @@ def reorder_stations(dataset, index_mapping):
         # merge all the data from this file with the previously-known data
         combined_event_data = xr.merge([lma_station_data, lma_event_data])
         all_data = combined_event_data
-    # Sort the data by file_start_time
-    all_data = all_data.sortby('file_start_time')
     # Update the global attributes
     all_data.attrs.update(final_attrs)
     # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary
@@ -236,6 +234,10 @@ def dataset(filenames, sort_time=True):
         except:
             raise
     ds = combine_datasets(lma_data)
+    if sort_time:
+        ds = ds.sortby('event_time')
+        if 'file_start_time' in ds.dims:
+            ds = ds.sortby('file_start_time')
     ds = ds.reset_index(('number_of_events', 'number_of_stations'))
     if 'number_of_events_' in ds.coords:
         # Older xarray versions appended a trailing underscore. reset_coords then dropped
@@ -252,8 +254,6 @@ def dataset(filenames, sort_time=True):
         resetted = ('number_of_events', 'number_of_stations')
         ds = ds.reset_coords(resetted)
         ds = ds.rename_vars({resetted[0]:'event_id'})
-    if sort_time:
-        ds = ds.sortby('event_time')
     return ds, starttime
 
 def to_dataset(lma_file, event_id_start=0):

From 25385bb5c0a2ea2cd8a1d594ba7da225d16f6798 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 1 Jul 2024 01:35:28 -0500
Subject: [PATCH 06/28] add station_active flag

---
 pyxlma/lmalib/io/read.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index cb5198c..95cf941 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -279,6 +279,7 @@ def to_dataset(lma_file, event_id_start=0):
         'station_altitude':'Alt',
         'station_event_fraction':'sources',
         'station_power_ratio':'<P/P_m>',
+        'station_active':'active',
     }
     event_mapping = {
         'event_latitude':'lat',
@@ -335,6 +336,11 @@ def to_dataset(lma_file, event_id_start=0):
         station_mask_bools[:, i] = lma_data[col]
     ds['event_contributing_stations'][:] = station_mask_bools
 
+    # -- Convert the station_active flag to a psuedo-boolean --
+    station_active_data = np.zeros(N_stations, dtype='int8')
+    station_active_data[ds.station_active.data == b'A'] = 1
+    station_active_data[ds.station_active.data == b'NA'] = 0
+    ds.station_active.data = station_active_data
     return ds
 
 

From 1477a42666f3148dfcdbfc993f6e37a4e8c44a3b Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 1 Jul 2024 01:44:43 -0500
Subject: [PATCH 07/28] part 2 to 25385bb, actually commit the cf_netcdf file
 this time

---
 pyxlma/lmalib/io/cf_netcdf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py
index 6ce8d45..01749e4 100644
--- a/pyxlma/lmalib/io/cf_netcdf.py
+++ b/pyxlma/lmalib/io/cf_netcdf.py
@@ -335,6 +335,11 @@ def new_template_dataset():
     'long_name': "<P/P_m>"},
    'dtype': 'float32',
    },
+   'station_active': {'dims': ('number_of_stations',),
+   'attrs': {'_FillValue': np.nan,
+    'long_name': "Station active flag"},
+   'dtype': '|S2',
+   },
 
  }}
  return __template_dataset

From fb0b3abe6a078e6d03e5d4d974230587eaca6aa7 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 1 Jul 2024 02:58:23 -0500
Subject: [PATCH 08/28] fill in known values when stations are offline

---
 pyxlma/lmalib/io/read.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 95cf941..5cc89d0 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -128,6 +128,20 @@ def reorder_stations(dataset, index_mapping):
             new_station_index = len(all_data.number_of_stations)
             # Expand all of the previous data to contain the new station. Fill with nan values for all previous times.
             all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=np.nan)
+            for var in all_data.data_vars:
+                if 'number_of_stations' in all_data[var].dims:
+                    if var in ['station_event_fraction', 'station_power_ratio', 'station_active', 'event_contributing_stations']:
+                        val_to_fill = 0
+                    else:
+                        val_to_fill = new_station[var].data.item()
+                    if all_data[var].data.dtype != 'object':
+                        filled_data = np.nan_to_num(all_data[var].data, copy=True, nan=val_to_fill)
+                    else:
+                        filled_data = all_data[var].data.copy().astype(new_station[var].data.dtype)
+                        for i in np.ndindex(filled_data.shape):
+                            if filled_data[i] == np.array([np.nan]).astype(new_station[var].data.dtype)[0]:
+                                filled_data[i] = val_to_fill
+                    all_data[var].data = filled_data
             # Update the dimension coordinate to reflect the new station's addition
             all_data['number_of_stations'] = ('number_of_stations', np.arange(new_station_index+1))
             # Add the new station's information to the global list of known stations
@@ -146,6 +160,18 @@ def reorder_stations(dataset, index_mapping):
                 temp_station_id = len(new_file.number_of_stations)
                 # fill the new row with nan values
                 new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=np.nan)
+                for var in new_file.data_vars:
+                    if var == 'number_of_stations':
+                        continue
+                    if 'number_of_stations' in new_file[var].dims:
+                        if var in ['station_event_fraction', 'station_power_ratio', 'station_active', 'event_contributing_stations']:
+                            val_to_fill = 0
+                            filled_data = np.nan_to_num(new_file[var].data, copy=True, nan=val_to_fill)
+                            new_file[var].data = filled_data
+                        else:
+                            new_file[var].data[temp_station_id] = all_data[var].isel(number_of_stations=old_station_id, number_of_files=-1).data.item()
+                            val_to_fill = all_data[var].isel(number_of_stations=old_station_id).data
+                            val_to_fill = val_to_fill[0]
                 new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1))
                 # add the dead station to the list of stations from this file so that it is the same length as the all_data dataset
                 stations_in_file.append(temp_station_id)

From b5555e97b206d09e8be86803876e119070245c5f Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Fri, 5 Jul 2024 01:53:41 -0500
Subject: [PATCH 09/28] only reduce number_of_files if it exists

---
 pyxlma/lmalib/io/read.py | 47 ++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 5cc89d0..88e4194 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -210,30 +210,31 @@ def reorder_stations(dataset, index_mapping):
     # Update the global attributes
     all_data.attrs.update(final_attrs)
     # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary
-    all_the_same = True
-    # These variables are expected to change between files, so they are not checked for equality
-    expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time']
-    for var in all_data.data_vars:
-        if var in expected_to_change:
-            continue
-        if 'number_of_files' in all_data[var].dims:
-            if (all_data[var] == all_data[var].isel(number_of_files=0)).all():
-                pass
-            else:
-                all_the_same = False
-    # If all of the variables are the same for all files, remove the 'number_of_files' dimension.
-    if all_the_same:
-        # Some variables need to be averaged
-        mean_data = all_data.mean(dim='number_of_files')
-        # ...but most can just be copied over (this is necessary because some are strings which can't be averaged)
-        all_data = all_data.isel(number_of_files=0)
-        # the file_start_time variable is not needed in the reduced dataset
-        all_data = all_data.drop_vars(['file_start_time'])
-        # replace the copied variables with the averaged variables where necessary
-        for var in expected_to_change:
-            if var == 'file_start_time':
+    if 'number_of_files' in all_data.dims:
+        all_the_same = True
+        # These variables are expected to change between files, so they are not checked for equality
+        expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time']
+        for var in all_data.data_vars:
+            if var in expected_to_change:
                 continue
-            all_data[var] = mean_data[var]
+            if 'number_of_files' in all_data[var].dims:
+                if (all_data[var] == all_data[var].isel(number_of_files=0)).all():
+                    pass
+                else:
+                    all_the_same = False
+        # If all of the variables are the same for all files, remove the 'number_of_files' dimension.
+        if all_the_same:
+            # Some variables need to be averaged
+            mean_data = all_data.mean(dim='number_of_files')
+            # ...but most can just be copied over (this is necessary because some are strings which can't be averaged)
+            all_data = all_data.isel(number_of_files=0)
+            # the file_start_time variable is not needed in the reduced dataset
+            all_data = all_data.drop_vars(['file_start_time'])
+            # replace the copied variables with the averaged variables where necessary
+            for var in expected_to_change:
+                if var == 'file_start_time':
+                    continue
+                all_data[var] = mean_data[var]
     return all_data
 
 def dataset(filenames, sort_time=True):

From d458c2291b7218c6eb60835a7d285b84a30742d8 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Wed, 24 Jul 2024 00:33:15 -0500
Subject: [PATCH 10/28] identify unique network configurations instead of
 individual files

---
 pyxlma/lmalib/io/read.py | 40 ++++++++++++----------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 7f02c61..45ab04a 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -169,7 +169,7 @@ def reorder_stations(dataset, index_mapping):
                             filled_data = np.nan_to_num(new_file[var].data, copy=True, nan=val_to_fill)
                             new_file[var].data = filled_data
                         else:
-                            new_file[var].data[temp_station_id] = all_data[var].isel(number_of_stations=old_station_id, number_of_files=-1).data.item()
+                            new_file[var].data[temp_station_id] = all_data[var].isel(number_of_stations=old_station_id, network_configurations=-1).data.item()
                             val_to_fill = all_data[var].isel(number_of_stations=old_station_id).data
                             val_to_fill = val_to_fill[0]
                 new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1))
@@ -184,7 +184,7 @@ def reorder_stations(dataset, index_mapping):
         # Concatenate the new file's station information with the previously-known station information
         lma_station_data = xr.concat(
                 [d.drop_dims(['number_of_events']) for d in [all_data, new_file]],
-                dim='number_of_files'
+                dim='network_configurations'
             )
         # Rebuild the event_contributing_stations array
         event_contributing_stations = xr.concat(
@@ -209,32 +209,16 @@ def reorder_stations(dataset, index_mapping):
         all_data = combined_event_data
     # Update the global attributes
     all_data.attrs.update(final_attrs)
-    # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary
-    if 'number_of_files' in all_data.dims:
-        all_the_same = True
-        # These variables are expected to change between files, so they are not checked for equality
-        expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time']
-        for var in all_data.data_vars:
-            if var in expected_to_change:
-                continue
-            if 'number_of_files' in all_data[var].dims:
-                if (all_data[var] == all_data[var].isel(number_of_files=0)).all():
-                    pass
-                else:
-                    all_the_same = False
-        # If all of the variables are the same for all files, remove the 'number_of_files' dimension.
-        if all_the_same:
-            # Some variables need to be averaged
-            mean_data = all_data.mean(dim='number_of_files')
-            # ...but most can just be copied over (this is necessary because some are strings which can't be averaged)
-            all_data = all_data.isel(number_of_files=0)
-            # the file_start_time variable is not needed in the reduced dataset
-            all_data = all_data.drop_vars(['file_start_time'])
-            # replace the copied variables with the averaged variables where necessary
-            for var in expected_to_change:
-                if var == 'file_start_time':
-                    continue
-                all_data[var] = mean_data[var]
+    # store start and end time and seconds analyzed as attributes, then drop file_start_time and seconds_analyzed dims
+    all_data.attrs['start_time'] = all_data['file_start_time'].data.min()
+    all_data = all_data.drop_vars(['file_start_time'])
+    # somehow this gets set to float64 (probably introduction of nan values above), that's unnecessary, use uint8
+    all_data.station_active.data = all_data.station_active.data.astype(np.uint8)
+    # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary
+    if 'network_configurations' in all_data.dims:
+        # Identify unique network configurations
+        unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1]
+        all_data = all_data.isel(network_configurations=unique_configs)
     return all_data
 
 def dataset(filenames, sort_time=True):

From fe6225c351a9d06932ee9ae884cba0968c3c4883 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Wed, 24 Jul 2024 01:39:46 -0500
Subject: [PATCH 11/28] start time, end time, and analyzed sec of dataset, warn
 if mismatch

---
 pyxlma/lmalib/io/read.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 45ab04a..1884eee 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -4,6 +4,7 @@
 import gzip
 import datetime as dt
 from os import path
+import warnings
 
 class open_gzip_or_dat:
     def __init__(self, filename):
@@ -199,8 +200,7 @@ def reorder_stations(dataset, index_mapping):
             [d.drop_dims(['number_of_stations']).drop_vars(
                 ['network_center_latitude',
                 'network_center_longitude',
-                'network_center_altitude',
-                'file_start_time']
+                'network_center_altitude']
             ) for d in [all_data, new_file]],
             dim='number_of_events'
         )
@@ -209,15 +209,14 @@ def reorder_stations(dataset, index_mapping):
         all_data = combined_event_data
     # Update the global attributes
     all_data.attrs.update(final_attrs)
-    # store start and end time and seconds analyzed as attributes, then drop file_start_time and seconds_analyzed dims
-    all_data.attrs['start_time'] = all_data['file_start_time'].data.min()
-    all_data = all_data.drop_vars(['file_start_time'])
     # somehow this gets set to float64 (probably introduction of nan values above), that's unnecessary, use uint8
     all_data.station_active.data = all_data.station_active.data.astype(np.uint8)
     # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary
     if 'network_configurations' in all_data.dims:
         # Identify unique network configurations
         unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1]
+        if len(unique_configs) == 1:
+            unique_configs = unique_configs[0]
         all_data = all_data.isel(network_configurations=unique_configs)
     return all_data
 
@@ -229,6 +228,8 @@ def dataset(filenames, sort_time=True):
         filenames = [filenames]
     lma_data = []
     starttime = None
+    analyzed_sec = 0
+    endtime = None
     next_event_id = 0
     for filename in filenames:
         lma_file = lmafile(filename)
@@ -236,6 +237,11 @@ def dataset(filenames, sort_time=True):
             starttime = lma_file.starttime
         else:
             starttime = min(lma_file.starttime, starttime)
+        analyzed_sec += lma_file.analyzed_sec
+        if endtime is None:
+            endtime = lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec)
+        else:
+            endtime = max(lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec), endtime)
         # Accounting for empty files
         try:
             ds = to_dataset(lma_file, event_id_start=next_event_id).set_index(
@@ -245,10 +251,17 @@ def dataset(filenames, sort_time=True):
         except:
             raise
     ds = combine_datasets(lma_data)
+    ds.attrs['analysis_start_time'] = starttime
+    ds.attrs['number_of_seconds_analyzed'] = analyzed_sec
+    ds.attrs['analysis_end_time'] = endtime
+    if starttime + dt.timedelta(seconds=analyzed_sec) < endtime:
+        warnings.warn('The number of seconds analyzed does not fully cover the time between the start and end times of the analysis. This may indicate '
+                      'incomplete or non-continuous data.')
+    elif starttime + dt.timedelta(seconds=analyzed_sec) > endtime:
+        warnings.warn('The start time and end time of the analysis do not match the number of seconds analyzed. If you are combining multiple networks, '
+                      'this is normal and expected, otherwise, this may indicate an error in the orignal analysis program.')
     if sort_time:
         ds = ds.sortby('event_time')
-        if 'file_start_time' in ds.dims:
-            ds = ds.sortby('file_start_time')
     ds = ds.reset_index(('number_of_events', 'number_of_stations'))
     if 'number_of_events_' in ds.coords:
         # Older xarray versions appended a trailing underscore. reset_coords then dropped
@@ -331,7 +344,6 @@ def to_dataset(lma_file, event_id_start=0):
     ds.attrs['event_algorithm_name'] = lma_file.analysis_program
     ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version
     ds.attrs['original_filename'] =  path.basename(lma_file.file)
-    ds['file_start_time'] = starttime
     # -- Populate the station mask information --
     # int, because NetCDF doesn't have booleans
     station_mask_bools = np.zeros((N_events, N_stations), dtype='int8')
@@ -496,13 +508,15 @@ def __init__(self,filename):
                     file_created = line.decode().split(':')[1:]
                     self.file_created = ':'.join(file_created)[:-1]
                 if line.startswith(b'Location:'):
-                    self.network_location = ':'.join(line.decode().split(':')[1:])[:-1].replace(' ','')
+                    self.network_location = ':'.join(line.decode().split(':')[1:])[:-1].replace(' ','', 1)
                 if line.startswith(b'Data start time:'):
                     timestring = line.decode().split()[-2:]
                     self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y')
                     # Full start time and second, likely unneeded
                     self.starttime = dt.datetime.strptime(timestring[0]+timestring[1],'%m/%d/%y%H:%M:%S')
                     # self.startsecond = (starttime-dt.datetime(starttime.year,starttime.month,starttime.day)).seconds
+                if line.startswith(b'Number of seconds analyzed:'):
+                    self.analyzed_sec = int(line.decode().split(':')[-1])
                 # Find starting and ending rows for station information
                 if line.startswith(b'Coordinate center'):
                     self.center_lat = float(line.decode().split()[-3])

From 27b3c10c86243a768c63b5875f5bce5903c8e2f1 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Wed, 24 Jul 2024 02:50:46 -0500
Subject: [PATCH 12/28] lma dat header (mostly)

---
 pyxlma/lmalib/io/write.py | 56 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py
index aa0b8cd..ccaa458 100644
--- a/pyxlma/lmalib/io/write.py
+++ b/pyxlma/lmalib/io/write.py
@@ -1,4 +1,6 @@
 import numpy as np
+import datetime as dt
+import warnings
 
 def dataset(dataset, path):
     "Write an LMA dataset to a netCDF file"
@@ -13,4 +15,56 @@ def dataset(dataset, path):
     comp = dict(zlib=True, complevel=5)
     encoding = {var: comp for var in dataset.data_vars}
     dataset = dataset.chunk('auto')
-    dataset.to_netcdf(path, encoding=encoding)
\ No newline at end of file
+    dataset.to_netcdf(path, encoding=encoding)
+
+def lma_dat_file(dataset, path, use_gzip=True):
+    """
+    Write an LMA dataset to a legacy .dat(.gz) file for use in IDL XLMA.
+
+    Parameters
+    ----------
+    dataset : xarray.Dataset
+        The LMA dataset to write.
+    path : str
+        The target path to the file to write.
+    use_gzip : bool
+        Whether to compress the file using gzip.
+    """
+
+    if 'network_configurations' in dataset.dims:
+        warnings.warn('Attempting to write dataset with multiple network configurations to a single file. This is highly experimental.\n'
+                      'Caveats:\n-Location names will be appended and separated by semicolon\n- The center of the network is derived from the mean of the locations of the stations')
+        center_lon = dataset.station_longitude.data.mean()
+        center_lat = dataset.station_latitude.data.mean()
+        center_alt = dataset.station_altitude.data.mean()
+        num_active = np.sum(np.clip(np.sum(dataset.station_active.data, axis=0), 0, 1))
+    else:
+        center_lon = dataset.network_center_longitude.data.item()
+        center_lat = dataset.network_center_latitude.data.item()
+        center_alt = dataset.network_center_altitude.data.item()
+        num_active = np.sum(dataset.station_active.data)
+    lma_header = (f'Lightning Mapping Array analyzed data\n'
+                  f'Analysis program: {dataset.attrs["event_algorithm_name"]}\n'
+                  f'Analysis program version: {dataset.attrs["event_algorithm_version"]}\n'
+                  f'File created: {dt.datetime.utcnow().strftime("%a %b %d %H:%M:%S %Y")}\n'
+                  f'Data start time: {dataset.attrs["analysis_start_time"].strftime("%M/%d/%y %H:%M:%S")}\n'
+                  f'Number of seconds analyzed: {dataset.attrs["number_of_seconds_analyzed"]}\n'
+                  f'Location: {"; ".join(np.unique(dataset.station_network.data).astype(str).tolist())}\n'
+                  f'Coordinate center (lat,lon,alt): {center_lat:.7f}, {center_lon:.7f}, {center_alt:.2f}\n'
+                  f'Coordinate frame: cartesian\n'
+                #   f'Maximum diameter of LMA (km): '
+                #   f'Maximum light-time across LMA (ns): '
+                  f'Number of stations: {dataset.sizes['number_of_stations']}'
+                  f'Number of active stations: {num_active}\n'
+                #   f'Active stations:'
+                #   f'Minimum number of stations per solution: {}\n'
+                #   f'Maximum reduced chi-squared: {}\n'
+                #   f'Maximum number of chi-squared iterations: {}\n'
+                  ### STATION INFO TABLE
+                  ### STATION DATA TABLE
+                  f'Metric file version: 4\n'
+                  f'Data: time (UT sec of day), lat, lon, alt(m), reduced chi^2, P(dBW), mask\n'
+                  f'Data format: 15.9f 12.8f 13.8f 9.2f 6.2f 5.1f 7x\n'
+                  f'Number of events: {dataset.sizes["number_of_events"]}\n'
+                  f'*** data ***\n'
+    )
\ No newline at end of file

From 3a4c1b33b08394184a325271a003338ea5aec1d0 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 00:45:57 -0500
Subject: [PATCH 13/28] make station event fraction a percentage as described
 by metadata

---
 pyxlma/lmalib/io/read.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 1884eee..e0a0a0c 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -364,6 +364,10 @@ def to_dataset(lma_file, event_id_start=0):
     station_active_data[ds.station_active.data == b'A'] = 1
     station_active_data[ds.station_active.data == b'NA'] = 0
     ds.station_active.data = station_active_data
+
+    # Convert station event count to station event fraction
+    if N_events != 0:
+        ds.station_event_fraction.data = 100*ds.station_event_fraction.data/N_events
     return ds
 
 

From 32e90d706942bc22931f11877c52e02770d010c1 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 00:47:10 -0500
Subject: [PATCH 14/28] track data coverage across network configs and gaps in
 coverage

---
 pyxlma/lmalib/io/read.py | 260 ++++++++++++++++++---------------------
 1 file changed, 122 insertions(+), 138 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index e0a0a0c..09395ae 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -4,7 +4,8 @@
 import gzip
 import datetime as dt
 from os import path
-import warnings
+import collections
+from pyxlma.lmalib.io.cf_netcdf import new_dataset
 
 class open_gzip_or_dat:
     def __init__(self, filename):
@@ -59,6 +60,8 @@ def reorder_stations(dataset, index_mapping):
     }
     final_attrs = {}
     for k in all_attrs:
+        if k.startswith('analysis_') and k.endswith('_time'):
+            continue
         attr_vals =  all_attrs[k]
         set_of_values = set(attr_vals)
         if len(set_of_values) == 1:
@@ -68,15 +71,17 @@ def reorder_stations(dataset, index_mapping):
     # Get the station data from the first dataset and assign each station a unique index
     lma_data[0]['station_code'] = lma_data[0].number_of_stations
     lma_data[0]['number_of_stations'] = np.arange(len(lma_data[0].number_of_stations))
+    lma_data[0].attrs['config_times'] = [[[lma_data[0].attrs['analysis_start_time'], lma_data[0].attrs['analysis_end_time']]]]
     all_data = lma_data[0]
-
-    # Set up a few arrays to keep track of all known stations
-    station_networks = all_data.station_network.data.copy()
-    station_codes = all_data.station_code.data.copy()
-    station_lats = all_data.station_latitude.data.copy()
-    station_lons = all_data.station_longitude.data.copy()
-    station_alts = all_data.station_altitude.data.copy()
-
+    # Get the attributes attached to each variable in the dataset
+    dv_attrs = {}
+    new_ds = new_dataset(1, 1, 1)
+    for var in new_ds.data_vars:
+        dv_attrs[var] = new_ds[var].attrs
+    # Define list of 'properties', things which identify a station and are not expected to change
+    property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network')
+    # Define list of variables to be recalculated for each station after the datasets are combined
+    recalc_vars = ('station_event_fraction', 'station_power_ratio', 'network_center_latitude', 'network_center_longitude', 'network_center_altitude')
     # Check each subsequent dataset for new stations
     for new_file_num in range(1, len(lma_data)):
         new_file = lma_data[new_file_num]
@@ -85,41 +90,26 @@ def reorder_stations(dataset, index_mapping):
         new_file['number_of_stations'] = np.arange(len(new_file.number_of_stations))
         stations_in_file = []
         # Check each station in the new file against all known stations
+        station_is_new = True
+        old_ids_to_check_for_missing = all_data.number_of_stations.data.tolist()
         for station_num in range(len(new_file.number_of_stations.data)):
             station = new_file.isel(number_of_stations=station_num)
-            matching_networks = station_networks == station.station_network.data
-            matching_station_code = station_codes == station.station_code.data
-            matching_lat = station_lats == station.station_latitude.data
-            matching_lon = station_lons == station.station_longitude.data
-            matching_alt = station_alts == station.station_altitude.data
-            # If the lat/lon/alt are the same, then the station is in the same location
-            matching_loc = matching_lat * matching_lon * matching_alt
-            matching_id = matching_networks * matching_station_code
-            matching = matching_id * matching_loc
-            # check for a single match in the previously-known stations
-            if sum(matching) == 1:
-                # station completely matches previous
-                previous_index = np.argmax(matching)
-                stations_in_file.append(previous_index)
-            elif sum(matching_id) == 1:
-                # station has moved geographically since previous
-                previous_index = np.argmax(matching_id)
-                # Update the global list of lat/lon/alts to reflect the move
-                station_lats[previous_index] = station.station_latitude.data
-                station_lons[previous_index] = station.station_longitude.data
-                station_alts[previous_index] = station.station_altitude.data
-                stations_in_file.append(previous_index)
-            elif sum(matching_loc) == 1:
-                # station was renamed since previous
-                previous_index = np.argmax(matching_loc)
-                # Update the global list of station identifying information to reflect the rename
-                station_networks[previous_index] = station.station_network.data
-                station_codes[previous_index] = station.station_code.data
-                stations_in_file.append(previous_index)
-            else:
-                # station is new
-                previous_index = -1
-                stations_in_file.append(previous_index)
+            old_ids_to_search = collections.deque(range(len(all_data.number_of_stations.data)))
+            old_ids_to_search.rotate(-1*station_num) # start with the same index because stations USUALLY come in the same order
+            for old_station_num in old_ids_to_search:
+                old_station = all_data.isel(number_of_stations=old_station_num)
+                all_props_match = True
+                for prop in property_vars:
+                    if station.data_vars[prop].data.item() != old_station.data_vars[prop].data.item():
+                        all_props_match = False
+                        break
+                if all_props_match:
+                    station_is_new = False
+                    stations_in_file.append(old_station_num)
+                    old_ids_to_check_for_missing.remove(old_station_num)
+                    break
+            if station_is_new:
+                stations_in_file.append(-1)
         # Find the indices of any newly-discovered stations
         indices_of_new_stations = np.where(np.array(stations_in_file) == -1)[0]
         # Add the new stations to the known stations
@@ -127,66 +117,50 @@ def reorder_stations(dataset, index_mapping):
             new_station = new_file.isel(number_of_stations=idx_of_new_station)
             # The new station is appended to the end of the known stations
             new_station_index = len(all_data.number_of_stations)
-            # Expand all of the previous data to contain the new station. Fill with nan values for all previous times.
-            all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=np.nan)
-            for var in all_data.data_vars:
+            # Expand all of the previous data to contain the new station. Fill with station properties.
+            fill_vals_dict = {}
+            for var in new_station.data_vars:
                 if 'number_of_stations' in all_data[var].dims:
-                    if var in ['station_event_fraction', 'station_power_ratio', 'station_active', 'event_contributing_stations']:
-                        val_to_fill = 0
-                    else:
-                        val_to_fill = new_station[var].data.item()
-                    if all_data[var].data.dtype != 'object':
-                        filled_data = np.nan_to_num(all_data[var].data, copy=True, nan=val_to_fill)
+                    if var in property_vars:
+                        fill_vals_dict[var] = new_station[var].data.item()
                     else:
-                        filled_data = all_data[var].data.copy().astype(new_station[var].data.dtype)
-                        for i in np.ndindex(filled_data.shape):
-                            if filled_data[i] == np.array([np.nan]).astype(new_station[var].data.dtype)[0]:
-                                filled_data[i] = val_to_fill
-                    all_data[var].data = filled_data
-            # Update the dimension coordinate to reflect the new station's addition
-            all_data['number_of_stations'] = ('number_of_stations', np.arange(new_station_index+1))
-            # Add the new station's information to the global list of known stations
-            station_networks = np.append(station_networks, new_station.station_network.data)
-            station_codes = np.append(station_codes, new_station.station_code.data)
-            station_lats = np.append(station_lats, new_station.station_latitude.data)
-            station_lons = np.append(station_lons, new_station.station_longitude.data)
-            station_alts = np.append(station_alts, new_station.station_altitude.data)
+                        fill_vals_dict[var] = 0
+            all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=fill_vals_dict)
             # Update the station index for the new station
             stations_in_file[idx_of_new_station] = new_station_index
         # Check for any previously-known stations that are no longer in the new file
-        for old_station_id in all_data.number_of_stations.data:
-            if old_station_id not in stations_in_file:
-                # a station has been removed, create a row of nan values for this file to indicate that the station is no longer present
-                # first create a temporary index for the dead station
-                temp_station_id = len(new_file.number_of_stations)
-                # fill the new row with nan values
-                new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=np.nan)
-                for var in new_file.data_vars:
-                    if var == 'number_of_stations':
-                        continue
-                    if 'number_of_stations' in new_file[var].dims:
-                        if var in ['station_event_fraction', 'station_power_ratio', 'station_active', 'event_contributing_stations']:
-                            val_to_fill = 0
-                            filled_data = np.nan_to_num(new_file[var].data, copy=True, nan=val_to_fill)
-                            new_file[var].data = filled_data
-                        else:
-                            new_file[var].data[temp_station_id] = all_data[var].isel(number_of_stations=old_station_id, network_configurations=-1).data.item()
-                            val_to_fill = all_data[var].isel(number_of_stations=old_station_id).data
-                            val_to_fill = val_to_fill[0]
-                new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1))
-                # add the dead station to the list of stations from this file so that it is the same length as the all_data dataset
-                stations_in_file.append(temp_station_id)
-                # update the mapping of the dead station to the temporary index so that the data can be re-ordered
-                stations_in_file[old_station_id] = temp_station_id
-                stations_in_file[temp_station_id] = old_station_id
+        for missing_station_id in old_ids_to_check_for_missing:
+            dead_station_data = all_data.isel(number_of_stations=missing_station_id)
+            # a station has been removed, create a row of nan values for this file to indicate that the station is no longer present
+            # first create a temporary index for the dead station
+            temp_station_id = len(new_file.number_of_stations)
+            fill_vals_dict = {}
+            for var in dead_station_data.data_vars:
+                if 'number_of_stations' in all_data[var].dims:
+                    if var in property_vars:
+                        fill_vals_dict[var] = dead_station_data[var].data.item()
+                    else:
+                        fill_vals_dict[var] = 0
+            new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=fill_vals_dict)
+            # Update the station index for the dead station
+            stations_in_file.append(missing_station_id)
         # Re-order the station data to match the order of the stations in the new file
         # This can happen if lma_analysis decides to change the order of the stations, or if new stations are added or removed
-        new_file = reorder_stations(new_file, stations_in_file)
+        new_file = reorder_stations(new_file, np.argsort(stations_in_file))
         # Concatenate the new file's station information with the previously-known station information
-        lma_station_data = xr.concat(
-                [d.drop_dims(['number_of_events']) for d in [all_data, new_file]],
-                dim='network_configurations'
-            )
+        station_to_merge = [d.drop_dims(['number_of_events']) for d in [all_data, new_file]]
+        lma_station_data = xr.concat(station_to_merge, dim='network_configurations').drop_vars(
+            ['network_center_latitude', 'network_center_longitude', 'network_center_altitude'])
+        for var_name in lma_station_data.data_vars:
+            da = lma_station_data[var_name]
+            if 'network_configurations' in da.dims:
+                # black magic from openAI that somehow determines if a data array is identical across the 'network_configurations' variable
+                if np.all((da == da.isel({'network_configurations': 0})).all('network_configurations')) or var_name in recalc_vars:
+                    # Remove the 'network_configurations' dimension if the data is identical across it
+                    lma_station_data[var_name] = da.isel(network_configurations=0)
+        if 'network_configurations' in lma_station_data.dims:
+            unique_netw_configs = np.unique(lma_station_data.station_active.data, return_index=True, axis=0)[1]
+            lma_station_data = lma_station_data.isel(network_configurations=sorted(unique_netw_configs))
         # Rebuild the event_contributing_stations array
         event_contributing_stations = xr.concat(
                 [d['event_contributing_stations'] for d in [all_data, new_file]],
@@ -197,27 +171,50 @@ def reorder_stations(dataset, index_mapping):
 
         # Combining the events is a simple concatenation. If only the stations had been this easy...
         lma_event_data = xr.concat(
-            [d.drop_dims(['number_of_stations']).drop_vars(
-                ['network_center_latitude',
-                'network_center_longitude',
-                'network_center_altitude']
-            ) for d in [all_data, new_file]],
+            [d.drop_dims(['number_of_stations']) for d in [all_data, new_file]],
             dim='number_of_events'
         )
         # merge all the data from this file with the previously-known data
         combined_event_data = xr.merge([lma_station_data, lma_event_data])
         all_data = combined_event_data
+        # Log continuous intervals of data coverage
+        if 'network_configurations' in all_data.dims:
+            this_netw_conf = np.where((all_data.station_active.data == new_file.station_active.data).all(axis=1))[0][0]
+        else:
+            this_netw_conf = 0
+        if len(all_data.attrs['config_times']) > this_netw_conf:
+            times_of_this_config = all_data.attrs['config_times'][this_netw_conf]
+            for gap in times_of_this_config:
+                if new_file.attrs['analysis_start_time'] == gap[1]:
+                    gap[1] = new_file.attrs['analysis_end_time']
+                    break
+                elif new_file.attrs['analysis_end_time'] == gap[0]:
+                    gap[0] = new_file.attrs['analysis_start_time']
+                    break
+            else:
+                times_of_this_config.append([new_file.attrs['analysis_start_time'], new_file.attrs['analysis_end_time']])
+        else:
+            all_data.attrs['config_times'].append([[new_file.attrs['analysis_start_time'], new_file.attrs['analysis_end_time']]])
+        all_data.attrs['analysis_start_time'] = min(all_data.attrs['analysis_start_time'], new_file.attrs['analysis_start_time'])
+        all_data.attrs['analysis_end_time'] = max(all_data.attrs['analysis_end_time'], new_file.attrs['analysis_end_time'])
     # Update the global attributes
     all_data.attrs.update(final_attrs)
-    # somehow this gets set to float64 (probably introduction of nan values above), that's unnecessary, use uint8
-    all_data.station_active.data = all_data.station_active.data.astype(np.uint8)
     # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary
     if 'network_configurations' in all_data.dims:
         # Identify unique network configurations
         unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1]
         if len(unique_configs) == 1:
             unique_configs = unique_configs[0]
-        all_data = all_data.isel(network_configurations=unique_configs)
+    # recalculate variables that depend on the station data
+    all_data['network_center_longitude'] = all_data.station_longitude.mean(dim='number_of_stations')
+    all_data['network_center_latitude'] = all_data.station_latitude.mean(dim='number_of_stations')
+    all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations')
+    all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0]
+    all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events')
+    # restore previously cached data var attributes
+    for var_name in all_data.data_vars:
+        if var_name in dv_attrs:
+            all_data[var_name].attrs = dv_attrs[var_name]
     return all_data
 
 def dataset(filenames, sort_time=True):
@@ -228,55 +225,42 @@ def dataset(filenames, sort_time=True):
         filenames = [filenames]
     lma_data = []
     starttime = None
-    analyzed_sec = 0
-    endtime = None
+    if sort_time:
+        all_starttimes = []
     next_event_id = 0
-    for filename in filenames:
+    for filename in sorted(filenames):
         lma_file = lmafile(filename)
         if starttime is None:
             starttime = lma_file.starttime
         else:
             starttime = min(lma_file.starttime, starttime)
-        analyzed_sec += lma_file.analyzed_sec
-        if endtime is None:
-            endtime = lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec)
-        else:
-            endtime = max(lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec), endtime)
-        # Accounting for empty files
-        try:
-            ds = to_dataset(lma_file, event_id_start=next_event_id).set_index(
-                {'number_of_stations':'station_code', 'number_of_events':'event_id'})
-            lma_data.append(ds)
-            next_event_id += ds.sizes['number_of_events']
-        except:
-            raise
+        ds = to_dataset(lma_file, event_id_start=next_event_id).set_index(
+            {'number_of_stations':'station_code', 'number_of_events':'event_id'})
+        ds.attrs['analysis_start_time'] = lma_file.starttime
+        ds.attrs['analysis_end_time'] = lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec)
+        lma_data.append(ds)
+        next_event_id += ds.sizes['number_of_events']
+        if sort_time:
+            all_starttimes.append(lma_file.starttime)
+    if sort_time:
+        sorting = np.argsort(all_starttimes)
+        lma_data = [lma_data[i] for i in sorting]
     ds = combine_datasets(lma_data)
-    ds.attrs['analysis_start_time'] = starttime
-    ds.attrs['number_of_seconds_analyzed'] = analyzed_sec
-    ds.attrs['analysis_end_time'] = endtime
-    if starttime + dt.timedelta(seconds=analyzed_sec) < endtime:
-        warnings.warn('The number of seconds analyzed does not fully cover the time between the start and end times of the analysis. This may indicate '
-                      'incomplete or non-continuous data.')
-    elif starttime + dt.timedelta(seconds=analyzed_sec) > endtime:
-        warnings.warn('The start time and end time of the analysis do not match the number of seconds analyzed. If you are combining multiple networks, '
-                      'this is normal and expected, otherwise, this may indicate an error in the orignal analysis program.')
     if sort_time:
         ds = ds.sortby('event_time')
-    ds = ds.reset_index(('number_of_events', 'number_of_stations'))
+    ds = ds.reset_index(('number_of_events'))
     if 'number_of_events_' in ds.coords:
-        # Older xarray versions appended a trailing underscore. reset_coords then dropped
-        # converted the coordinate variables into regular variables while not touching
-        # the original dimension name, allowing us to rename. In newer versions, the variable
-        # names are never changed at the reset_index step, so the renaming step modifies
-        # the dimension name, too.
+    #     # Older xarray versions appended a trailing underscore. reset_coords then dropped
+    #     # converted the coordinate variables into regular variables while not touching
+    #     # the original dimension name, allowing us to rename. In newer versions, the variable
+    #     # names are never changed at the reset_index step, so the renaming step modifies
+    #     # the dimension name, too.
         resetted = ('number_of_events_', 'number_of_stations_')
-        ds = ds.reset_coords(resetted)
         ds = ds.rename({resetted[0]:'event_id'})
     else:
-        # The approach for newer xarray versions requires we explicitly rename only the variables.
-        # The generic "rename" in the block above renames vars, coords, and dims.
+    #     # The approach for newer xarray versions requires we explicitly rename only the variables.
+    #     # The generic "rename" in the block above renames vars, coords, and dims.
         resetted = ('number_of_events', 'number_of_stations')
-        ds = ds.reset_coords(resetted)
         ds = ds.rename_vars({resetted[0]:'event_id'})
     return ds, starttime
 
@@ -286,7 +270,7 @@ def to_dataset(lma_file, event_id_start=0):
     returns an xarray dataset of the type returned by
         pyxlma.lmalib.io.cf_netcdf.new_dataset
     """
-    from pyxlma.lmalib.io.cf_netcdf import new_dataset
+    
     lma_data = lma_file.readfile()
     starttime = lma_file.starttime
     stations = lma_file.stations

From caaf6073bbb482ea9886880aaad050f0b7483063 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 01:11:11 -0500
Subject: [PATCH 15/28] use template cf dataset to restore metadata when
 combining #11

---
 pyxlma/lmalib/io/read.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 09395ae..aa4058c 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -5,7 +5,7 @@
 import datetime as dt
 from os import path
 import collections
-from pyxlma.lmalib.io.cf_netcdf import new_dataset
+from pyxlma.lmalib.io.cf_netcdf import new_dataset, new_template_dataset
 
 class open_gzip_or_dat:
     def __init__(self, filename):
@@ -75,7 +75,7 @@ def reorder_stations(dataset, index_mapping):
     all_data = lma_data[0]
     # Get the attributes attached to each variable in the dataset
     dv_attrs = {}
-    new_ds = new_dataset(1, 1, 1)
+    new_ds = new_template_dataset()
     for var in new_ds.data_vars:
         dv_attrs[var] = new_ds[var].attrs
     # Define list of 'properties', things which identify a station and are not expected to change

From 81502fdb3ba60825242cb7656017e20e68c5c2ad Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 01:12:03 -0500
Subject: [PATCH 16/28] fix cf comparison

---
 pyxlma/lmalib/io/cf_netcdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py
index 01749e4..b192e71 100644
--- a/pyxlma/lmalib/io/cf_netcdf.py
+++ b/pyxlma/lmalib/io/cf_netcdf.py
@@ -466,7 +466,7 @@ def compare_attributes(ds):
     """ Compare the attributes of all data variables in ds to the CF spec
         and print a report of any differences.
     """
-    dst = __template_dataset['data_vars']
+    dst = new_template_dataset()['data_vars']
     dsd = ds.to_dict()['data_vars']
     for k in dsd.keys():
         if dst[k]['attrs'] != dsd[k]['attrs']:

From 9491cc2c029ed329e63f21b40312bc55458fe4cf Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 01:16:13 -0500
Subject: [PATCH 17/28] that's not an xarray yet

---
 pyxlma/lmalib/io/read.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index aa4058c..c3e22e9 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -76,8 +76,9 @@ def reorder_stations(dataset, index_mapping):
     # Get the attributes attached to each variable in the dataset
     dv_attrs = {}
     new_ds = new_template_dataset()
-    for var in new_ds.data_vars:
-        dv_attrs[var] = new_ds[var].attrs
+    print(new_ds)
+    for var in new_ds['data_vars']:
+        dv_attrs[var] = new_ds['data_vars'][var]['attrs']
     # Define list of 'properties', things which identify a station and are not expected to change
     property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network')
     # Define list of variables to be recalculated for each station after the datasets are combined

From dffb0fb6c3fa4bc598776af58556643298c3798f Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 01:44:30 -0500
Subject: [PATCH 18/28] recalculate station mask after combine

---
 pyxlma/lmalib/io/read.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index c3e22e9..ef6448b 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -76,7 +76,6 @@ def reorder_stations(dataset, index_mapping):
     # Get the attributes attached to each variable in the dataset
     dv_attrs = {}
     new_ds = new_template_dataset()
-    print(new_ds)
     for var in new_ds['data_vars']:
         dv_attrs[var] = new_ds['data_vars'][var]['attrs']
     # Define list of 'properties', things which identify a station and are not expected to change
@@ -212,6 +211,9 @@ def reorder_stations(dataset, index_mapping):
     all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations')
     all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0]
     all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events')
+    mask_strings = np.apply_along_axis(lambda x: ''.join(x), 1, all_data.event_contributing_stations.astype(str).data) # convert each event's contributing stations (T/F) to binary
+    all_data.event_mask.data = np.vectorize(lambda x: int(x, 2))(mask_strings) # convert bin to dec
+    all_data.attrs['station_mask_order'] = np.apply_along_axis(lambda x: ''.join(x), 0, all_data.station_code.astype(str).data).item()
     # restore previously cached data var attributes
     for var_name in all_data.data_vars:
         if var_name in dv_attrs:
@@ -251,16 +253,16 @@ def dataset(filenames, sort_time=True):
         ds = ds.sortby('event_time')
     ds = ds.reset_index(('number_of_events'))
     if 'number_of_events_' in ds.coords:
-    #     # Older xarray versions appended a trailing underscore. reset_coords then dropped
-    #     # converted the coordinate variables into regular variables while not touching
-    #     # the original dimension name, allowing us to rename. In newer versions, the variable
-    #     # names are never changed at the reset_index step, so the renaming step modifies
-    #     # the dimension name, too.
+        # Older xarray versions appended a trailing underscore. reset_coords then dropped
+        # converted the coordinate variables into regular variables while not touching
+        # the original dimension name, allowing us to rename. In newer versions, the variable
+        # names are never changed at the reset_index step, so the renaming step modifies
+        # the dimension name, too.
         resetted = ('number_of_events_', 'number_of_stations_')
         ds = ds.rename({resetted[0]:'event_id'})
     else:
-    #     # The approach for newer xarray versions requires we explicitly rename only the variables.
-    #     # The generic "rename" in the block above renames vars, coords, and dims.
+        # The approach for newer xarray versions requires we explicitly rename only the variables.
+        # The generic "rename" in the block above renames vars, coords, and dims.
         resetted = ('number_of_events', 'number_of_stations')
         ds = ds.rename_vars({resetted[0]:'event_id'})
     return ds, starttime

From 8ad6f0976f2a44a09ff8be8855a63185662e288d Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 03:42:22 -0500
Subject: [PATCH 19/28] try to make station IDs unique, fix writing, dont
 recalc center unless necessary

---
 pyxlma/lmalib/io/read.py  | 64 +++++++++++++++++++++++++++++++++++----
 pyxlma/lmalib/io/write.py |  4 +++
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index ef6448b..1e92531 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -5,6 +5,8 @@
 import datetime as dt
 from os import path
 import collections
+import string
+import warnings
 from pyxlma.lmalib.io.cf_netcdf import new_dataset, new_template_dataset
 
 class open_gzip_or_dat:
@@ -82,9 +84,15 @@ def reorder_stations(dataset, index_mapping):
     property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network')
     # Define list of variables to be recalculated for each station after the datasets are combined
     recalc_vars = ('station_event_fraction', 'station_power_ratio', 'network_center_latitude', 'network_center_longitude', 'network_center_altitude')
+    # Will be set to True if network_center location needs to be recalculated
+    recalc_center = False
     # Check each subsequent dataset for new stations
     for new_file_num in range(1, len(lma_data)):
         new_file = lma_data[new_file_num]
+        if (np.all(new_file.network_center_latitude.data != all_data.network_center_latitude.data)
+            or np.all(new_file.network_center_longitude.data.item() != all_data.network_center_longitude.data)
+            or np.all(new_file.network_center_altitude.data != all_data.network_center_altitude.data)):
+            recalc_center = True
         # Demote station code to a data variable and assign an index to each station in the new file
         new_file['station_code'] = new_file.number_of_stations
         new_file['number_of_stations'] = np.arange(len(new_file.number_of_stations))
@@ -197,6 +205,9 @@ def reorder_stations(dataset, index_mapping):
             all_data.attrs['config_times'].append([[new_file.attrs['analysis_start_time'], new_file.attrs['analysis_end_time']]])
         all_data.attrs['analysis_start_time'] = min(all_data.attrs['analysis_start_time'], new_file.attrs['analysis_start_time'])
         all_data.attrs['analysis_end_time'] = max(all_data.attrs['analysis_end_time'], new_file.attrs['analysis_end_time'])
+        all_data['network_center_longitude'] = all_data.network_center_longitude.isel(number_of_events=0)
+        all_data['network_center_latitude'] = all_data.network_center_latitude.isel(number_of_events=0)
+        all_data['network_center_altitude'] = all_data.network_center_altitude.isel(number_of_events=0)
     # Update the global attributes
     all_data.attrs.update(final_attrs)
     # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary
@@ -205,12 +216,52 @@ def reorder_stations(dataset, index_mapping):
         unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1]
         if len(unique_configs) == 1:
             unique_configs = unique_configs[0]
+        else:
+            all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0]
+            all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events')
     # recalculate variables that depend on the station data
-    all_data['network_center_longitude'] = all_data.station_longitude.mean(dim='number_of_stations')
-    all_data['network_center_latitude'] = all_data.station_latitude.mean(dim='number_of_stations')
-    all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations')
-    all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0]
-    all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events')
+    if recalc_center:
+        all_data['network_center_longitude'] = all_data.station_longitude.mean(dim='number_of_stations')
+        all_data['network_center_latitude'] = all_data.station_latitude.mean(dim='number_of_stations')
+        all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations')
+    # Make sure all station codes are unique
+    if np.max(np.unique_counts(all_data.station_code.data).counts) > 1:
+        unique_stat = []
+        the_alphabet = string.ascii_letters
+        for i in range(len(all_data.station_code.data)):
+            stat = all_data.station_code.data[i]
+            if stat in unique_stat:
+                warnings.warn(f'Conflicting station code discovered at index {i}')
+                stat_str = stat.decode('utf-8')
+                if stat_str.isupper():
+                    lowc = stat_str.lower()
+                    if bytes(lowc, 'utf-8') not in unique_stat:
+                        warnings.warn(f'Renaming station {stat_str} to {lowc}')
+                        unique_stat.append(bytes(lowc, 'utf-8'))
+                    else:
+                        for new_letter in the_alphabet:
+                            if bytes(new_letter, 'utf-8') not in unique_stat:
+                                warnings.warn(f'Renaming station {stat_str} to {new_letter}')
+                                unique_stat.append(bytes(new_letter, 'utf-8'))
+                                break
+                        else:
+                            warnings.warn(f'Assigning station a blank ID. This station will not be included in files generated by pyxlma.lmalib.io.write.lma_dat_file')
+                elif stat_str.islower():
+                    upc = stat_str.upper()
+                    if bytes(upc, 'utf-8') not in unique_stat:
+                        warnings.warn(f'Renaming station {stat_str} to {upc}')
+                        unique_stat.append(bytes(upc, 'utf-8'))
+                    else:
+                        for new_letter in the_alphabet:
+                            if bytes(new_letter, 'utf-8') not in unique_stat:
+                                warnings.warn(f'Renaming station {stat_str} to {new_letter}')
+                                unique_stat.append(bytes(new_letter, 'utf-8'))
+                                break
+                        else:
+                            warnings.warn(f'Assigning station a blank ID. This station will not be included in files generated by pyxlma.lmalib.io.write.lma_dat_file')
+            else:    
+                unique_stat.append(stat)
+        all_data.station_code.data = np.array(unique_stat, dtype='S1')
     mask_strings = np.apply_along_axis(lambda x: ''.join(x), 1, all_data.event_contributing_stations.astype(str).data) # convert each event's contributing stations (T/F) to binary
     all_data.event_mask.data = np.vectorize(lambda x: int(x, 2))(mask_strings) # convert bin to dec
     all_data.attrs['station_mask_order'] = np.apply_along_axis(lambda x: ''.join(x), 0, all_data.station_code.astype(str).data).item()
@@ -218,6 +269,7 @@ def reorder_stations(dataset, index_mapping):
     for var_name in all_data.data_vars:
         if var_name in dv_attrs:
             all_data[var_name].attrs = dv_attrs[var_name]
+    all_data.station_active.attrs['_FillValue'] = 255
     return all_data
 
 def dataset(filenames, sort_time=True):
@@ -347,7 +399,7 @@ def to_dataset(lma_file, event_id_start=0):
     ds['event_contributing_stations'][:] = station_mask_bools
 
     # -- Convert the station_active flag to a psuedo-boolean --
-    station_active_data = np.zeros(N_stations, dtype='int8')
+    station_active_data = np.zeros(N_stations, dtype='uint8')
     station_active_data[ds.station_active.data == b'A'] = 1
     station_active_data[ds.station_active.data == b'NA'] = 0
     ds.station_active.data = station_active_data
diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py
index ccaa458..4ba94dc 100644
--- a/pyxlma/lmalib/io/write.py
+++ b/pyxlma/lmalib/io/write.py
@@ -12,6 +12,10 @@ def dataset(dataset, path):
     for var in dataset.data_vars:
         if np.all(dataset[var].data == np.nan):
             dataset = dataset.drop_vars(var)
+    for attr in dataset.attrs:
+        if type(dataset.attrs[attr]) == dt.datetime:
+            dataset.attrs[attr] = dataset.attrs[attr].strftime('%Y-%m-%d %H:%M:%S.%f')
+    dataset.attrs.pop('config_times', None)
     comp = dict(zlib=True, complevel=5)
     encoding = {var: comp for var in dataset.data_vars}
     dataset = dataset.chunk('auto')

From 1e739f6a94e2e08bd008b0d7ea2f59aacc4f0161 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 03:43:19 -0500
Subject: [PATCH 20/28] reading multifile test more comprehensive

---
 tests/test_io.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_io.py b/tests/test_io.py
index 85e8d74..9277cdb 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -15,7 +15,9 @@ def test_read_mf_dataset():
     files_to_read = ['examples/data/'+file for file in files_to_read]
     dataset, start_date = lma_read.dataset(files_to_read)
     assert start_date == dt(2023, 12, 24, 0, 57, 1)
-    assert dataset == xr.open_dataset('tests/truth/lma_netcdf/lma.nc')
+    truth = xr.open_dataset('tests/truth/lma_netcdf/lma.nc')
+    for var in truth.data_vars:
+        compare_dataarrays(dataset, truth, var)
 
 
 def test_read_onefile_dataset():

From e72b3b4369dfd145e08f01a1372dc63d3b9baabb Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 04:10:16 -0500
Subject: [PATCH 21/28] vectorize station mask calculation

---
 pyxlma/lmalib/io/read.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 1e92531..787b644 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -262,8 +262,8 @@ def reorder_stations(dataset, index_mapping):
             else:    
                 unique_stat.append(stat)
         all_data.station_code.data = np.array(unique_stat, dtype='S1')
-    mask_strings = np.apply_along_axis(lambda x: ''.join(x), 1, all_data.event_contributing_stations.astype(str).data) # convert each event's contributing stations (T/F) to binary
-    all_data.event_mask.data = np.vectorize(lambda x: int(x, 2))(mask_strings) # convert bin to dec
+    bin_to_dec = 2**np.flip(np.arange(all_data.event_contributing_stations.data.shape[1]))
+    all_data.event_mask.data = np.sum((bin_to_dec * all_data.event_contributing_stations.data), axis=1) # convert bin to dec
     all_data.attrs['station_mask_order'] = np.apply_along_axis(lambda x: ''.join(x), 0, all_data.station_code.astype(str).data).item()
     # restore previously cached data var attributes
     for var_name in all_data.data_vars:

From 3e762835a5d210e94455ac11fcc9c881f61fb464 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 20:48:54 -0500
Subject: [PATCH 22/28] read station name, board revision, delay, and rec_ch

---
 pyxlma/lmalib/io/cf_netcdf.py | 20 +++++++++++++++++++
 pyxlma/lmalib/io/read.py      | 36 +++++++++++++++++++----------------
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py
index b192e71..e606bfa 100644
--- a/pyxlma/lmalib/io/cf_netcdf.py
+++ b/pyxlma/lmalib/io/cf_netcdf.py
@@ -301,6 +301,10 @@ def new_template_dataset():
    'attrs': {'_FillValue': ' '*32, 'long_name': 'LMA network'},
    'dtype': '|S32',
    },
+   'station_name': {'dims': ('number_of_stations',),# 'string32'),
+   'attrs': {'_FillValue': ' '*32, 'long_name': 'Name of receiving station'},
+   'dtype': '|S32',
+   },
   'station_latitude': {'dims': ('number_of_stations',),
    'attrs': {'_FillValue': np.nan,
     'valid_range': [-90.0, 90.0],
@@ -323,6 +327,22 @@ def new_template_dataset():
     'positive': 'up'},
    'dtype': 'float32',
    },
+   'station_delay': {'dims': ('number_of_stations',),
+   'attrs': {'_FillValue': np.nan,
+    'long_name': "Configured delay time for a signal to travel from the antenna to the receiver",
+    'units': 'nanoseconds'},
+   'dtype': 'float32',
+   },
+   'station_board_revision': {'dims': ('number_of_stations',),
+   'attrs': {'_FillValue': 255,
+    'long_name': "Station board revision number"},
+   'dtype': 'uint8',
+   },
+   'station_receive_channels': {'dims': ('number_of_stations',),
+   'attrs': {'_FillValue': 255,
+    'long_name': "Number of channels on the station's receiver"},
+   'dtype': 'uint8',
+   },
   'station_event_fraction': {'dims': ('number_of_stations',),
    'attrs': {'_FillValue': np.nan,
     'long_name': "Fraction of events to which this station contributed",
diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 787b644..1d1db46 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -325,7 +325,7 @@ def to_dataset(lma_file, event_id_start=0):
     returns an xarray dataset of the type returned by
         pyxlma.lmalib.io.cf_netcdf.new_dataset
     """
-    
+
     lma_data = lma_file.readfile()
     starttime = lma_file.starttime
     stations = lma_file.stations
@@ -343,6 +343,10 @@ def to_dataset(lma_file, event_id_start=0):
         'station_event_fraction':'sources',
         'station_power_ratio':'<P/P_m>',
         'station_active':'active',
+        'station_name':'Name',
+        'station_board_revision':'Board Revision',
+        'station_delay':'Delay Time',
+        'station_receive_channels':'Receiver Channels',
     }
     event_mapping = {
         'event_latitude':'lat',
@@ -383,6 +387,9 @@ def to_dataset(lma_file, event_id_start=0):
     ds.attrs['event_algorithm_name'] = lma_file.analysis_program
     ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version
     ds.attrs['original_filename'] =  path.basename(lma_file.file)
+    ds.attrs['max_chi2'] = lma_file.maximum_chi2
+    ds.attrs['min_stations'] = lma_file.minimum_stations
+    ds.attrs['max_chi2_iterations'] = lma_file.maximum_chi2_iter
     # -- Populate the station mask information --
     # int, because NetCDF doesn't have booleans
     station_mask_bools = np.zeros((N_events, N_stations), dtype='int8')
@@ -542,20 +549,17 @@ def __init__(self,filename):
         with open_gzip_or_dat(self.file) as f:
             for line_no, line in enumerate(f):
                 if line.startswith(b'Analysis program:'):
-                    analysis_program = line.decode().split(':')[1:]
-                    self.analysis_program = ':'.join(analysis_program)[:-1]
+                    self.analysis_program = line.decode().replace('Analysis program: ','').replace('Analysis program:','')
                 if line.startswith(b'Analysis program version:'):
-                    analysis_program_version = line.decode().split(':')[1:]
-                    self.analysis_program_version = ':'.join(analysis_program_version)[:-1]
+                    self.analysis_program_version = line.decode().replace('Analysis program version: ', '').replace('Analysis program version:', '')
                 if line.startswith(b'File created:') | line.startswith(b'Analysis finished:'):
-                    file_created = line.decode().split(':')[1:]
-                    self.file_created = ':'.join(file_created)[:-1]
+                    self.file_created = line.decode().replace('File created: ', '').replace('File created:', '').replace('Analysis finished: ', '').replace('Analysis finished:', '')
                 if line.startswith(b'Location:'):
-                    self.network_location = ':'.join(line.decode().split(':')[1:])[:-1].replace(' ','', 1)
+                    self.network_location = line.decode().replace('Location: ', '').replace('Location:', '')
                 if line.startswith(b'Data start time:'):
                     timestring = line.decode().split()[-2:]
                     self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y')
-                    # Full start time and second, likely unneeded
+                    # Full start time and second
                     self.starttime = dt.datetime.strptime(timestring[0]+timestring[1],'%m/%d/%y%H:%M:%S')
                     # self.startsecond = (starttime-dt.datetime(starttime.year,starttime.month,starttime.day)).seconds
                 if line.startswith(b'Number of seconds analyzed:'):
@@ -568,7 +572,7 @@ def __init__(self,filename):
                 # Number of active stations
                 if line.startswith(b'Number of active stations:'):
                     self.active_station_c_line = line_no
-                    self.active_staion_c_count = line.decode().split()[-1]
+                    self.active_staion_c_count = int(line.decode().split()[-1])
                 # Active stations
                 if line.startswith(b'Active stations:'):
                     self.active_station_s_line = line_no
@@ -577,7 +581,7 @@ def __init__(self,filename):
                     self.minimum_stations = int(line.decode().split(':')[1])
                 if line.startswith(b'Maximum reduced chi-squared:'):
                     self.maximum_chi2 = float(line.decode().split(':')[1])
-                if line.startswith(b'Maximum chi-squared iterations:'):
+                if line.startswith(b'Maximum chi-squared iterations:') or line.startswith(b'Maximum number of chi-squared iterations:'):
                     self.maximum_chi2_iter = int(line.decode().split(':')[1])
                 if line.startswith(b'Station information:'):
                     self.station_info_start = line_no
@@ -606,13 +610,13 @@ def __init__(self,filename):
 
         # Station overview information
         stations = pd.DataFrame(self.gen_sta_info(),
-            columns=['ID','Name','Lat','Long','Alt','Delay Time'])
+            columns=['ID','Name','Lat','Long','Alt','Delay Time','Board Revision','Receiver Channels'])
         overview = pd.DataFrame(self.gen_sta_data(),
             columns=['ID','Name','win(us)', 'data_ver', 'rms_error(ns)',
-                     'sources','percent','<P/P_m>','active'])
+                     'sources','percent','<P/P_m>','active']).drop(columns=['Name'])
         # Drop the station name column that has a redundant station letter code
         # as part of the name and join on station letter code.
-        station_combo =  stations.set_index('ID').drop(columns=['Name']).join(
+        station_combo =  stations.set_index('ID').join(
                              overview.set_index('ID'))
         self.stations = station_combo.reset_index(level=station_combo.index.names)
 
@@ -631,7 +635,7 @@ def gen_sta_info(self):
                 parts = line.decode("utf-8").split()
                 name = ' '.join(parts[2:-6])
                 sta_info, code = parts[0:2]
-                yield (code, name) + tuple(parts[-6:-2])
+                yield (code, name) + tuple(parts[-6:])
 
     def gen_sta_data(self):
         """ Parse the station data table from the header. Some files do not
@@ -680,7 +684,7 @@ def readfile(self):
         lmad.insert(1,'Datetime',pd.to_timedelta(lmad['time (UT sec of day)'], unit='s')+self.startday)
 
         # Parse out which stations contributed into new columns for each station
-        col_names = self.stations.Name.values
+        col_names = self.stations.Name.values.copy()
         self.mask_ints = mask_to_int(lmad["mask"])
         for index,items in enumerate(self.maskorder[::-1]):
             col_names[index] = items+'_'+self.stations.Name.values[index]

From 0cb0d9c16e4c190d6515856ad01aeea2585a511b Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Sun, 28 Jul 2024 20:52:50 -0500
Subject: [PATCH 23/28] this doesn't appear to be used for anything...

---
 pyxlma/lmalib/io/read.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 1d1db46..d3b21a5 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -587,8 +587,6 @@ def __init__(self,filename):
                     self.station_info_start = line_no
                 if line.startswith(b'Station data:'):
                     self.station_data_start = line_no
-                if line.startswith(b'Metric file:'):
-                    self.station_data_end = line_no
                 # Find mask list order
                 if line.startswith(b'Station mask order:'):
                     self.maskorder = line.decode().split()[-1]

From 17e89108e57d4a72b8c2f3c1a2f7781af1c61c42 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 29 Jul 2024 00:21:32 -0500
Subject: [PATCH 24/28] make fill values more obvious

---
 pyxlma/lmalib/io/cf_netcdf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py
index e606bfa..7a83347 100644
--- a/pyxlma/lmalib/io/cf_netcdf.py
+++ b/pyxlma/lmalib/io/cf_netcdf.py
@@ -334,12 +334,12 @@ def new_template_dataset():
    'dtype': 'float32',
    },
    'station_board_revision': {'dims': ('number_of_stations',),
-   'attrs': {'_FillValue': 255,
+   'attrs': {'_FillValue': 0,
     'long_name': "Station board revision number"},
    'dtype': 'uint8',
    },
    'station_receive_channels': {'dims': ('number_of_stations',),
-   'attrs': {'_FillValue': 255,
+   'attrs': {'_FillValue': 0,
     'long_name': "Number of channels on the station's receiver"},
    'dtype': 'uint8',
    },

From 55c29e12636742c8c6bf3c9790e2170feb9fe567 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 29 Jul 2024 00:21:59 -0500
Subject: [PATCH 25/28] remove newlines from attrs

---
 pyxlma/lmalib/io/read.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index d3b21a5..86af36b 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -549,13 +549,13 @@ def __init__(self,filename):
         with open_gzip_or_dat(self.file) as f:
             for line_no, line in enumerate(f):
                 if line.startswith(b'Analysis program:'):
-                    self.analysis_program = line.decode().replace('Analysis program: ','').replace('Analysis program:','')
+                    self.analysis_program = line.decode().replace('Analysis program: ','').replace('Analysis program:','').replace('\n', '')
                 if line.startswith(b'Analysis program version:'):
-                    self.analysis_program_version = line.decode().replace('Analysis program version: ', '').replace('Analysis program version:', '')
+                    self.analysis_program_version = line.decode().replace('Analysis program version: ', '').replace('Analysis program version:', '').replace('\n', '')
                 if line.startswith(b'File created:') | line.startswith(b'Analysis finished:'):
-                    self.file_created = line.decode().replace('File created: ', '').replace('File created:', '').replace('Analysis finished: ', '').replace('Analysis finished:', '')
+                    self.file_created = line.decode().replace('File created: ', '').replace('File created:', '').replace('Analysis finished: ', '').replace('Analysis finished:', '').replace('\n', '')
                 if line.startswith(b'Location:'):
-                    self.network_location = line.decode().replace('Location: ', '').replace('Location:', '')
+                    self.network_location = line.decode().replace('Location: ', '').replace('Location:', '').replace('\n', '')
                 if line.startswith(b'Data start time:'):
                     timestring = line.decode().split()[-2:]
                     self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y')

From 0cc3204b30bd6c4c5b4c69a26ef002ef5fa62c99 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 29 Jul 2024 00:47:14 -0500
Subject: [PATCH 26/28] add new properties

---
 pyxlma/lmalib/io/read.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 86af36b..64df2c0 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -81,7 +81,7 @@ def reorder_stations(dataset, index_mapping):
     for var in new_ds['data_vars']:
         dv_attrs[var] = new_ds['data_vars'][var]['attrs']
     # Define list of 'properties', things which identify a station and are not expected to change
-    property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network')
+    property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network', 'station_name', 'station_board_revision', 'station_delay', 'station_receive_channels')
     # Define list of variables to be recalculated for each station after the datasets are combined
     recalc_vars = ('station_event_fraction', 'station_power_ratio', 'network_center_latitude', 'network_center_longitude', 'network_center_altitude')
     # Will be set to True if network_center location needs to be recalculated

From 0defdeac53bc82c58e8e18b64469af755d1be421 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 29 Jul 2024 00:52:14 -0500
Subject: [PATCH 27/28] better handle combining new attrs

---
 pyxlma/lmalib/io/read.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py
index 64df2c0..a5f5e7a 100644
--- a/pyxlma/lmalib/io/read.py
+++ b/pyxlma/lmalib/io/read.py
@@ -66,10 +66,21 @@ def reorder_stations(dataset, index_mapping):
             continue
         attr_vals =  all_attrs[k]
         set_of_values = set(attr_vals)
-        if len(set_of_values) == 1:
+        if len(set_of_values) == 0:
+            continue
+        elif len(set_of_values) == 1:
             final_attrs[k] = tuple(set_of_values)[0]
         else:
-            final_attrs[k] = '; '.join(attr_vals)
+            if type(attr_vals[0]) == str:
+                final_attrs[k] = '; '.join(attr_vals)
+            elif k == 'min_stations':
+                final_attrs[k] = min(attr_vals)
+            elif k == 'max_chi2':
+                final_attrs[k] = max(attr_vals)
+            elif k == 'max_chi2_iterations':
+                final_attrs[k] = min(attr_vals)
+            else:
+                warnings.warn(f'Conflicting values for global attribute {k}')
     # Get the station data from the first dataset and assign each station a unique index
     lma_data[0]['station_code'] = lma_data[0].number_of_stations
     lma_data[0]['number_of_stations'] = np.arange(len(lma_data[0].number_of_stations))

From d2b28e37d8f29e34e76f293ee743bd0b25f319a0 Mon Sep 17 00:00:00 2001
From: Sam Gardner <samuel.gardner@ttu.edu>
Date: Mon, 29 Jul 2024 00:52:38 -0500
Subject: [PATCH 28/28] dat.gz write support!

---
 pyxlma/lmalib/io/write.py | 245 ++++++++++++++++++++++++++++++++++----
 1 file changed, 219 insertions(+), 26 deletions(-)

diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py
index 4ba94dc..4599c0c 100644
--- a/pyxlma/lmalib/io/write.py
+++ b/pyxlma/lmalib/io/write.py
@@ -1,6 +1,10 @@
+import importlib.metadata
 import numpy as np
 import datetime as dt
 import warnings
+import importlib
+from functools import reduce
+import gzip
 
 def dataset(dataset, path):
     "Write an LMA dataset to a netCDF file"
@@ -21,6 +25,177 @@ def dataset(dataset, path):
     dataset = dataset.chunk('auto')
     dataset.to_netcdf(path, encoding=encoding)
 
+
+def count_decimals_in_array(a):
+    a = np.array(a).astype(str)
+    # Find the position of the decimal point in each string
+    decimal_positions = np.char.find(a, '.')
+
+    # Handle the case where there are no decimal points
+    decimal_positions[decimal_positions == -1] = np.char.str_len(a[decimal_positions == -1])
+
+    # Calculate the number of characters after the decimal point
+    decimal_counts = np.char.str_len(a) - decimal_positions - 1
+
+    return np.max(decimal_counts)
+
+
+def create_station_info_string(dataset):
+    """
+    Create a string representation of the station information table for an LMA dataset.
+
+    Parameters
+    ----------
+    dataset : xarray.Dataset
+        The LMA dataset to create the station information table for.
+
+    Returns
+    -------
+    str
+        The string representation of the station information table.
+    """
+    if 'number_of_stations' not in dataset.dims:
+        raise ValueError('Dataset does not contain station information.')
+    header_line = 'Station information: '
+    info_to_include = []
+    if 'station_code' in dataset.data_vars:
+        header_line += 'id, '
+        info_to_include.append('station_code')
+    else:
+        raise ValueError('Stations do not have letter codes. This is incompatible with the XLMA dat format, please assign station codes to all stations.')
+    
+    if 'station_name' in dataset.data_vars:
+        header_line += 'name, '
+        info_to_include.append('station_name')
+    else:
+        raise ValueError('Stations do not have names. This is incompatible with the XLMA dat format, please assign names to all stations.')
+    
+    if 'station_latitude' in dataset.data_vars:
+        header_line += 'lat(d), '
+        info_to_include.append('station_latitude')
+    else:
+        raise ValueError('Stations do not have latitudes. This is incompatible with the XLMA dat format, please assign latitudes to all stations.')
+    
+    if 'station_longitude' in dataset.data_vars:
+        header_line += 'lon(d), '
+        info_to_include.append('station_longitude')
+    else:
+        raise ValueError('Stations do not have longitudes. This is incompatible with the XLMA dat format, please assign longitudes to all stations.')
+    
+    if 'station_altitude' in dataset.data_vars:
+        header_line += 'alt(m), '
+        info_to_include.append('station_altitude')
+    else:
+        raise ValueError('Stations do not have altitudes. This is incompatible with the XLMA dat format, please assign altitudes to all stations.')
+    
+    if 'station_delay' in dataset.data_vars:
+        header_line += 'delay(ns), '
+        info_to_include.append('station_delay')
+        # This is optional
+    
+    if 'station_board_revision' in dataset.data_vars:
+        header_line += 'board_rev, '
+        info_to_include.append('station_board_revision')
+    else:
+        raise ValueError('Stations do not have board revisions. This is incompatible with the XLMA dat format, please assign board revisions to all stations.')
+    
+    if 'station_receive_channels' in dataset.data_vars:
+        header_line += 'rec_ch, '
+        info_to_include.append('station_receive_channels')
+        # This is optional
+    header_line = header_line[:-2]
+    stations_strings = []
+    for station_num in range(dataset.sizes['number_of_stations']):
+        this_station = dataset.isel(number_of_stations=station_num)
+        station_string = 'Sta_info: '
+        for var in info_to_include:
+            station_string += f'{this_station[var].data.astype(str).item()}  '
+        stations_strings.append(station_string)
+    return header_line+'\n'+'\n'.join(stations_strings)
+
+
+def create_station_data_string(dataset):
+    pass
+
+
+def create_event_data_string(dataset):
+    if 'analysis_start_time' in dataset.attrs:
+        start_date = dataset.attrs['analysis_start_time']
+        start_date = np.array([start_date]).astype('datetime64[D]')
+    else:
+        start_date = dataset.event_time.data.min().astype('datetime64[D]')
+    columns_line = 'Data: '
+    format_line = 'Data format: '
+    data_to_combine = []
+
+    day_sec = (dataset.event_time - start_date).data.astype('timedelta64[ns]').astype(np.float64)/1e9
+    day_sec_str = day_sec.astype(str)
+    day_sec_length = np.max(np.char.str_len(day_sec_str))
+    day_sec_str = np.char.ljust(day_sec_str, day_sec_length, fillchar='0')
+    columns_line += 'time (UT sec of day), '
+    format_line += f'{day_sec_length}.9f, ' #time is always in nanoseconds
+    data_to_combine.append(day_sec_str.T)
+
+    lat = dataset.event_latitude.data.astype(str)
+    lat_length = np.max(np.char.str_len(lat))
+    lat = np.char.ljust(lat, lat_length, fillchar='0')
+    columns_line += 'lat, '
+    lat_dec_count = count_decimals_in_array(lat)
+    format_line += f'{lat_length}.{lat_dec_count}f, '
+    data_to_combine.append(lat.T)
+
+    lon = dataset.event_longitude.data.astype(str)
+    lon_length = np.max(np.char.str_len(lon))
+    lon = np.char.ljust(lon, lon_length, fillchar='0')
+    columns_line += 'lon, '
+    lon_dec_count = count_decimals_in_array(lon)
+    format_line += f'{lon_length}.{lon_dec_count}f, '
+    data_to_combine.append(lon.T)
+
+    alt = dataset.event_altitude.data.astype(str)
+    alt_length = np.max(np.char.str_len(alt))
+    alt = np.char.ljust(alt, alt_length, fillchar='0')
+    columns_line += 'alt(m), '
+    alt_dec_count = count_decimals_in_array(alt)
+    format_line += f'{alt_length}.{alt_dec_count}f, '
+    data_to_combine.append(alt.T)
+
+    chi2 = dataset.event_chi2.data.astype(str)
+    chi2_length = np.max(np.char.str_len(chi2))
+    chi2 = np.char.ljust(chi2, chi2_length, fillchar='0')
+    columns_line += 'reduced chi^2, '
+    chi2_dec_count = count_decimals_in_array(chi2)
+    format_line += f'{chi2_length}.{chi2_dec_count}f, '
+    data_to_combine.append(chi2.T)
+
+    power = dataset.event_power.data.astype(str)
+    power_length = np.max(np.char.str_len(power))
+    power = np.char.ljust(power, power_length, fillchar='0')
+    columns_line += 'P(dBW), '
+    power_dec_count = count_decimals_in_array(power)
+    format_line += f'{power_length}.{power_dec_count}f, '
+    data_to_combine.append(power.T)
+
+    masks = np.vectorize(np.base_repr)(dataset.event_mask.data, base=16)
+    mask_len = np.max(np.char.str_len(masks))
+    masks = np.char.rjust(masks, mask_len, fillchar='0')
+    masks = '0x' + masks
+    mask_len += 2
+    columns_line += 'mask, '
+    format_line += f'{mask_len}x, '
+    data_to_combine.append(masks.T)
+
+    columns_line = columns_line[:-2]
+    format_line = format_line[:-2]
+    N_ev_line = f'Number of events: {dataset.sizes["number_of_events"]}'
+    data_line = '*** data ***'
+    data_str = '\n'.join(reduce(lambda x, y:np.char.add(np.char.add(x, '  '), y), data_to_combine))
+    return columns_line+'\n'+format_line+'\n'+N_ev_line+'\n'+data_line+'\n'+data_str
+
+
+    
+    
+
 def lma_dat_file(dataset, path, use_gzip=True):
     """
     Write an LMA dataset to a legacy .dat(.gz) file for use in IDL XLMA.
@@ -38,37 +213,55 @@ def lma_dat_file(dataset, path, use_gzip=True):
     if 'network_configurations' in dataset.dims:
         warnings.warn('Attempting to write dataset with multiple network configurations to a single file. This is highly experimental.\n'
                       'Caveats:\n-Location names will be appended and separated by semicolon\n- The center of the network is derived from the mean of the locations of the stations')
-        center_lon = dataset.station_longitude.data.mean()
-        center_lat = dataset.station_latitude.data.mean()
-        center_alt = dataset.station_altitude.data.mean()
-        num_active = np.sum(np.clip(np.sum(dataset.station_active.data, axis=0), 0, 1))
+        stations_active = np.sum(dataset.station_active.data, axis=0)
+        num_active = np.sum(np.clip(stations_active, 0, 1))
+        stations_active = ' '.join(np.extract(stations_active, dataset.station_code.data).astype(str).tolist())
     else:
-        center_lon = dataset.network_center_longitude.data.item()
-        center_lat = dataset.network_center_latitude.data.item()
-        center_alt = dataset.network_center_altitude.data.item()
         num_active = np.sum(dataset.station_active.data)
-    lma_header = (f'Lightning Mapping Array analyzed data\n'
-                  f'Analysis program: {dataset.attrs["event_algorithm_name"]}\n'
-                  f'Analysis program version: {dataset.attrs["event_algorithm_version"]}\n'
+        stations_active = ' '.join(np.extract(dataset.station_active.data, dataset.station_code.data).astype(str).tolist())
+    
+    center_lon = dataset.network_center_longitude.data.item()
+    center_lat = dataset.network_center_latitude.data.item()
+    center_alt = dataset.network_center_altitude.data.item()
+    
+    unique_codes, code_counts = np.unique(dataset.station_code.data, return_counts=True)
+    if np.max(code_counts) > 1:
+        raise ValueError('Duplicate station codes found in dataset. This is incompatible with the XLMA dat format, please rename conflicting station letters.')
+    if b'' in unique_codes:
+        raise ValueError('Empty station codes found in dataset. This is incompatible with the XLMA dat format, please assign station codes to all stations.')
+    
+    if dataset.attrs["event_algorithm_name"].startswith('pyxlma'):
+        analysis_program = dataset.attrs["event_algorithm_name"]
+    else:
+        analysis_program = 'pyxlma; '+dataset.attrs["event_algorithm_name"]
+    if dataset.attrs["event_algorithm_version"].startswith('pyxlma'):
+        analysis_program_version = dataset.attrs["event_algorithm_version"]
+    else:
+        analysis_program_version = f'pyxlma-{importlib.metadata.version("pyxlma")}; {dataset.attrs["event_algorithm_version"]}'
+
+    lma_file = (f'pyxlma exported data -- https://github.com/deeplycloudy/xlma-python\n'
+                  f'Analysis program: {analysis_program}\n'
+                  f'Analysis program version: {analysis_program_version}\n'
                   f'File created: {dt.datetime.utcnow().strftime("%a %b %d %H:%M:%S %Y")}\n'
-                  f'Data start time: {dataset.attrs["analysis_start_time"].strftime("%M/%d/%y %H:%M:%S")}\n'
-                  f'Number of seconds analyzed: {dataset.attrs["number_of_seconds_analyzed"]}\n'
+                  f'Data start time: {dataset.attrs["analysis_start_time"].strftime("%m/%d/%y %H:%M:%S")}\n'
+                  f'Number of seconds analyzed: {(dataset.attrs["analysis_end_time"] - dataset.attrs["analysis_start_time"]).total_seconds()}\n'
                   f'Location: {"; ".join(np.unique(dataset.station_network.data).astype(str).tolist())}\n'
                   f'Coordinate center (lat,lon,alt): {center_lat:.7f}, {center_lon:.7f}, {center_alt:.2f}\n'
                   f'Coordinate frame: cartesian\n'
-                #   f'Maximum diameter of LMA (km): '
-                #   f'Maximum light-time across LMA (ns): '
-                  f'Number of stations: {dataset.sizes['number_of_stations']}'
+                  f'Number of stations: {dataset.sizes['number_of_stations']}\n'
                   f'Number of active stations: {num_active}\n'
-                #   f'Active stations:'
-                #   f'Minimum number of stations per solution: {}\n'
-                #   f'Maximum reduced chi-squared: {}\n'
-                #   f'Maximum number of chi-squared iterations: {}\n'
-                  ### STATION INFO TABLE
+                  f'Active stations: {stations_active}\n'
+                  f'Minimum number of stations per solution: {dataset.attrs["min_stations"]}\n'
+                  f'Maximum reduced chi-squared: {dataset.attrs["max_chi2"]}\n'
+                  f'Maximum number of chi-squared iterations: {dataset.attrs["max_chi2_iterations"]}\n'
+                  f'{create_station_info_string(dataset)}\n'
+                  f'Station mask order: {dataset.attrs["station_mask_order"]}\n'
                   ### STATION DATA TABLE
-                  f'Metric file version: 4\n'
-                  f'Data: time (UT sec of day), lat, lon, alt(m), reduced chi^2, P(dBW), mask\n'
-                  f'Data format: 15.9f 12.8f 13.8f 9.2f 6.2f 5.1f 7x\n'
-                  f'Number of events: {dataset.sizes["number_of_events"]}\n'
-                  f'*** data ***\n'
-    )
\ No newline at end of file
+                  f'{create_event_data_string(dataset)}'
+    )
+    if use_gzip:
+        with gzip.open(path, 'wb') as f:
+            f.write(lma_file.encode('utf-8'))
+    else:
+        with open(path, 'w') as f:
+            f.write(lma_file)
\ No newline at end of file