From d3931c76246118a2aab0ffc4833b2ed6e3691ada Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Thu, 28 Dec 2023 16:41:34 -0600 Subject: [PATCH 01/28] add lma writer --- pyxlma/lmalib/io/write.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 pyxlma/lmalib/io/write.py diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py new file mode 100644 index 0000000..aa0b8cd --- /dev/null +++ b/pyxlma/lmalib/io/write.py @@ -0,0 +1,16 @@ +import numpy as np + +def dataset(dataset, path): + "Write an LMA dataset to a netCDF file" + + if 'flash_event_count' in dataset.data_vars: + if np.all(dataset.flash_event_count == np.iinfo(np.uint32).max): + dataset = dataset.drop_vars(['flash_init_latitude', 'flash_init_longitude', 'flash_init_altitude', 'flash_area', 'flash_volume', 'flash_energy', 'flash_center_latitude', 'flash_center_longitude', 'flash_center_altitude', 'flash_power', 'flash_event_count', 'flash_duration_threshold', 'flash_time_start', 'flash_time_end', 'flash_duration']) + + for var in dataset.data_vars: + if np.all(dataset[var].data == np.nan): + dataset = dataset.drop_vars(var) + comp = dict(zlib=True, complevel=5) + encoding = {var: comp for var in dataset.data_vars} + dataset = dataset.chunk('auto') + dataset.to_netcdf(path, encoding=encoding) \ No newline at end of file From aacf8557c37bc0af3fc812b89bcd274471eed46f Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 30 Jun 2024 19:33:10 -0500 Subject: [PATCH 02/28] use xarray sizes instead of dims (warning silence) --- pyxlma/lmalib/io/read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index e1f3990..e2ca541 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -113,7 +113,7 @@ def dataset(filenames, sort_time=True): ds = to_dataset(lma_file, event_id_start=next_event_id).set_index( {'number_of_stations':'station_code', 'number_of_events':'event_id'}) lma_data.append(ds) - next_event_id += ds.dims['number_of_events'] + next_event_id += ds.sizes['number_of_events'] except: raise ds = combine_datasets(lma_data) From f5bfa96d52794a0141cf6a877beb24ca1bf7803b Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 1 Jul 2024 01:07:05 -0500 Subject: [PATCH 03/28] add original_filename, fix station network location --- pyxlma/lmalib/io/read.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index e2ca541..022f570 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -3,6 +3,7 @@ import numpy as np import gzip import datetime as dt +from os import path class open_gzip_or_dat: def __init__(self, filename): @@ -201,6 +202,7 @@ def to_dataset(lma_file, event_id_start=0): ds.attrs['history'] = "LMA source file created "+lma_file.file_created ds.attrs['event_algorithm_name'] = lma_file.analysis_program ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version + ds.attrs['original_filename'] = path.basename(lma_file.file) # -- Populate the station mask information -- # int, because NetCDF doesn't have booleans @@ -361,7 +363,7 @@ def __init__(self,filename): file_created = line.decode().split(':')[1:] self.file_created = ':'.join(file_created)[:-1] if line.startswith(b'Location:'): - self.network_location = ':'.join(line.decode().split(':')[1:])[:-1] + self.network_location = ':'.join(line.decode().split(':')[1:])[:-1].replace(' ','') if line.startswith(b'Data start time:'): timestring = line.decode().split()[-2:] self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y') From fe657e4f9ebbc34096768be1765235d8ed191e6e Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 1 Jul 2024 01:13:32 -0500 Subject: [PATCH 04/28] properly support stations appearing/disappearing/moving --- pyxlma/lmalib/io/read.py | 206 ++++++++++++++++++++++++++++++--------- 1 file changed, 161 insertions(+), 45 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 022f570..be1e479 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -37,6 +37,17 @@ def combine_datasets(lma_data): pyxlma.lmalib.io.cf_netcdf.new_dataset or pyxlma.lmalib.io.read.to_dataset """ + def reorder_stations(dataset, index_mapping): + for var in dataset.data_vars: + if 'number_of_stations' in dataset[var].dims: + if var == 'station_code': + new_station_codes = dataset[var].data.copy() + new_station_codes = new_station_codes[index_mapping] + dataset = dataset.assign(station_code=('number_of_stations', new_station_codes)) + else: + reordered_data = dataset[var].isel(number_of_stations=index_mapping) + dataset[var].data = reordered_data.values + return dataset # Get a list of all the global attributes from each dataset attrs = [d.attrs for d in lma_data] # Create a dict of {attr_name: [list of values from each dataset]} @@ -53,46 +64,153 @@ def combine_datasets(lma_data): final_attrs[k] = tuple(set_of_values)[0] else: final_attrs[k] = '; '.join(attr_vals) - # print(final_attrs) - - # Get just the pure-station variables - lma_station_data = xr.concat( - [d.drop_dims(['number_of_events']) for d in lma_data], - dim='number_of_files' - ) - # print(lma_station_data) - - # Get just the pure-event variables - lma_event_data = xr.concat( - [d.drop_dims(['number_of_stations']).drop( - ['network_center_latitude', - 'network_center_longitude', - 'network_center_altitude',] - ) for d in lma_data], - dim='number_of_events' - ) - # print(lma_event_data) - - # Get any varaibles with joint dimensions, - # i.e., ('number_of_events', 'number_of_stations') - event_contributing_stations = xr.concat( - [d['event_contributing_stations'] for d in lma_data], - dim='number_of_events' - ) - # print(event_contributing_stations) - - # Find the mean of the station data, and then add back the event data - ds = xr.merge([lma_station_data.mean(dim='number_of_files'), - lma_event_data], - compat='equals') - # ... and then restore the contributing stations. - # Note the coordinate variables are being used as labels to ensure data remain aligned. - ds['event_contributing_stations'] = event_contributing_stations - - # Restore the global attributes - ds.attrs.update(final_attrs) - - return ds + # Get the station data from the first dataset and assign each station a unique index + lma_data[0]['station_code'] = lma_data[0].number_of_stations + lma_data[0]['number_of_stations'] = np.arange(len(lma_data[0].number_of_stations)) + all_data = lma_data[0] + + # Set up a few arrays to keep track of all known stations + station_networks = all_data.station_network.data.copy() + station_codes = all_data.station_code.data.copy() + station_lats = all_data.station_latitude.data.copy() + station_lons = all_data.station_longitude.data.copy() + station_alts = all_data.station_altitude.data.copy() + + # Check each subsequent dataset for new stations + for new_file_num in range(1, len(lma_data)): + new_file = lma_data[new_file_num] + # Demote station code to a data variable and assign an index to each station in the new file + new_file['station_code'] = new_file.number_of_stations + new_file['number_of_stations'] = np.arange(len(new_file.number_of_stations)) + stations_in_file = [] + # Check each station in the new file against all known stations + for station_num in range(len(new_file.number_of_stations.data)): + station = new_file.isel(number_of_stations=station_num) + matching_networks = station_networks == station.station_network.data + matching_station_code = station_codes == station.station_code.data + matching_lat = station_lats == station.station_latitude.data + matching_lon = station_lons == station.station_longitude.data + matching_alt = station_alts == station.station_altitude.data + # If the lat/lon/alt are the same, then the station is in the same location + matching_loc = matching_lat * matching_lon * matching_alt + matching_id = matching_networks * matching_station_code + matching = matching_id * matching_loc + # check for a single match in the previously-known stations + if sum(matching) == 1: + # station completely matches previous + previous_index = np.argmax(matching) + stations_in_file.append(previous_index) + elif sum(matching_id) == 1: + # station has moved geographically since previous + previous_index = np.argmax(matching_id) + # Update the global list of lat/lon/alts to reflect the move + station_lats[previous_index] = station.station_latitude.data + station_lons[previous_index] = station.station_longitude.data + station_alts[previous_index] = station.station_altitude.data + stations_in_file.append(previous_index) + elif sum(matching_loc) == 1: + # station was renamed since previous + previous_index = np.argmax(matching_loc) + # Update the global list of station identifying information to reflect the rename + station_networks[previous_index] = station.station_network.data + station_codes[previous_index] = station.station_code.data + stations_in_file.append(previous_index) + else: + # station is new + previous_index = -1 + stations_in_file.append(previous_index) + # Find the indices of any newly-discovered stations + indices_of_new_stations = np.where(np.array(stations_in_file) == -1)[0] + # Add the new stations to the known stations + for idx_of_new_station in indices_of_new_stations: + new_station = new_file.isel(number_of_stations=idx_of_new_station) + # The new station is appended to the end of the known stations + new_station_index = len(all_data.number_of_stations) + # Expand all of the previous data to contain the new station. Fill with nan values for all previous times. + all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=np.nan) + # Update the dimension coordinate to reflect the new station's addition + all_data['number_of_stations'] = ('number_of_stations', np.arange(new_station_index+1)) + # Add the new station's information to the global list of known stations + station_networks = np.append(station_networks, new_station.station_network.data) + station_codes = np.append(station_codes, new_station.station_code.data) + station_lats = np.append(station_lats, new_station.station_latitude.data) + station_lons = np.append(station_lons, new_station.station_longitude.data) + station_alts = np.append(station_alts, new_station.station_altitude.data) + # Update the station index for the new station + stations_in_file[idx_of_new_station] = new_station_index + # Check for any previously-known stations that are no longer in the new file + for old_station_id in all_data.number_of_stations.data: + if old_station_id not in stations_in_file: + # a station has been removed, create a row of nan values for this file to indicate that the station is no longer present + # first create a temporary index for the dead station + temp_station_id = len(new_file.number_of_stations) + # fill the new row with nan values + new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=np.nan) + new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1)) + # add the dead station to the list of stations from this file so that it is the same length as the all_data dataset + stations_in_file.append(temp_station_id) + # update the mapping of the dead station to the temporary index so that the data can be re-ordered + stations_in_file[old_station_id] = temp_station_id + stations_in_file[temp_station_id] = old_station_id + # Re-order the station data to match the order of the stations in the new file + # This can happen if lma_analysis decides to change the order of the stations, or if new stations are added or removed + new_file = reorder_stations(new_file, stations_in_file) + # Concatenate the new file's station information with the previously-known station information + lma_station_data = xr.concat( + [d.drop_dims(['number_of_events']) for d in [all_data, new_file]], + dim='number_of_files' + ) + # Rebuild the event_contributing_stations array + event_contributing_stations = xr.concat( + [d['event_contributing_stations'] for d in [all_data, new_file]], + dim='number_of_events' + ) + # Attach the event_contributing_stations array to the station dataset + lma_station_data['event_contributing_stations'] = event_contributing_stations + + # Combining the events is a simple concatenation. If only the stations had been this easy... + lma_event_data = xr.concat( + [d.drop_dims(['number_of_stations']).drop_vars( + ['network_center_latitude', + 'network_center_longitude', + 'network_center_altitude', + 'file_start_time'] + ) for d in [all_data, new_file]], + dim='number_of_events' + ) + # merge all the data from this file with the previously-known data + combined_event_data = xr.merge([lma_station_data, lma_event_data]) + all_data = combined_event_data + # Sort the data by file_start_time + all_data = all_data.sortby('file_start_time') + # Update the global attributes + all_data.attrs.update(final_attrs) + # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary + all_the_same = True + # These variables are expected to change between files, so they are not checked for equality + expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time'] + for var in all_data.data_vars: + if var in expected_to_change: + continue + if 'number_of_files' in all_data[var].dims: + if (all_data[var] == all_data[var].isel(number_of_files=0)).all(): + pass + else: + all_the_same = False + # If all of the variables are the same for all files, remove the 'number_of_files' dimension. + if all_the_same: + # Some variables need to be averaged + mean_data = all_data.mean(dim='number_of_files') + # ...but most can just be copied over (this is necessary because some are strings which can't be averaged) + all_data = all_data.isel(number_of_files=0) + # the file_start_time variable is not needed in the reduced dataset + all_data = all_data.drop_vars(['file_start_time']) + # replace the copied variables with the averaged variables where necessary + for var in expected_to_change: + if var == 'file_start_time': + continue + all_data[var] = mean_data[var] + return all_data def dataset(filenames, sort_time=True): """ Create an xarray dataset of the type returned by @@ -127,15 +245,13 @@ def dataset(filenames, sort_time=True): # the dimension name, too. resetted = ('number_of_events_', 'number_of_stations_') ds = ds.reset_coords(resetted) - ds = ds.rename({resetted[0]:'event_id', - resetted[1]:'station_code'}) + ds = ds.rename({resetted[0]:'event_id'}) else: # The approach for newer xarray versions requires we explicitly rename only the variables. # The generic "rename" in the block above renames vars, coords, and dims. resetted = ('number_of_events', 'number_of_stations') ds = ds.reset_coords(resetted) - ds = ds.rename_vars({resetted[0]:'event_id', - resetted[1]:'station_code'}) + ds = ds.rename_vars({resetted[0]:'event_id'}) if sort_time: ds = ds.sortby('event_time') return ds, starttime @@ -203,7 +319,7 @@ def to_dataset(lma_file, event_id_start=0): ds.attrs['event_algorithm_name'] = lma_file.analysis_program ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version ds.attrs['original_filename'] = path.basename(lma_file.file) - + ds['file_start_time'] = starttime # -- Populate the station mask information -- # int, because NetCDF doesn't have booleans station_mask_bools = np.zeros((N_events, N_stations), dtype='int8') From 043acd80118d620ee9f3c896eb82ddf324c655dc Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 1 Jul 2024 01:18:14 -0500 Subject: [PATCH 05/28] preserve event_id == number_of_events (deeplycloudy/xlma-python#43) --- pyxlma/lmalib/io/read.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index be1e479..cb5198c 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -181,8 +181,6 @@ def reorder_stations(dataset, index_mapping): # merge all the data from this file with the previously-known data combined_event_data = xr.merge([lma_station_data, lma_event_data]) all_data = combined_event_data - # Sort the data by file_start_time - all_data = all_data.sortby('file_start_time') # Update the global attributes all_data.attrs.update(final_attrs) # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary @@ -236,6 +234,10 @@ def dataset(filenames, sort_time=True): except: raise ds = combine_datasets(lma_data) + if sort_time: + ds = ds.sortby('event_time') + if 'file_start_time' in ds.dims: + ds = ds.sortby('file_start_time') ds = ds.reset_index(('number_of_events', 'number_of_stations')) if 'number_of_events_' in ds.coords: # Older xarray versions appended a trailing underscore. reset_coords then dropped @@ -252,8 +254,6 @@ def dataset(filenames, sort_time=True): resetted = ('number_of_events', 'number_of_stations') ds = ds.reset_coords(resetted) ds = ds.rename_vars({resetted[0]:'event_id'}) - if sort_time: - ds = ds.sortby('event_time') return ds, starttime def to_dataset(lma_file, event_id_start=0): From 25385bb5c0a2ea2cd8a1d594ba7da225d16f6798 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 1 Jul 2024 01:35:28 -0500 Subject: [PATCH 06/28] add station_active flag --- pyxlma/lmalib/io/read.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index cb5198c..95cf941 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -279,6 +279,7 @@ def to_dataset(lma_file, event_id_start=0): 'station_altitude':'Alt', 'station_event_fraction':'sources', 'station_power_ratio':'

', + 'station_active':'active', } event_mapping = { 'event_latitude':'lat', @@ -335,6 +336,11 @@ def to_dataset(lma_file, event_id_start=0): station_mask_bools[:, i] = lma_data[col] ds['event_contributing_stations'][:] = station_mask_bools + # -- Convert the station_active flag to a psuedo-boolean -- + station_active_data = np.zeros(N_stations, dtype='int8') + station_active_data[ds.station_active.data == b'A'] = 1 + station_active_data[ds.station_active.data == b'NA'] = 0 + ds.station_active.data = station_active_data return ds From 1477a42666f3148dfcdbfc993f6e37a4e8c44a3b Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 1 Jul 2024 01:44:43 -0500 Subject: [PATCH 07/28] part 2 to 25385bb, actually commit the cf_netcdf file this time --- pyxlma/lmalib/io/cf_netcdf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py index 6ce8d45..01749e4 100644 --- a/pyxlma/lmalib/io/cf_netcdf.py +++ b/pyxlma/lmalib/io/cf_netcdf.py @@ -335,6 +335,11 @@ def new_template_dataset(): 'long_name': "

"}, 'dtype': 'float32', }, + 'station_active': {'dims': ('number_of_stations',), + 'attrs': {'_FillValue': np.nan, + 'long_name': "Station active flag"}, + 'dtype': '|S2', + }, }} return __template_dataset From fb0b3abe6a078e6d03e5d4d974230587eaca6aa7 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 1 Jul 2024 02:58:23 -0500 Subject: [PATCH 08/28] fill in known values when stations are offline --- pyxlma/lmalib/io/read.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 95cf941..5cc89d0 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -128,6 +128,20 @@ def reorder_stations(dataset, index_mapping): new_station_index = len(all_data.number_of_stations) # Expand all of the previous data to contain the new station. Fill with nan values for all previous times. all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=np.nan) + for var in all_data.data_vars: + if 'number_of_stations' in all_data[var].dims: + if var in ['station_event_fraction', 'station_power_ratio', 'station_active', 'event_contributing_stations']: + val_to_fill = 0 + else: + val_to_fill = new_station[var].data.item() + if all_data[var].data.dtype != 'object': + filled_data = np.nan_to_num(all_data[var].data, copy=True, nan=val_to_fill) + else: + filled_data = all_data[var].data.copy().astype(new_station[var].data.dtype) + for i in np.ndindex(filled_data.shape): + if filled_data[i] == np.array([np.nan]).astype(new_station[var].data.dtype)[0]: + filled_data[i] = val_to_fill + all_data[var].data = filled_data # Update the dimension coordinate to reflect the new station's addition all_data['number_of_stations'] = ('number_of_stations', np.arange(new_station_index+1)) # Add the new station's information to the global list of known stations @@ -146,6 +160,18 @@ def reorder_stations(dataset, index_mapping): temp_station_id = len(new_file.number_of_stations) # fill the new row with nan values new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=np.nan) + for var in new_file.data_vars: + if var == 'number_of_stations': + continue + if 'number_of_stations' in new_file[var].dims: + if var in ['station_event_fraction', 'station_power_ratio', 'station_active', 'event_contributing_stations']: + val_to_fill = 0 + filled_data = np.nan_to_num(new_file[var].data, copy=True, nan=val_to_fill) + new_file[var].data = filled_data + else: + new_file[var].data[temp_station_id] = all_data[var].isel(number_of_stations=old_station_id, number_of_files=-1).data.item() + val_to_fill = all_data[var].isel(number_of_stations=old_station_id).data + val_to_fill = val_to_fill[0] new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1)) # add the dead station to the list of stations from this file so that it is the same length as the all_data dataset stations_in_file.append(temp_station_id) From b5555e97b206d09e8be86803876e119070245c5f Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Fri, 5 Jul 2024 01:53:41 -0500 Subject: [PATCH 09/28] only reduce number_of_files if it exists --- pyxlma/lmalib/io/read.py | 47 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 5cc89d0..88e4194 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -210,30 +210,31 @@ def reorder_stations(dataset, index_mapping): # Update the global attributes all_data.attrs.update(final_attrs) # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary - all_the_same = True - # These variables are expected to change between files, so they are not checked for equality - expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time'] - for var in all_data.data_vars: - if var in expected_to_change: - continue - if 'number_of_files' in all_data[var].dims: - if (all_data[var] == all_data[var].isel(number_of_files=0)).all(): - pass - else: - all_the_same = False - # If all of the variables are the same for all files, remove the 'number_of_files' dimension. - if all_the_same: - # Some variables need to be averaged - mean_data = all_data.mean(dim='number_of_files') - # ...but most can just be copied over (this is necessary because some are strings which can't be averaged) - all_data = all_data.isel(number_of_files=0) - # the file_start_time variable is not needed in the reduced dataset - all_data = all_data.drop_vars(['file_start_time']) - # replace the copied variables with the averaged variables where necessary - for var in expected_to_change: - if var == 'file_start_time': + if 'number_of_files' in all_data.dims: + all_the_same = True + # These variables are expected to change between files, so they are not checked for equality + expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time'] + for var in all_data.data_vars: + if var in expected_to_change: continue - all_data[var] = mean_data[var] + if 'number_of_files' in all_data[var].dims: + if (all_data[var] == all_data[var].isel(number_of_files=0)).all(): + pass + else: + all_the_same = False + # If all of the variables are the same for all files, remove the 'number_of_files' dimension. + if all_the_same: + # Some variables need to be averaged + mean_data = all_data.mean(dim='number_of_files') + # ...but most can just be copied over (this is necessary because some are strings which can't be averaged) + all_data = all_data.isel(number_of_files=0) + # the file_start_time variable is not needed in the reduced dataset + all_data = all_data.drop_vars(['file_start_time']) + # replace the copied variables with the averaged variables where necessary + for var in expected_to_change: + if var == 'file_start_time': + continue + all_data[var] = mean_data[var] return all_data def dataset(filenames, sort_time=True): From d458c2291b7218c6eb60835a7d285b84a30742d8 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Wed, 24 Jul 2024 00:33:15 -0500 Subject: [PATCH 10/28] identify unique network configurations instead of individual files --- pyxlma/lmalib/io/read.py | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 7f02c61..45ab04a 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -169,7 +169,7 @@ def reorder_stations(dataset, index_mapping): filled_data = np.nan_to_num(new_file[var].data, copy=True, nan=val_to_fill) new_file[var].data = filled_data else: - new_file[var].data[temp_station_id] = all_data[var].isel(number_of_stations=old_station_id, number_of_files=-1).data.item() + new_file[var].data[temp_station_id] = all_data[var].isel(number_of_stations=old_station_id, network_configurations=-1).data.item() val_to_fill = all_data[var].isel(number_of_stations=old_station_id).data val_to_fill = val_to_fill[0] new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1)) @@ -184,7 +184,7 @@ def reorder_stations(dataset, index_mapping): # Concatenate the new file's station information with the previously-known station information lma_station_data = xr.concat( [d.drop_dims(['number_of_events']) for d in [all_data, new_file]], - dim='number_of_files' + dim='network_configurations' ) # Rebuild the event_contributing_stations array event_contributing_stations = xr.concat( @@ -209,32 +209,16 @@ def reorder_stations(dataset, index_mapping): all_data = combined_event_data # Update the global attributes all_data.attrs.update(final_attrs) - # To reduce complexity and resource usage, if the 'number_of_files' dimension is the same for all variables, then the dimension is unnecessary - if 'number_of_files' in all_data.dims: - all_the_same = True - # These variables are expected to change between files, so they are not checked for equality - expected_to_change = ['station_event_fraction', 'station_power_ratio', 'file_start_time'] - for var in all_data.data_vars: - if var in expected_to_change: - continue - if 'number_of_files' in all_data[var].dims: - if (all_data[var] == all_data[var].isel(number_of_files=0)).all(): - pass - else: - all_the_same = False - # If all of the variables are the same for all files, remove the 'number_of_files' dimension. - if all_the_same: - # Some variables need to be averaged - mean_data = all_data.mean(dim='number_of_files') - # ...but most can just be copied over (this is necessary because some are strings which can't be averaged) - all_data = all_data.isel(number_of_files=0) - # the file_start_time variable is not needed in the reduced dataset - all_data = all_data.drop_vars(['file_start_time']) - # replace the copied variables with the averaged variables where necessary - for var in expected_to_change: - if var == 'file_start_time': - continue - all_data[var] = mean_data[var] + # store start and end time and seconds analyzed as attributes, then drop file_start_time and seconds_analyzed dims + all_data.attrs['start_time'] = all_data['file_start_time'].data.min() + all_data = all_data.drop_vars(['file_start_time']) + # somehow this gets set to float64 (probably introduction of nan values above), that's unnecessary, use uint8 + all_data.station_active.data = all_data.station_active.data.astype(np.uint8) + # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary + if 'network_configurations' in all_data.dims: + # Identify unique network configurations + unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1] + all_data = all_data.isel(network_configurations=unique_configs) return all_data def dataset(filenames, sort_time=True): From fe6225c351a9d06932ee9ae884cba0968c3c4883 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Wed, 24 Jul 2024 01:39:46 -0500 Subject: [PATCH 11/28] start time, end time, and analyzed sec of dataset, warn if mismatch --- pyxlma/lmalib/io/read.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 45ab04a..1884eee 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -4,6 +4,7 @@ import gzip import datetime as dt from os import path +import warnings class open_gzip_or_dat: def __init__(self, filename): @@ -199,8 +200,7 @@ def reorder_stations(dataset, index_mapping): [d.drop_dims(['number_of_stations']).drop_vars( ['network_center_latitude', 'network_center_longitude', - 'network_center_altitude', - 'file_start_time'] + 'network_center_altitude'] ) for d in [all_data, new_file]], dim='number_of_events' ) @@ -209,15 +209,14 @@ def reorder_stations(dataset, index_mapping): all_data = combined_event_data # Update the global attributes all_data.attrs.update(final_attrs) - # store start and end time and seconds analyzed as attributes, then drop file_start_time and seconds_analyzed dims - all_data.attrs['start_time'] = all_data['file_start_time'].data.min() - all_data = all_data.drop_vars(['file_start_time']) # somehow this gets set to float64 (probably introduction of nan values above), that's unnecessary, use uint8 all_data.station_active.data = all_data.station_active.data.astype(np.uint8) # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary if 'network_configurations' in all_data.dims: # Identify unique network configurations unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1] + if len(unique_configs) == 1: + unique_configs = unique_configs[0] all_data = all_data.isel(network_configurations=unique_configs) return all_data @@ -229,6 +228,8 @@ def dataset(filenames, sort_time=True): filenames = [filenames] lma_data = [] starttime = None + analyzed_sec = 0 + endtime = None next_event_id = 0 for filename in filenames: lma_file = lmafile(filename) @@ -236,6 +237,11 @@ def dataset(filenames, sort_time=True): starttime = lma_file.starttime else: starttime = min(lma_file.starttime, starttime) + analyzed_sec += lma_file.analyzed_sec + if endtime is None: + endtime = lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec) + else: + endtime = max(lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec), endtime) # Accounting for empty files try: ds = to_dataset(lma_file, event_id_start=next_event_id).set_index( @@ -245,10 +251,17 @@ def dataset(filenames, sort_time=True): except: raise ds = combine_datasets(lma_data) + ds.attrs['analysis_start_time'] = starttime + ds.attrs['number_of_seconds_analyzed'] = analyzed_sec + ds.attrs['analysis_end_time'] = endtime + if starttime + dt.timedelta(seconds=analyzed_sec) < endtime: + warnings.warn('The number of seconds analyzed does not fully cover the time between the start and end times of the analysis. This may indicate ' + 'incomplete or non-continuous data.') + elif starttime + dt.timedelta(seconds=analyzed_sec) > endtime: + warnings.warn('The start time and end time of the analysis do not match the number of seconds analyzed. If you are combining multiple networks, ' + 'this is normal and expected, otherwise, this may indicate an error in the orignal analysis program.') if sort_time: ds = ds.sortby('event_time') - if 'file_start_time' in ds.dims: - ds = ds.sortby('file_start_time') ds = ds.reset_index(('number_of_events', 'number_of_stations')) if 'number_of_events_' in ds.coords: # Older xarray versions appended a trailing underscore. reset_coords then dropped @@ -331,7 +344,6 @@ def to_dataset(lma_file, event_id_start=0): ds.attrs['event_algorithm_name'] = lma_file.analysis_program ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version ds.attrs['original_filename'] = path.basename(lma_file.file) - ds['file_start_time'] = starttime # -- Populate the station mask information -- # int, because NetCDF doesn't have booleans station_mask_bools = np.zeros((N_events, N_stations), dtype='int8') @@ -496,13 +508,15 @@ def __init__(self,filename): file_created = line.decode().split(':')[1:] self.file_created = ':'.join(file_created)[:-1] if line.startswith(b'Location:'): - self.network_location = ':'.join(line.decode().split(':')[1:])[:-1].replace(' ','') + self.network_location = ':'.join(line.decode().split(':')[1:])[:-1].replace(' ','', 1) if line.startswith(b'Data start time:'): timestring = line.decode().split()[-2:] self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y') # Full start time and second, likely unneeded self.starttime = dt.datetime.strptime(timestring[0]+timestring[1],'%m/%d/%y%H:%M:%S') # self.startsecond = (starttime-dt.datetime(starttime.year,starttime.month,starttime.day)).seconds + if line.startswith(b'Number of seconds analyzed:'): + self.analyzed_sec = int(line.decode().split(':')[-1]) # Find starting and ending rows for station information if line.startswith(b'Coordinate center'): self.center_lat = float(line.decode().split()[-3]) From 27b3c10c86243a768c63b5875f5bce5903c8e2f1 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Wed, 24 Jul 2024 02:50:46 -0500 Subject: [PATCH 12/28] lma dat header (mostly) --- pyxlma/lmalib/io/write.py | 56 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py index aa0b8cd..ccaa458 100644 --- a/pyxlma/lmalib/io/write.py +++ b/pyxlma/lmalib/io/write.py @@ -1,4 +1,6 @@ import numpy as np +import datetime as dt +import warnings def dataset(dataset, path): "Write an LMA dataset to a netCDF file" @@ -13,4 +15,56 @@ def dataset(dataset, path): comp = dict(zlib=True, complevel=5) encoding = {var: comp for var in dataset.data_vars} dataset = dataset.chunk('auto') - dataset.to_netcdf(path, encoding=encoding) \ No newline at end of file + dataset.to_netcdf(path, encoding=encoding) + +def lma_dat_file(dataset, path, use_gzip=True): + """ + Write an LMA dataset to a legacy .dat(.gz) file for use in IDL XLMA. + + Parameters + ---------- + dataset : xarray.Dataset + The LMA dataset to write. + path : str + The target path to the file to write. + use_gzip : bool + Whether to compress the file using gzip. + """ + + if 'network_configurations' in dataset.dims: + warnings.warn('Attempting to write dataset with multiple network configurations to a single file. This is highly experimental.\n' + 'Caveats:\n-Location names will be appended and separated by semicolon\n- The center of the network is derived from the mean of the locations of the stations') + center_lon = dataset.station_longitude.data.mean() + center_lat = dataset.station_latitude.data.mean() + center_alt = dataset.station_altitude.data.mean() + num_active = np.sum(np.clip(np.sum(dataset.station_active.data, axis=0), 0, 1)) + else: + center_lon = dataset.network_center_longitude.data.item() + center_lat = dataset.network_center_latitude.data.item() + center_alt = dataset.network_center_altitude.data.item() + num_active = np.sum(dataset.station_active.data) + lma_header = (f'Lightning Mapping Array analyzed data\n' + f'Analysis program: {dataset.attrs["event_algorithm_name"]}\n' + f'Analysis program version: {dataset.attrs["event_algorithm_version"]}\n' + f'File created: {dt.datetime.utcnow().strftime("%a %b %d %H:%M:%S %Y")}\n' + f'Data start time: {dataset.attrs["analysis_start_time"].strftime("%M/%d/%y %H:%M:%S")}\n' + f'Number of seconds analyzed: {dataset.attrs["number_of_seconds_analyzed"]}\n' + f'Location: {"; ".join(np.unique(dataset.station_network.data).astype(str).tolist())}\n' + f'Coordinate center (lat,lon,alt): {center_lat:.7f}, {center_lon:.7f}, {center_alt:.2f}\n' + f'Coordinate frame: cartesian\n' + # f'Maximum diameter of LMA (km): ' + # f'Maximum light-time across LMA (ns): ' + f'Number of stations: {dataset.sizes['number_of_stations']}' + f'Number of active stations: {num_active}\n' + # f'Active stations:' + # f'Minimum number of stations per solution: {}\n' + # f'Maximum reduced chi-squared: {}\n' + # f'Maximum number of chi-squared iterations: {}\n' + ### STATION INFO TABLE + ### STATION DATA TABLE + f'Metric file version: 4\n' + f'Data: time (UT sec of day), lat, lon, alt(m), reduced chi^2, P(dBW), mask\n' + f'Data format: 15.9f 12.8f 13.8f 9.2f 6.2f 5.1f 7x\n' + f'Number of events: {dataset.sizes["number_of_events"]}\n' + f'*** data ***\n' + ) \ No newline at end of file From 3a4c1b33b08394184a325271a003338ea5aec1d0 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 00:45:57 -0500 Subject: [PATCH 13/28] make station event fraction a percentage as described by metadata --- pyxlma/lmalib/io/read.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 1884eee..e0a0a0c 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -364,6 +364,10 @@ def to_dataset(lma_file, event_id_start=0): station_active_data[ds.station_active.data == b'A'] = 1 station_active_data[ds.station_active.data == b'NA'] = 0 ds.station_active.data = station_active_data + + # Convert station event count to station event fraction + if N_events != 0: + ds.station_event_fraction.data = 100*ds.station_event_fraction.data/N_events return ds From 32e90d706942bc22931f11877c52e02770d010c1 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 00:47:10 -0500 Subject: [PATCH 14/28] track data coverage across network configs and gaps in coverage --- pyxlma/lmalib/io/read.py | 260 ++++++++++++++++++--------------------- 1 file changed, 122 insertions(+), 138 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index e0a0a0c..09395ae 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -4,7 +4,8 @@ import gzip import datetime as dt from os import path -import warnings +import collections +from pyxlma.lmalib.io.cf_netcdf import new_dataset class open_gzip_or_dat: def __init__(self, filename): @@ -59,6 +60,8 @@ def reorder_stations(dataset, index_mapping): } final_attrs = {} for k in all_attrs: + if k.startswith('analysis_') and k.endswith('_time'): + continue attr_vals = all_attrs[k] set_of_values = set(attr_vals) if len(set_of_values) == 1: @@ -68,15 +71,17 @@ def reorder_stations(dataset, index_mapping): # Get the station data from the first dataset and assign each station a unique index lma_data[0]['station_code'] = lma_data[0].number_of_stations lma_data[0]['number_of_stations'] = np.arange(len(lma_data[0].number_of_stations)) + lma_data[0].attrs['config_times'] = [[[lma_data[0].attrs['analysis_start_time'], lma_data[0].attrs['analysis_end_time']]]] all_data = lma_data[0] - - # Set up a few arrays to keep track of all known stations - station_networks = all_data.station_network.data.copy() - station_codes = all_data.station_code.data.copy() - station_lats = all_data.station_latitude.data.copy() - station_lons = all_data.station_longitude.data.copy() - station_alts = all_data.station_altitude.data.copy() - + # Get the attributes attached to each variable in the dataset + dv_attrs = {} + new_ds = new_dataset(1, 1, 1) + for var in new_ds.data_vars: + dv_attrs[var] = new_ds[var].attrs + # Define list of 'properties', things which identify a station and are not expected to change + property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network') + # Define list of variables to be recalculated for each station after the datasets are combined + recalc_vars = ('station_event_fraction', 'station_power_ratio', 'network_center_latitude', 'network_center_longitude', 'network_center_altitude') # Check each subsequent dataset for new stations for new_file_num in range(1, len(lma_data)): new_file = lma_data[new_file_num] @@ -85,41 +90,26 @@ def reorder_stations(dataset, index_mapping): new_file['number_of_stations'] = np.arange(len(new_file.number_of_stations)) stations_in_file = [] # Check each station in the new file against all known stations + station_is_new = True + old_ids_to_check_for_missing = all_data.number_of_stations.data.tolist() for station_num in range(len(new_file.number_of_stations.data)): station = new_file.isel(number_of_stations=station_num) - matching_networks = station_networks == station.station_network.data - matching_station_code = station_codes == station.station_code.data - matching_lat = station_lats == station.station_latitude.data - matching_lon = station_lons == station.station_longitude.data - matching_alt = station_alts == station.station_altitude.data - # If the lat/lon/alt are the same, then the station is in the same location - matching_loc = matching_lat * matching_lon * matching_alt - matching_id = matching_networks * matching_station_code - matching = matching_id * matching_loc - # check for a single match in the previously-known stations - if sum(matching) == 1: - # station completely matches previous - previous_index = np.argmax(matching) - stations_in_file.append(previous_index) - elif sum(matching_id) == 1: - # station has moved geographically since previous - previous_index = np.argmax(matching_id) - # Update the global list of lat/lon/alts to reflect the move - station_lats[previous_index] = station.station_latitude.data - station_lons[previous_index] = station.station_longitude.data - station_alts[previous_index] = station.station_altitude.data - stations_in_file.append(previous_index) - elif sum(matching_loc) == 1: - # station was renamed since previous - previous_index = np.argmax(matching_loc) - # Update the global list of station identifying information to reflect the rename - station_networks[previous_index] = station.station_network.data - station_codes[previous_index] = station.station_code.data - stations_in_file.append(previous_index) - else: - # station is new - previous_index = -1 - stations_in_file.append(previous_index) + old_ids_to_search = collections.deque(range(len(all_data.number_of_stations.data))) + old_ids_to_search.rotate(-1*station_num) # start with the same index because stations USUALLY come in the same order + for old_station_num in old_ids_to_search: + old_station = all_data.isel(number_of_stations=old_station_num) + all_props_match = True + for prop in property_vars: + if station.data_vars[prop].data.item() != old_station.data_vars[prop].data.item(): + all_props_match = False + break + if all_props_match: + station_is_new = False + stations_in_file.append(old_station_num) + old_ids_to_check_for_missing.remove(old_station_num) + break + if station_is_new: + stations_in_file.append(-1) # Find the indices of any newly-discovered stations indices_of_new_stations = np.where(np.array(stations_in_file) == -1)[0] # Add the new stations to the known stations @@ -127,66 +117,50 @@ def reorder_stations(dataset, index_mapping): new_station = new_file.isel(number_of_stations=idx_of_new_station) # The new station is appended to the end of the known stations new_station_index = len(all_data.number_of_stations) - # Expand all of the previous data to contain the new station. Fill with nan values for all previous times. - all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=np.nan) - for var in all_data.data_vars: + # Expand all of the previous data to contain the new station. Fill with station properties. + fill_vals_dict = {} + for var in new_station.data_vars: if 'number_of_stations' in all_data[var].dims: - if var in ['station_event_fraction', 'station_power_ratio', 'station_active', 'event_contributing_stations']: - val_to_fill = 0 - else: - val_to_fill = new_station[var].data.item() - if all_data[var].data.dtype != 'object': - filled_data = np.nan_to_num(all_data[var].data, copy=True, nan=val_to_fill) + if var in property_vars: + fill_vals_dict[var] = new_station[var].data.item() else: - filled_data = all_data[var].data.copy().astype(new_station[var].data.dtype) - for i in np.ndindex(filled_data.shape): - if filled_data[i] == np.array([np.nan]).astype(new_station[var].data.dtype)[0]: - filled_data[i] = val_to_fill - all_data[var].data = filled_data - # Update the dimension coordinate to reflect the new station's addition - all_data['number_of_stations'] = ('number_of_stations', np.arange(new_station_index+1)) - # Add the new station's information to the global list of known stations - station_networks = np.append(station_networks, new_station.station_network.data) - station_codes = np.append(station_codes, new_station.station_code.data) - station_lats = np.append(station_lats, new_station.station_latitude.data) - station_lons = np.append(station_lons, new_station.station_longitude.data) - station_alts = np.append(station_alts, new_station.station_altitude.data) + fill_vals_dict[var] = 0 + all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=fill_vals_dict) # Update the station index for the new station stations_in_file[idx_of_new_station] = new_station_index # Check for any previously-known stations that are no longer in the new file - for old_station_id in all_data.number_of_stations.data: - if old_station_id not in stations_in_file: - # a station has been removed, create a row of nan values for this file to indicate that the station is no longer present - # first create a temporary index for the dead station - temp_station_id = len(new_file.number_of_stations) - # fill the new row with nan values - new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=np.nan) - for var in new_file.data_vars: - if var == 'number_of_stations': - continue - if 'number_of_stations' in new_file[var].dims: - if var in ['station_event_fraction', 'station_power_ratio', 'station_active', 'event_contributing_stations']: - val_to_fill = 0 - filled_data = np.nan_to_num(new_file[var].data, copy=True, nan=val_to_fill) - new_file[var].data = filled_data - else: - new_file[var].data[temp_station_id] = all_data[var].isel(number_of_stations=old_station_id, network_configurations=-1).data.item() - val_to_fill = all_data[var].isel(number_of_stations=old_station_id).data - val_to_fill = val_to_fill[0] - new_file['number_of_stations'] = ('number_of_stations', np.arange(temp_station_id+1)) - # add the dead station to the list of stations from this file so that it is the same length as the all_data dataset - stations_in_file.append(temp_station_id) - # update the mapping of the dead station to the temporary index so that the data can be re-ordered - stations_in_file[old_station_id] = temp_station_id - stations_in_file[temp_station_id] = old_station_id + for missing_station_id in old_ids_to_check_for_missing: + dead_station_data = all_data.isel(number_of_stations=missing_station_id) + # a station has been removed, create a row of nan values for this file to indicate that the station is no longer present + # first create a temporary index for the dead station + temp_station_id = len(new_file.number_of_stations) + fill_vals_dict = {} + for var in dead_station_data.data_vars: + if 'number_of_stations' in all_data[var].dims: + if var in property_vars: + fill_vals_dict[var] = dead_station_data[var].data.item() + else: + fill_vals_dict[var] = 0 + new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=fill_vals_dict) + # Update the station index for the dead station + stations_in_file.append(missing_station_id) # Re-order the station data to match the order of the stations in the new file # This can happen if lma_analysis decides to change the order of the stations, or if new stations are added or removed - new_file = reorder_stations(new_file, stations_in_file) + new_file = reorder_stations(new_file, np.argsort(stations_in_file)) # Concatenate the new file's station information with the previously-known station information - lma_station_data = xr.concat( - [d.drop_dims(['number_of_events']) for d in [all_data, new_file]], - dim='network_configurations' - ) + station_to_merge = [d.drop_dims(['number_of_events']) for d in [all_data, new_file]] + lma_station_data = xr.concat(station_to_merge, dim='network_configurations').drop_vars( + ['network_center_latitude', 'network_center_longitude', 'network_center_altitude']) + for var_name in lma_station_data.data_vars: + da = lma_station_data[var_name] + if 'network_configurations' in da.dims: + # black magic from openAI that somehow determines if a data array is identical across the 'network_configurations' variable + if np.all((da == da.isel({'network_configurations': 0})).all('network_configurations')) or var_name in recalc_vars: + # Remove the 'network_configurations' dimension if the data is identical across it + lma_station_data[var_name] = da.isel(network_configurations=0) + if 'network_configurations' in lma_station_data.dims: + unique_netw_configs = np.unique(lma_station_data.station_active.data, return_index=True, axis=0)[1] + lma_station_data = lma_station_data.isel(network_configurations=sorted(unique_netw_configs)) # Rebuild the event_contributing_stations array event_contributing_stations = xr.concat( [d['event_contributing_stations'] for d in [all_data, new_file]], @@ -197,27 +171,50 @@ def reorder_stations(dataset, index_mapping): # Combining the events is a simple concatenation. If only the stations had been this easy... lma_event_data = xr.concat( - [d.drop_dims(['number_of_stations']).drop_vars( - ['network_center_latitude', - 'network_center_longitude', - 'network_center_altitude'] - ) for d in [all_data, new_file]], + [d.drop_dims(['number_of_stations']) for d in [all_data, new_file]], dim='number_of_events' ) # merge all the data from this file with the previously-known data combined_event_data = xr.merge([lma_station_data, lma_event_data]) all_data = combined_event_data + # Log continuous intervals of data coverage + if 'network_configurations' in all_data.dims: + this_netw_conf = np.where((all_data.station_active.data == new_file.station_active.data).all(axis=1))[0][0] + else: + this_netw_conf = 0 + if len(all_data.attrs['config_times']) > this_netw_conf: + times_of_this_config = all_data.attrs['config_times'][this_netw_conf] + for gap in times_of_this_config: + if new_file.attrs['analysis_start_time'] == gap[1]: + gap[1] = new_file.attrs['analysis_end_time'] + break + elif new_file.attrs['analysis_end_time'] == gap[0]: + gap[0] = new_file.attrs['analysis_start_time'] + break + else: + times_of_this_config.append([new_file.attrs['analysis_start_time'], new_file.attrs['analysis_end_time']]) + else: + all_data.attrs['config_times'].append([[new_file.attrs['analysis_start_time'], new_file.attrs['analysis_end_time']]]) + all_data.attrs['analysis_start_time'] = min(all_data.attrs['analysis_start_time'], new_file.attrs['analysis_start_time']) + all_data.attrs['analysis_end_time'] = max(all_data.attrs['analysis_end_time'], new_file.attrs['analysis_end_time']) # Update the global attributes all_data.attrs.update(final_attrs) - # somehow this gets set to float64 (probably introduction of nan values above), that's unnecessary, use uint8 - all_data.station_active.data = all_data.station_active.data.astype(np.uint8) # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary if 'network_configurations' in all_data.dims: # Identify unique network configurations unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1] if len(unique_configs) == 1: unique_configs = unique_configs[0] - all_data = all_data.isel(network_configurations=unique_configs) + # recalculate variables that depend on the station data + all_data['network_center_longitude'] = all_data.station_longitude.mean(dim='number_of_stations') + all_data['network_center_latitude'] = all_data.station_latitude.mean(dim='number_of_stations') + all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations') + all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0] + all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events') + # restore previously cached data var attributes + for var_name in all_data.data_vars: + if var_name in dv_attrs: + all_data[var_name].attrs = dv_attrs[var_name] return all_data def dataset(filenames, sort_time=True): @@ -228,55 +225,42 @@ def dataset(filenames, sort_time=True): filenames = [filenames] lma_data = [] starttime = None - analyzed_sec = 0 - endtime = None + if sort_time: + all_starttimes = [] next_event_id = 0 - for filename in filenames: + for filename in sorted(filenames): lma_file = lmafile(filename) if starttime is None: starttime = lma_file.starttime else: starttime = min(lma_file.starttime, starttime) - analyzed_sec += lma_file.analyzed_sec - if endtime is None: - endtime = lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec) - else: - endtime = max(lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec), endtime) - # Accounting for empty files - try: - ds = to_dataset(lma_file, event_id_start=next_event_id).set_index( - {'number_of_stations':'station_code', 'number_of_events':'event_id'}) - lma_data.append(ds) - next_event_id += ds.sizes['number_of_events'] - except: - raise + ds = to_dataset(lma_file, event_id_start=next_event_id).set_index( + {'number_of_stations':'station_code', 'number_of_events':'event_id'}) + ds.attrs['analysis_start_time'] = lma_file.starttime + ds.attrs['analysis_end_time'] = lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec) + lma_data.append(ds) + next_event_id += ds.sizes['number_of_events'] + if sort_time: + all_starttimes.append(lma_file.starttime) + if sort_time: + sorting = np.argsort(all_starttimes) + lma_data = [lma_data[i] for i in sorting] ds = combine_datasets(lma_data) - ds.attrs['analysis_start_time'] = starttime - ds.attrs['number_of_seconds_analyzed'] = analyzed_sec - ds.attrs['analysis_end_time'] = endtime - if starttime + dt.timedelta(seconds=analyzed_sec) < endtime: - warnings.warn('The number of seconds analyzed does not fully cover the time between the start and end times of the analysis. This may indicate ' - 'incomplete or non-continuous data.') - elif starttime + dt.timedelta(seconds=analyzed_sec) > endtime: - warnings.warn('The start time and end time of the analysis do not match the number of seconds analyzed. If you are combining multiple networks, ' - 'this is normal and expected, otherwise, this may indicate an error in the orignal analysis program.') if sort_time: ds = ds.sortby('event_time') - ds = ds.reset_index(('number_of_events', 'number_of_stations')) + ds = ds.reset_index(('number_of_events')) if 'number_of_events_' in ds.coords: - # Older xarray versions appended a trailing underscore. reset_coords then dropped - # converted the coordinate variables into regular variables while not touching - # the original dimension name, allowing us to rename. In newer versions, the variable - # names are never changed at the reset_index step, so the renaming step modifies - # the dimension name, too. + # # Older xarray versions appended a trailing underscore. reset_coords then dropped + # # converted the coordinate variables into regular variables while not touching + # # the original dimension name, allowing us to rename. In newer versions, the variable + # # names are never changed at the reset_index step, so the renaming step modifies + # # the dimension name, too. resetted = ('number_of_events_', 'number_of_stations_') - ds = ds.reset_coords(resetted) ds = ds.rename({resetted[0]:'event_id'}) else: - # The approach for newer xarray versions requires we explicitly rename only the variables. - # The generic "rename" in the block above renames vars, coords, and dims. + # # The approach for newer xarray versions requires we explicitly rename only the variables. + # # The generic "rename" in the block above renames vars, coords, and dims. resetted = ('number_of_events', 'number_of_stations') - ds = ds.reset_coords(resetted) ds = ds.rename_vars({resetted[0]:'event_id'}) return ds, starttime @@ -286,7 +270,7 @@ def to_dataset(lma_file, event_id_start=0): returns an xarray dataset of the type returned by pyxlma.lmalib.io.cf_netcdf.new_dataset """ - from pyxlma.lmalib.io.cf_netcdf import new_dataset + lma_data = lma_file.readfile() starttime = lma_file.starttime stations = lma_file.stations From caaf6073bbb482ea9886880aaad050f0b7483063 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 01:11:11 -0500 Subject: [PATCH 15/28] use template cf dataset to restore metadata when combining #11 --- pyxlma/lmalib/io/read.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 09395ae..aa4058c 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -5,7 +5,7 @@ import datetime as dt from os import path import collections -from pyxlma.lmalib.io.cf_netcdf import new_dataset +from pyxlma.lmalib.io.cf_netcdf import new_dataset, new_template_dataset class open_gzip_or_dat: def __init__(self, filename): @@ -75,7 +75,7 @@ def reorder_stations(dataset, index_mapping): all_data = lma_data[0] # Get the attributes attached to each variable in the dataset dv_attrs = {} - new_ds = new_dataset(1, 1, 1) + new_ds = new_template_dataset() for var in new_ds.data_vars: dv_attrs[var] = new_ds[var].attrs # Define list of 'properties', things which identify a station and are not expected to change From 81502fdb3ba60825242cb7656017e20e68c5c2ad Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 01:12:03 -0500 Subject: [PATCH 16/28] fix cf comparison --- pyxlma/lmalib/io/cf_netcdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py index 01749e4..b192e71 100644 --- a/pyxlma/lmalib/io/cf_netcdf.py +++ b/pyxlma/lmalib/io/cf_netcdf.py @@ -466,7 +466,7 @@ def compare_attributes(ds): """ Compare the attributes of all data variables in ds to the CF spec and print a report of any differences. """ - dst = __template_dataset['data_vars'] + dst = new_template_dataset()['data_vars'] dsd = ds.to_dict()['data_vars'] for k in dsd.keys(): if dst[k]['attrs'] != dsd[k]['attrs']: From 9491cc2c029ed329e63f21b40312bc55458fe4cf Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 01:16:13 -0500 Subject: [PATCH 17/28] that's not an xarray yet --- pyxlma/lmalib/io/read.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index aa4058c..c3e22e9 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -76,8 +76,9 @@ def reorder_stations(dataset, index_mapping): # Get the attributes attached to each variable in the dataset dv_attrs = {} new_ds = new_template_dataset() - for var in new_ds.data_vars: - dv_attrs[var] = new_ds[var].attrs + print(new_ds) + for var in new_ds['data_vars']: + dv_attrs[var] = new_ds['data_vars'][var]['attrs'] # Define list of 'properties', things which identify a station and are not expected to change property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network') # Define list of variables to be recalculated for each station after the datasets are combined From dffb0fb6c3fa4bc598776af58556643298c3798f Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 01:44:30 -0500 Subject: [PATCH 18/28] recalculate station mask after combine --- pyxlma/lmalib/io/read.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index c3e22e9..ef6448b 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -76,7 +76,6 @@ def reorder_stations(dataset, index_mapping): # Get the attributes attached to each variable in the dataset dv_attrs = {} new_ds = new_template_dataset() - print(new_ds) for var in new_ds['data_vars']: dv_attrs[var] = new_ds['data_vars'][var]['attrs'] # Define list of 'properties', things which identify a station and are not expected to change @@ -212,6 +211,9 @@ def reorder_stations(dataset, index_mapping): all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations') all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0] all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events') + mask_strings = np.apply_along_axis(lambda x: ''.join(x), 1, all_data.event_contributing_stations.astype(str).data) # convert each event's contributing stations (T/F) to binary + all_data.event_mask.data = np.vectorize(lambda x: int(x, 2))(mask_strings) # convert bin to dec + all_data.attrs['station_mask_order'] = np.apply_along_axis(lambda x: ''.join(x), 0, all_data.station_code.astype(str).data).item() # restore previously cached data var attributes for var_name in all_data.data_vars: if var_name in dv_attrs: @@ -251,16 +253,16 @@ def dataset(filenames, sort_time=True): ds = ds.sortby('event_time') ds = ds.reset_index(('number_of_events')) if 'number_of_events_' in ds.coords: - # # Older xarray versions appended a trailing underscore. reset_coords then dropped - # # converted the coordinate variables into regular variables while not touching - # # the original dimension name, allowing us to rename. In newer versions, the variable - # # names are never changed at the reset_index step, so the renaming step modifies - # # the dimension name, too. + # Older xarray versions appended a trailing underscore. reset_coords then dropped + # converted the coordinate variables into regular variables while not touching + # the original dimension name, allowing us to rename. In newer versions, the variable + # names are never changed at the reset_index step, so the renaming step modifies + # the dimension name, too. resetted = ('number_of_events_', 'number_of_stations_') ds = ds.rename({resetted[0]:'event_id'}) else: - # # The approach for newer xarray versions requires we explicitly rename only the variables. - # # The generic "rename" in the block above renames vars, coords, and dims. + # The approach for newer xarray versions requires we explicitly rename only the variables. + # The generic "rename" in the block above renames vars, coords, and dims. resetted = ('number_of_events', 'number_of_stations') ds = ds.rename_vars({resetted[0]:'event_id'}) return ds, starttime From 8ad6f0976f2a44a09ff8be8855a63185662e288d Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 03:42:22 -0500 Subject: [PATCH 19/28] try to make station IDs unique, fix writing, dont recalc center unless necessary --- pyxlma/lmalib/io/read.py | 64 +++++++++++++++++++++++++++++++++++---- pyxlma/lmalib/io/write.py | 4 +++ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index ef6448b..1e92531 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -5,6 +5,8 @@ import datetime as dt from os import path import collections +import string +import warnings from pyxlma.lmalib.io.cf_netcdf import new_dataset, new_template_dataset class open_gzip_or_dat: @@ -82,9 +84,15 @@ def reorder_stations(dataset, index_mapping): property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network') # Define list of variables to be recalculated for each station after the datasets are combined recalc_vars = ('station_event_fraction', 'station_power_ratio', 'network_center_latitude', 'network_center_longitude', 'network_center_altitude') + # Will be set to True if network_center location needs to be recalculated + recalc_center = False # Check each subsequent dataset for new stations for new_file_num in range(1, len(lma_data)): new_file = lma_data[new_file_num] + if (np.all(new_file.network_center_latitude.data != all_data.network_center_latitude.data) + or np.all(new_file.network_center_longitude.data.item() != all_data.network_center_longitude.data) + or np.all(new_file.network_center_altitude.data != all_data.network_center_altitude.data)): + recalc_center = True # Demote station code to a data variable and assign an index to each station in the new file new_file['station_code'] = new_file.number_of_stations new_file['number_of_stations'] = np.arange(len(new_file.number_of_stations)) @@ -197,6 +205,9 @@ def reorder_stations(dataset, index_mapping): all_data.attrs['config_times'].append([[new_file.attrs['analysis_start_time'], new_file.attrs['analysis_end_time']]]) all_data.attrs['analysis_start_time'] = min(all_data.attrs['analysis_start_time'], new_file.attrs['analysis_start_time']) all_data.attrs['analysis_end_time'] = max(all_data.attrs['analysis_end_time'], new_file.attrs['analysis_end_time']) + all_data['network_center_longitude'] = all_data.network_center_longitude.isel(number_of_events=0) + all_data['network_center_latitude'] = all_data.network_center_latitude.isel(number_of_events=0) + all_data['network_center_altitude'] = all_data.network_center_altitude.isel(number_of_events=0) # Update the global attributes all_data.attrs.update(final_attrs) # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary @@ -205,12 +216,52 @@ def reorder_stations(dataset, index_mapping): unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1] if len(unique_configs) == 1: unique_configs = unique_configs[0] + else: + all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0] + all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events') # recalculate variables that depend on the station data - all_data['network_center_longitude'] = all_data.station_longitude.mean(dim='number_of_stations') - all_data['network_center_latitude'] = all_data.station_latitude.mean(dim='number_of_stations') - all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations') - all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0] - all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events') + if recalc_center: + all_data['network_center_longitude'] = all_data.station_longitude.mean(dim='number_of_stations') + all_data['network_center_latitude'] = all_data.station_latitude.mean(dim='number_of_stations') + all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations') + # Make sure all station codes are unique + if np.max(np.unique_counts(all_data.station_code.data).counts) > 1: + unique_stat = [] + the_alphabet = string.ascii_letters + for i in range(len(all_data.station_code.data)): + stat = all_data.station_code.data[i] + if stat in unique_stat: + warnings.warn(f'Conflicting station code discovered at index {i}') + stat_str = stat.decode('utf-8') + if stat_str.isupper(): + lowc = stat_str.lower() + if bytes(lowc, 'utf-8') not in unique_stat: + warnings.warn(f'Renaming station {stat_str} to {lowc}') + unique_stat.append(bytes(lowc, 'utf-8')) + else: + for new_letter in the_alphabet: + if bytes(new_letter, 'utf-8') not in unique_stat: + warnings.warn(f'Renaming station {stat_str} to {new_letter}') + unique_stat.append(bytes(new_letter, 'utf-8')) + break + else: + warnings.warn(f'Assigning station a blank ID. This station will not be included in files generated by pyxlma.lmalib.io.write.lma_dat_file') + elif stat_str.islower(): + upc = stat_str.upper() + if bytes(upc, 'utf-8') not in unique_stat: + warnings.warn(f'Renaming station {stat_str} to {upc}') + unique_stat.append(bytes(upc, 'utf-8')) + else: + for new_letter in the_alphabet: + if bytes(new_letter, 'utf-8') not in unique_stat: + warnings.warn(f'Renaming station {stat_str} to {new_letter}') + unique_stat.append(bytes(new_letter, 'utf-8')) + break + else: + warnings.warn(f'Assigning station a blank ID. This station will not be included in files generated by pyxlma.lmalib.io.write.lma_dat_file') + else: + unique_stat.append(stat) + all_data.station_code.data = np.array(unique_stat, dtype='S1') mask_strings = np.apply_along_axis(lambda x: ''.join(x), 1, all_data.event_contributing_stations.astype(str).data) # convert each event's contributing stations (T/F) to binary all_data.event_mask.data = np.vectorize(lambda x: int(x, 2))(mask_strings) # convert bin to dec all_data.attrs['station_mask_order'] = np.apply_along_axis(lambda x: ''.join(x), 0, all_data.station_code.astype(str).data).item() @@ -218,6 +269,7 @@ def reorder_stations(dataset, index_mapping): for var_name in all_data.data_vars: if var_name in dv_attrs: all_data[var_name].attrs = dv_attrs[var_name] + all_data.station_active.attrs['_FillValue'] = 255 return all_data def dataset(filenames, sort_time=True): @@ -347,7 +399,7 @@ def to_dataset(lma_file, event_id_start=0): ds['event_contributing_stations'][:] = station_mask_bools # -- Convert the station_active flag to a psuedo-boolean -- - station_active_data = np.zeros(N_stations, dtype='int8') + station_active_data = np.zeros(N_stations, dtype='uint8') station_active_data[ds.station_active.data == b'A'] = 1 station_active_data[ds.station_active.data == b'NA'] = 0 ds.station_active.data = station_active_data diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py index ccaa458..4ba94dc 100644 --- a/pyxlma/lmalib/io/write.py +++ b/pyxlma/lmalib/io/write.py @@ -12,6 +12,10 @@ def dataset(dataset, path): for var in dataset.data_vars: if np.all(dataset[var].data == np.nan): dataset = dataset.drop_vars(var) + for attr in dataset.attrs: + if type(dataset.attrs[attr]) == dt.datetime: + dataset.attrs[attr] = dataset.attrs[attr].strftime('%Y-%m-%d %H:%M:%S.%f') + dataset.attrs.pop('config_times', None) comp = dict(zlib=True, complevel=5) encoding = {var: comp for var in dataset.data_vars} dataset = dataset.chunk('auto') From 1e739f6a94e2e08bd008b0d7ea2f59aacc4f0161 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 03:43:19 -0500 Subject: [PATCH 20/28] reading multifile test more comprehensive --- tests/test_io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_io.py b/tests/test_io.py index 85e8d74..9277cdb 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -15,7 +15,9 @@ def test_read_mf_dataset(): files_to_read = ['examples/data/'+file for file in files_to_read] dataset, start_date = lma_read.dataset(files_to_read) assert start_date == dt(2023, 12, 24, 0, 57, 1) - assert dataset == xr.open_dataset('tests/truth/lma_netcdf/lma.nc') + truth = xr.open_dataset('tests/truth/lma_netcdf/lma.nc') + for var in truth.data_vars: + compare_dataarrays(dataset, truth, var) def test_read_onefile_dataset(): From e72b3b4369dfd145e08f01a1372dc63d3b9baabb Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 04:10:16 -0500 Subject: [PATCH 21/28] vectorize station mask calculation --- pyxlma/lmalib/io/read.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 1e92531..787b644 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -262,8 +262,8 @@ def reorder_stations(dataset, index_mapping): else: unique_stat.append(stat) all_data.station_code.data = np.array(unique_stat, dtype='S1') - mask_strings = np.apply_along_axis(lambda x: ''.join(x), 1, all_data.event_contributing_stations.astype(str).data) # convert each event's contributing stations (T/F) to binary - all_data.event_mask.data = np.vectorize(lambda x: int(x, 2))(mask_strings) # convert bin to dec + bin_to_dec = 2**np.flip(np.arange(all_data.event_contributing_stations.data.shape[1])) + all_data.event_mask.data = np.sum((bin_to_dec * all_data.event_contributing_stations.data), axis=1) # convert bin to dec all_data.attrs['station_mask_order'] = np.apply_along_axis(lambda x: ''.join(x), 0, all_data.station_code.astype(str).data).item() # restore previously cached data var attributes for var_name in all_data.data_vars: From 3e762835a5d210e94455ac11fcc9c881f61fb464 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 20:48:54 -0500 Subject: [PATCH 22/28] read station name, board revision, delay, and rec_ch --- pyxlma/lmalib/io/cf_netcdf.py | 20 +++++++++++++++++++ pyxlma/lmalib/io/read.py | 36 +++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py index b192e71..e606bfa 100644 --- a/pyxlma/lmalib/io/cf_netcdf.py +++ b/pyxlma/lmalib/io/cf_netcdf.py @@ -301,6 +301,10 @@ def new_template_dataset(): 'attrs': {'_FillValue': ' '*32, 'long_name': 'LMA network'}, 'dtype': '|S32', }, + 'station_name': {'dims': ('number_of_stations',),# 'string32'), + 'attrs': {'_FillValue': ' '*32, 'long_name': 'Name of receiving station'}, + 'dtype': '|S32', + }, 'station_latitude': {'dims': ('number_of_stations',), 'attrs': {'_FillValue': np.nan, 'valid_range': [-90.0, 90.0], @@ -323,6 +327,22 @@ def new_template_dataset(): 'positive': 'up'}, 'dtype': 'float32', }, + 'station_delay': {'dims': ('number_of_stations',), + 'attrs': {'_FillValue': np.nan, + 'long_name': "Configured delay time for a signal to travel from the antenna to the receiver", + 'units': 'nanoseconds'}, + 'dtype': 'float32', + }, + 'station_board_revision': {'dims': ('number_of_stations',), + 'attrs': {'_FillValue': 255, + 'long_name': "Station board revision number"}, + 'dtype': 'uint8', + }, + 'station_receive_channels': {'dims': ('number_of_stations',), + 'attrs': {'_FillValue': 255, + 'long_name': "Number of channels on the station's receiver"}, + 'dtype': 'uint8', + }, 'station_event_fraction': {'dims': ('number_of_stations',), 'attrs': {'_FillValue': np.nan, 'long_name': "Fraction of events to which this station contributed", diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 787b644..1d1db46 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -325,7 +325,7 @@ def to_dataset(lma_file, event_id_start=0): returns an xarray dataset of the type returned by pyxlma.lmalib.io.cf_netcdf.new_dataset """ - + lma_data = lma_file.readfile() starttime = lma_file.starttime stations = lma_file.stations @@ -343,6 +343,10 @@ def to_dataset(lma_file, event_id_start=0): 'station_event_fraction':'sources', 'station_power_ratio':'

', 'station_active':'active', + 'station_name':'Name', + 'station_board_revision':'Board Revision', + 'station_delay':'Delay Time', + 'station_receive_channels':'Receiver Channels', } event_mapping = { 'event_latitude':'lat', @@ -383,6 +387,9 @@ def to_dataset(lma_file, event_id_start=0): ds.attrs['event_algorithm_name'] = lma_file.analysis_program ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version ds.attrs['original_filename'] = path.basename(lma_file.file) + ds.attrs['max_chi2'] = lma_file.maximum_chi2 + ds.attrs['min_stations'] = lma_file.minimum_stations + ds.attrs['max_chi2_iterations'] = lma_file.maximum_chi2_iter # -- Populate the station mask information -- # int, because NetCDF doesn't have booleans station_mask_bools = np.zeros((N_events, N_stations), dtype='int8') @@ -542,20 +549,17 @@ def __init__(self,filename): with open_gzip_or_dat(self.file) as f: for line_no, line in enumerate(f): if line.startswith(b'Analysis program:'): - analysis_program = line.decode().split(':')[1:] - self.analysis_program = ':'.join(analysis_program)[:-1] + self.analysis_program = line.decode().replace('Analysis program: ','').replace('Analysis program:','') if line.startswith(b'Analysis program version:'): - analysis_program_version = line.decode().split(':')[1:] - self.analysis_program_version = ':'.join(analysis_program_version)[:-1] + self.analysis_program_version = line.decode().replace('Analysis program version: ', '').replace('Analysis program version:', '') if line.startswith(b'File created:') | line.startswith(b'Analysis finished:'): - file_created = line.decode().split(':')[1:] - self.file_created = ':'.join(file_created)[:-1] + self.file_created = line.decode().replace('File created: ', '').replace('File created:', '').replace('Analysis finished: ', '').replace('Analysis finished:', '') if line.startswith(b'Location:'): - self.network_location = ':'.join(line.decode().split(':')[1:])[:-1].replace(' ','', 1) + self.network_location = line.decode().replace('Location: ', '').replace('Location:', '') if line.startswith(b'Data start time:'): timestring = line.decode().split()[-2:] self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y') - # Full start time and second, likely unneeded + # Full start time and second self.starttime = dt.datetime.strptime(timestring[0]+timestring[1],'%m/%d/%y%H:%M:%S') # self.startsecond = (starttime-dt.datetime(starttime.year,starttime.month,starttime.day)).seconds if line.startswith(b'Number of seconds analyzed:'): @@ -568,7 +572,7 @@ def __init__(self,filename): # Number of active stations if line.startswith(b'Number of active stations:'): self.active_station_c_line = line_no - self.active_staion_c_count = line.decode().split()[-1] + self.active_staion_c_count = int(line.decode().split()[-1]) # Active stations if line.startswith(b'Active stations:'): self.active_station_s_line = line_no @@ -577,7 +581,7 @@ def __init__(self,filename): self.minimum_stations = int(line.decode().split(':')[1]) if line.startswith(b'Maximum reduced chi-squared:'): self.maximum_chi2 = float(line.decode().split(':')[1]) - if line.startswith(b'Maximum chi-squared iterations:'): + if line.startswith(b'Maximum chi-squared iterations:') or line.startswith(b'Maximum number of chi-squared iterations:'): self.maximum_chi2_iter = int(line.decode().split(':')[1]) if line.startswith(b'Station information:'): self.station_info_start = line_no @@ -606,13 +610,13 @@ def __init__(self,filename): # Station overview information stations = pd.DataFrame(self.gen_sta_info(), - columns=['ID','Name','Lat','Long','Alt','Delay Time']) + columns=['ID','Name','Lat','Long','Alt','Delay Time','Board Revision','Receiver Channels']) overview = pd.DataFrame(self.gen_sta_data(), columns=['ID','Name','win(us)', 'data_ver', 'rms_error(ns)', - 'sources','percent','

','active']) + 'sources','percent','

','active']).drop(columns=['Name']) # Drop the station name column that has a redundant station letter code # as part of the name and join on station letter code. - station_combo = stations.set_index('ID').drop(columns=['Name']).join( + station_combo = stations.set_index('ID').join( overview.set_index('ID')) self.stations = station_combo.reset_index(level=station_combo.index.names) @@ -631,7 +635,7 @@ def gen_sta_info(self): parts = line.decode("utf-8").split() name = ' '.join(parts[2:-6]) sta_info, code = parts[0:2] - yield (code, name) + tuple(parts[-6:-2]) + yield (code, name) + tuple(parts[-6:]) def gen_sta_data(self): """ Parse the station data table from the header. Some files do not @@ -680,7 +684,7 @@ def readfile(self): lmad.insert(1,'Datetime',pd.to_timedelta(lmad['time (UT sec of day)'], unit='s')+self.startday) # Parse out which stations contributed into new columns for each station - col_names = self.stations.Name.values + col_names = self.stations.Name.values.copy() self.mask_ints = mask_to_int(lmad["mask"]) for index,items in enumerate(self.maskorder[::-1]): col_names[index] = items+'_'+self.stations.Name.values[index] From 0cb0d9c16e4c190d6515856ad01aeea2585a511b Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Sun, 28 Jul 2024 20:52:50 -0500 Subject: [PATCH 23/28] this doesn't appear to be used for anything... --- pyxlma/lmalib/io/read.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 1d1db46..d3b21a5 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -587,8 +587,6 @@ def __init__(self,filename): self.station_info_start = line_no if line.startswith(b'Station data:'): self.station_data_start = line_no - if line.startswith(b'Metric file:'): - self.station_data_end = line_no # Find mask list order if line.startswith(b'Station mask order:'): self.maskorder = line.decode().split()[-1] From 17e89108e57d4a72b8c2f3c1a2f7781af1c61c42 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 29 Jul 2024 00:21:32 -0500 Subject: [PATCH 24/28] make fill values more obvious --- pyxlma/lmalib/io/cf_netcdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py index e606bfa..7a83347 100644 --- a/pyxlma/lmalib/io/cf_netcdf.py +++ b/pyxlma/lmalib/io/cf_netcdf.py @@ -334,12 +334,12 @@ def new_template_dataset(): 'dtype': 'float32', }, 'station_board_revision': {'dims': ('number_of_stations',), - 'attrs': {'_FillValue': 255, + 'attrs': {'_FillValue': 0, 'long_name': "Station board revision number"}, 'dtype': 'uint8', }, 'station_receive_channels': {'dims': ('number_of_stations',), - 'attrs': {'_FillValue': 255, + 'attrs': {'_FillValue': 0, 'long_name': "Number of channels on the station's receiver"}, 'dtype': 'uint8', }, From 55c29e12636742c8c6bf3c9790e2170feb9fe567 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 29 Jul 2024 00:21:59 -0500 Subject: [PATCH 25/28] remove newlines from attrs --- pyxlma/lmalib/io/read.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index d3b21a5..86af36b 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -549,13 +549,13 @@ def __init__(self,filename): with open_gzip_or_dat(self.file) as f: for line_no, line in enumerate(f): if line.startswith(b'Analysis program:'): - self.analysis_program = line.decode().replace('Analysis program: ','').replace('Analysis program:','') + self.analysis_program = line.decode().replace('Analysis program: ','').replace('Analysis program:','').replace('\n', '') if line.startswith(b'Analysis program version:'): - self.analysis_program_version = line.decode().replace('Analysis program version: ', '').replace('Analysis program version:', '') + self.analysis_program_version = line.decode().replace('Analysis program version: ', '').replace('Analysis program version:', '').replace('\n', '') if line.startswith(b'File created:') | line.startswith(b'Analysis finished:'): - self.file_created = line.decode().replace('File created: ', '').replace('File created:', '').replace('Analysis finished: ', '').replace('Analysis finished:', '') + self.file_created = line.decode().replace('File created: ', '').replace('File created:', '').replace('Analysis finished: ', '').replace('Analysis finished:', '').replace('\n', '') if line.startswith(b'Location:'): - self.network_location = line.decode().replace('Location: ', '').replace('Location:', '') + self.network_location = line.decode().replace('Location: ', '').replace('Location:', '').replace('\n', '') if line.startswith(b'Data start time:'): timestring = line.decode().split()[-2:] self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y') From 0cc3204b30bd6c4c5b4c69a26ef002ef5fa62c99 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 29 Jul 2024 00:47:14 -0500 Subject: [PATCH 26/28] add new properties --- pyxlma/lmalib/io/read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 86af36b..64df2c0 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -81,7 +81,7 @@ def reorder_stations(dataset, index_mapping): for var in new_ds['data_vars']: dv_attrs[var] = new_ds['data_vars'][var]['attrs'] # Define list of 'properties', things which identify a station and are not expected to change - property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network') + property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network', 'station_name', 'station_board_revision', 'station_delay', 'station_receive_channels') # Define list of variables to be recalculated for each station after the datasets are combined recalc_vars = ('station_event_fraction', 'station_power_ratio', 'network_center_latitude', 'network_center_longitude', 'network_center_altitude') # Will be set to True if network_center location needs to be recalculated From 0defdeac53bc82c58e8e18b64469af755d1be421 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 29 Jul 2024 00:52:14 -0500 Subject: [PATCH 27/28] better handle combining new attrs --- pyxlma/lmalib/io/read.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 64df2c0..a5f5e7a 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -66,10 +66,21 @@ def reorder_stations(dataset, index_mapping): continue attr_vals = all_attrs[k] set_of_values = set(attr_vals) - if len(set_of_values) == 1: + if len(set_of_values) == 0: + continue + elif len(set_of_values) == 1: final_attrs[k] = tuple(set_of_values)[0] else: - final_attrs[k] = '; '.join(attr_vals) + if type(attr_vals[0]) == str: + final_attrs[k] = '; '.join(attr_vals) + elif k == 'min_stations': + final_attrs[k] = min(attr_vals) + elif k == 'max_chi2': + final_attrs[k] = max(attr_vals) + elif k == 'max_chi2_iterations': + final_attrs[k] = min(attr_vals) + else: + warnings.warn(f'Conflicting values for global attribute {k}') # Get the station data from the first dataset and assign each station a unique index lma_data[0]['station_code'] = lma_data[0].number_of_stations lma_data[0]['number_of_stations'] = np.arange(len(lma_data[0].number_of_stations)) From d2b28e37d8f29e34e76f293ee743bd0b25f319a0 Mon Sep 17 00:00:00 2001 From: Sam Gardner Date: Mon, 29 Jul 2024 00:52:38 -0500 Subject: [PATCH 28/28] dat.gz write support! --- pyxlma/lmalib/io/write.py | 245 ++++++++++++++++++++++++++++++++++---- 1 file changed, 219 insertions(+), 26 deletions(-) diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py index 4ba94dc..4599c0c 100644 --- a/pyxlma/lmalib/io/write.py +++ b/pyxlma/lmalib/io/write.py @@ -1,6 +1,10 @@ +import importlib.metadata import numpy as np import datetime as dt import warnings +import importlib +from functools import reduce +import gzip def dataset(dataset, path): "Write an LMA dataset to a netCDF file" @@ -21,6 +25,177 @@ def dataset(dataset, path): dataset = dataset.chunk('auto') dataset.to_netcdf(path, encoding=encoding) + +def count_decimals_in_array(a): + a = np.array(a).astype(str) + # Find the position of the decimal point in each string + decimal_positions = np.char.find(a, '.') + + # Handle the case where there are no decimal points + decimal_positions[decimal_positions == -1] = np.char.str_len(a[decimal_positions == -1]) + + # Calculate the number of characters after the decimal point + decimal_counts = np.char.str_len(a) - decimal_positions - 1 + + return np.max(decimal_counts) + + +def create_station_info_string(dataset): + """ + Create a string representation of the station information table for an LMA dataset. + + Parameters + ---------- + dataset : xarray.Dataset + The LMA dataset to create the station information table for. + + Returns + ------- + str + The string representation of the station information table. + """ + if 'number_of_stations' not in dataset.dims: + raise ValueError('Dataset does not contain station information.') + header_line = 'Station information: ' + info_to_include = [] + if 'station_code' in dataset.data_vars: + header_line += 'id, ' + info_to_include.append('station_code') + else: + raise ValueError('Stations do not have letter codes. This is incompatible with the XLMA dat format, please assign station codes to all stations.') + + if 'station_name' in dataset.data_vars: + header_line += 'name, ' + info_to_include.append('station_name') + else: + raise ValueError('Stations do not have names. This is incompatible with the XLMA dat format, please assign names to all stations.') + + if 'station_latitude' in dataset.data_vars: + header_line += 'lat(d), ' + info_to_include.append('station_latitude') + else: + raise ValueError('Stations do not have latitudes. This is incompatible with the XLMA dat format, please assign latitudes to all stations.') + + if 'station_longitude' in dataset.data_vars: + header_line += 'lon(d), ' + info_to_include.append('station_longitude') + else: + raise ValueError('Stations do not have longitudes. This is incompatible with the XLMA dat format, please assign longitudes to all stations.') + + if 'station_altitude' in dataset.data_vars: + header_line += 'alt(m), ' + info_to_include.append('station_altitude') + else: + raise ValueError('Stations do not have altitudes. This is incompatible with the XLMA dat format, please assign altitudes to all stations.') + + if 'station_delay' in dataset.data_vars: + header_line += 'delay(ns), ' + info_to_include.append('station_delay') + # This is optional + + if 'station_board_revision' in dataset.data_vars: + header_line += 'board_rev, ' + info_to_include.append('station_board_revision') + else: + raise ValueError('Stations do not have board revisions. This is incompatible with the XLMA dat format, please assign board revisions to all stations.') + + if 'station_receive_channels' in dataset.data_vars: + header_line += 'rec_ch, ' + info_to_include.append('station_receive_channels') + # This is optional + header_line = header_line[:-2] + stations_strings = [] + for station_num in range(dataset.sizes['number_of_stations']): + this_station = dataset.isel(number_of_stations=station_num) + station_string = 'Sta_info: ' + for var in info_to_include: + station_string += f'{this_station[var].data.astype(str).item()} ' + stations_strings.append(station_string) + return header_line+'\n'+'\n'.join(stations_strings) + + +def create_station_data_string(dataset): + pass + + +def create_event_data_string(dataset): + if 'analysis_start_time' in dataset.attrs: + start_date = dataset.attrs['analysis_start_time'] + start_date = np.array([start_date]).astype('datetime64[D]') + else: + start_date = dataset.event_time.data.min().astype('datetime64[D]') + columns_line = 'Data: ' + format_line = 'Data format: ' + data_to_combine = [] + + day_sec = (dataset.event_time - start_date).data.astype('timedelta64[ns]').astype(np.float64)/1e9 + day_sec_str = day_sec.astype(str) + day_sec_length = np.max(np.char.str_len(day_sec_str)) + day_sec_str = np.char.ljust(day_sec_str, day_sec_length, fillchar='0') + columns_line += 'time (UT sec of day), ' + format_line += f'{day_sec_length}.9f, ' #time is always in nanoseconds + data_to_combine.append(day_sec_str.T) + + lat = dataset.event_latitude.data.astype(str) + lat_length = np.max(np.char.str_len(lat)) + lat = np.char.ljust(lat, lat_length, fillchar='0') + columns_line += 'lat, ' + lat_dec_count = count_decimals_in_array(lat) + format_line += f'{lat_length}.{lat_dec_count}f, ' + data_to_combine.append(lat.T) + + lon = dataset.event_longitude.data.astype(str) + lon_length = np.max(np.char.str_len(lon)) + lon = np.char.ljust(lon, lon_length, fillchar='0') + columns_line += 'lon, ' + lon_dec_count = count_decimals_in_array(lon) + format_line += f'{lon_length}.{lon_dec_count}f, ' + data_to_combine.append(lon.T) + + alt = dataset.event_altitude.data.astype(str) + alt_length = np.max(np.char.str_len(alt)) + alt = np.char.ljust(alt, alt_length, fillchar='0') + columns_line += 'alt(m), ' + alt_dec_count = count_decimals_in_array(alt) + format_line += f'{alt_length}.{alt_dec_count}f, ' + data_to_combine.append(alt.T) + + chi2 = dataset.event_chi2.data.astype(str) + chi2_length = np.max(np.char.str_len(chi2)) + chi2 = np.char.ljust(chi2, chi2_length, fillchar='0') + columns_line += 'reduced chi^2, ' + chi2_dec_count = count_decimals_in_array(chi2) + format_line += f'{chi2_length}.{chi2_dec_count}f, ' + data_to_combine.append(chi2.T) + + power = dataset.event_power.data.astype(str) + power_length = np.max(np.char.str_len(power)) + power = np.char.ljust(power, power_length, fillchar='0') + columns_line += 'P(dBW), ' + power_dec_count = count_decimals_in_array(power) + format_line += f'{power_length}.{power_dec_count}f, ' + data_to_combine.append(power.T) + + masks = np.vectorize(np.base_repr)(dataset.event_mask.data, base=16) + mask_len = np.max(np.char.str_len(masks)) + masks = np.char.rjust(masks, mask_len, fillchar='0') + masks = '0x' + masks + mask_len += 2 + columns_line += 'mask, ' + format_line += f'{mask_len}x, ' + data_to_combine.append(masks.T) + + columns_line = columns_line[:-2] + format_line = format_line[:-2] + N_ev_line = f'Number of events: {dataset.sizes["number_of_events"]}' + data_line = '*** data ***' + data_str = '\n'.join(reduce(lambda x, y:np.char.add(np.char.add(x, ' '), y), data_to_combine)) + return columns_line+'\n'+format_line+'\n'+N_ev_line+'\n'+data_line+'\n'+data_str + + + + + def lma_dat_file(dataset, path, use_gzip=True): """ Write an LMA dataset to a legacy .dat(.gz) file for use in IDL XLMA. @@ -38,37 +213,55 @@ def lma_dat_file(dataset, path, use_gzip=True): if 'network_configurations' in dataset.dims: warnings.warn('Attempting to write dataset with multiple network configurations to a single file. This is highly experimental.\n' 'Caveats:\n-Location names will be appended and separated by semicolon\n- The center of the network is derived from the mean of the locations of the stations') - center_lon = dataset.station_longitude.data.mean() - center_lat = dataset.station_latitude.data.mean() - center_alt = dataset.station_altitude.data.mean() - num_active = np.sum(np.clip(np.sum(dataset.station_active.data, axis=0), 0, 1)) + stations_active = np.sum(dataset.station_active.data, axis=0) + num_active = np.sum(np.clip(stations_active, 0, 1)) + stations_active = ' '.join(np.extract(stations_active, dataset.station_code.data).astype(str).tolist()) else: - center_lon = dataset.network_center_longitude.data.item() - center_lat = dataset.network_center_latitude.data.item() - center_alt = dataset.network_center_altitude.data.item() num_active = np.sum(dataset.station_active.data) - lma_header = (f'Lightning Mapping Array analyzed data\n' - f'Analysis program: {dataset.attrs["event_algorithm_name"]}\n' - f'Analysis program version: {dataset.attrs["event_algorithm_version"]}\n' + stations_active = ' '.join(np.extract(dataset.station_active.data, dataset.station_code.data).astype(str).tolist()) + + center_lon = dataset.network_center_longitude.data.item() + center_lat = dataset.network_center_latitude.data.item() + center_alt = dataset.network_center_altitude.data.item() + + unique_codes, code_counts = np.unique(dataset.station_code.data, return_counts=True) + if np.max(code_counts) > 1: + raise ValueError('Duplicate station codes found in dataset. This is incompatible with the XLMA dat format, please rename conflicting station letters.') + if b'' in unique_codes: + raise ValueError('Empty station codes found in dataset. This is incompatible with the XLMA dat format, please assign station codes to all stations.') + + if dataset.attrs["event_algorithm_name"].startswith('pyxlma'): + analysis_program = dataset.attrs["event_algorithm_name"] + else: + analysis_program = 'pyxlma; '+dataset.attrs["event_algorithm_name"] + if dataset.attrs["event_algorithm_version"].startswith('pyxlma'): + analysis_program_version = dataset.attrs["event_algorithm_version"] + else: + analysis_program_version = f'pyxlma-{importlib.metadata.version("pyxlma")}; {dataset.attrs["event_algorithm_version"]}' + + lma_file = (f'pyxlma exported data -- https://github.com/deeplycloudy/xlma-python\n' + f'Analysis program: {analysis_program}\n' + f'Analysis program version: {analysis_program_version}\n' f'File created: {dt.datetime.utcnow().strftime("%a %b %d %H:%M:%S %Y")}\n' - f'Data start time: {dataset.attrs["analysis_start_time"].strftime("%M/%d/%y %H:%M:%S")}\n' - f'Number of seconds analyzed: {dataset.attrs["number_of_seconds_analyzed"]}\n' + f'Data start time: {dataset.attrs["analysis_start_time"].strftime("%m/%d/%y %H:%M:%S")}\n' + f'Number of seconds analyzed: {(dataset.attrs["analysis_end_time"] - dataset.attrs["analysis_start_time"]).total_seconds()}\n' f'Location: {"; ".join(np.unique(dataset.station_network.data).astype(str).tolist())}\n' f'Coordinate center (lat,lon,alt): {center_lat:.7f}, {center_lon:.7f}, {center_alt:.2f}\n' f'Coordinate frame: cartesian\n' - # f'Maximum diameter of LMA (km): ' - # f'Maximum light-time across LMA (ns): ' - f'Number of stations: {dataset.sizes['number_of_stations']}' + f'Number of stations: {dataset.sizes['number_of_stations']}\n' f'Number of active stations: {num_active}\n' - # f'Active stations:' - # f'Minimum number of stations per solution: {}\n' - # f'Maximum reduced chi-squared: {}\n' - # f'Maximum number of chi-squared iterations: {}\n' - ### STATION INFO TABLE + f'Active stations: {stations_active}\n' + f'Minimum number of stations per solution: {dataset.attrs["min_stations"]}\n' + f'Maximum reduced chi-squared: {dataset.attrs["max_chi2"]}\n' + f'Maximum number of chi-squared iterations: {dataset.attrs["max_chi2_iterations"]}\n' + f'{create_station_info_string(dataset)}\n' + f'Station mask order: {dataset.attrs["station_mask_order"]}\n' ### STATION DATA TABLE - f'Metric file version: 4\n' - f'Data: time (UT sec of day), lat, lon, alt(m), reduced chi^2, P(dBW), mask\n' - f'Data format: 15.9f 12.8f 13.8f 9.2f 6.2f 5.1f 7x\n' - f'Number of events: {dataset.sizes["number_of_events"]}\n' - f'*** data ***\n' - ) \ No newline at end of file + f'{create_event_data_string(dataset)}' + ) + if use_gzip: + with gzip.open(path, 'wb') as f: + f.write(lma_file.encode('utf-8')) + else: + with open(path, 'w') as f: + f.write(lma_file) \ No newline at end of file