Merge branch 'develop' into release/2.10.0

# Conflicts: # pyproject.toml
podaac · May 2, 2024 · e497a68 · e497a68
2 parents 53adc2e + b9742d3
commit e497a68
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 ### Added
+- [issue/260](https://github.com/podaac/l2ss-py/pull/261): Add gpm cleanup function to add a timeMidScan variable if the timeMidScan variable isn't present. Function takes the years, months, days etc ScanTime variables and creates a single time variable using datetime.datetime library.
 ### Changed
 ### Deprecated 
 ### Removed

diff --git a/podaac/subsetter/gpm_cleanup.py b/podaac/subsetter/gpm_cleanup.py
@@ -3,10 +3,34 @@
 to nscan, nbin, nfreq by using the DimensionNames variable attribute
 """
 
+import datetime
+from netCDF4 import date2num  # pylint: disable=no-name-in-module
+
 dim_dict = {}
 
 
-def change_var_dims(nc_dataset, variables=None):
+def compute_new_time_data(time_group, nc_dataset):
+    """
+    create a time variable, timeMidScan, that is present in other
+    GPM collections but not the ENV collections.
+    """
+    # set the time unit for GPM
+    time_unit_out = "seconds since 1980-01-06 00:00:00"
+    # conver to a float, seconds variable
+    new_time_list = [date2num(datetime.datetime(
+        nc_dataset[time_group+'__Year'][:][i],
+        nc_dataset[time_group+'__Month'][:][i],
+        nc_dataset[time_group+'__DayOfMonth'][:][i],
+        hour=nc_dataset[time_group+'__Hour'][:][i],
+        minute=nc_dataset[time_group+'__Minute'][:][i],
+        second=nc_dataset[time_group+'__Second'][:][i],
+        microsecond=nc_dataset[time_group+'__MilliSecond'][:][i]*1000),
+        time_unit_out) for i in range(len(nc_dataset[time_group+'__Year'][:]))]
+
+    return new_time_list, time_unit_out
+
+
+def change_var_dims(nc_dataset, variables=None, time_name="_timeMidScan"):
     """
     Go through each variable and get the dimension names from attribute "DimensionNames
     If the name is unique, add it as a dimension to the netCDF4 dataset. Then change the
@@ -62,4 +86,20 @@ def change_var_dims(nc_dataset, variables=None):
                 # copy the data to the new variable with dimension names
                 new_mapped_var[var_name][:] = var[:]
 
+    if not any(time_name in var for var in var_list):
+        # if there isn't any timeMidScan variables, create one
+        scan_time_groups = ["__".join(i.split('__')[:-1]) for i in var_list if 'ScanTime' in i]
+        for time_group in list(set(scan_time_groups)):
+            # get the seconds since Jan 6, 1980
+            time_data, time_unit = compute_new_time_data(time_group, nc_dataset)
+            # make a new variable for each ScanTime group
+            new_time_var_name = time_group+time_name
+            # copy dimensions from the Year variable
+            var_dims = nc_dataset.variables[time_group+'__Year'].dimensions
+            comp_args = {"zlib": True, "complevel": 1}
+            nc_dataset.createVariable(new_time_var_name, 'f8', var_dims, **comp_args)
+            nc_dataset.variables[new_time_var_name].setncattr('unit', time_unit)
+            # copy the data in
+            nc_dataset.variables[new_time_var_name][:] = time_data
+
     return nc_dataset
diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
@@ -1226,6 +1226,9 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
             except AttributeError:
                 pass
 
+    if hdf_type == 'GPM':
+        args['decode_times'] = False
+
     with xr.open_dataset(
             xr.backends.NetCDF4DataStore(nc_dataset),
             **args

diff --git a/tests/data/GPM/GPM_test_file.HDF5 b/tests/data/GPM/GPM_test_file.HDF5
diff --git a/tests/data/GPM/GPM_test_file_2.HDF5 b/tests/data/GPM/GPM_test_file_2.HDF5
diff --git a/tests/test_subset.py b/tests/test_subset.py
@@ -2288,21 +2288,22 @@ def test_get_unique_groups():
     assert expected_groups_single == unique_groups_single
     assert expected_diff_counts_single == diff_counts_single
 
-def test_gpm_dimension_map(data_dir, subset_output_dir, request):
-    """Test GPM files for dimension mapping and returns the expected netCDF
-       dataset without the phony dimensions"""
+
+def test_gpm_compute_new_var_data(data_dir, subset_output_dir, request):
+    """Test GPM files that have scantime variable to compute the time for seconds
+    since 1980-01-06"""
 
     gpm_dir = join(data_dir, 'GPM')
-    gpm_file = 'GPM_test_file.HDF5'
-    bbox = np.array(((-180, 180), (-90, 90)))
+    gpm_file = 'GPM_test_file_2.HDF5'
     shutil.copyfile(
         os.path.join(gpm_dir, gpm_file),
         os.path.join(subset_output_dir, gpm_file)
     )
 
     nc_dataset, has_groups, file_extension = subset.open_as_nc_dataset(join(subset_output_dir, gpm_file))
 
-    nc_dataset = gc.change_var_dims(nc_dataset)
+    nc_dataset_new = gc.change_var_dims(nc_dataset, variables=None, time_name='__test_time')
+    assert int(nc_dataset_new.variables["__FS__ScanTime__test_time"][:][0]) == 1306403820
 
     for var_name, var in nc_dataset.variables.items():
         dims = list(var.dimensions)