From 288cfffd29508207ae91313643c711851a00dbd6 Mon Sep 17 00:00:00 2001 From: Danang Date: Tue, 3 Dec 2024 20:09:19 +0000 Subject: [PATCH] read past forecast date for tio shortterm (#281) --- django_project/gap/providers/tio.py | 112 +++++++++---- .../gap/tests/providers/test_tio_zarr.py | 150 +++++++++++++++--- django_project/gap/utils/reader.py | 24 ++- 3 files changed, 238 insertions(+), 48 deletions(-) diff --git a/django_project/gap/providers/tio.py b/django_project/gap/providers/tio.py index 74edc32f..6a75c2ec 100644 --- a/django_project/gap/providers/tio.py +++ b/django_project/gap/providers/tio.py @@ -9,7 +9,7 @@ import logging import os from datetime import datetime, timedelta -from typing import List +from typing import List, Tuple import pytz import requests @@ -132,7 +132,7 @@ def __init__( self, dataset: Dataset, attributes: List[DatasetAttribute], location_input: DatasetReaderInput, start_date: datetime, end_date: datetime, verbose = False, - altitudes: (float, float) = None + altitudes: Tuple[float, float] = None ) -> None: """Initialize Dataset Reader.""" super().__init__( @@ -499,7 +499,7 @@ def get_raw_results(self) -> List[DatasetTimelineValue]: class TioZarrReaderValue(DatasetReaderValue): """Class that convert Tio Zarr Dataset to TimelineValues.""" - date_variable = 'forecast_day' + date_variable = 'date' def __init__( self, val: xrDataset | List[DatasetTimelineValue], @@ -524,22 +524,11 @@ def _post_init(self): if not self._is_xr_dataset: return - # rename attributes and the forecast_day - renamed_dict = { - 'forecast_day_idx': 'forecast_day' - } + renamed_dict = {} for attr in self.attributes: renamed_dict[attr.source] = attr.attribute.variable_name self._val = self._val.rename(renamed_dict) - # replace forecast_day to actualdates - initial_date = pd.Timestamp(self.forecast_date) - forecast_day_timedelta = pd.to_timedelta( - self._val.forecast_day, unit='D') - forecast_day = initial_date + forecast_day_timedelta - self._val = self._val.assign_coords( - forecast_day=('forecast_day', forecast_day)) - def _xr_dataset_to_dict(self) -> dict: """Convert xArray Dataset to dictionary. @@ -582,7 +571,7 @@ def __init__( self, dataset: Dataset, attributes: List[DatasetAttribute], location_input: DatasetReaderInput, start_date: datetime, end_date: datetime, - altitudes: (float, float) = None + altitudes: Tuple[float, float] = None ) -> None: """Initialize TioZarrReader class.""" super().__init__( @@ -609,14 +598,41 @@ def read_forecast_data(self, start_date: datetime, end_date: datetime): if zarr_file is None: return ds = self.open_dataset(zarr_file) + # get latest forecast date self.latest_forecast_date = ds['forecast_date'][-1].values - if np.datetime64(start_date) < self.latest_forecast_date: - return - val = self.read_variables(ds, start_date, end_date) - if val is None: - return - self.xrDatasets.append(val) + + # split date range + ranges = self._split_date_range( + start_date, end_date, + pd.Timestamp(self.latest_forecast_date).to_pydatetime().replace( + tzinfo=pytz.UTC + ) + ) + + if ranges['future']: + val = self.read_variables( + ds, ranges['future'][0], ranges['future'][1] + ) + if val: + dval = val.drop_vars('forecast_date').rename({ + 'forecast_day_idx': 'date' + }) + initial_date = pd.Timestamp(self.latest_forecast_date) + forecast_day_timedelta = pd.to_timedelta(dval.date, unit='D') + forecast_day = initial_date + forecast_day_timedelta + dval = dval.assign_coords(date=('date', forecast_day)) + self.xrDatasets.append(dval) + + if ranges['past']: + val = self.read_variables( + ds, ranges['past'][0], ranges['past'][1] + ) + if val: + val = val.drop_vars(self.date_variable).rename({ + 'forecast_date': 'date' + }) + self.xrDatasets.append(val) def get_data_values(self) -> DatasetReaderValue: """Fetch data values from dataset. @@ -625,8 +641,12 @@ def get_data_values(self) -> DatasetReaderValue: :rtype: DatasetReaderValue """ val = None - if len(self.xrDatasets) > 0: + if len(self.xrDatasets) == 1: val = self.xrDatasets[0] + elif len(self.xrDatasets) == 2: + val = xr.concat(self.xrDatasets, dim='date').sortby('date') + val = val.chunk({'date': 30}) + return TioZarrReaderValue( val, self.location_input, self.attributes, self.latest_forecast_date) @@ -654,6 +674,15 @@ def _read_variables_by_point( :rtype: xrDataset """ point = self.location_input.point + + if start_dt < self.latest_forecast_date: + return dataset[variables].sel( + forecast_date=slice(start_dt, end_dt), + **{self.date_variable: 0} + ).sel( + lat=point.y, + lon=point.x, method='nearest') + min_idx = self._get_forecast_day_idx(start_dt) max_idx = self._get_forecast_day_idx(end_dt) return dataset[variables].sel( @@ -685,6 +714,15 @@ def _read_variables_by_bbox( lat_max = points[1].y lon_min = points[0].x lon_max = points[1].x + + if start_dt < self.latest_forecast_date: + return dataset[variables].sel( + forecast_date=slice(start_dt, end_dt), + lat=slice(lat_min, lat_max), + lon=slice(lon_min, lon_max), + **{self.date_variable: 0} + ) + min_idx = self._get_forecast_day_idx(start_dt) max_idx = self._get_forecast_day_idx(end_dt) # output results is in two dimensional array @@ -712,15 +750,25 @@ def _read_variables_by_polygon( :return: Dataset that has been filtered :rtype: xrDataset """ - min_idx = self._get_forecast_day_idx(start_dt) - max_idx = self._get_forecast_day_idx(end_dt) # Convert the polygon to a format compatible with shapely shapely_multipolygon = shape( json.loads(self.location_input.polygon.geojson)) # Create a mask using regionmask from the shapely polygon mask = regionmask.Regions([shapely_multipolygon]).mask(dataset) + + if start_dt < self.latest_forecast_date: + return dataset[variables].sel( + forecast_date=slice(start_dt, end_dt), + **{self.date_variable: 0} + ).where( + mask == 0, + drop=True + ) + # Mask the dataset + min_idx = self._get_forecast_day_idx(start_dt) + max_idx = self._get_forecast_day_idx(end_dt) return dataset[variables].sel( forecast_date=self.latest_forecast_date, **{self.date_variable: slice(min_idx, max_idx)} @@ -746,8 +794,6 @@ def _read_variables_by_points( :return: Dataset that has been filtered :rtype: xrDataset """ - min_idx = self._get_forecast_day_idx(start_dt) - max_idx = self._get_forecast_day_idx(end_dt) # use the 0 index for it's date variable mask = np.zeros_like(dataset[variables[0]][0][0], dtype=bool) @@ -763,6 +809,18 @@ def _read_variables_by_points( 'lat': dataset['lat'], 'lon': dataset['lon'] }, dims=['lat', 'lon'] ) + + if start_dt < self.latest_forecast_date: + return dataset[variables].sel( + forecast_date=slice(start_dt, end_dt), + **{self.date_variable: 0} + ).where( + mask_da, + drop=True + ) + + min_idx = self._get_forecast_day_idx(start_dt) + max_idx = self._get_forecast_day_idx(end_dt) # Apply the mask to the dataset return dataset[variables].sel( forecast_date=self.latest_forecast_date, diff --git a/django_project/gap/tests/providers/test_tio_zarr.py b/django_project/gap/tests/providers/test_tio_zarr.py index 8f9129f4..c97e07e2 100644 --- a/django_project/gap/tests/providers/test_tio_zarr.py +++ b/django_project/gap/tests/providers/test_tio_zarr.py @@ -6,7 +6,8 @@ """ from django.test import TestCase -from datetime import datetime +from datetime import datetime, timedelta +import pytz import xarray as xr import numpy as np import pandas as pd @@ -66,7 +67,7 @@ def setUp(self): self.mock_location_input = DatasetReaderInput.from_point(point) # Creating filtered xarray dataset - forecast_days = np.array([0, 1, 2]) + forecast_days = pd.date_range(start='2023-01-01', end='2023-01-03') lats = np.array([10, 20]) lons = np.array([30, 40]) temperature_data = np.random.rand( @@ -75,11 +76,11 @@ def setUp(self): self.mock_xr_dataset = xr.Dataset( { "max_temperature": ( - ["forecast_day_idx", "lat", "lon"], temperature_data + ["date", "lat", "lon"], temperature_data ), }, coords={ - "forecast_day_idx": forecast_days, + "date": forecast_days, "lat": lats, "lon": lons, } @@ -87,16 +88,13 @@ def setUp(self): # Mock forecast_date self.forecast_date = np.datetime64('2023-01-01') variables = [ - 'forecast_day_idx', + 'date', 'max_temperature' ] self.mock_xr_dataset = self.mock_xr_dataset[variables].sel( lat=point.y, lon=point.x, method='nearest' - ).where( - (self.mock_xr_dataset['forecast_day_idx'] >= 0) & - (self.mock_xr_dataset['forecast_day_idx'] <= 1), - drop=True) + ) # TioZarrReaderValue initialization with xarray dataset self.tio_reader_value_xr = TioZarrReaderValue( @@ -116,17 +114,20 @@ def test_post_init(self): """Test post initialization method.""" # Check if the renaming happened correctly self.assertIn( - 'forecast_day', self.tio_reader_value_xr.xr_dataset.coords) + 'date', self.tio_reader_value_xr.xr_dataset.coords) self.assertIn( 'max_temperature', self.tio_reader_value_xr.xr_dataset.data_vars) self.assertNotIn( 'forecast_day_idx', self.tio_reader_value_xr.xr_dataset.coords ) + self.assertNotIn( + 'forecast_date', self.tio_reader_value_xr.xr_dataset.coords + ) # Check if forecast_day has been updated to actual dates - forecast_days = pd.date_range('2023-01-01', periods=2) + forecast_days = pd.date_range('2023-01-01', periods=3) xr_forecast_days = pd.to_datetime( - self.tio_reader_value_xr.xr_dataset.forecast_day.values) + self.tio_reader_value_xr.xr_dataset.date.values) pd.testing.assert_index_equal( pd.Index(xr_forecast_days), forecast_days) @@ -217,8 +218,8 @@ def test_read_forecast_data_empty(self, mock_filter): def test_read_forecast_data(self): """Test for reading forecast data.""" - dt1 = datetime(2024, 10, 3) - dt2 = datetime(2024, 10, 5) + dt1 = datetime(2024, 10, 3, tzinfo=pytz.UTC) + dt2 = datetime(2024, 10, 5, tzinfo=pytz.UTC) with patch.object(self.reader, 'open_dataset') as mock_open: mock_open.return_value = mock_open_zarr_dataset() self.reader.read_forecast_data(dt1, dt2) @@ -231,8 +232,8 @@ def test_read_forecast_data(self): def test_read_from_bbox(self): """Test for reading forecast data using bbox.""" - dt1 = datetime(2024, 10, 3) - dt2 = datetime(2024, 10, 5) + dt1 = datetime(2024, 10, 3, tzinfo=pytz.UTC) + dt2 = datetime(2024, 10, 5, tzinfo=pytz.UTC) with patch.object(self.reader, 'open_dataset') as mock_open: mock_open.return_value = mock_open_zarr_dataset() self.reader.location_input = DatasetReaderInput.from_bbox( @@ -254,8 +255,68 @@ def test_read_from_bbox(self): def test_read_from_points(self): """Test for reading forecast data using points.""" - dt1 = datetime(2024, 10, 3) - dt2 = datetime(2024, 10, 5) + dt1 = datetime(2024, 10, 3, tzinfo=pytz.UTC) + dt2 = datetime(2024, 10, 5, tzinfo=pytz.UTC) + with patch.object(self.reader, 'open_dataset') as mock_open: + mock_open.return_value = mock_open_zarr_dataset() + p1 = Point(LON_METADATA['min'], LAT_METADATA['min']) + p2 = Point( + LON_METADATA['min'] + LON_METADATA['inc'], + LAT_METADATA['min'] + LAT_METADATA['inc'] + ) + self.reader.location_input = DatasetReaderInput( + MultiPoint([p1, p2]), + LocationInputType.LIST_OF_POINT + ) + self.reader.read_forecast_data(dt1, dt2) + self.assertEqual(len(self.reader.xrDatasets), 1) + data_value = self.reader.get_data_values() + mock_open.assert_called_once() + self.assertTrue(isinstance(data_value, DatasetReaderValue)) + self.assertTrue(isinstance(data_value._val, xr.Dataset)) + dataset = data_value.xr_dataset + self.assertIn('max_temperature', dataset.data_vars) + + def test_read_past_forecast_data(self): + """Test for reading forecast data.""" + dt1 = datetime(2024, 8, 1, tzinfo=pytz.UTC) + dt2 = datetime(2024, 8, 2, tzinfo=pytz.UTC) + with patch.object(self.reader, 'open_dataset') as mock_open: + mock_open.return_value = mock_open_zarr_dataset() + self.reader.read_forecast_data(dt1, dt2) + self.assertEqual(len(self.reader.xrDatasets), 1) + data_value = self.reader.get_data_values().to_json() + mock_open.assert_called_once() + result_data = data_value['data'] + self.assertEqual(len(result_data), 0) + + def test_read_past_forecast_data_using_bbox(self): + """Test for reading forecast data using bbox.""" + dt1 = datetime(2024, 8, 1, tzinfo=pytz.UTC) + dt2 = datetime(2024, 8, 2, tzinfo=pytz.UTC) + with patch.object(self.reader, 'open_dataset') as mock_open: + mock_open.return_value = mock_open_zarr_dataset() + self.reader.location_input = DatasetReaderInput.from_bbox( + [ + LON_METADATA['min'], + LAT_METADATA['min'], + LON_METADATA['min'] + LON_METADATA['inc'], + LAT_METADATA['min'] + LAT_METADATA['inc'] + ] + ) + self.reader.read_forecast_data(dt1, dt2) + self.assertEqual(len(self.reader.xrDatasets), 1) + data_value = self.reader.get_data_values() + mock_open.assert_called_once() + self.assertTrue(isinstance(data_value, DatasetReaderValue)) + self.assertTrue(isinstance(data_value._val, xr.Dataset)) + dataset = data_value.xr_dataset + self.assertIn('max_temperature', dataset.data_vars) + + def test_read_past_forecast_data_using_points(self): + """Test for reading forecast data using points.""" + dt1 = datetime(2024, 8, 1, tzinfo=pytz.UTC) + dt2 = datetime(2024, 8, 2, tzinfo=pytz.UTC) with patch.object(self.reader, 'open_dataset') as mock_open: mock_open.return_value = mock_open_zarr_dataset() p1 = Point(LON_METADATA['min'], LAT_METADATA['min']) @@ -275,3 +336,56 @@ def test_read_from_points(self): self.assertTrue(isinstance(data_value._val, xr.Dataset)) dataset = data_value.xr_dataset self.assertIn('max_temperature', dataset.data_vars) + + def test_entirely_in_past(self): + """Test split date range entirely in past.""" + start_date = datetime(2024, 11, 20) + end_date = datetime(2024, 11, 25) + now = datetime(2024, 12, 1) + result = self.reader._split_date_range(start_date, end_date, now) + self.assertEqual( + result, {'past': (start_date, end_date), 'future': None} + ) + + def test_entirely_in_future(self): + """Test split date range entirely in future.""" + start_date = datetime(2024, 12, 10) + end_date = datetime(2024, 12, 15) + now = datetime(2024, 12, 1) + result = self.reader._split_date_range(start_date, end_date, now) + self.assertEqual( + result, {'past': None, 'future': (start_date, end_date)} + ) + + def test_split_between_past_and_future(self): + """Test split date range between past and future.""" + start_date = datetime(2024, 11, 20) + end_date = datetime(2024, 12, 10) + now = datetime(2024, 12, 1) + result = self.reader._split_date_range(start_date, end_date, now) + self.assertEqual(result, { + 'past': (start_date, now - timedelta(days=1)), + 'future': (now, end_date) + }) + + def test_now_equals_start_date(self): + """Test split date range now = start_date.""" + start_date = datetime(2024, 12, 1) + end_date = datetime(2024, 12, 10) + now = datetime(2024, 12, 1) + result = self.reader._split_date_range(start_date, end_date, now) + self.assertEqual(result, { + 'past': None, + 'future': (now, end_date) + }) + + def test_now_equals_end_date(self): + """Test split date range now = end_date.""" + start_date = datetime(2024, 11, 20) + end_date = datetime(2024, 12, 1) + now = datetime(2024, 12, 1) + result = self.reader._split_date_range(start_date, end_date, now) + self.assertEqual(result, { + 'past': (start_date, now - timedelta(days=1)), + 'future': (now, now) + }) diff --git a/django_project/gap/utils/reader.py b/django_project/gap/utils/reader.py index f612b85d..0dde5d1a 100644 --- a/django_project/gap/utils/reader.py +++ b/django_project/gap/utils/reader.py @@ -7,8 +7,8 @@ import json import tempfile -from datetime import datetime -from typing import Union, List +from datetime import datetime, timedelta +from typing import Union, List, Tuple import numpy as np import pytz @@ -487,7 +487,7 @@ def __init__( location_input: DatasetReaderInput, start_date: datetime, end_date: datetime, output_type=DatasetReaderOutputType.JSON, - altitudes: (float, float) = None + altitudes: Tuple[float, float] = None ) -> None: """Initialize BaseDatasetReader class. @@ -578,3 +578,21 @@ def read_forecast_data(self, start_date: datetime, end_date: datetime): :type end_date: datetime """ pass + + def _split_date_range( + self, start_date: datetime, end_date: datetime, + now: datetime + ) -> dict: + """Split a date range into past and future ranges.""" + if end_date < now: + # Entire range is in the past + return {'past': (start_date, end_date), 'future': None} + elif start_date >= now: + # Entire range is in the future + return {'past': None, 'future': (start_date, end_date)} + else: + # Split into past and future + return { + 'past': (start_date, now - timedelta(days=1)), + 'future': (now, end_date) + }