forked from DOI-USGS/lake-temperature-model-prep
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_targets.R
295 lines (255 loc) · 12.6 KB
/
_targets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
library(targets)
options(tidyverse.quiet = TRUE)
tar_option_set(packages = c(
"tidyverse",
"sf",
"ncdf4",
"ncdfgeom", # You need >= v1.1.2
"RNetCDF",
"scipiper",
"ggplot2",
"scico",
"geoknife", # You need >= v1.6.6
"arrow",
"retry",
"data.table"
))
source('7_drivers_munge/src/GCM_driver_utils.R')
source('7_drivers_munge/src/GCM_driver_nc_utils.R')
targets_list <- list(
##### Construct overall GCM grids #####
# Hard code GCM grid parameters
tar_target(grid_params, tibble(
crs = "+proj=lcc +lat_0=45 + lon_0=-97 +lat_1=36 +lat_2=52 +x_0=0 +y_0=0 +ellps=WGS84 +units=m",
cellsize = 25000,
xmin = -2700000, # Centroid of bottom left cell
ymin = -1750000, # Centroid of bottom left cell
nx = 217,
ny = 141
)),
# Create larger tiles to use for querying GDP with groups of cells.
# Constructing tiles to be made of 400 cells in a 20x20 grid. Attempts
# at 23x23 and bigger resulted in timeouts.
tar_target(grid_tiles_sf, construct_grid_tiles(grid_params, tile_dim=20)),
# Reconstruct GCM grid using grid parameters from GDP defined above
tar_target(grid_cells_sf, reconstruct_gcm_grid(grid_params)),
tar_target(grid_cell_centroids_sf, sf::st_centroid(grid_cells_sf)),
##### Prepare lake centroids for matching to grid #####
# Identify the lakes that we can model with GLM first
tar_target(nml_rds, gd_get('7_config_merge/out/nml_list.rds.ind'), format='file'),
tar_target(nml_site_ids, names(readRDS(nml_rds))),
# Load and prepare lake centroids for matching to the GCM grid using
# the `centroid_lakes_sf.rds` file from our `scipiper` pipeline
tar_target(lake_centroids_sf_rds, gd_get('2_crosswalk_munge/out/centroid_lakes_sf.rds.ind'), format='file'),
tar_target(lake_to_state_xwalk_rds, gd_get('2_crosswalk_munge/out/lake_to_state_xwalk.rds.ind'), format='file'),
tar_target(query_lake_centroids_sf,
readRDS(lake_centroids_sf_rds) %>%
left_join(readRDS(lake_to_state_xwalk_rds), by = "site_id") %>%
# Subset to just lakes with NMLs, so that we aren't wasting
# time getting driver data for cells with lakes that can't
# be modeled using GLM anyways.
filter(site_id %in% nml_site_ids) %>%
# Lake centroids should be in the same CRS as the GCM grid
sf::st_transform(grid_params$crs)
),
##### Create crosswalks between cells, tiles, and lakes #####
# Get mapping of which cells are in which tiles (as a non-spatial data.frame)
tar_target(cell_tile_xwalk_df, get_cell_tile_xwalk(grid_cell_centroids_sf, grid_tiles_sf)),
# Get mapping of which lakes are in which cells and tiles, based on a spatial join
# (as a non-spatial data.frame). Not currently being used here, except for mapping the query.
tar_target(lake_cell_tile_spatial_xwalk_df,
get_lake_cell_tile_spatial_xwalk(query_lake_centroids_sf, grid_cells_sf, cell_tile_xwalk_df)),
##### Prepare grid cell centroids for querying GDP #####
# Don't need to do everything, only need to do cells where we have lakes. Group query
# to GDP by 10x10 tiles so that we have manageable sized geoknife calls.
# Spatially filter cells to only those cells that contain lake centroids and
# share how many lakes are in each for summarization purposes. Make a second target
# that is only the cell number so that adding lakes in the same cell won't kick
# off any rebuilds.
tar_target(query_cells_info_df, get_query_cells(grid_cells_sf, query_lake_centroids_sf)),
tar_target(query_cells, query_cells_info_df$cell_no),
# Get the centroids for the cells that need to be queried (since we need to use cell
# centroids in the GDP query, not cell polygons) and m
tar_target(query_cell_centroids_sf,
grid_cell_centroids_sf %>%
dplyr::filter(cell_no %in% query_cells) %>%
dplyr::left_join(cell_tile_xwalk_df, by = "cell_no")
),
# Reproject query_cell_centroids to WGS84
tar_target(query_cell_centroids_sf_WGS84,
sf::st_transform(query_cell_centroids_sf, crs = 4326)),
# Split query cells by tile_no to map over
tar_target(query_tiles, unique(query_cell_centroids_sf_WGS84$tile_no)),
tar_target(query_cells_centroids_list_by_tile,
dplyr::filter(query_cell_centroids_sf_WGS84, tile_no == query_tiles),
pattern = map(query_tiles),
iteration = "list"),
##### Create an image showing the full query, with n_lakes per cell #####
tar_target(
query_tile_cell_map_png,
map_tiles_cells(
out_file = '7_drivers_munge/out/query_tile_cell_map.png',
lake_cell_tile_xwalk = lake_cell_tile_spatial_xwalk_df,
query_tiles = query_tiles,
query_cells = query_cells,
grid_tiles = grid_tiles_sf,
grid_cells = grid_cells_sf
),
format='file'
),
##### Download data from the GDP #####
# BUILD QUERY
# Define list of GCMs
tar_target(gcm_names, c('ACCESS', 'GFDL', 'CNRM', 'IPSL', 'MRI', 'MIROC5')),
tar_target(gcm_dates_df,
tibble(
projection_period = c('1980_2099'),
start_datetime = c('1980-01-01 00:00:00'),
# Include midnight on the final day of the time period
end_datetime = c('2099-12-31 11:59:59')
)),
# Notaro variable definitions (see http://gdp-netcdfdev.cr.usgs.gov:8080/thredds/dodsC/notaro_debias_mri.html)
tar_target(gcm_query_vars, c(
"prcp_debias", # Precipitation (mm/day)
"tas_debias", # Air temp (deg C)
"rh_debias", # Relative humidity (%)
"rsds_debias", # Surface shortwave radiation (W/m2)
"rsdl_debias", # Surface longwave radiation (W/m2)
"windspeed_debias" # Wind speed (m/s)
)),
# Download data from GDP for each tile, GCM name, and GCM projection period combination.
# If the cells in a tile don't change, then the tile should not need to rebuild.
tar_target(
gcm_data_raw_feather,
download_gcm_data(
out_file_template = "7_drivers_munge/tmp/7_GCM_%s_%s_tile%s_raw.feather",
query_geom = query_cells_centroids_list_by_tile,
gcm_name = gcm_names,
gcm_projection_period = gcm_dates_df$projection_period,
query_vars = gcm_query_vars,
query_dates = c(gcm_dates_df$start_datetime, gcm_dates_df$end_datetime)
),
pattern = cross(query_cells_centroids_list_by_tile, gcm_names, gcm_dates_df),
format = "file",
error = "continue"
),
##### Munge GDP output into NetCDF files that will feed into GLM #####
# Load the wind transform information created by code from J. Read at
# https://github.com/USGS-R/lake-temperature-model-prep/issues/353#issuecomment-1212216800
tar_target(notaro_wind_transform_rds, '7_drivers_munge/in/notaro_wind_transform.rds', format = 'file'),
tar_target(notaro_wind_transform, readRDS(notaro_wind_transform_rds)),
# Munge GCM variables into useable GLM variables and correct units
# For each tile-gcm combo this function returns a list with 2 elements:
# 1) file_out - the name of the munged output file, which contains data
# only for queried cells in that tile that returned data for all variables, and
# 2) cell_info - a tibble with a row for each queried cell in that tile, indicating
# whether or not that cell is missing data for *any* variable for that gcm
# Note: we are munging and saving the data & identifying missing cells in a single
# target (rather than having a munging target and then a target to write the munged
# data and another to identify cells with missing data) to avoid the I/O overhead of
# returning the munged data as a dataframe, as targets would be writing a file
# beneath the surface to store that object target.
tar_target(
glm_ready_gcm_data_list,
munge_gdp_output(gcm_data_raw_feather, gcm_names, query_tiles, notaro_wind_transform),
pattern = map(gcm_data_raw_feather)
),
# Combine the `file_out` elements of the gcm_ready_gcm_data_list branches
# into a single vector of file targets
tar_target(
glm_ready_gcm_data_feather,
glm_ready_gcm_data_list$file_out,
pattern = map(glm_ready_gcm_data_list),
format = "file"
),
# Combine the `cell_info` elements of the gcm_ready_gcm_data_list branches
# into a single tibble of cell information, with a row per cell-gcm combo
tar_target(
glm_ready_gcm_data_cell_info,
bind_rows(glm_ready_gcm_data_list[grepl("cell_info", names(glm_ready_gcm_data_list))])
),
# For all lakes that fall within the bounding box of all queried cells that returned data,
# re-do the mapping of which lakes are in which cells and tiles, using only the query
# cells that returned data and did not have missing data for any variables for any GCMs.
# Here, the pool of cells to which lakes can be matched are those cells that returned data,
# to ensure that lakes that fell within cells that were missing data are re-assigned to
# a cell that returned data. Lakes are preferentially matched to cells within the same
# grid row and within `x_buffer` columns from the cell that the lake falls within.
# If no cells that returned data meet that criteria, the lake is matched to the closest
# cell that did return data, regardless of row. This xwalk is being used in
# `lake-temperature-process-models` to determine what meteorological data to pull for each lake.
tar_target(lake_cell_tile_xwalk_df,
adjust_lake_cell_tile_xwalk(lake_cell_tile_spatial_xwalk_df, query_lake_centroids_sf, grid_cells_sf,
query_cell_centroids_sf, glm_ready_gcm_data_cell_info, x_buffer=3)
),
##### Create an image showing missing cells from the query #####
tar_target(
query_tile_cell_map_missing_png,
map_missing_cells(
out_file = '7_drivers_munge/out/query_tile_cell_map_missing.png',
lake_cell_tile_xwalk = lake_cell_tile_xwalk_df,
cell_info = glm_ready_gcm_data_cell_info,
grid_cells = grid_cells_sf
),
format='file'
),
# Save the revised lake-cell-tile mapping for use in `lake-temperature-process-models`
tar_target(lake_cell_tile_xwalk_csv, {
out_file <- "7_drivers_munge/out/lake_cell_tile_xwalk.csv"
write_csv(lake_cell_tile_xwalk_df, out_file)
return(out_file)
}, format = 'file'),
# Group daily feather files by GCM to map over and include
# file hashes to trigger rebuilds for groups as needed
tar_target(gcm_data_daily_feather_group_by_gcm,
tibble(
gcm_file = glm_ready_gcm_data_feather,
data = tools::md5sum(gcm_file),
gcm_name = str_extract(gcm_file, paste(gcm_names, collapse="|"))
) %>%
group_by(gcm_name) %>%
tar_group(),
iteration = "group"),
# Setup table of GLM variable definitions to use in NetCDF file metadata
tar_target(glm_vars_info,
tibble(
var_name = c("Rain", "Snow", "AirTemp", "RelHum", "Shortwave", "Longwave", "WindSpeed"),
longname = c("Total daily rainfall",
"Total daily snowfall derived from precipitation and air temp",
"Average daily near surface air temperature",
"Average daily percent relative humidity",
"Average daily surface downward shortwave flux in air",
"Average daily surface downward longwave flux in air",
"Average daily windspeed derived from anemometric zonal and anenometric meridional wind components"),
units = c("m/day", "m/day", "degrees Celcius", "percent", "W/m2", "W/m2", "m/s"),
data_precision = "float",
compression_precision = c('.3','.3','.2','.1','.1','.1','.3' )
)),
# Create single NetCDF files for each of the GCMs
tar_target(
gcm_nc, {
# Grouped by GCM name so this should just return one unique value
gcm_name <- unique(gcm_data_daily_feather_group_by_gcm$gcm_name)
generate_gcm_nc(
nc_file = sprintf("7_drivers_munge/out/GCM_%s.nc", gcm_name),
gcm_raw_files = gcm_data_daily_feather_group_by_gcm$gcm_file,
vars_info = glm_vars_info,
grid_params = grid_params,
spatial_info = query_cell_centroids_sf_WGS84,
gcm_name = gcm_name,
compression = FALSE
)
},
pattern = map(gcm_data_daily_feather_group_by_gcm),
format = 'file',
error = 'continue'
),
##### Get list of final output files to link back to scipiper pipeline #####
tar_target(
gcm_files_out,
c(gcm_nc, lake_cell_tile_xwalk_csv, query_tile_cell_map_png, query_tile_cell_map_missing_png),
format = 'file'
)
)
# Return the complete list of targets
c(targets_list)