forked from JoakimWeill/ca_owrs
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.Rmd
393 lines (346 loc) · 15.5 KB
/
index.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
---
title: "Water Utility Rates in California"
#author: "Team BA"
output:
html_document:
self_contained: no
keep_md: yes
theme: cosmo
toc: TRUE
toc_float: TRUE
toc_depth: 2
code_folding: hide
highlight: "pygments"
---
Welcome to a clearing house of water data. This page explores water utility prices, consumption, sources, and the number of water quality violations each utility has experienced.
To the left is a table of contents for the following page, but most importantly, immediately below is a map of water prices throughout California for the utilities that have greater than 3,000 customers. And for the data intrepid, at the bottom is the underlying data consisting of the annual water production and deliveries reports, and rates.
```{r global, include=FALSE}
# Global
viridis_color <- "C"
# File paths
file_parent <- ""
file_paths <- list(
code = paste0(file_parent, "Code/"),
data = paste0(file_parent, "Data/"),
data_owrs = paste0(file_parent, "Data/OWRS/Open-Water-Rate-Specification-master/full_utility_rates/California/")
); rm(file_parent)
###############
# Open Packages
###############
# General packages needed
library(stringr)
library(lubridate)
library(viridis)
# US Census Data
library(tidycensus)
library(maps)
library(sf)
library(leaflet)
library(ggridges)
# Utilities
library(ggfortify)
library(haven)
# Aesthetics
library(hrbrthemes)
library(gcookbook)
library(DT)
# Scraping
library(rvest)
# Analysis
library(glmnet)
library(lfe)
# Laod last to avoid masking functionality
library(tidyverse)
###############
# Open Data
###############
owrs_summary_full <- read_csv(paste0(file_paths$data, "OWRS/summary_table_cleaned.csv")) %>%
mutate(tier_number = as.character(tier_number))
ear_deliv <- read_csv(paste0(file_paths$data, "EAR/EAR 2013-2016 DELIVERIES FINAL 06-22-2018.csv"))
ear_prod <- read_csv(paste0(file_paths$data, "EAR/EAR 2013-2016 PRODUCTION FINAL 06-22-2018_1.csv"),
col_types = cols(.default = "c")) %>%
bind_rows(read_csv(paste0(file_paths$data, "EAR/EAR 2013-2016 PRODUCTION FINAL 06-22-2018_2.csv"),
col_types = cols(.default = "c"))) %>%
type_convert()
## Geospatial Data
affected <- st_read(paste0(file_paths$data, "affected"), "affected")
util_sf <- st_read(paste0(file_paths$data, "Water_Districts"), "Water_Districts")
###############
# Clean Data
###############
# Clean the OWRS data
owrs_summary_full %<>%
arrange(pwsid) %>%
filter(!is.na(Tier_price)) %>%
mutate(Tier_volume = if_else(bill_type == "Tiered" & is.na(Tier_volume), Inf, Tier_volume))
# Simplify the OWRS data for the shapefile
owrs_summary <- owrs_summary_full %>%
group_by(pwsid) %>%
mutate(tier_count = n()) %>%
ungroup() %>%
distinct(utility_name, pwsid, effective_date, bill_frequency, bill_type,
service_charge, commodity_charge, bill, tier_count)
# Clean EAR data
## Deliveries - i.e. use cases
ear_deliv <- ear_deliv %>%
select(PWSID, Water.System.Classification, Year, Month, Date,
`WATER DELIVERIES TO Single.family.Residential`, `WATER DELIVERIES TO Multi.family.Residential`,
`WATER DELIVERIES Total.Delivered Residential IN REVISED UNITS (Total Does not include Landscape Irrigation, Agricultural or to other PWS)`,
`WATER DELIVERIES TO Commercial.Institutional`:`WATER DELIVERIES TO Industrial`,
`WATER DELIVERIES TO Agricultural`, `WATER DELIVERIES TO Landscape.Irrigation`,
`CALCULATED GPCD (Total delivery to residential in gallons per capita day)`) %>%
rename(classification = Water.System.Classification,
deliv_res = `WATER DELIVERIES TO Single.family.Residential`,
deliv_res_multi = `WATER DELIVERIES TO Multi.family.Residential`,
deliv_com_inst = `WATER DELIVERIES TO Commercial.Institutional`,
deliv_ind = `WATER DELIVERIES TO Industrial`,
deliv_irig = `WATER DELIVERIES TO Landscape.Irrigation`,
deliv_ag = `WATER DELIVERIES TO Agricultural`,
deliv_irig = `WATER DELIVERIES TO Landscape.Irrigation`,
deliv_res_no_irig = `WATER DELIVERIES Total.Delivered Residential IN REVISED UNITS (Total Does not include Landscape Irrigation, Agricultural or to other PWS)`,
deliv_gpcd = `CALCULATED GPCD (Total delivery to residential in gallons per capita day)`) %>%
mutate(deliv_res = select(., deliv_res, deliv_res_multi) %>% apply(1, sum, na.rm = T),
Date = mdy(Date)) %>%
select(-deliv_res_multi) %>%
replace_na(list(deliv_res = 0, deliv_res_no_irig = 0, deliv_com_inst = 0, deliv_ind = 0,
deliv_ag = 0, deliv_irig = 0, deliv_gpcd = 0)) %>%
type_convert() %>%
mutate(deliv_total = select(., deliv_res, deliv_com_inst, deliv_ag) %>% apply(1, sum, na.rm = T)) %>%
mutate_at(vars(deliv_res:deliv_irig), .funs = list(perc = ~ . / deliv_total))
# Production - i.e. sources
ear_prod <- ear_prod %>%
select(PWSID, Year, Month, Date,
`WATER PRODUCED FROM GROUNDWATER`:`WATER SOLD TO ANOTHER PUBLIC WATER SYSTEM`,
`CALCULATED GPCD (Total Potable Produced in gallons per capita day)`) %>%
rename(prod_ground = `WATER PRODUCED FROM GROUNDWATER`,
prod_surface =`WATER PRODUCED FROM SURFACE WATER`,
prod_purch = `FINSIHIED WATER PURCHASED OR RECEIVED FROM ANOTHER PUBLIC WATER SYSTEM`,
prod_sold = `WATER SOLD TO ANOTHER PUBLIC WATER SYSTEM`,
prod_gpcd = `CALCULATED GPCD (Total Potable Produced in gallons per capita day)`) %>%
mutate(Date = mdy(Date)) %>%
mutate_at(vars(prod_ground:prod_gpcd), str_replace, "-", as.character(NA)) %>%
replace_na(list(prod_ground = 0, prod_surface = 0, prod_purch = 0,
prod_sold = 0, prod_gpcd = 0)) %>%
mutate_at(vars(prod_ground:prod_gpcd), str_remove_all, ",") %>%
mutate_at(vars(prod_ground:prod_gpcd), str_remove_all, "\\.00") %>%
mutate_at(vars(prod_ground:prod_gpcd), as.numeric) %>%
mutate(prod_sold = (-prod_sold),
prod_total = select(., prod_ground, prod_surface, prod_purch, prod_sold) %>% apply(1, sum, na.rm = T)) %>%
mutate_at(vars(prod_ground:prod_sold), .funs = list(perc = ~ . / prod_total))
###############
# Match/Join Data
###############
## Join EAR
ear <- ear_deliv %>%
full_join(ear_prod)
# Match Data
wat_ut_xwalk <- util_sf %>%
as_tibble() %>%
select(-geometry) %>%
# Clean names to join
mutate(AGENCYNAME = str_remove_all(AGENCYNAME, "District|Water|Irrigation"),
AGENCYNAME = str_trim(AGENCYNAME, "both")) %>%
# Join
fuzzyjoin::stringdist_left_join(
owrs_summary %>%
# Clean names to joins
mutate(utility_name = str_remove_all(utility_name, "District|Water|Irrigation|City Of"),
utility_name = str_trim(utility_name, "both")) %>%
rename(AGENCYNAME = utility_name)
) %>%
filter(!is.na(pwsid))
# Join the utility data for the visualization
util_sf <- util_sf %>%
mutate(AGENCYNAME = str_to_title(AGENCYNAME)) %>%
# Add water utility data
left_join(wat_ut_xwalk) %>%
filter(!is.na(pwsid)) %>%
# mutate(OBJECTID = as_factor(OBJECTID)) %>%
left_join(
# Add violation counts
affected %>%
select(OBJECTID, Freq) %>%
as_tibble() %>%
select(-geometry) %>%
mutate(OBJECTID = as.numeric(OBJECTID))
) %>%
mutate_at(vars(commodity_charge, service_charge, bill), round, 2)
# Make a total summary
ear_summary <- ear %>%
mutate(quarter = quarter(Date)) %>%
filter(quarter %in% 1:4) %>%
group_by(PWSID, quarter) %>%
summarise_at(vars(deliv_res:prod_sold_perc), mean, na.rm = T) %>%
ungroup() %>%
gather(variable, value, -PWSID, -quarter) %>%
unite(temp, variable, quarter, sep = "_Q") %>%
spread(temp, value) %>%
left_join(
ear %>%
group_by(PWSID) %>%
summarise_at(vars(deliv_res:prod_sold_perc), mean, na.rm = T) %>%
rename_at(vars(deliv_res:prod_sold_perc), funs(paste0(., "_annual")))
) %>%
rename(pwsid = PWSID) %>%
inner_join(wat_ut_xwalk) %>%
left_join(
# Add violation counts
affected %>%
select(OBJECTID, Freq) %>%
as_tibble() %>%
select(-geometry) %>%
mutate(OBJECTID = as.numeric(OBJECTID))
)
# Add source and sales data to master data
util_sf <- util_sf %>%
left_join(
ear_summary %>%
select(pwsid, prod_ground_perc_annual, prod_surface_perc_annual, prod_purch_perc_annual,
prod_gpcd_annual) %>%
mutate_at(vars(prod_ground_perc_annual, prod_surface_perc_annual, prod_purch_perc_annual), ~ (. * 100) %>% round(0)) %>%
mutate(prod_gpcd_annual = prod_gpcd_annual %>% round(0))
)
```
## California's Municipal Water Mapped
*Note*: The map is sparse because it only covers communities with over 3,000 customers.
```{r shiny_app, out.width="100%", echo = FALSE, warning = FALSE, message = FALSE, width = 400, height = 800}
# Creating labels
wat_ut_labels <- sprintf("<strong>%s</strong>
<br/>Bill Type: %s
<br/>Bill Frequency: %s
<br/>Number of Tiers: %s
<br/>Service Charge: %s
<br/>Consumption Charge: %s
<br/>Average Bill: %s
<br/>Ave. Consumption (Gal./Day Percapita): %s
<br/>Water Quality Violations: %s
<br/><strong>Source (Percent): %s</strong>
<br/>Groundwater: %s
<br/>Surface Water: %s
<br/>Purchased: %s",
util_sf$AGENCYNAME, util_sf$bill_type, util_sf$bill_frequency,
util_sf$tier_count, util_sf$service_charge, util_sf$commodity_charge,
util_sf$bill, util_sf$prod_gpcd_annual, util_sf$Freq, "",
util_sf$prod_ground_perc_annual, util_sf$prod_surface_perc_annual,
util_sf$prod_purch_perc_annual) %>%
lapply(htmltools::HTML)
# Put the service charge in bins
## Service Charges
bins_serv <- seq(min(util_sf$service_charge), max(util_sf$service_charge), length.out = 10)
pal_serv <- colorBin("plasma", util_sf$service_charge, bins = bins_serv)
## Commondity
bins_com <- seq(min(util_sf$commodity_charge), max(util_sf$commodity_charge), length.out = 10)
pal_com <- colorBin("plasma", util_sf$commodity_charge, bins = bins_com)
## Total Bill
bins_bil <- seq(min(util_sf$bill), max(util_sf$bill), length.out = 10)
pal_bil <- colorBin("plasma", util_sf$bill, bins = bins_bil)
pal <- colorNumeric("plasma", domain = util_sf$bill)
# Plot
leaflet(util_sf, width = "100%") %>%
addProviderTiles("CartoDB.Positron") %>%
setView(lng = -122, lat = 37.9, zoom = 8) %>%
addPolygons(color = ~pal_bil(bill),
fillOpacity = 0.5,
weight = 1,
label = wat_ut_labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"),
group = "Average Bill") %>%
addPolygons(color = ~pal_serv(service_charge),
fillOpacity = 0.5,
weight = 1,
label = wat_ut_labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"),
group = "Service Charge") %>%
addPolygons(color = ~pal_com(commodity_charge),
fillOpacity = 0.5,
weight = 1,
label = wat_ut_labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"),
group = "Water Price") %>%
# addLegend(colors = ~pal_bil(bill),
# title = "Average Bill") %>%
addLayersControl(
overlayGroups = c("Average Bill", "Service Charge", "Water Price"),
options = layersControlOptions(collapsed = FALSE),
position = "bottomright"
) %>%
hideGroup(c("Service Charge", "Water Price")) %>%
leaflet.extras::addSearchOSM(options = list(position = "topright"))
```
## Water Prices
*Note*: Water price data only covers communities with over 3,000 customers. The average consumer is used to calculate the average service charge (fixed charge paid regardless of your consumption) and consumptive charge (price paid for consumption).
```{r sum_stats, echo = FALSE, warning = FALSE, message = FALSE, fig.height=4.5, fig.width=8, fig.align="center"}
# Distribution of Prices
owrs_summary %>%
select(bill, commodity_charge, service_charge) %>%
gather() %>%
mutate(key = case_when(
key == "bill" ~ "Average Bill",
key == "commodity_charge" ~ "Water Price",
key == "service_charge" ~ "Service Charge",
TRUE ~ key
),
key = as_factor(key)) %>%
ggplot(aes(value, group = key, color= key)) + geom_line(stat = "density") +
scale_color_viridis_d(option = viridis_color, end = 0.8) +
theme_ipsum() +
labs(color = "Bill Type", x = "Price (USD)", y = "Density") +
theme(legend.position = "bottom")
# Summary Statistics
owrs_summary_full %>%
filter(is.finite(Tier_volume)) %>%
# mutate(Tier_volume = if_else(is.infinite(Tier_volume), 100, Tier_volume)) %>%
ggplot(aes(x = Tier_volume, y = tier_number, fill = tier_number)) +
geom_density_ridges(scale = 10, size = 0.25, rel_min_height = 0.03) +
theme_ridges() +
scale_x_continuous(limits=c(0, 150), expand = c(0.01, 0)) +
#scale_y_discrete(expand = c(0.01, 0)) +
scale_fill_brewer(palette = "YlOrRd")
#scale_y_reverse(breaks=c(2000, 1980, 1960, 1940, 1920, 1900), expand = c(0.01, 0))
ggplot(owrs_summary_full, aes(x = Tier_price, y = tier_number, fill = tier_number)) +
geom_density_ridges(scale = 10, size = 0.25, rel_min_height = 0.03) +
theme_ridges() +
scale_x_continuous(limits=c(0, 20), expand = c(0.01, 0)) +
scale_fill_brewer(palette = "YlOrRd")
#scale_y_reverse(breaks=c(2000, 1980, 1960, 1940, 1920, 1900), expand = c(0.01, 0))
owrs_summary_full %>%
ggplot(aes(bill, approximate_median_income)) +
geom_point(size = 0.6) +
theme_ipsum() +
stat_smooth(method = "lm") +
labs(x = "Average Bill (USD)", y = "Median Income (USD)") +
xlim(c(0, 250))
```
## Extensions and Next Steps
* Defining affordability for water services:
+ A function of household income, number of people in HH, bill, regional average consumption ,and household consumption. Ultimately this is the percent of the income that a household spends on water conditional on regional consumption.
* Underlying details for each utility's produciton and deliveries data.
* Use the rate parser to build a measure of the average bill size.
### Download Underlying Data
```{r data_download, echo=FALSE, warning = FALSE, message = FALSE}
# Names for the data set made public
util_sf_colnames <- c("Agency", "PWSID", "Date Updated", "Bill Frequency", "Bill Type",
"Average Fixed Charges", "Average Variable Change", "Average Total Bill",
"Number of Tiers at Agency", "Water Quality Violation Count",
"Source: Groundwater (%)", "Source: Surface Water (%)", "Source: Purchased (%)",
"Average Consumption Percapita per Day (Gallons)")
# Post the data
util_sf %>%
as_tibble() %>%
select(-geometry, -OBJECTID, -AGENCYUNIQ, -MODIFIEDBY, -SOURCECOMM, -Date_Data_,
-GlobalID, -AGENCYNAME.x, -AGENCYNAME.y, -SOURCE, -LASTMODIFI) %>%
datatable(.,
extensions = 'Buttons',
colnames = util_sf_colnames,
options = list(dom = 'Bfrtip',
buttons = c('excel', "csv")))
```