This package curate (downloads, clean, consolidate, smooth) data from Johns Hokpins for analysing international outbreak of COVID-19.
It includes several visualizations of the COVID-19 international outbreak.
Yanchang Zhao, COVID-19 Data Analysis with Tidyverse and Ggplot2 - China., 2020.
- COVID19DataProcessor generates curated series
- visualizations by Yanchang Zhao are included in ReportGenerator R6 object
- More visualizations included int ReportGeneratorEnhanced R6 object
- Visualizations ReportGeneratorDataComparison compares all countries counting epidemy day 0 when confirmed cases > n (i.e. n = 100).
Data is still noisy because there are missing data from some regions in some days. We are working on in it.
Release | Usage | Development |
Install the R package using the following commands on the R console:
# install.packages("devtools")
devtools::install_github("rOpenStats/COVID19analytics", build_opts = NULL)
#> Warning: replacing previous import 'ggplot2::Layout' by 'lgr::Layout' when
#> loading 'COVID19analytics'
#> Warning: replacing previous import 'readr::col_factor' by 'scales::col_factor'
#> when loading 'COVID19analytics'
#> Warning: replacing previous import 'magrittr::not' by 'testthat::not' when
#> loading 'COVID19analytics'
#> Warning: replacing previous import 'dplyr::matches' by 'testthat::matches' when
#> loading 'COVID19analytics'
#> Warning: replacing previous import 'readr::edition_get' by
#> 'testthat::edition_get' when loading 'COVID19analytics'
#> Warning: replacing previous import 'magrittr::equals' by 'testthat::equals' when
#> loading 'COVID19analytics'
#> Warning: replacing previous import 'magrittr::is_less_than' by
#> 'testthat::is_less_than' when loading 'COVID19analytics'
#> Warning: replacing previous import 'readr::local_edition' by
#> 'testthat::local_edition' when loading 'COVID19analytics'
#> Warning: replacing previous import 'testthat::matches' by 'tidyr::matches' when
#> loading 'COVID19analytics'
#> Warning: replacing previous import 'magrittr::extract' by 'tidyr::extract' when
#> loading 'COVID19analytics'
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> filter, lag
#> The following objects are masked from 'package:base':
#> intersect, setdiff, setequal, union
data.processor <- COVID19DataProcessor$new(provider = "JohnsHopkingsUniversity", missing.values = "imputation")
#dummy <- data.processor$preprocess() is setupData + transform is the preprocess made by data provider
dummy <- data.processor$setupData()
#> INFO [07:51:31.629] {stage: `processor-setup`}
#> INFO [07:51:31.865] Checking required downloaded { `2023-01-09`, daily.update.time: `21:00:00`, current.datetime: `2023-01-11 07:51:31`, download.flag: `TRUE`}
#> INFO [07:51:34.888] Checking required downloaded { `2023-01-09`, daily.update.time: `21:00:00`, current.datetime: `2023-01-11 07:51:34`, download.flag: `TRUE`}
#> INFO [07:51:36.562] Checking required downloaded { `2023-01-09`, daily.update.time: `21:00:00`, current.datetime: `2023-01-11 07:51:36`, download.flag: `TRUE`}
#> INFO [07:51:38.103] {stage: `data loaded`}
#> INFO [07:51:38.105] {stage: `data-setup`}
dummy <- data.processor$transform()
#> INFO [07:51:38.108] Executing transform
#> INFO [07:51:38.109] Executing consolidate
#> INFO [07:52:13.160] {stage: `consolidated`}
#> INFO [07:52:13.161] Executing standarize
#> INFO [07:52:17.230] gathering DataModel
#> INFO [07:52:17.231] {stage: `datamodel-setup`}
# Curate is the process made by missing values method
dummy <- data.processor$curate()
#> INFO [07:52:17.241] {stage: `loading-aggregated-data-model`}
#> Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Antarctica
#> Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Micronesia
#> Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: MS Zaandam
#> Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Summer Olympics 2020
#> Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Winter Olympics 2022
#> INFO [07:52:21.891] {stage: `calculating-rates`}
#> INFO [07:52:22.236] {stage: `making-data-comparison`}
#> INFO [07:52:38.136] {stage: `applying-missing-values-method`}
#> INFO [07:52:38.138] {stage: `Starting first imputation`}
#> INFO [07:52:38.192] {stage: `calculating-rates`}
#> INFO [07:52:38.567] {stage: `making-data-comparison-2`}
#> INFO [07:52:54.635] {stage: `calculating-top-countries`}
#> INFO [07:52:54.672] {stage: `curated`} <- max(data.processor$getData()$date)
rg <- ReportGeneratorEnhanced$new(data.processor)
rc <- ReportGeneratorDataComparison$new(data.processor = data.processor)
top.countries <- data.processor$top.countries
international.countries <- unique(c(data.processor$top.countries,
"China", "Japan", "Singapore", "Korea, South"))
africa.countries <- sort(data.processor$countries$getCountries(division = "continent", name = "Africa"))
# Top 10 daily cases confirmed increment
(data.processor$getData() %>%
filter(date == %>%
select(country, date,,, confirmed, deaths, %>%
arrange(desc( %>%
filter(confirmed >=10))[1:10,]
#> # A tibble: 10 × 7
#> # Groups: country [10]
#> country date confirmed deaths death…¹
#> <chr> <date> <dbl> <int> <int> <int> <int>
#> 1 Japan 2023-01-10 0.0026 78982 30670001 6.04e4 253
#> 2 Brazil 2023-01-10 0.0021 75218 36552432 6.95e5 206
#> 3 US 2023-01-10 0.0006 59695 101345042 1.10e6 864
#> 4 Korea, South 2023-01-10 0.0018 54343 29654090 3.27e4 76
#> 5 Thailand 2023-01-10 0.0062 29164 4752782 3.47e4 1018
#> 6 Taiwan* 2023-01-10 0.0028 25049 9097554 1.56e4 26
#> 7 Germany 2023-01-10 0.0006 22119 37562191 1.63e5 269
#> 8 France 2023-01-10 0.0003 11773 39625664 1.64e5 115
#> 9 China 2023-01-10 0.002 9379 4804906 1.78e4 75
#> 10 Austria 2023-01-10 0.0005 3019 5731160 2.15e4 27
#> # … with abbreviated variable name ¹
# Top 10 daily deaths increment
(data.processor$getData() %>%
filter(date == %>%
select(country, date,,, confirmed, deaths, %>%
#> # A tibble: 10 × 7
#> # Groups: country [10]
#> country date confirmed deaths death…¹
#> <chr> <date> <dbl> <int> <int> <int> <int>
#> 1 Thailand 2023-01-10 0.0062 29164 4752782 3.47e4 1018
#> 2 US 2023-01-10 0.0006 59695 101345042 1.10e6 864
#> 3 Germany 2023-01-10 0.0006 22119 37562191 1.63e5 269
#> 4 Japan 2023-01-10 0.0026 78982 30670001 6.04e4 253
#> 5 Brazil 2023-01-10 0.0021 75218 36552432 6.95e5 206
#> 6 France 2023-01-10 0.0003 11773 39625664 1.64e5 115
#> 7 Korea, South 2023-01-10 0.0018 54343 29654090 3.27e4 76
#> 8 China 2023-01-10 0.002 9379 4804906 1.78e4 75
#> 9 Russia 2023-01-10 0.0001 3010 21524480 3.86e5 46
#> 10 Australia 2023-01-10 0.0001 1150 11211305 1.74e4 31
#> # … with abbreviated variable name ¹
rg$ggplotTopCountriesStackedBarDailyInc(included.countries = africa.countries,
countries.text = "Africa")
#> Warning: Removed 318 rows containing missing values (`position_stack()`).
rc$ggplotComparisonExponentialGrowth(included.countries = africa.countries, min.cases = 20)
#> Warning: ggrepel: 41 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps
rg$ggplotCountriesLines(included.countries = africa.countries, countries.text = "Africa countries",
field = "", log.scale = TRUE)
#> Warning: Removed 318 rows containing missing values (`geom_line()`).
#> Warning: ggrepel: 24 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps
rc$ggplotComparisonExponentialGrowth(included.countries = africa.countries,
field = "deaths", y.label = "deaths", min.cases = 1)
#> Warning: ggrepel: 37 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps
#> Warning: Removed 69 rows containing missing values (`position_stack()`).
rc$ggplotComparisonExponentialGrowth(included.countries = international.countries,
min.cases = 100)
#> Warning: Removed 2 rows containing missing values (`geom_line()`).
#> Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps
rg$ggplotCountriesLines(field = "", log.scale = TRUE)
#> Warning: Removed 66 rows containing missing values (`geom_line()`).
rg$ggplotCountriesLines(field = "", log.scale = TRUE)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning in self$trans$transform(x): NaNs produced
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning in self$trans$transform(x): NaNs produced
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning: Removed 119 rows containing missing values (`geom_line()`).
#> Warning: Removed 1 rows containing missing values (`geom_text_repel()`).
rg$ggplotCountriesBarGraphs( = "Ethiopia")