README.md
In jnshsrs/corona: Corona Data Import

corona

This package support to import, prepare, model and visualize the data about COVID19 infections from the Johns Hopkins University github repository.

You can install the released version of corona from github with:

install.packages("devtools")
devtools::install_github("jnshsrs/corona")

To load the dataset, just call read_corona(). The function will fetch the latest data from the JHU github repository.

Opposed to the JHU github data, which is structured in a wide format (each day has a column and each row represents a country), this dataset is rearranged into a long format where each country and day reprents a row.

library(dplyr, warn.conflicts = FALSE)
library(corona)

# Import the corona
data <- read_corona()

data
#> # A tibble: 19,875 x 7
#>    country     date         Lat  Long infections deaths recoveries
#>    <chr>       <date>     <dbl> <dbl>      <dbl>  <dbl>      <dbl>
#>  1 Afghanistan 2020-01-22    33    65          0      0          0
#>  2 Afghanistan 2020-01-23    33    65          0      0          0
#>  3 Afghanistan 2020-01-24    33    65          0      0          0
#>  4 Afghanistan 2020-01-25    33    65          0      0          0
#>  5 Afghanistan 2020-01-26    33    65          0      0          0
#>  6 Afghanistan 2020-01-27    33    65          0      0          0
#>  7 Afghanistan 2020-01-28    33    65          0      0          0
#>  8 Afghanistan 2020-01-29    33    65          0      0          0
#>  9 Afghanistan 2020-01-30    33    65          0      0          0
#> 10 Afghanistan 2020-01-31    33    65          0      0          0
#> # … with 19,865 more rows

Additionally, the corona-package comes with reader functions for the three statistics, i.e., number of infections, number of deaths and number of recoveries, but these functions are mainly used as helper functions for the read_corona.


# Import numbers of infection 
read_infections()

# Import numbers of deaths
read_deaths()

# Import number of recoveries
read_recoveries()

Prepare the data

To work with the data, we have to preprocess the data


# Prepare data
data_germany <- data %>% 
  preprocess_corona_data(statistic = "infections", 
                         countries = "Germany", 
                         n = 100)

Predict cases


# Predict the cases
data_germany %>% predict_growth() 
#> # A tibble: 40 x 7
#> # Groups:   country [1]
#>    country   Lat  Long date       statistic   day predicted_cases
#>    <chr>   <dbl> <dbl> <date>         <dbl> <int>           <dbl>
#>  1 Germany    51     9 2020-03-01       130     1            257.
#>  2 Germany    51     9 2020-03-02       159     2            312.
#>  3 Germany    51     9 2020-03-03       196     3            380.
#>  4 Germany    51     9 2020-03-04       262     4            462.
#>  5 Germany    51     9 2020-03-05       482     5            562.
#>  6 Germany    51     9 2020-03-06       670     6            683.
#>  7 Germany    51     9 2020-03-07       799     7            831.
#>  8 Germany    51     9 2020-03-08      1040     8           1011.
#>  9 Germany    51     9 2020-03-09      1176     9           1230.
#> 10 Germany    51     9 2020-03-10      1457    10           1496.
#> # … with 30 more rows

Plot the growth curve of infected cases


# Data pipeline
data %>% 
  preprocess_corona_data(statistic = "infections", 
                         countries = "Spain", 
                         n = 100) %>% 
  predict_growth() %>% 
  plot_country(show_model = TRUE)

Plot the number of deaths

# Data pipeline
data %>% 
  preprocess_corona_data(statistic = "deaths", 
                         countries = "Italy", 
                         n = 10) %>% 
  predict_growth() %>%
  plot_country(show_model = TRUE) +
  ggplot2::ggtitle("Corona  Death Growth Curve in Italy", 
                   subtitle = "Starte date is the first day with > 10 deaths")
#> Warning: Removed 1 row(s) containing missing values (geom_path).

Look at the Growth Model

The function lm_corona takes a preprocessed corona dataset (as tibble or dataframe) and returns a dataframe with the parameters of a exponential growth model.

The column base_rate and growth rate indicte the initial case numbers and the estimated growth across the entire time period.

data %>%
  preprocess_corona_data(
    countries = "Germany",
    statistic = "infections",
    n = 100
  ) %>% 
  lm_corona()
#> # A tibble: 1 x 7
#> # Groups:   country [1]
#>   country models  r_sq lm_intercept lm_slope base_rate growth_rate
#>   <chr>   <list> <dbl>        <dbl>    <dbl>     <dbl>       <dbl>
#> 1 Germany <lm>   0.958         2.32   0.0851      211.        1.22

The funcition plot_country plots the exponential growth model for the given country (note that this function can process only one country, a function to compare countries is not available so far).

# Data pipeline
data %>% 
  preprocess_corona_data(statistic = "deaths", 
                         countries = "Germany", 
                         n = 100) %>% 
  predict_growth() %>%
  plot_country(show_model = TRUE) +
  ggplot2::ggtitle("Number of cumulative deaths in Germany", 
                   "Days since the 100th case included")
#> Warning: Removed 96 row(s) containing missing values (geom_path).

# Data pipeline
data %>% 
  preprocess_corona_data(statistic = "infections", 
                         countries = "Germany", 
                         n = 10000) %>% 
  predict_growth() %>%
  plot_country(show_model = TRUE) +
  ggplot2::ggtitle("Number of cumulative deaths in Germany", 
                   "Days since the 10000th (1e4) case included")
#> Warning: Removed 66 row(s) containing missing values (geom_path).


data %>% 
  group_by(country, date) %>% 
  summarise_at(c("infections", "deaths", "recoveries"), sum)
#> # A tibble: 13,725 x 5
#> # Groups:   country [183]
#>    country     date       infections deaths recoveries
#>    <chr>       <date>          <dbl>  <dbl>      <dbl>
#>  1 Afghanistan 2020-01-22          0      0          0
#>  2 Afghanistan 2020-01-23          0      0          0
#>  3 Afghanistan 2020-01-24          0      0          0
#>  4 Afghanistan 2020-01-25          0      0          0
#>  5 Afghanistan 2020-01-26          0      0          0
#>  6 Afghanistan 2020-01-27          0      0          0
#>  7 Afghanistan 2020-01-28          0      0          0
#>  8 Afghanistan 2020-01-29          0      0          0
#>  9 Afghanistan 2020-01-30          0      0          0
#> 10 Afghanistan 2020-01-31          0      0          0
#> # … with 13,715 more rows

Daily growths rates of infections

library(ggplot2)

data %>% 
  preprocess_corona_data(statistic = "infections",
                         countries = c("Germany",
                                       "Italy", 
                                       "Spain",
                                       "US",
                                       "Vietnam"),
                         n = 100) %>% 
  mutate(daily_growth_rate = statistic / lag(statistic)) %>% 
  filter(!is.na(daily_growth_rate)) %>% 
  ggplot(aes(x = date, y = daily_growth_rate, col = country)) +
  geom_line(alpha = .4) +
  geom_smooth(method = "loess", se = FALSE, span = .55) +
  scale_y_continuous("Daily growth rate (smoothed)") +
  scale_x_date(breaks = seq(min(data$date),
                            max(data$date),
                            by = "2 days"), 
               label = scales::date_format(format = "%d %b")) + 
  scale_color_discrete("Country") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), 
        axis.title.x = element_blank()) +
  ggtitle("Daily growth rates since the 100th case",
          "Observed data is superimposed by smoothed lines")
#> `geom_smooth()` using formula 'y ~ x'

Daily growth rate of deaths

library(ggplot2)

data %>% 
  preprocess_corona_data(statistic = "deaths",
                         countries = c("Germany",
                                       "Italy", 
                                       "Spain",
                                       "US",
                                       "Vietnam"),
                         n = 100) %>% 
  mutate(daily_growth_rate = statistic / lag(statistic)) %>% 
  filter(!is.na(daily_growth_rate)) %>% 
  ggplot(aes(x = date, y = daily_growth_rate, col = country)) +
  geom_line(alpha = .4) +
  geom_smooth(method = "loess", formula = "y ~ x", se = FALSE, span = .55) +
  scale_y_continuous("Daily growth rate (smoothed)") +
  scale_x_date(breaks = seq(min(data$date),
                            max(data$date) + lubridate::days(3),
                            by = "2 days"), 
               label = scales::date_format(format = "%d %b")) + 
  scale_color_discrete("Country") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), 
        axis.title.x = element_blank()) +
  ggtitle("Daily deaths growth rates since the 100th case",
          "Observed data is superimposed by smoothed lines")

data %>% 
  preprocess_corona_data(statistic = "infections",
                         countries = c("Germany",
                                       "Italy", 
                                       "Spain",
                                       "US",
                                       "Vietnam"),
                         n = 7000) %>% 
  mutate(daily_growth_rate = statistic / lag(statistic)) %>% 
  mutate_at("daily_growth_rate", function(x) x - 1) %>% 
  filter(country == c("Germany")) %>% 
  mutate(format = scales::percent(daily_growth_rate, accuracy = 1)) %>% 
  filter(!is.na(daily_growth_rate)) %>% 
  ggplot(aes(x = date, y = statistic)) +
  geom_point(aes(size = daily_growth_rate)) +
  geom_line(linetype = 3, size = .5) +
  geom_text(aes(label = format), nudge_x = 0, nudge_y = .2) +
  scale_y_log10("Cumulative Infections", 
                breaks = c(1e4, 2e4, 5e4, 1e5, 2e5, 5e5),
                limits = c(1e4, 5e5),
                labels = c("10k", "20k", "50k", "100k", "200k", "500k"),
                minor_breaks = NULL) +
  facet_wrap(~ country, ncol = 1) +
  theme_minimal()  +
  scale_size("Daily Growth Rate", labels = scales::percent) +
  scale_x_date(breaks = seq(min(data$date),
                            max(data$date) + lubridate::days(3),
                            by = "2 days"), 
               label = scales::date_format(format = "%d %b")) +
  theme(legend.position = "bottom",
        axis.title.x = element_blank(), 
        axis.text.x = element_text(angle = 90)) +
  ggtitle("Cumulative Infection Count", "Daily Growth Rate for each Day in percent")
#> Warning: Removed 1 rows containing missing values (geom_point).
#> Warning: Removed 1 row(s) containing missing values (geom_path).
#> Warning: Removed 1 rows containing missing values (geom_text).