In jnshsrs/corona: Corona Data Import

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  fig.retina = 10,
  out.width = "100%"
)

corona

This package support to import, prepare, model and visualize the data about COVID19 infections from the Johns Hopkins University github repository.

Installation

You can install the released version of corona from github with:

install.packages("devtools")
devtools::install_github("jnshsrs/corona")

Import JHU corona data

To load the dataset, just call read_corona(). The function will fetch the latest data from the JHU github repository.

Opposed to the JHU github data, which is structured in a wide format (each day has a column and each row represents a country), this dataset is rearranged into a long format where each country and day reprents a row.

library(dplyr, warn.conflicts = FALSE)
library(corona)

# Import the corona
data <- read_corona()

data

Additionally, the corona-package comes with reader functions for the three statistics, i.e., number of infections, number of deaths and number of recoveries, but these functions are mainly used as helper functions for the read_corona.

# Import numbers of infection 
read_infections()

# Import numbers of deaths
read_deaths()

# Import number of recoveries
read_recoveries()

Prepare the data

To work with the data, we have to preprocess the data

# Prepare data
data_germany <- data %>% 
  preprocess_corona_data(statistic = "infections", 
                         countries = "Germany", 
                         n = 100)

Predict cases

# Predict the cases
data_germany %>% predict_growth()

Plot the growth curve of infected cases

# Data pipeline
data %>% 
  preprocess_corona_data(statistic = "infections", 
                         countries = "Spain", 
                         n = 100) %>% 
  predict_growth() %>% 
  plot_country(show_model = TRUE)

Plot the number of deaths

# Data pipeline
data %>% 
  preprocess_corona_data(statistic = "deaths", 
                         countries = "Italy", 
                         n = 10) %>% 
  predict_growth() %>%
  plot_country(show_model = TRUE) +
  ggplot2::ggtitle("Corona  Death Growth Curve in Italy", 
                   subtitle = "Starte date is the first day with > 10 deaths")

Look at the Growth Model

The function lm_corona takes a preprocessed corona dataset (as tibble or dataframe) and returns a dataframe with the parameters of a exponential growth model.

The column base_rate and growth rate indicte the initial case numbers and the estimated growth across the entire time period.

data %>%
  preprocess_corona_data(
    countries = "Germany",
    statistic = "infections",
    n = 100
  ) %>% 
  lm_corona()

The funcition plot_country plots the exponential growth model for the given country (note that this function can process only one country, a function to compare countries is not available so far).

# Data pipeline
data %>% 
  preprocess_corona_data(statistic = "deaths", 
                         countries = "Germany", 
                         n = 100) %>% 
  predict_growth() %>%
  plot_country(show_model = TRUE) +
  ggplot2::ggtitle("Number of cumulative deaths in Germany", 
                   "Days since the 100th case included")

# Data pipeline
data %>% 
  preprocess_corona_data(statistic = "infections", 
                         countries = "Germany", 
                         n = 10000) %>% 
  predict_growth() %>%
  plot_country(show_model = TRUE) +
  ggplot2::ggtitle("Number of cumulative deaths in Germany", 
                   "Days since the 10000th (1e4) case included")

data %>% 
  group_by(country, date) %>% 
  summarise_at(c("infections", "deaths", "recoveries"), sum)

Daily growths rates of infections

library(ggplot2)

data %>% 
  preprocess_corona_data(statistic = "infections",
                         countries = c("Germany",
                                       "Italy", 
                                       "Spain",
                                       "US",
                                       "Vietnam"),
                         n = 100) %>% 
  mutate(daily_growth_rate = statistic / lag(statistic)) %>% 
  filter(!is.na(daily_growth_rate)) %>% 
  ggplot(aes(x = date, y = daily_growth_rate, col = country)) +
  geom_line(alpha = .4) +
  geom_smooth(method = "loess", se = FALSE, span = .55) +
  scale_y_continuous("Daily growth rate (smoothed)") +
  scale_x_date(breaks = seq(min(data$date),
                            max(data$date),
                            by = "2 days"), 
               label = scales::date_format(format = "%d %b")) + 
  scale_color_discrete("Country") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), 
        axis.title.x = element_blank()) +
  ggtitle("Daily growth rates since the 100th case",
          "Observed data is superimposed by smoothed lines")

Daily growth rate of deaths

library(ggplot2)

data %>% 
  preprocess_corona_data(statistic = "deaths",
                         countries = c("Germany",
                                       "Italy", 
                                       "Spain",
                                       "US",
                                       "Vietnam"),
                         n = 100) %>% 
  mutate(daily_growth_rate = statistic / lag(statistic)) %>% 
  filter(!is.na(daily_growth_rate)) %>% 
  ggplot(aes(x = date, y = daily_growth_rate, col = country)) +
  geom_line(alpha = .4) +
  geom_smooth(method = "loess", formula = "y ~ x", se = FALSE, span = .55) +
  scale_y_continuous("Daily growth rate (smoothed)") +
  scale_x_date(breaks = seq(min(data$date),
                            max(data$date) + lubridate::days(3),
                            by = "2 days"), 
               label = scales::date_format(format = "%d %b")) + 
  scale_color_discrete("Country") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), 
        axis.title.x = element_blank()) +
  ggtitle("Daily deaths growth rates since the 100th case",
          "Observed data is superimposed by smoothed lines")

data %>% 
  preprocess_corona_data(statistic = "infections",
                         countries = c("Germany",
                                       "Italy", 
                                       "Spain",
                                       "US",
                                       "Vietnam"),
                         n = 7000) %>% 
  mutate(daily_growth_rate = statistic / lag(statistic)) %>% 
  mutate_at("daily_growth_rate", function(x) x - 1) %>% 
  filter(country == c("Germany")) %>% 
  mutate(format = scales::percent(daily_growth_rate, accuracy = 1)) %>% 
  filter(!is.na(daily_growth_rate)) %>% 
  ggplot(aes(x = date, y = statistic)) +
  geom_point(aes(size = daily_growth_rate)) +
  geom_line(linetype = 3, size = .5) +
  geom_text(aes(label = format), nudge_x = 0, nudge_y = .2) +
  scale_y_log10("Cumulative Infections", 
                breaks = c(1e4, 2e4, 5e4, 1e5, 2e5, 5e5),
                limits = c(1e4, 5e5),
                labels = c("10k", "20k", "50k", "100k", "200k", "500k"),
                minor_breaks = NULL) +
  facet_wrap(~ country, ncol = 1) +
  theme_minimal()  +
  scale_size("Daily Growth Rate", labels = scales::percent) +
  scale_x_date(breaks = seq(min(data$date),
                            max(data$date) + lubridate::days(3),
                            by = "2 days"), 
               label = scales::date_format(format = "%d %b")) +
  theme(legend.position = "bottom",
        axis.title.x = element_blank(), 
        axis.text.x = element_text(angle = 90)) +
  ggtitle("Cumulative Infection Count", "Daily Growth Rate for each Day in percent")