data-raw/update_data.R

# Prior to running this, one should
# Clone the repo at https://github.com/CSSEGISandData/COVID-19/ into the same level as the covid19 directory (same level, not same directory)

# Pull from JHU
system('cd ../../COVID-19; pwd; git pull;')

Pull from ibesora's webscraping app
system('cd ../../covid-19-data; pwd; git pull;')


library(dplyr)
library(readr)
library(tidyr)
library(readxl)
library(rgdal)
library(sp)
library(raster)



# Johns Hopkins data
# Define the files at https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports
reports_dir <- '../../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/'
files <- dir(reports_dir)
files <- files[grepl('.csv', files, fixed = T)]

# Read in all files
out_list <- list()
for(i in 1:length(files)){
  this_file <- files[i]
  this_date <- as.Date(substr(this_file, 1, 10), format = '%m-%d-%Y')
  this_data <- read_csv(paste0(reports_dir, this_file)) 
  # There was a change in formatting, so have to do this in two different ways
  if('Admin2' %in% names(this_data)){
    this_data <-this_data %>%
      dplyr::select(district = Province_State,
                    country = Country_Region,
                    time_stamp = Last_Update,
                    cases = Confirmed,
                    deaths = Deaths,
                    recovered = Recovered)
  } else {
    this_data <-this_data %>%
      dplyr::rename(district = `Province/State`,
                    country = `Country/Region`,
                    time_stamp = `Last Update`,
                    cases = Confirmed,
                    deaths = Deaths,
                    recovered = Recovered)
  }
  
  
  out_list[[i]] <- this_data %>% mutate(time_stamp = as.character(time_stamp)) %>%
    mutate(date = this_date)
}
df <- bind_rows(out_list) %>% dplyr::select(-Latitude, -Longitude, -recovered)

#
# For China, get all on one
df <- df %>% mutate(country = ifelse(country == 'Mainland China',
                                       'China',
                                       country)) %>%
  mutate(country = ifelse(country == 'Korea, South', 'South Korea',
                          ifelse(country == 'Hong Kong SAR', 'Hong Kong',
                                 ifelse(country == 'Republic of Korea', 'North Korea',
                                        ifelse(country == 'Macao SAR', 'Macao',
                                               ifelse(country == 'Czech Republic', 'Czechia',
                                                      ifelse(country == 'Republic of Moldova', 'Moldova',
                                                             ifelse(country == 'Russian Federation',
                                                                    'Russia',
                                                                    ifelse(country == 'Iran (Islamic Republic of)',
                                                                           'Iran', country))))))))) %>%
  mutate(country = ifelse(country == 'Gambia, The', 'Gambia', country)) %>%
  mutate(country = ifelse(country == 'UK', 'United Kingdom', country)) %>%
  mutate(district = ifelse(country %in% have_pop,
                                        district,
                                        NA)) %>%
  group_by(country, district, date) %>%
  summarise(cases = sum(cases, na.rm = TRUE),
            deaths = sum(deaths, na.rm = TRUE)) %>%
  ungroup
  

# For US
# Up until March 9, data shows by sub-states but not States
# Beginning on March 10, shows states, not sub-states
usiso2 <- readLines('usiso2/usiso2.txt')
iso2 <- unlist(lapply(strsplit(usiso2, split = '\t', fixed = T), function(x){x[1]}))
district <- unlist(lapply(strsplit(usiso2, split = '\t', fixed = T), function(x){x[2]}))
usiso2 <- tibble(district, iso2)

us_old <- df %>% filter(country == 'US', date <= '2020-03-09')
us_new <- df %>% filter(country == 'US', date >= '2020-03-10')

# In us_old, keep only the sub-states, since that's where the data is
us_old <- us_old %>% filter(grepl(',', district))
# Do the opposite in us_new (ie, keep only the states)
us_new <- us_new %>% filter(!grepl(',', district))

# In us_old, get the state
us_old$iso2 <- unlist(lapply(strsplit(us_old$district, ', '), function(x){x[length(x)]}))
# now bring in state name
us_old <- left_join(us_old %>% dplyr::select(-district), usiso2) %>% dplyr::select(-iso2)
us_old <- us_old %>%
  group_by(country, date, district) %>%
  summarise(cases = sum(cases, na.rm = T),
            deaths = sum(deaths, na.rm = TRUE)) %>% ungroup
# bring it all together now
df <- df %>%
  filter(country != 'US') %>%
  bind_rows(us_old,
            us_new)

# Get most recent Spanish ministry data
library(gsheet)
url <- 'https://docs.google.com/spreadsheets/d/15UJWpsW6G7sEImE8aiQ5JrLCtCnbprpztfoEwyTNTEY/edit#gid=810081118'
esp_df <- gsheet::gsheet2tbl(url)
right <- esp_df %>%
  group_by(date) %>%
  summarise(cases = sum(cases, na.rm = TRUE),
            deaths  = sum(deaths, na.rm = TRUE))

# Get Italy regional data

# https://code.montera34.com:4443/numeroteca/covid19
ita <- read.delim("https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv",sep = ",")  

# Process data
ita$date <-  as.Date(ita$data)
ita <- ita %>%
  dplyr::select(ccaa = denominazione_regione,
                date,
                cases = totale_casi,
                uci = terapia_intensiva,
                hospitalizations = totale_ospedalizzati,
                deaths = deceduti)
ita$ccaa <- as.character(ita$ccaa)
ita <- ita %>%
  mutate(ccaa = ifelse(ccaa %in% c('P.A. Bolzano', 'P.A. Trento'), 'Trentino-Alto Adige',
                       ccaa)) %>%
  group_by(ccaa, date) %>%
  summarise(cases = sum(cases, na.rm = TRUE),
            uci = sum(uci, na.rm = TRUE),
            hospitalizations = sum(hospitalizations, na.rm = TRUE),
            deaths = sum(deaths, na.rm = TRUE))

# Get the district-level data for Spain and Italy Portugal, France too in (since JHU doesn't have it)
do_italy_spain <- TRUE
if(do_italy_spain){
  # Italy
  left <- ita %>% 
    dplyr::rename(district = ccaa) %>%
    mutate(country = 'Italy')
  right <- df %>% filter(country == 'Italy') %>%
    dplyr::select(date,country)
  add_these <- df %>% filter(country == 'Italy') %>%
    filter(!date %in% left$date,
           date < max(left$date)) # remove any obs AHEAD of ministry data
  joined <- left_join(left, right) %>% bind_rows(add_these)
  df <- df %>%
    mutate(flag = (country == 'Italy' & date %in% joined$date) | (country == 'Italy' & date > max(ita$date))) %>%
    filter(!flag) %>% dplyr::select(-flag) %>%
    bind_rows(joined)
  
  # Spain
  left <- esp_df %>% 
    dplyr::rename(district = ccaa) %>%
    mutate(country = 'Spain') %>%
    dplyr::select( -comment)
  right <- df %>% filter(country == 'Spain') %>%
    dplyr::select(date,country)
  add_these <- df %>% filter(country == 'Spain') %>%
    filter(!date %in% left$date,
           date < max(left$date)) # remove any obs AHEAD of ministry data
  joined <- left_join(left, right) %>% bind_rows(add_these)
  df <- df %>%
    mutate(flag = (country == 'Spain' & date %in% joined$date) | (country == 'Spain' & date > max(esp_df$date))) %>%
    filter(!flag) %>% dplyr::select(-flag) %>%
    bind_rows(joined)
  
  # Portugal
  left <- por_df %>% 
    dplyr::rename(district = ccaa) %>%
    mutate(country = 'Portugal')
  right <- df %>% filter(country == 'Portugal') %>%
    dplyr::select(date,country)
  add_these <- df %>% filter(country == 'Portugal') %>%
    filter(!date %in% left$date,
           date < max(left$date)) # remove any obs AHEAD of ministry data
  joined <- left_join(left, right) %>% bind_rows(add_these)
  df <- df %>%
    mutate(flag = (country == 'Portugal' & date %in% joined$date) | (country == 'Portugal' & date > max(por_df$date))) %>%
    filter(!flag) %>% dplyr::select(-flag) %>%
    bind_rows(joined)
  
  # # France (special case because JHU has some)
  # NOT DOING DUE TO INCOMPLETE DATA FOR FRANCE
  # left <- fra_df %>% 
  #   dplyr::rename(district = ccaa) %>%
  #   mutate(country = 'France')
  # right <- df %>% filter(country == 'France') %>%
  #   dplyr::distinct(date,country)
  # add_these <- df %>% filter(country == 'France') %>%
  #   group_by(date, country) %>% mutate(cases = sum(cases, na.rm = TRUE), deaths = sum(deaths, na.rm = TRUE)) %>%
  #   mutate(district = NA) %>%
  #   filter(!date %in% left$date,
  #          date < max(left$date)) # remove any obs AHEAD of ministry data
  # joined <- left_join(left, right) %>% bind_rows(add_these)
  # df <- df %>%
  #   mutate(flag = (country == 'France' & date %in% joined$date) | (country == 'France' & date > max(fra_df$date))) %>%
  #   filter(!flag) %>% dplyr::select(-flag) %>%
  #   bind_rows(joined)
}

# Decumulate
df <- df %>%
  ungroup %>%
  arrange(country, district, date) %>%
  group_by(country, district) %>%
  mutate(cases_non_cum = cases - dplyr::lag(cases, default = 0),
         deaths_non_cum = deaths - dplyr::lag(deaths, default = 0),
         uci_non_cum = uci - dplyr::lag(uci, default = 0)) %>%
  ungroup

# Deal with errors in US data
df$cases[df$district == 'Nevada' & df$country == 'US' & df$date == '2020-03-17'] <- 50
df$cases[df$district == 'Utah' & df$country == 'US' & df$date == '2020-03-19'] <- 77
df$cases[df$district == 'Washington' & df$country == 'US' & df$date == '2020-03-17'] <- 1000
df$cases[df$district == 'Grand Princess' & df$country == 'US' & df$date == '2020-03-16'] <- 21
df$deaths[df$district == 'Oregon' & df$country == 'US' & df$date == '2020-03-22'] <- 5

# Join all together but by country
df_country <- df %>%
  group_by(country, date) %>%
  dplyr::summarise(cases = sum(cases, na.rm = TRUE),
                        deaths = sum(deaths, na.rm = TRUE),
                        uci = sum(uci, na.rm = TRUE),
                        hospitalizations = sum(hospitalizations, na.rm = TRUE),
                        cases_non_cum = sum(cases_non_cum, na.rm = TRUE),
                        deaths_non_cum = sum(deaths_non_cum, na.rm = TRUE),
                        uci_non_cum = sum(uci_non_cum, na.rm = TRUE))

# Get the iso code for each country
countries <- sort(unique(df$country))
country_codes <- tibble(country = countries)
library(passport)
country_codes$iso <- parse_country(x = countries, to = 'iso3c')
df <- left_join(df, country_codes)
df_country <- left_join(df_country, country_codes)

usethis::use_data(df, overwrite = T)
usethis::use_data(df_country, overwrite = T)

# Make spreadsheet for ISGlobal
isglobal <- df_country %>%
  filter(date == max(date)) %>%
  dplyr::select(date, 
                country, 
                cases,
                deaths) %>%
  left_join(country_codes)


if(!dir.exists('isglobal')){
  dir.create('isglobal')
}
write_csv(isglobal, 'isglobal/isglobal.csv')

write_csv(df, 'isglobal/world_region_data.csv')
write_csv(df_country, 'isglobal/world_data.csv')


# Read population data for Spain's CCAA
esp_pop <- tibble(ccaa = c("Andalucía",
                           "Aragón",
                           "Asturias",
                           "Baleares",
                           "C. Valenciana",
                           "Canarias",
                           "Cantabria",
                           "Cataluña",
                           "Ceuta",
                           "CLM",
                           "CyL",
                           "Extremadura",
                           "Galicia",
                           "La Rioja",
                           "Madrid",
                           "Melilla",
                           "Murcia",
                           "Navarra",
                           "País Vasco"),
                  pop = c(8414240,#  "Andalucía",
                            1319291,#"Aragón",
                            1022800,#"Asturias",
                            1149460,#"Baleares",
                            5003769,#"C. Valenciana",
                            2153389,#"Canarias",
                            581078,#"Cantabria",
                            7675217,#"Cataluña",
                            84777,#"Ceuta",
                          2032863, #"CLM",
                            2399548,#"CyL",
                            1067710,#"Extremadura",
                            2699499,#"Galicia",
                            316798,#"La Rioja",
                            6663394,#"Madrid",
                            86487,#"Melilla",
                            1493898,#"Murcia",
                            654214,#"Navarra",
                            2207776))#"País Vasco"))
usethis::use_data(esp_pop, overwrite = TRUE)


# Interpret missing UCI cases
esp_df <- esp_df %>%
  arrange(date) %>%
  group_by(ccaa) %>%
  mutate(uci = zoo::na.approx(uci, na.rm = F))

# Get non cumulative cases
esp_df <- esp_df %>%
  ungroup %>%
  arrange(ccaa, date) %>%
  group_by(ccaa) %>%
  mutate(cases_non_cum = cases - lag(cases, default = 0),
         deaths_non_cum = deaths - lag(deaths, default = 0),
         uci_non_cum = uci - lag(uci, default = 0)) %>%
  ungroup

# Spot corrections
esp_df <- esp_df %>%
  mutate(cases_non_cum = ifelse(cases_non_cum < 0,
                                          0, 
                                cases_non_cum)) 


# De-cumulate Italy
ita <- ita %>%
  ungroup %>%
  arrange(ccaa, date) %>%
  group_by(ccaa) %>%
  mutate(cases_non_cum = cases - lag(cases, default = 0),
         deaths_non_cum = deaths - lag(deaths, default = 0),
         uci_non_cum = uci - lag(uci, default = 0),
         hospitalizations_non_cum = hospitalizations - lag(hospitalizations, default = 0)) %>%
  ungroup

# De-cumulate france
fra_df <- fra_df %>%
  ungroup %>%
  arrange(ccaa, date) %>%
  group_by(ccaa) %>%
  mutate(cases_non_cum = cases - lag(cases, default = 0),
         deaths_non_cum = deaths - lag(deaths, default = 0)) %>%
  ungroup

# De-cumulate portugal
por_df <- por_df %>%
  ungroup %>%
  arrange(ccaa, date) %>%
  group_by(ccaa) %>%
  mutate(cases_non_cum = cases - lag(cases, default = 0),
         deaths_non_cum = deaths - lag(deaths, default = 0)) %>%
  ungroup


# By province
# https://docs.google.com/spreadsheets/d/1qxbKnU39yn6yYcNkBqQ0mKnIXmKfPQ4lgpNglpJ9frE/edit#gid=0

usethis::use_data(ita, overwrite = T)
usethis::use_data(por_df, overwrite = T)
usethis::use_data(fra_df, overwrite = T)

# Get Italian populations
ita_pop <- tibble(
  ccaa = c('Abruzzo',
           'Basilicata',
           'Calabria',
           'Campania',
           'Emilia-Romagna',
           'Friuli Venezia Giulia',
           'Lazio',
           'Liguria',
           'Lombardia',
           'Marche',
           'Molise',
           'Trentino-Alto Adige',
           'Piemonte',
           'Puglia',
           'Sardegna',
           'Sicilia',
           'Toscana',
           'Umbria',
           'Valle d\'Aosta',
'Veneto'),
pop = c(
  1315000,#'Abruzzo',
  567118,#'Basilicata',
  1957000,#'Calabria',
  5827000,#'Campania',
  4453000,#'Emilia Romagna',
  1216000,#'Friuli Venezia Giulia',
  5897000,#'Lazio',
  1557000,#'Liguria',
  10040000,#'Lombardia',
  1532000,#'Marche',
  308493,#'Molise',
  1070000, # trentino alto
  4376000,#'Piemonte',
  4048000,#'Puglia',
  1648000,#'Sardegna',
  5027000,#'Sicilia',
  3737000,#'Toscana',
  884640,#'Umbria',
  126202,#'Valle d\'Aosta',
  4905000#'Veneto'
)
)
usethis::use_data(ita_pop, overwrite = T)

library(readr)
write_csv(esp_df, 'spain/ccaa_day.csv')
write_csv(esp_df, 'isglobal/ccaa_day.csv')
write_csv(esp_pop, 'isglobal/esp_pop.csv')
write_csv(world_pop, 'isglobal/world_pop.csv')
write_csv(ita_pop, 'isglobal/ita_pop.csv')

# write_csv(esp_uci, 'isglobal/esp_uci.csv')

usethis::use_data(esp_df, overwrite = T)
# Write a map

# Cognoms
cognoms <- read_csv('cognoms/cognoms.csv', skip = 7, locale = readr::locale(encoding = "latin1"))
cognoms <- cognoms$Nombre[1:1000]
surnames <- read_csv('cognoms/surnames.csv')
surnames <- surnames$name[1:1000]
usethis::use_data(cognoms, overwrite = T)
usethis::use_data(surnames, overwrite = T)

# Create a regions pop for Italy, China, Spain, US, Portugal, France
usa_pop <- readxl::read_excel('usapop/usapop.xlsx', 
                              skip = 3)
canada_pop <- 
  tibble(ccaa = c('Alberta',
         'British Columbia',
         'Grand Princess',
         'Manitoba',
         'New Brunswick',
         'Newfoundland and Labrador',
         'Northwest Territories',
         'Nova Scotia',
         'Ontario',
         'Prince Edward Island',
         'Quebec',
         'Saskatchewan'),
         pop = c(4413146,
                 5110917,
                 NA,
                 1377517,
                 779993,
                 521365,
                 44904,
                 977457,
                 14711827,
                 158158,
                 8537674,
                 1181666))
names(usa_pop)[c(1,13)] <- c('ccaa', 'pop')
usa_pop <- usa_pop %>% dplyr::select('ccaa', 'pop')
usa_pop <- usa_pop[6:56,]
usa_pop$ccaa <- gsub('.', '', usa_pop$ccaa, fixed = TRUE)
regions_pop <- 
  bind_rows(esp_pop %>% mutate(country = 'Spain', iso = 'ESP'),
            ita_pop %>% mutate(country = 'Italy', iso = 'ITA'),
            chi_pop %>% mutate(country = 'China', iso = 'CHN'),
            usa_pop %>% mutate(country = 'US', iso = 'USA'),
            fra_pop %>% mutate(country = 'France', iso = 'FRA'),
            por_pop %>% mutate(country = 'Portugal', iso = 'PRT'),
            canada_pop %>% mutate(country = 'Canada', iso = 'CAN'))
usethis::use_data(regions_pop, overwrite = T)

setwd('..')
golem::detach_all_attached()
rm(list=ls(all.names = TRUE))
devtools::install()
setwd('data-raw/')
databrew/covid19USACAN documentation built on April 6, 2020, 12:38 a.m.