data-raw/update_data.R

# Prior to running this, one should
# Clone the repo at https://github.com/CSSEGISandData/COVID-19/ into the same level as the covid19 directory (same level, not same directory)



# Pull from JHU
system('cd ../../COVID-19; pwd; git pull;')

# Pull from ibesora's webscraping app
system('cd ../../covid-19-data; pwd; git pull;')

# Pull from data for French departments
# Consider replacing with this: Updated map (data through 31 March
system('cd ../../FRANCE-COVID-19; git pull')

# Pull data for Portugese regions
system('cd ../../covid19pt-data; git pull')

library(dplyr)
library(readr)
library(tidyr)
library(readxl)
library(rgdal)
library(sp)
library(raster)
library(rgdal)
library(raster)
library(sp)
library(readxl)
library(rmapshaper)
library(sf)

# Get our world in data testin gdata
owid <- read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv')
usethis::use_data(owid, overwrite = TRUE)


esp0 <- getData(country = 'ESP', level = 0)
usethis::use_data(esp0, overwrite = TRUE)

esp1 <- getData(country = 'ESP', level = 1)
usethis::use_data(esp1, overwrite = TRUE)

# Municipal codes
codes <- read_excel('20codmun.xlsx', skip = 1)

## Get Spanish census data
# Population by age and municipality
# https://www.ine.es/jaxi/Tabla.htm?path=/t20/e244/avance/p02/l0/&file=1mun00.px&L=0
download_url <- 'https://www.ine.es/jaxi/files/_px/es/csv_bdsc/t20/e244/avance/p02/l0/1mun00.csv_bdsc?nocab=1'
download_file <- '1mun00.csv'
if(!download_file %in% dir()){
  download.file(url = download_url,
                destfile = download_file)
}
# Read in
census <- read_delim(download_file, delim = ';')
# Clean up names
names(census) <- c('municipio',
                   'sexo',
                   'edad',
                   'periodio',
                   'total')
census$periodio <- NULL
# Remove unnecessary values
census <- census %>%
  filter(!municipio %in% 'TOTAL NACIONAL',
         !sexo %in% 'Ambos sexos',
         !edad %in% 'Total')
census$total <- ifelse(census$total == '..',
                       0,
                       as.numeric(as.character(gsub('.', '', 
                                                    census$total, fixed = T))))
census$edad <- gsub('años|año', '', census$edad)
census$edad <- gsub(' y más', '', census$edad)
census$edad <- as.numeric(as.character(census$edad))

census$id <- unlist(lapply(strsplit(census$municipio, ' '),
                           function(x){paste0(x[1], collapse = ' ')}))
census$id <- trimws(census$id)
# census$id <- as.numeric(census$id)
census$municipio <- unlist(lapply(strsplit(census$municipio, ' '),
                                  function(x){paste0(x[2:length(x)], collapse = ' ')}))
census$municipio <- trimws(census$municipio)
usethis::use_data(census, overwrite = TRUE)

# Get Spain at municipal level
# Must download teh following as 'lineas_limite.zip'
# http://centrodedescargas.cnig.es/CentroDescargas/catalogo.do?Serie=CAANE
# and then unzip
# unzip('lineas_limite.zip')
municipios <- readOGR('recintos_municipales_inspire_peninbal_etrs89',
                      'recintos_municipales_inspire_peninbal_etrs89', encoding="UTF-8")
# municipios <- st_read('recintos_municipales_inspire_peninbal_etrs89/recintos_municipales_inspire_peninbal_etrs89.shp')
# municipios <- read_sf('recintos_municipales_inspire_peninbal_etrs89/recintos_municipales_inspire_peninbal_etrs89.shp')
library(rgeos)
regions_df <- municipios@data
# x = gSimplify(municipios, tol = 0.05, topologyPreserve = TRUE)
# municipios <- sp::SpatialPolygonsDataFrame(x, regions_df)


# Get an ID in municipios
census_ids <- sort(unique(census$id))
census_ids <- sample(census_ids, length(census_ids))
ids <- substr(municipios$INSPIREID, 20, 24)

municipios$id <- ids
usethis::use_data(municipios, overwrite = TRUE)


# World cities
cities <- read_csv('worldcities.csv')
cities <- cities %>% filter(country == 'Spain')
cities <- cities %>% dplyr::select(city, lat, lng, population)
cities_sp <- cities %>% dplyr::mutate(x = lng, y = lat)
coordinates(cities_sp) <- ~x+y
proj4string(cities_sp) <- proj4string(municipios)
usethis::use_data(cities, overwrite = T)
usethis::use_data(cities_sp, overwrite = T)


# Read in cat municipal level data
muni <- read_csv('https://analisi.transparenciacatalunya.cat/api/views/jj6z-iyrp/rows.csv?accessType=DOWNLOAD&sorting=true')
muni <- muni %>%
  mutate(date = as.Date(TipusCasData, format = '%d/%m/%Y')) %>%
  group_by(date,
           ComarcaCodi,
           ComarcaDescripcio,
           MunicipiCodi,
           MunicipiDescripcio) %>%
  summarise(pcr = sum(NumCasos[TipusCasDescripcio == 'Positiu PCR'], na.rm = TRUE),
            rdt = sum(NumCasos[TipusCasDescripcio == 'Positiu per Test Ràpid'], na.rm = TRUE),
            suspect_cases = sum(NumCasos[TipusCasDescripcio == 'Sospitós'], na.rm = TRUE)) %>%
  ungroup %>%
  mutate(confirmed_cases = pcr + rdt,
         total_cases = pcr + rdt + suspect_cases)
# Get expanded
left <- expand.grid(
  date = seq(as.Date('2020-02-25'),
             max(muni$date),
             by = 1))
out_list <- list()
municipalities <- sort(unique(muni$MunicipiCodi))
for(i in 1:length(municipalities)){
  this_code <- municipalities[i]
  this_data <- muni %>%
    filter(MunicipiCodi == this_code)
  joined <- left_join(left, this_data, by = 'date') %>%
    tidyr::fill(ComarcaCodi, ComarcaDescripcio, MunicipiCodi, MunicipiDescripcio, .direction = 'downup') %>%
    mutate_at(.vars = vars(pcr, rdt, suspect_cases, confirmed_cases, total_cases),
              .funs = function(x){ifelse(is.na(x), 0, x)}) %>%
    dplyr::rename(pcr_non_cum = pcr,
                  rdt_non_cum = rdt,
                  suspect_cases_non_cum = suspect_cases,
                  confirmed_cases_non_cum = confirmed_cases,
                  total_cases_non_cum = total_cases) %>%
    dplyr::mutate(pcr = cumsum(pcr_non_cum),
                  rdt = cumsum(rdt_non_cum),
                  suspect_cases = cumsum(suspect_cases_non_cum),
                  confirmed_cases = cumsum(confirmed_cases_non_cum),
                  total_cases = cumsum(total_cases_non_cum))
  out_list[[i]] <- joined
}
muni <- bind_rows(out_list)

# Get population too into muni
poppy <- census %>%
  group_by(MunicipiCodi = id) %>%
  summarise(pop = sum(total))
muni <- left_join(
  x = muni,
  y = poppy
)

usethis::use_data(muni, overwrite = T)

cristina_bravo <- muni %>%
  mutate(incidencia_per_100k = confirmed_cases_non_cum / pop * 100000)
  
cristina_bravo_bcn <- cristina_bravo %>%
  filter(MunicipiDescripcio == 'Barcelona')

write_csv(cristina_bravo, 'cristina_bravo.csv')
write_csv(cristina_bravo_bcn, 'cristina_bravo_bcn.csv')

# Incidencia
# incidencia <- read_csv('https://analisi.transparenciacatalunya.cat/api/views/623z-r97q/rows.csv?accessType=DOWNLOAD&sorting=true')
# Read in economist excess mortality data
excess <- read_csv('https://github.com/TheEconomist/covid-19-excess-deaths-tracker/raw/master/output-data/historical-deaths/spain_weekly_deaths.csv')
usethis::use_data(excess, overwrite = T)


# Testing data from our world in data
testing <- read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-all-observations.csv')
usethis::use_data(testing, overwrite = T)

# Pull from transpariencia catalunya
# https://analisi.transparenciacatalunya.cat/Salut/Registre-de-casos-de-COVID-19-realitzats-a-Catalun/qwj8-xpvk/data
if(!dir.exists('trans_cat/')){
  dir.create('trans_cat/')
}
trans_cat <- read_csv('https://analisi.transparenciacatalunya.cat/api/views/qwj8-xpvk/rows.csv?accessType=DOWNLOAD&sorting=true')
trans_cat <- trans_cat %>%
  mutate(date = as.Date(TipusCasData, format = '%d/%m/%Y'))
write_csv(trans_cat, 'trans_cat/trans_cat.csv')
usethis::use_data(trans_cat, overwrite = T)


# Province level data for all of Spain
provinces <- read_csv('https://github.com/montera34/escovid19data/blob/master/data/output/covid19-provincias-spain_consolidated.csv?raw=true')
usethis::use_data(provinces, overwrite = T)

# Get catalunya regions sanitarias
library(rgdal)
regions_sanitaries <- readOGR('regions_sanitaries/', 'RegionsS_2018', encoding = "latin1")
usethis::use_data(regions_sanitaries, overwrite = T)



# Pull from Spanish ministry
if(!dir.exists('isciii')){
  dir.create('isciii')
}
isc <- read_csv('https://cnecovid.isciii.es/covid19/resources/datos_ccaas.csv',
                col_names = c("CCAA", "date", "cases"),
                col_types = c('ccd'),
                skip = 1) %>%
  filter(!is.na(CCAA))
joiner <- tibble(CCAA = c('AN',
                          'AR',
                          'AS',
                          'CB',
                          'CE',
                          'CL',
                          'CM',
                          'CN',
                          'CT',
                          'EX',
                          'GA',
                          'IB',
                          'MC',
                          'MD',
                          'ML',
                          'NC',
                          'PV',
                          'RI',
                          'VC'),
                 ccaa = c('Andalucía',
                          'Aragón',
                          'Asturias',
                          'Cantabria',
                          'Ceuta',
                          'CyL',
                          'CLM',
                          'Canarias',
                          'Cataluña',
                          'Extremadura',
                          'Galicia',
                          'Baleares',
                          'Murcia',
                          'Madrid',
                          'Melilla',
                          'Navarra',
                          'País Vasco',
                          'La Rioja',
                          'C. Valenciana'))
isc <- left_join(isc, joiner)

isc <- isc %>%
  mutate(cases = ifelse(is.na(cases), 0, cases))

isc$date <- as.Date(isc$date)
isc <- isc %>% filter(!is.na(date))
message('MAX DATE IN SPAIN IS ', max(isc$date))
write_csv(isc, 'isciii/raw.csv')

# Get French data
cases <- read_csv('../../FRANCE-COVID-19/france_coronavirus_time_series-confirmed.csv')
deaths <- read_csv('../../FRANCE-COVID-19/france_coronavirus_time_series-deaths.csv')
# Clean up
clean_france <- function(x){
  x$Date <- as.Date(x$Date, format = '%d/%m/%Y')
  x$Total <- NULL
  x <- x %>% gather(key, value, names(x)[2:ncol(x)])
  x <- x %>% dplyr::rename(date = Date,
                           ccaa = key)
  return(x)
}
cases <- clean_france(cases) %>% dplyr::rename(cases = value)
deaths <- clean_france(deaths) %>% dplyr::rename(deaths = value)
fra_df <- left_join(cases, deaths)
# fra_df <- fra_df %>%
#   mutate(cases = ifelse(is.na(cases), 0, cases),
#          deaths = ifelse(is.na(deaths), 0, deaths))
fra_df <- fra_df %>% arrange(date, ccaa)
# Flag non-updated areas 
fra_df <- fra_df %>%
  group_by(ccaa) %>%
  mutate(flag_cases = cases < dplyr::lag(cases, 1),
         flag_deaths = deaths < dplyr::lag(deaths, 1)) %>%
  ungroup %>%
  mutate(cases = ifelse(flag_cases, NA, cases),
         deaths = ifelse(flag_deaths, NA, deaths)) %>%
  dplyr::select(-flag_deaths,
                -flag_cases)
cases <- fra_df %>% dplyr::select(date, ccaa, cases) %>% filter(!is.na(cases))
deaths <- fra_df %>% dplyr::select(date, ccaa, deaths) %>% filter(!is.na(deaths))
left <- expand.grid(date = sort(unique(c(cases$date, deaths$date))),
                    ccaa = sort(unique(c(cases$ccaa, deaths$ccaa))))
joined <- left_join(left, cases) %>% left_join(deaths)
# Interpolate missing
library(zoo)
fra_df <- joined %>%
  arrange(date) %>%
  group_by(ccaa) %>%
    mutate(cases = na.approx(cases, maxgap = 3, rule = 2),
           deaths = na.approx(deaths, maxgap = 3, rule = 2))

# Get french population
fra_pop <- 
  tibble(ccaa = c("Auvergne-Rhône-Alpes",
                  "Bourgogne-Franche-Comté",
                  "Bretagne",
                  "Centre-Val de Loire",
                  "Corse",
                  "Grand Est",
                  "Guadeloupe",
                  "Guyane",
                  "Hauts-de-France",
                  "Île-de-France",
                  "La Réunion",
                  "Martinique",
                  "Mayotte",
                  "Normandie",
                  "Nouvelle-Aquitaine",
                  "Nouvelle-Calédonie",
                  "Occitanie",
                  "Pays de la Loire",
                  "Provence-Alpes-Côte d'Azur",
                  "Saint-Barthélémy",
                  "Saint-Martin"),
         pop = c(
           8032377, #"Auvergne-Rhône-Alpes",
           2783039, #"Bourgogne-Franche-Comté",
           3340379, #"Bretagne",
           2559073, #"Centre-Val de Loire",
           344679, #"Corse",
           5511747, #"Grand Est",
           395700, #"Guadeloupe",
           290691, #"Guyane",
           5962662, #"Hauts-de-France",
           12278210, #"Île-de-France",
           859959, #"La Réunion",
           376480, #"Martinique",
           270372, #"Mayotte",
           3303500, #"Normandie",
           5999982, #"Nouvelle-Aquitaine",
           280460, #"Nouvelle-Calédonie",
           5924858, #"Occitanie",
           3801797, #"Pays de la Loire",
           5055651, #"Provence-Alpes-Côte d'Azur",
           9131, # "Saint-Barthélémy",
           32125# "Saint-Martin"
         ))
# Get french map
map_fra <- raster::getData(country = 'FRA', level = 1)
# Get Spanish map
map_esp <- raster::getData(country = 'ESP', level = 1)
# Get Italian map
map_ita <- raster::getData(country = 'ITA', level = 1)
ccaas <- map_ita@data$NAME_1
ccaas <- ifelse(ccaas == "Apulia", 'Puglia',
                # ifelse(ccaas == "Emilia-Romagna", 'Emilia Romagna',
                       ifelse(ccaas == "Friuli-Venezia Giulia", 'Friuli Venezia Giulia',
                              ifelse(ccaas == "Sicily", 'Sicilia',
                                            ccaas)))
map_ita@data$ccaa <- ccaas
usethis::use_data(map_ita, overwrite = T)
# Andorra map
map_and <- raster::getData(country = 'AND', level = 0)
usethis::use_data(map_and, overwrite = T)
# Fix names
names_df <- tibble(NAME_1 = c('Andalucía',
                              'Aragón',
                              'Cantabria',
                              'Castilla-La Mancha',
                              'Castilla y León',
                              'Cataluña',
                              'Ceuta y Melilla',
                              'Comunidad de Madrid',
                              'Comunidad Foral de Navarra',
                              'Comunidad Valenciana',
                              'Extremadura',
                              'Galicia',
                              'Islas Baleares',
                              'La Rioja',
                              'País Vasco',
                              'Principado de Asturias',
                              'Región de Murcia',
                              'Islas Canarias'),
                   ccaa = c('Andalucía',
                            'Aragón',
                            'Cantabria',
                            'CLM',
                            'CyL',
                            'Cataluña',
                            'Melilla',
                            'Madrid',
                            'Navarra',
                            'C. Valenciana',
                            'Extremadura',
                            'Galicia',
                            'Baleares',
                            'La Rioja',
                            'País Vasco',
                            'Asturias',
                            'Murcia',
                            'Canarias'))
map_esp@data <- left_join(map_esp@data, names_df)

usethis::use_data(map_esp, overwrite = TRUE)
usethis::use_data(fra_pop, overwrite = TRUE)
usethis::use_data(fra_df, overwrite = TRUE)
usethis::use_data(map_fra, overwrite = TRUE)

# Get Portuguese data
#https://github.com/dssg-pt/covid19pt-data
# Shpaefiles
map_por <- readOGR('../../covid19pt-data/extra/mapas/portugal/', 'portugal')
# raw data
portugal <- read_csv('../../covid19pt-data/data.csv')
# Chop down variables
places <- c('_arscentro','_arslvt','arsalentejo','arsalgarve','acores','madeira','estrangeiro', 'arsnorte')
places <- paste0(places, collapse = '|')
keep_vars <- which(grepl(places, names(portugal)) & !grepl('recuperados', names(portugal)))
portugal <- portugal[,c(1, keep_vars)]
portugal$data <- as.Date(portugal$data, format = '%d-%m-%Y')
portugal <- gather(portugal, key, value, names(portugal)[2:ncol(portugal)])
ccaa <- unlist(lapply(strsplit(portugal$key, '_'), function(x){x[2]}))
key <- unlist(lapply(strsplit(portugal$key, '_'), function(x){x[1]}))
portugal$key <- ifelse(key == 'obitos', 'deaths', 'cases')
portugal$ccaa <- ccaa
deaths <- portugal %>% filter(key == 'deaths') %>% dplyr::select(-key) %>% dplyr::rename(deaths = value)
cases <- portugal %>% filter(key == 'cases') %>% dplyr::select(-key) %>% dplyr::rename(cases = value)
joined <- full_join(cases, deaths)
joined <- joined %>%
  mutate(ccaa = ifelse(ccaa == 'acores', 'Açores',
                       ifelse(ccaa == 'arsalgarve', 'Algarve',
                              ifelse(ccaa == 'arsalentejo', 'Alentejo',
                                     ifelse(ccaa == 'arslvt', 'RLVT',
                                            ifelse(ccaa == 'madeira', 'Madeira',
                                                   ifelse(ccaa == 'arsnorte', 'Norte',
                                                          ifelse(ccaa == 'arscentro', 'Centro', 'Estrangeiro'))))))))
por_df <- joined
por_df <- por_df %>%
  mutate(cases = ifelse(is.na(cases), 0, cases),
         deaths = ifelse(is.na(deaths), 0, deaths))
por_df <- por_df %>% dplyr::rename(date = data)
por_df <- por_df %>% arrange(date, ccaa)
# Portugal population
por_pop <- tibble(
  ccaa = c('Açores',
           'Alentejo',
           'Algarve',
           'Madeira',
           'Norte',
           'Centro',
           'RLVT'),
  pop = c(242846,
          705478,
          438864,
          253945,
          3572583,
          2327000,
          3447173)
)
usethis::use_data(por_pop, overwrite = TRUE)
usethis::use_data(por_df, overwrite = TRUE)
usethis::use_data(map_por, overwrite = TRUE)


# Get ibesora's data (from acquas application)
the_dir <- '../../covid-19-data/'
csvs <- dir(the_dir)
csvs <- csvs[grepl('.csv', csvs, fixed = T)]
if(file.exists('catalonia.csv')){
  data_list <- list()
  for(i in 1:length(csvs)){
    data <- read_csv(paste0(the_dir, csvs[i])) %>%
      mutate(file_name = gsub('.csv', '', csvs[i], fixed = TRUE))
    data_list[[i]] <- data
  }
}

# Get world populationd data
world_pop <- read_csv('unpop/API_SP.POP.TOTL_DS2_en_csv_v2_866861.csv',
                      skip = 4)
world_pop <- world_pop %>%
  dplyr::select(country = `Country Name`,
                iso = `Country Code`,
                pop = `2018`)

# Add regions
iso_regions <- read_csv('isoregions.csv') %>%
  dplyr::select(iso = `alpha-3`,
                region,
                sub_region = `sub-region`)
world_pop <- left_join(world_pop, iso_regions)
usethis::use_data(world_pop, overwrite = TRUE)


# Get chinese pop by region
chi_pop <- tibble(
  ccaa = c('Anhui',
           'Beijing',
           'Chongqing',
           'Fujian',
           'Gansu',
           'Guangdong',
           'Guangxi',
           'Guizhou',
           'Hainan',
           'Hebei',
           'Heilongjiang',
           'Henan',
           'Hong Kong',
           'Hubei',
           'Hunan',
           'Inner Mongolia',
           'Jiangsu',
           'Jiangxi',
           'Jilin',
           'Liaoning',
           'Macau',
           'Ningxia',
           'Qinghai',
           'Shaanxi',
           'Shandong',
           'Shanghai',
           'Shanxi',
           'Sichuan',
           'Tianjin',
           'Tibet',
           'Xinjiang',
           'Yunnan',
           'Zhejiang'),
  pop = c(63.24, #'Anhui',
          21.54, #'Beijing',
          31.02, #'Chongqing',
          39.41, #'Fujian',
          26.37, #'Gansu',
          113.46, #'Guangdong',
          49.26, #'Guangxi',
          36, #'Guizhou',
          9.34, #'Hainan',
          75.56, #'Hebei',
          37.73, #'Heilongjiang',
          96.05, #'Henan',
          7.39, #'Hong Kong',
          59.17, #'Hubei',
          68.99, #'Hunan',
          25.34, #'Inner Mongolia',
          80.51, #'Jiangsu',
          46.48, #'Jiangxi',
          27.04, #'Jilin',
          43.59, #'Liaoning',
          0.62, #'Macau',
          6.88, #'Ningxia',
          6.03, #'Qinghai',
          38.64, #'Shaanxi',
          100.47, #'Shandong',
          24.24, #'Shanghai',
          37.18, #'Shanxi',
          83.41, #'Sichuan',
          15.6, #'Tianjin',
          3.44, #'Tibet',
          24.87, #'Xinjiang',
          48.3, #'Yunnan',
          57.37)) #'Zhejiang')
# Multiply by 1 million
chi_pop$pop <- chi_pop$pop * 1000000
usethis::use_data(chi_pop, overwrite = TRUE)

# Johns Hopkins data
# Define the files at https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports
reports_dir <- '../../COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/'
files <- dir(reports_dir)
files <- files[grepl('.csv', files, fixed = T)]

# Read in all files
out_list <- list()
for(i in 1:length(files)){
  this_file <- files[i]
  this_date <- as.Date(substr(this_file, 1, 10), format = '%m-%d-%Y')
  this_data <- read_csv(paste0(reports_dir, this_file)) 
  # There was a change in formatting, so have to do this in two different ways
  if('Admin2' %in% names(this_data)){
    this_data <-this_data %>%
      dplyr::select(district = Province_State,
                    country = Country_Region,
                    time_stamp = Last_Update,
                    cases = Confirmed,
                    deaths = Deaths,
                    recovered = Recovered)
  } else {
    this_data <-this_data %>%
      dplyr::rename(district = `Province/State`,
                    country = `Country/Region`,
                    time_stamp = `Last Update`,
                    cases = Confirmed,
                    deaths = Deaths,
                    recovered = Recovered)
  }
  
  
  out_list[[i]] <- this_data %>% mutate(time_stamp = as.character(time_stamp)) %>%
    mutate(date = this_date)
}
df <- bind_rows(out_list) %>% dplyr::select(-Latitude, -Longitude, -recovered)

# Define which countries we have sub-national population data for
have_pop <- c('US', 'Italy', 'Spain', 'China', 'Canada', 'France', 'Portugal')

# For China, get all on one
df <- df %>% mutate(country = ifelse(country == 'Mainland China',
                                       'China',
                                       country)) %>%
  mutate(country = ifelse(country == 'Korea, South', 'South Korea',
                          ifelse(country == 'Hong Kong SAR', 'Hong Kong',
                                 ifelse(country == 'Republic of Korea', 'North Korea',
                                        ifelse(country == 'Macao SAR', 'Macao',
                                               ifelse(country == 'Czech Republic', 'Czechia',
                                                      ifelse(country == 'Republic of Moldova', 'Moldova',
                                                             ifelse(country == 'Russian Federation',
                                                                    'Russia',
                                                                    ifelse(country == 'Iran (Islamic Republic of)',
                                                                           'Iran', country))))))))) %>%
  mutate(country = ifelse(country == 'Gambia, The', 'Gambia', country)) %>%
  mutate(country = ifelse(country == 'UK', 'United Kingdom', country)) %>%
  mutate(district = ifelse(country %in% have_pop,
                                        district,
                                        NA)) %>%
  group_by(country, district, date) %>%
  summarise(cases = sum(cases, na.rm = TRUE),
            deaths = sum(deaths, na.rm = TRUE)) %>%
  ungroup
  

# For US
# Up until March 9, data shows by sub-states but not States
# Beginning on March 10, shows states, not sub-states
usiso2 <- readLines('usiso2/usiso2.txt')
iso2 <- unlist(lapply(strsplit(usiso2, split = '\t', fixed = T), function(x){x[1]}))
district <- unlist(lapply(strsplit(usiso2, split = '\t', fixed = T), function(x){x[2]}))
usiso2 <- tibble(district, iso2)

us_old <- df %>% filter(country == 'US', date <= '2020-03-09')
us_new <- df %>% filter(country == 'US', date >= '2020-03-10')

# In us_old, keep only the sub-states, since that's where the data is
us_old <- us_old %>% filter(grepl(',', district))
# Do the opposite in us_new (ie, keep only the states)
us_new <- us_new %>% filter(!grepl(',', district))

# In us_old, get the state
us_old$iso2 <- unlist(lapply(strsplit(us_old$district, ', '), function(x){x[length(x)]}))
# now bring in state name
us_old <- left_join(us_old %>% dplyr::select(-district), usiso2) %>% dplyr::select(-iso2)
us_old <- us_old %>%
  group_by(country, date, district) %>%
  summarise(cases = sum(cases, na.rm = T),
            deaths = sum(deaths, na.rm = TRUE)) %>% ungroup
# bring it all together now
df <- df %>%
  filter(country != 'US') %>%
  bind_rows(us_old,
            us_new)

# Get most recent Spanish ministry data
library(gsheet)
# url <- 'https://docs.google.com/spreadsheets/d/15UJWpsW6G7sEImE8aiQ5JrLCtCnbprpztfoEwyTNTEY/edit#gid=810081118'
# esp_df <- gsheet::gsheet2tbl(url)
na_to_zero <- function(x){ifelse(is.na(x), 0, x)}
esp_df <- isc
esp_df$uci <- esp_df$deaths <- NA
esp_df <- esp_df %>% mutate(cases = na_to_zero(cases),
                         uci = na_to_zero(uci),
                         deaths = na_to_zero(deaths))
# Get rolling sum
esp_df <- esp_df %>%
  arrange(date) %>%
  group_by(ccaa) %>%
  mutate(cases = cumsum(cases)) %>%
  ungroup 

right <- esp_df %>%
  group_by(date) %>%
  summarise(cases = sum(cases, na.rm = TRUE),
            deaths  = sum(deaths, na.rm = TRUE))

# Get Italy regional data

# https://code.montera34.com:4443/numeroteca/covid19
ita <- read.delim("https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv",sep = ",")  

# Process data
ita$date <-  as.Date(ita$data)
ita <- ita %>%
  dplyr::select(ccaa = denominazione_regione,
                date,
                cases = totale_casi,
                uci = terapia_intensiva,
                hospitalizations = totale_ospedalizzati,
                deaths = deceduti)
ita$ccaa <- as.character(ita$ccaa)
ita <- ita %>%
  mutate(ccaa = ifelse(ccaa %in% c('P.A. Bolzano', 'P.A. Trento'), 'Trentino-Alto Adige',
                       ccaa)) %>%
  group_by(ccaa, date) %>%
  summarise(cases = sum(cases, na.rm = TRUE),
            uci = sum(uci, na.rm = TRUE),
            hospitalizations = sum(hospitalizations, na.rm = TRUE),
            deaths = sum(deaths, na.rm = TRUE))

# Get the district-level data for Spain and Italy Portugal, France too in (since JHU doesn't have it)
do_italy_spain <- TRUE
if(do_italy_spain){
  # Italy
  left <- ita %>% 
    dplyr::rename(district = ccaa) %>%
    mutate(country = 'Italy')
  right <- df %>% filter(country == 'Italy') %>%
    dplyr::distinct(date,country)
  add_these <- df %>% filter(country == 'Italy') %>%
    filter(!date %in% left$date,
           date < max(left$date)) # remove any obs AHEAD of ministry data
  joined <- left_join(left, right) %>% bind_rows(add_these)
  df <- df %>%
    mutate(flag = (country == 'Italy' & date %in% joined$date) | (country == 'Italy' & date > max(ita$date))) %>%
    filter(!flag) %>% dplyr::select(-flag) %>%
    bind_rows(joined)
  
  # Spain
  left <- esp_df %>% 
    dplyr::rename(district = ccaa) %>%
    mutate(country = 'Spain')
  right <- df %>% filter(country == 'Spain') %>%
    dplyr::distinct(date,country)
  add_these <- df %>% filter(country == 'Spain') %>%
    filter(!date %in% left$date,
           date < max(left$date)) # remove any obs AHEAD of ministry data
  joined <- left_join(left, right) %>% bind_rows(add_these)
  df <- df %>%
    mutate(flag = (country == 'Spain' & date %in% joined$date) | (country == 'Spain' & date > max(esp_df$date))) %>%
    filter(!flag) %>% dplyr::select(-flag) %>%
    bind_rows(joined)
  
  # Portugal
  left <- por_df %>% 
    dplyr::rename(district = ccaa) %>%
    mutate(country = 'Portugal')
  right <- df %>% filter(country == 'Portugal') %>%
    dplyr::distinct(date,country)
  add_these <- df %>% filter(country == 'Portugal') %>%
    filter(!date %in% left$date,
           date < max(left$date)) # remove any obs AHEAD of ministry data
  joined <- left_join(left, right) %>% bind_rows(add_these)
  df <- df %>%
    mutate(flag = (country == 'Portugal' & date %in% joined$date) | (country == 'Portugal' & date > max(por_df$date))) %>%
    filter(!flag) %>% dplyr::select(-flag) %>%
    bind_rows(joined)
  
  # # France (special case because JHU has some)
  # NOT DOING DUE TO INCOMPLETE DATA FOR FRANCE
  # left <- fra_df %>% 
  #   dplyr::rename(district = ccaa) %>%
  #   mutate(country = 'France')
  # right <- df %>% filter(country == 'France') %>%
  #   dplyr::distinct(date,country)
  # add_these <- df %>% filter(country == 'France') %>%
  #   group_by(date, country) %>% mutate(cases = sum(cases, na.rm = TRUE), deaths = sum(deaths, na.rm = TRUE)) %>%
  #   mutate(district = NA) %>%
  #   filter(!date %in% left$date,
  #          date < max(left$date)) # remove any obs AHEAD of ministry data
  # joined <- left_join(left, right) %>% bind_rows(add_these)
  # df <- df %>%
  #   mutate(flag = (country == 'France' & date %in% joined$date) | (country == 'France' & date > max(fra_df$date))) %>%
  #   filter(!flag) %>% dplyr::select(-flag) %>%
  #   bind_rows(joined)
}

# Decumulate
df <- df %>%
  ungroup %>%
  arrange(country, district, date) %>%
  group_by(country, district) %>%
  mutate(cases_non_cum = cases - dplyr::lag(cases, default = 0),
         deaths_non_cum = deaths - dplyr::lag(deaths, default = 0),
         uci_non_cum = uci - dplyr::lag(uci, default = 0)) %>%
  ungroup

# Deal with errors in US data
df$cases[df$district == 'Nevada' & df$country == 'US' & df$date == '2020-03-17'] <- 50
df$cases[df$district == 'Utah' & df$country == 'US' & df$date == '2020-03-19'] <- 77
df$cases[df$district == 'Washington' & df$country == 'US' & df$date == '2020-03-17'] <- 1000
df$cases[df$district == 'Grand Princess' & df$country == 'US' & df$date == '2020-03-16'] <- 21
df$deaths[df$district == 'Oregon' & df$country == 'US' & df$date == '2020-03-22'] <- 5

# Join all together but by country
df_country <- df %>%
  group_by(country, date) %>%
  dplyr::summarise(cases = sum(cases, na.rm = TRUE),
                        deaths = sum(deaths, na.rm = TRUE),
                        uci = sum(uci, na.rm = TRUE),
                        hospitalizations = sum(hospitalizations, na.rm = TRUE),
                        cases_non_cum = sum(cases_non_cum, na.rm = TRUE),
                        deaths_non_cum = sum(deaths_non_cum, na.rm = TRUE),
                        uci_non_cum = sum(uci_non_cum, na.rm = TRUE))

# Get the iso code for each country
countries <- sort(unique(df$country))
country_codes <- tibble(country = countries)
library(passport)
country_codes$iso <- parse_country(x = countries, to = 'iso3c')
df <- left_join(df, country_codes)
df_country <- left_join(df_country, country_codes)

usethis::use_data(df, overwrite = T)
usethis::use_data(df_country, overwrite = T)

# Make spreadsheet for ISGlobal
isglobal <- df_country %>%
  filter(date == max(date)) %>%
  dplyr::select(date, 
                country, 
                cases,
                deaths) %>%
  left_join(country_codes)


if(!dir.exists('isglobal')){
  dir.create('isglobal')
}
write_csv(isglobal, 'isglobal/isglobal.csv')

write_csv(df, 'isglobal/world_region_data.csv')
write_csv(df_country, 'isglobal/world_data.csv')


# Read population data for Spain's CCAA
esp_pop <- tibble(ccaa = c("Andalucía",
                           "Aragón",
                           "Asturias",
                           "Baleares",
                           "C. Valenciana",
                           "Canarias",
                           "Cantabria",
                           "Cataluña",
                           "Ceuta",
                           "CLM",
                           "CyL",
                           "Extremadura",
                           "Galicia",
                           "La Rioja",
                           "Madrid",
                           "Melilla",
                           "Murcia",
                           "Navarra",
                           "País Vasco"),
                  pop = c(8414240,#  "Andalucía",
                            1319291,#"Aragón",
                            1022800,#"Asturias",
                            1149460,#"Baleares",
                            5003769,#"C. Valenciana",
                            2153389,#"Canarias",
                            581078,#"Cantabria",
                            7675217,#"Cataluña",
                            84777,#"Ceuta",
                          2032863, #"CLM",
                            2399548,#"CyL",
                            1067710,#"Extremadura",
                            2699499,#"Galicia",
                            316798,#"La Rioja",
                            6663394,#"Madrid",
                            86487,#"Melilla",
                            1493898,#"Murcia",
                            654214,#"Navarra",
                            2207776))#"País Vasco"))
usethis::use_data(esp_pop, overwrite = TRUE)


# Interpret missing UCI cases
esp_df <- esp_df %>%
  arrange(date) %>%
  group_by(ccaa) %>%
  mutate(uci = zoo::na.approx(uci, na.rm = F))

# Get non cumulative cases
esp_df <- esp_df %>%
  ungroup %>%
  arrange(ccaa, date) %>%
  group_by(ccaa) %>%
  mutate(cases_non_cum = cases - lag(cases, default = 0),
         deaths_non_cum = deaths - lag(deaths, default = 0),
         uci_non_cum = uci - lag(uci, default = 0)) %>%
  ungroup

# Spot corrections
esp_df <- esp_df %>%
  mutate(cases_non_cum = ifelse(cases_non_cum < 0,
                                          0, 
                                cases_non_cum)) 


# De-cumulate Italy
ita <- ita %>%
  ungroup %>%
  arrange(ccaa, date) %>%
  group_by(ccaa) %>%
  mutate(cases_non_cum = cases - lag(cases, default = 0),
         deaths_non_cum = deaths - lag(deaths, default = 0),
         uci_non_cum = uci - lag(uci, default = 0),
         hospitalizations_non_cum = hospitalizations - lag(hospitalizations, default = 0)) %>%
  ungroup

# De-cumulate france
fra_df <- fra_df %>%
  ungroup %>%
  arrange(ccaa, date) %>%
  group_by(ccaa) %>%
  mutate(cases_non_cum = cases - lag(cases, default = 0),
         deaths_non_cum = deaths - lag(deaths, default = 0)) %>%
  ungroup

# De-cumulate portugal
por_df <- por_df %>%
  ungroup %>%
  arrange(ccaa, date) %>%
  group_by(ccaa) %>%
  mutate(cases_non_cum = cases - lag(cases, default = 0),
         deaths_non_cum = deaths - lag(deaths, default = 0)) %>%
  ungroup


# By province
# https://docs.google.com/spreadsheets/d/1qxbKnU39yn6yYcNkBqQ0mKnIXmKfPQ4lgpNglpJ9frE/edit#gid=0

usethis::use_data(ita, overwrite = T)
usethis::use_data(por_df, overwrite = T)
usethis::use_data(fra_df, overwrite = T)

# Get Italian populations
ita_pop <- tibble(
  ccaa = c('Abruzzo',
           'Basilicata',
           'Calabria',
           'Campania',
           'Emilia-Romagna',
           'Friuli Venezia Giulia',
           'Lazio',
           'Liguria',
           'Lombardia',
           'Marche',
           'Molise',
           'Trentino-Alto Adige',
           'Piemonte',
           'Puglia',
           'Sardegna',
           'Sicilia',
           'Toscana',
           'Umbria',
           'Valle d\'Aosta',
'Veneto'),
pop = c(
  1315000,#'Abruzzo',
  567118,#'Basilicata',
  1957000,#'Calabria',
  5827000,#'Campania',
  4453000,#'Emilia Romagna',
  1216000,#'Friuli Venezia Giulia',
  5897000,#'Lazio',
  1557000,#'Liguria',
  10040000,#'Lombardia',
  1532000,#'Marche',
  308493,#'Molise',
  1070000, # trentino alto
  4376000,#'Piemonte',
  4048000,#'Puglia',
  1648000,#'Sardegna',
  5027000,#'Sicilia',
  3737000,#'Toscana',
  884640,#'Umbria',
  126202,#'Valle d\'Aosta',
  4905000#'Veneto'
)
)
usethis::use_data(ita_pop, overwrite = T)

library(readr)
write_csv(esp_df, 'spain/ccaa_day.csv')
write_csv(esp_df, 'isglobal/ccaa_day.csv')
write_csv(esp_pop, 'isglobal/esp_pop.csv')
write_csv(world_pop, 'isglobal/world_pop.csv')
write_csv(ita_pop, 'isglobal/ita_pop.csv')

# write_csv(esp_uci, 'isglobal/esp_uci.csv')

usethis::use_data(esp_df, overwrite = T)
# Write a map

# Cognoms
cognoms <- read_csv('cognoms/cognoms.csv', skip = 7, locale = readr::locale(encoding = "latin1"))
cognoms <- cognoms$Nombre[1:1000]
surnames <- read_csv('cognoms/surnames.csv')
surnames <- surnames$name[1:1000]
usethis::use_data(cognoms, overwrite = T)
usethis::use_data(surnames, overwrite = T)

# Create a regions pop for Italy, China, Spain, US, Portugal, France
usa_pop <- readxl::read_excel('usapop/usapop.xlsx', 
                              skip = 3)
canada_pop <- 
  tibble(ccaa = c('Alberta',
         'British Columbia',
         'Grand Princess',
         'Manitoba',
         'New Brunswick',
         'Newfoundland and Labrador',
         'Northwest Territories',
         'Nova Scotia',
         'Ontario',
         'Prince Edward Island',
         'Quebec',
         'Saskatchewan'),
         pop = c(4413146,
                 5110917,
                 NA,
                 1377517,
                 779993,
                 521365,
                 44904,
                 977457,
                 14711827,
                 158158,
                 8537674,
                 1181666))
names(usa_pop)[c(1,13)] <- c('ccaa', 'pop')
usa_pop <- usa_pop %>% dplyr::select('ccaa', 'pop')
usa_pop <- usa_pop[6:56,]
usa_pop$ccaa <- gsub('.', '', usa_pop$ccaa, fixed = TRUE)
regions_pop <- 
  bind_rows(esp_pop %>% mutate(country = 'Spain', iso = 'ESP'),
            ita_pop %>% mutate(country = 'Italy', iso = 'ITA'),
            chi_pop %>% mutate(country = 'China', iso = 'CHN'),
            usa_pop %>% mutate(country = 'US', iso = 'USA'),
            fra_pop %>% mutate(country = 'France', iso = 'FRA'),
            por_pop %>% mutate(country = 'Portugal', iso = 'PRT'),
            canada_pop %>% mutate(country = 'Canada', iso = 'CAN'))
usethis::use_data(regions_pop, overwrite = T)

# Write to the covidcount directory
write_csv(df_country, '../../covidcount/data/jhu.csv')
write_csv(world_pop, '../../covidcount/data/world_pop.csv')
jeff = excess %>% filter(country == 'Spain')
readr::write_csv(jeff, 'jeff.csv')
jeff2 = jeff %>% filter(end_date >= '2020-03-01') %>%
  group_by(region) %>%
  summarise(total_deaths = sum(total_deaths),
            expected_deaths = sum(expected_deaths)) %>%
  mutate(percent_excess = total_deaths / expected_deaths * 100) %>%
  arrange(desc(percent_excess))
readr::write_csv(jeff2, 'jeff2.csv')
setwd('..')
golem::detach_all_attached()
rm(list=ls(all.names = TRUE))
devtools::install()
setwd('data-raw/')
databrew/covid19 documentation built on Aug. 24, 2020, 10:39 a.m.