R/update_data.r

Defines functions update_data

Documented in update_data

#' update package data from HERA via HDX
#'
#' BEWARE code in africovid-creation.r is more up-to-date than this
#' *in progress
#'
#' will try to download data from HDX and use this to update locally the data stored in the package
#'
# @param country a character vector of country names or iso3c character codes.
# @param plot
#'
#'
# @examples
# update_data()
#'
#' @return dataframe
# @importFrom ggplot2 ggplot aes_string geom_tile theme_classic labs scale_fill_distiller scale_x_date theme element_blank
# @importFrom lubridate parse_date_time dmy
# @importFrom rhdx set_rhdx_config get_rhdx_config search_datasets get_resources read_resource
# @importFrom usethis use_data
# @importFrom magrittr %>%
# @importFrom purrr pluck
# @importFrom readr locale
# @importFrom dplyr select anti_join left_join
# @importFrom sf st_drop_geometry
#' @export
#'
#'
update_data <- function()
{


  rhdx::set_rhdx_config(hdx_site = "prod")
  rhdx::get_rhdx_config()

  # this does return hera datasets
  #ds <- search_datasets("hera", rows=99)

  #21 subnational datasets (2020-12-14 & 2021-01-17)
  #26 subnational datasets (2021-05-16)
  ds <- search_datasets("hera subnational", rows=99)
  #below returned 49 including city level datasets
  #ds <- search_datasets("hera Coronavirus subnational", rows=99)

  #trying to read them all into single dataframe

  #2021-01-17
  #11 Liberia has problems with DATE column, stops in May, doesn't have an ID column
  #15 is called a web app and fails to load
  #16 & later are cumulative for all Africa & different columns
  #to_exclude <- c(11,15,16,17,18,19,20,21)

  #2021-05-16 finding which datsets to include/exclude
  #here sapply provides a simplified vector of all dataset titles
  sapply(ds,function(x) x$data$title)

  #which to exclude by visually inspecting list
  to_exclude <- c(2,14,18:26)
  # [1] "Nigeria: Coronavirus  (Covid-19) Subnational"
  # [2] "Guinea: Ebola (2021) Subnational cases"
  # [3] "Mali: Coronavirus (Covid-19) Subnational"
  # [4] "Ghana: Coronavirus (Covid-19) Subnational"
  # [5] "Niger: Coronavirus (Covid-19) Subnational"
  # [6] "Mauritania: Coronavirus (Covid-19) Subnational"
  # [7] "Senegal : Coronavirus (Covid-19) Subnational"
  # [8] "Togo: Coronavirus (Covid-19) Subnational"
  # [9] "Gambia: Coronavirus (Covid-19) Subnational"
  # [10] "Bénin: Coronavirus (Covid-19) Subnational"
  # [11] "Burkina Faso: Coronavirus (Covid-19) Subnational"
  # [12] "Liberia: Coronavirus (Covid-19) Subnational"
  # [13] "Guinea: Coronavirus (Covid-19) Subnational"
  # [14] "Democratic Republic of the Congo: Ebola (2021) Subnational cases"
  # [15] "Democratic Republic of the Congo: Coronavirus  (Covid-19) Subnational"
  # [16] "Côte d'Ivoire: Coronavirus (Covid-19) Subnational"
  # [17] "Sierra Leone: Coronavirus (Covid-19) Subnational"
  # [18] "Africa: Coronavirus (COVID-19) Subnational Cases"
  # [19] "Democratic Republic of the Congo : Ebola (2021) Cumulative cases (national)"
  # [20] "Guinea: Ebola (2021) Cumulative cases (national)"
  # [21] "Africa: Covid-19 Infections (National)"
  # [22] "Africa: Covid-19 Cumulative Recoveries (National)"
  # [23] "Africa: Covid-19 Cumulative infections (National)"
  # [24] "Africa: Covid-19 Cumulative Deaths (National)"
  # [25] "Africa: Covid-19 Recoveries (National)"
  # [26] "Africa: Covid-19 Death cases (National)"

  ds <- ds[-to_exclude]

  dfall <- NULL

  for( i in 1:length(ds))
  {
    cat(i)

    # df1 <- get_resource(ds[[i]], 1) %>% ## 1st resource is XLS (csv is ; delimited causing problems)
    #        read_resource() # read into R

    #df1 <- get_resource(ds[[i]], 2) %>% ## 2nd resource is csv
    df1 <- get_resources(ds[[i]], format = "csv") %>%
      pluck(1) %>% ## select the first csv
      # read into R, force_download to make sure get latest version
      read_resource(delim = ";", locale = locale(decimal_mark = ","), force_download = TRUE)

    #should work too
    #get_resources(format = "csv") %>%

    #Liberia stops in May, doesn't have an ID column
    #also gives this error, probably to do with dates
    #Error in as.POSIXlt.character(x, tz, ...) :
    #character string is not in a standard unambiguous format
    #skip Liberia above
    #2021-05-16 Cote d'Ivoire doesn't have an ID column either
    if (names(df1)[1]=='DATE')
    {
      #next #to miss out Liberia
      #df1 <- NULL #set to NULL to miss Liberia out for now
      dfid <- tibble(ID=1:nrow(df1))
      df1 <- cbind(dfid,df1)
    }

    #some, but not all xls files, read rownames into first column
    if (df1[[1,1]] == 'ID')
    {
      #set column names from 1st row
      names(df1) <- as.character(df1[1,])
      #remove first row
      df1 <- df1[-1,]
    }

    #Benin stops in October and has different named columns, title case rather than upper case
    #patch to fix it
    #if ("Femme" %in% names(df1))
    #  names(df1) <- names(dfall)

    #remove this column that occurs in just some ds e.g Gambia
    if ('LIEN SOURCE' %in% names(df1))
      df1 <- dplyr::select(df1, !`LIEN SOURCE`)

    #2021-05-16 senegal now has added vaccination columns, breaks reading into combined dataframe
    #remove thos columns for now
    if ("NOUVEAUX_INDIVIDUS_VACCINES" %in% names(df1))
      df1 <- dplyr::select(df1, !"NOUVEAUX_INDIVIDUS_VACCINES")

    if ("TOTAL_INDIVIDUS_VACCINES (1 dose)" %in% names(df1))
      df1 <- dplyr::select(df1, !`TOTAL_INDIVIDUS_VACCINES (1 dose)`)

    #remove these columns that appear just in some case Sierra Leone
    if ('LIEN WEB' %in% names(df1))
      df1 <- dplyr::select(df1, !`LIEN WEB`)

    if ('x15' %in% names(df1))
      df1 <- dplyr::select(df1, !`x15`)
    if ('X15' %in% names(df1))
      df1 <- dplyr::select(df1, !`X15`)

    #2021-05-16 correcting fieldnames for Cote d'Ivoire
    if ('Contaminés' %in% names(df1))
      df1 <- dplyr::rename(df1, CONTAMINES = Contaminés)
    if ('Décès' %in% names(df1))
      df1 <- dplyr::rename(df1, DECES = Décès)
    if ('Guéris' %in% names(df1))
      df1 <- dplyr::rename(df1, GUERIS = Guéris)
    if ('Femme' %in% names(df1))
      df1 <- dplyr::rename(df1, CONTAMINES_FEMME = Femme)
    if ('Homme' %in% names(df1))
      df1 <- dplyr::rename(df1, CONTAMINES_HOMME = Homme)
    if ("Genre_non spécifié" %in% names(df1))
      df1 <- dplyr::rename(df1, CONTAMINES_GENRE_NON_SPECIFIE = `Genre_non spécifié`)

    #bind these country rows onto all country rows
    dfall <- rbind(dfall, df1)
  }


  #2021-05-16 694 rows with NA for PAYS ISO_3 ID_PAYS etc., some are Nigeria
  tstna <- filter(dfall,is.na(PAYS))
  # TODO fill in these NAs
  # for now filter out the NAs
  dfall <- filter(dfall,!is.na(PAYS))


  #7 Gambia with csv
  # Warning: 1365 parsing failures.
  #  row                           col expected actual                                                                              file
  # 1081 CONTAMINES                    a double   null 'C:/Users/andy.south/AppData/Local/Cache/R/rhdx/gmb_subnational_covid19_hera.csv'

  #saving the data object for all countries
  dfhera <- dfall

  # unique(dfhera$PAYS)
  # [1] "Nigéria"      "Ghana"        "Mali"         "Niger"        "Mauritanie"   "Sénégal"      "Gambie"
  # [8] "Togo"         "Bénin"        "Burkina Faso"

  names(dfhera)
  # [1] "ID"                            "DATE"                          "ISO_3"
  # [4] "PAYS"                          "ID_PAYS"                       "REGION"
  # [7] "ID_REGION"                     "CONTAMINES"                    "DECES"
  # [10] "GUERIS"                        "CONTAMINES_FEMME"              "CONTAMINES_HOMME"
  # [13] "CONTAMINES_GENRE_NON_SPECIFIE" "SOURCE"

  #convert dates to useable format
  #library(lubridate)
  dfhera$date <- lubridate::dmy(dfhera$DATE)

  #convert numeric columns to numeric - seesm this shouldn't be necessary, tibble attributes indicate col_double()
  dfhera$CONTAMINES <- as.numeric(dfhera$CONTAMINES)
  dfhera$DECES <- as.numeric(dfhera$DECES)
  dfhera$GUERIS <- as.numeric(dfhera$GUERIS)
  dfhera$CONTAMINES_FEMME <- as.numeric(dfhera$CONTAMINES_FEMME)
  dfhera$CONTAMINES_HOMME <- as.numeric(dfhera$CONTAMINES_HOMME)
  dfhera$CONTAMINES_GENRE_NON_SPECIFIE <- as.numeric(dfhera$CONTAMINES_GENRE_NON_SPECIFIE)

  ##add countrycodes & english names

  # library(countrycode)
  # this doesn't work to convert from french destination
  # below I use afrilearndata instead
  # dfhera$iso3c <- countrycode(dfhera$PAYS, origin = 'country.name.fr', destination = 'iso3c')
  # Error in countrycode(dfhera$PAYS, origin = "country.name.fr", destination = "iso3c") :
  #   Origin code not supported by countrycode or present in the user-supplied custom_dict.

  # library(fuzzyjoin)
  # library(afrilearndata)
  # library(sf)

  #create dataframe of nam, name_fr & iso_a3
  #df1 <- dplyr::select(sf::st_drop_geometry(africountries), c(name,iso_a3,name_fr))
  df1 <- dplyr::select(sf::st_drop_geometry(afrilearndata::africountries), c(name,iso_a3))
  names(df1)[1] <- 'name_en'

  #test whether standard antijoin & fuzzyjoin copes with all the French names, should return 0 rows
  #dplyr::anti_join(dfhera, df1, by=c(PAYS='name_fr'))
  #stringdist_anti_join(dfhera, df1, by=c(PAYS='name_fr')) #, mode='left')

  #arg stringdist_join matches Nigeria & Liberia !! detect by more rows in the df
  #df2 <- stringdist_left_join(dfhera, df1, by=c(PAYS='name_fr')) #, mode='left')

  #standard left join works based on either name_fr safer to use ISO_3
  #dfhera <- left_join(dfhera, df1, by=c(PAYS='name_fr')) #, mode='left')
  dfhera <- dplyr::left_join(dfhera, df1, by=c(ISO_3='iso_a3')) #, mode='left')

  # checking and correcting a few negative values
  # 3 rows with negative values
  # the one for Benin messes up Benin plot
  dfneg <- dfhera[which(dfhera$CONTAMINES < 0),]

  # ID DATE  ISO_3 PAYS  ID_PAYS REGION ID_REGION CONTAMINES DECES GUERIS CONTAMINES_FEMME CONTAMINES_HOMME
  # 2276 23/0~ GHA   Ghana       4 Bono          40         -4    NA      0               NA               NA
  # 3575 08/1~ GHA   Ghana       4 Great~        30        -44    NA    -15               NA               NA
  #  845 19/0~ BEN   Bénin      10 Non s~       108       -209     0    -26               NA               NA

  # here I assume that the neg symbol shouldn't be there so I remove it
  negs <- which(dfhera$CONTAMINES < 0)
  dfhera$CONTAMINES[negs] <- abs(dfhera$CONTAMINES[negs])
  negs <- which(dfhera$GUERIS < 0)
  dfhera$GUERIS[negs] <- abs(dfhera$GUERIS[negs])
  negs <- which(dfhera$CONTAMINES_GENRE_NON_SPECIFIE < 0)
  dfhera$CONTAMINES_GENRE_NON_SPECIFIE[negs] <- abs(dfhera$CONTAMINES_GENRE_NON_SPECIFIE[negs])

  #2021-02-07 dim(dfhera) 44827    16
  #2021-05-16 dim(dfhera) 64986    16

  #names(dfhera)
  # [1] "ID"                            "DATE"                          "ISO_3"
  # [4] "PAYS"                          "ID_PAYS"                       "REGION"
  # [7] "ID_REGION"                     "CONTAMINES"                    "DECES"
  # [10] "GUERIS"                        "CONTAMINES_FEMME"              "CONTAMINES_HOMME"
  # [13] "CONTAMINES_GENRE_NON_SPECIFIE" "SOURCE"                        "date"
  # [16] "name_en"

  #to get country totals
  #as.data.frame(table(dfhera$PAYS))

  # joining geoboundaries subnat names on to make it easier to map regions
  # BEWARE this uses dfhera that hasn't been saved to the package yet
  dfhera2 <- join_all_subnat_to_map(dfhera)

  dfhera <- dfhera2

  usethis::use_data(dfhera, overwrite = TRUE)

  dfhera
}
afrimapr/africovid documentation built on June 2, 2021, 8:57 a.m.