R/01-ph_datadrop.R

Defines functions ph_get_daily ph_get_tests ph_get_cases ph_get_fields ph_gdrive_files

Documented in ph_gdrive_files ph_get_cases ph_get_daily ph_get_fields ph_get_tests

################################################################################
#
#' Get Department of Health COVID-19 Data Drop Google Drive specific directory
#' information
#'
#' A wrapper to googledrive functions to extract information on contents of
#' a specific COVID-19 Data Drop Google Drive directory
#'
#' @param version A character value specifying whether to get the most
#'   currently available dataset (\code{"current"}) or to get archive data
#'   (\code{"archive"}). Default to \code{"current"}
#' @param date Date in <YYYY-MM-DD> format; This is the date up to which
#'   extracted data reports to. Only used when \code{version} is set to
#'   \code{"archive"} otherwise ignored.
#'
#' @return A tibble containing information on the various files in a specified
#'   COVID-19 Data Drop Google Drive directory
#'
#' @examples
#' ph_gdrive_files()
#'
#' @export
#'
#'
#
################################################################################

ph_gdrive_files <- function(version = "current", date = NULL) {
  ## Google Drive deauthorisation
  googledrive::drive_deauth()

  ## Get current data link folder information and contents
  dropCurrent <- googledrive::drive_ls(googledrive::drive_get(id = "1ZPPcVU4M7T-dtRyUceb0pMAd8ickYf8o"))

  ## Get dropDate
  dropDate <- stringr::str_extract(string = dropCurrent$name,
                                   pattern = "[0-9]{2}/[0-9]{2}|[0-9]{2}\\_[0-9]{2}") %>%
    paste("2020", sep = "/") %>%
    lubridate::mdy()

  if(version == "archive") dropDate <- date

  ## Provide message to user
  message(
    paste("Getting information on Google Drive directory structure of the DoH Data Drop for latest available data up to ",
          dropDate, ".", sep = "")
  )

  ## Create temporary file
  destFile <- tempfile()

  ## Create link for download of README
  link <- sprintf(fmt = "https://docs.google.com/uc?id=%s",
                  dropCurrent$id)

  googledrive::drive_download(file = googledrive::as_id(link),
                              path = destFile, verbose = FALSE)

  ## Extract information from PDF on link to folder of current data
  readme <- pdftools::pdf_text(pdf = destFile) %>%
    stringr::str_split(pattern = "\n|\r\n") %>%
    unlist()

  ## Ged id for current data google drive folder
  x <- stringr::word(readme[stringr::str_detect(string = readme,
                                                pattern = "bit.ly/*")][1], -1)

  if(!stringr::str_detect(string = x, pattern = "http")) {
    x <- paste("http://", x, sep = "")
  }

  x <- x %>%
    stringr::str_replace(pattern = "https", replacement = "http") %>%
    RCurl::getURL() %>%
    stringr::str_extract_all(pattern = "[A-Za-z0-9@%#&()+*$,._\\-]{33}") %>%
    unlist()

  ## Get google drive directory sturcture and information
  y <- googledrive::drive_ls(googledrive::drive_get(id = x))

  if(version == "archive") {
    if(is.null(date)) {
      stop("Date needs to be specified if version is set to archive. Please try again.",
           call. = TRUE)
    }

    if(lubridate::ymd(date) == Sys.Date()) {
      stop("Date should be earlier than current date to access data archive. Please try again.",
           call. = TRUE)
    }

    ## Check whether date is within range
    if(!lubridate::ymd(date) %within% lubridate::interval(lubridate::ymd("2020-04-14"), Sys.Date())) {
      stop("Earliest COVID-19 Data Drop record is for 2020-04-14.
         Only provide dates as early as 2020-04-14 or later. Please try again.",
           call. = TRUE)
    }

    ## List contents of Data Drop Archive Google Drive Folder
    dropArchive <- googledrive::drive_ls(googledrive::drive_get(id = "1w_O-vweBFbqCgzgmCpux2F0HVB4P6ni2"))

    ## Get gdriveID based on date
    dropArchiveMonth <- stringr::str_remove_all(string = dropArchive$name,
                                                pattern = "DOH COVID-19 Data Drop \\(|\\)") %>%
      lubridate::parse_date_time(orders = "my") %>%
      lubridate::month()

    gdriveID <- dropArchive$id[dropArchiveMonth == lubridate::month(lubridate::ymd(date))]

    ## Process date
    date <- stringr::str_remove_all(string = date, pattern = "-")

    ## Get list of contents directory specified by gdriveID
    w <- googledrive::drive_ls(googledrive::drive_get(id = gdriveID))

    ## Get the unique identifier for directory for specified date
    x <- w$id[stringr::str_extract(string = w$name, pattern = "[0-9]{8}") == date]

    ## Get listing of contents of specified directory
    y <- googledrive::drive_ls(googledrive::drive_get(id = x))
  }

  ## Return list of contents
  return(y)
}


################################################################################
#
#' Pull Philippines data fields information from the publicly available
#' Department of Health COVID-19 Data Drop
#'
#' A wrapper to googledrive and googlesheets4 functions to pull data from the
#' Philppines' COVID-19 Data Drop resource that is publicly distributed via
#' Google Drive
#'
#' @param version A character value specifying whether to get the most
#'   currently available dataset (\code{"current"}) or to get archive data
#'   (\code{"archive"}). Default to \code{"current"}
#' @param date Date in <YYYY-MM-DD> format; This is the date up to which
#'   extracted data reports to. Only used when \code{version} is set to
#'   \code{"archive"} otherwise ignored.
#'
#' @return A tibble of metadata for the fields used in the various datasets
#'   distributed via the COVID-19 Data Drop
#'
#' @examples
#' ph_get_fields()
#'
#' @export
#'
#
################################################################################

ph_get_fields <- function(version = "current", date = NULL) {
  ## Get list of contents of specified Google drive directory
  y <- ph_gdrive_files(version = version, date = date)

  ## Get identifier of Fields data
  z <- y$id[stringr::str_detect(string = y$name, pattern = "Fields.csv")]

  ## Get fields data CSV link
  link <- sprintf(fmt = "https://docs.google.com/uc?id=%s", z)

  ## Create temporary file
  destFile <- tempfile()

  ## Download Fields.csv to temp directory
  googledrive::drive_download(file = googledrive::as_id(link),
                              path = destFile, verbose = FALSE)

  ## Read fields CSV
  fields <- utils::read.csv(file = destFile, stringsAsFactors = FALSE)

  ## Convert to tibble
  fields <- tibble::tibble(fields)

  ## Return dataset
  return(fields)
}


################################################################################
#
#' Pull Philippines data on cases information from the publicly available
#' Department of Health COVID-19 Data Drop
#'
#' A wrapper to googledrive and googlesheets4 functions to pull data from the
#' Philppines' COVID-19 Data Drop resource that is publicly distributed via
#' Google Drive
#'
#' @param version A character value specifying whether to get the most
#'   currently available dataset (\code{"current"}) or to get archive data
#'   (\code{"archive"}). Default to \code{"current"}
#' @param date Date in <YYYY-MM-DD> format; This is the date up to which
#'   extracted data reports to. Only used when \code{version} is set to
#'   \code{"archive"} otherwise ignored.
#'
#' @return A tibble of case information on confirmed COVID-19 cases distributed
#'   via the COVID-19 Data Drop
#'
#' @examples
#' ph_get_cases()
#'
#' @export
#'
#
################################################################################

ph_get_cases <- function(version = "current", date = NULL) {
  ## Get list of contents of specified Google Drive directory
  y <- ph_gdrive_files(version = version, date = date)

  ## Check if file named Case Information is available
  if(!any(stringr::str_detect(string = y$name, pattern = "Case Information.csv"))) {
    stop(paste("No cases information on ", date,
               ". Try a date earlier or later than date specified.",
               sep = ""),
         call. = TRUE)
  }

  ## Get unique identifier for Cases data
  z <- y$id[stringr::str_detect(string = y$name, pattern = "Case Information.csv")]

  ## Get link for Case Information.csv
  link <- sprintf(fmt = "https://docs.google.com/uc?id=%s", z)

  ## Create temporary file
  destFile <- tempfile()

  ## Download Cases Information.csv
  googledrive::drive_download(file = googledrive::as_id(link),
                              path = destFile, verbose = FALSE)

  ## Read cases CSV
  cases <- utils::read.csv(file = destFile, stringsAsFactors = FALSE)

  ## Convert to tibble
  cases <- tibble::tibble(cases)

  ## Return dataset
  return(cases)
}


################################################################################
#
#' Pull Philippines data on testing aggregates from the publicly available
#' Department of Health COVID-19 Data Drop
#'
#' A wrapper to googledrive and googlesheets4 functions to pull data from the
#' Philppines' COVID-19 Data Drop resource that is publicly distributed via
#' Google Drive
#'
#' @param version A character value specifying whether to get the most
#'   currently available dataset (\code{"current"}) or to get archive data
#'   (\code{"archive"}). Default to \code{"current"}
#' @param date Date in <YYYY-MM-DD> format; This is the date up to which
#'   extracted data reports to. Only used when \code{version} is set to
#'   \code{"archive"} otherwise ignored.
#'
#' @return A tibble of case information on testing aggregates distributed
#'   via the COVID-19 Data Drop
#'
#' @examples
#' ph_get_tests()
#'
#' @export
#'
#
################################################################################

ph_get_tests <- function(version = "current", date = NULL) {
  ## Get list of contents of specified Google Drive directory
  y <- ph_gdrive_files(version = version, date = date)

  ## Get unique identifier for testing aggregates dataset
  z <- y$id[stringr::str_detect(string = y$name, pattern = "Testing Aggregates.csv")]

  ## Get testing aggregates CSV link
  link <- sprintf(fmt = "https://docs.google.com/uc?id=%s", z)

  ## Create temporary file
  destFile <- tempfile()

  ## Download Testing aggregates.csv
  googledrive::drive_download(file = googledrive::as_id(link),
                              path = destFile, verbose = FALSE)

  ## Read testing aggregates CSV file
  tests <- utils::read.csv(file = destFile, stringsAsFactors = FALSE)

  ## Convert to tibble
  tests <- tibble::tibble(tests)

  ## Return dataset
  return(tests)
}


################################################################################
#
#' Pull Philippines data on daily facilities status from the publicly available
#' Department of Health COVID-19 Data Drop
#'
#' A wrapper to googledrive and googlesheets4 functions to pull data from the
#' Philppines' COVID-19 Data Drop resource that is publicly distributed via
#' Google Drive
#'
#' @param version A character value specifying whether to get the most
#'   currently available dataset (\code{"current"}) or to get archive data
#'   (\code{"archive"}). Default to \code{"current"}
#' @param date Date in <YYYY-MM-DD> format; This is the date up to which
#'   extracted data reports to. Only used when \code{version} is set to
#'   \code{"archive"} otherwise ignored.
#'
#' @return A tibble of daily facilities status distributed via the COVID-19
#'   Data Drop
#'
#' @examples
#' ph_get_daily()
#'
#' @export
#'
#
################################################################################

ph_get_daily <- function(version = "current", date = NULL) {
  ## Get list of contents of specified Google Drive directory
  y <- ph_gdrive_files(version = version, date = date)

  ## Get unique identifier for Daily Report CSV
  z <- y$id[stringr::str_detect(string = y$name, pattern = "Daily Report.csv")]

  ## Get link for daily CSV
  link <- sprintf(fmt = "https://docs.google.com/uc?id=%s", z)

  ## Create temporary file
  destFile <- tempfile()

  ## Download daily report.csv
  googledrive::drive_download(file = googledrive::as_id(link),
                              path = destFile, verbose = FALSE)

  ## Read daily report CSV
  daily <- utils::read.csv(file = destFile, stringsAsFactors = FALSE)

  ## Convert to tibble
  daily <- tibble::tibble(daily)

  ## Return dataset
  return(daily)
}
como-ph/comoparams documentation built on July 3, 2020, 8:34 a.m.