diseasystore: Google Health COVID-19 Open Data"

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(diseasystore)
if (rlang::is_installed("withr")) {
  withr::local_options("tibble.print_min" = 5)
  withr::local_options("diseasystore.verbose" = FALSE)
} else {
  opts <- options("tibble.print_min" = 5, "diseasystore.verbose" = FALSE)
}

# We have a "hard" dependency for duckdb to render parts of this vignette
suggests_available <- rlang::is_installed("duckdb")
not_on_cran <- interactive() || identical(Sys.getenv("NOT_CRAN"), "true") ||
  identical(Sys.getenv("CI"), "true")

The Google COVID-19 data repository is a comprehensive open repository of COVID-19 data.

This vignette shows how to use (some of) this data through the diseasystore package.

First, it is a good idea to copy the relevant Google COVID-19 data files locally and store that location as an option for the package. ?DiseasystoreGoogleCovid19 uses only the age-stratified metrics for COVID-19, so only a subset of the repository is needed to download.

# First we set the path we want to use as an option
options(
  "diseasystore.DiseasystoreGoogleCovid19.source_conn" =
    file.path("local", "path")
)

# Ensure folder exists
source_conn <- diseasyoption("source_conn", "DiseasystoreGoogleCovid19")
if (!dir.exists(source_conn)) {
  dir.create(source_conn, recursive = TRUE, showWarnings = FALSE)
}

# Define the Google files to download
google_files <- c("by-age.csv", "demographics.csv", "index.csv", "weather.csv")

# Download each file and compress them to reduce storage
purrr::walk(google_files, ~ {
  url <- paste0(diseasyoption("remote_conn", "DiseasystoreGoogleCovid19"), .)

  destfile <- file.path(
    diseasyoption("source_conn", "DiseasystoreGoogleCovid19"),
    .
  )

  if (!file.exists(destfile)) {
    download.file(url, destfile)
  }
})
# The files we need are stored remotely in Google's API
google_files <- c("by-age.csv", "demographics.csv", "index.csv", "weather.csv")
remote_conn <- diseasyoption("remote_conn", "DiseasystoreGoogleCovid19")

# In practice, it is best to make a local copy of the data which is
# stored in the "vignette_data" folder.
# This folder can either be in the package folder
# (preferred, please create the folder) or in the tempdir().
local_conn <- purrr::detect(
  "vignette_data",
  checkmate::test_directory_exists,
  .default = tempdir()
)

if (rlang::is_installed("withr")) {
  withr::local_options(
    "diseasystore.DiseasystoreGoogleCovid19.source_conn" = local_conn,
    "diseasystore.DiseasystoreGoogleCovid19.n_max" = 1000
  )
} else {
  opts <- c(
    opts,
    options(
      "diseasystore.DiseasystoreGoogleCovid19.source_conn" = local_conn,
      "diseasystore.DiseasystoreGoogleCovid19.n_max" = 1000
    )
  )
}

# Then we download the first n rows of each data set of interest
try({
  purrr::discard(
    google_files,
    ~ checkmate::test_file_exists(file.path(local_conn, .))
  ) |>
    purrr::walk(\(file) {
      paste0(remote_conn, file) |>
        readr::read_csv(n_max = 1000, show_col_types = FALSE, progress = FALSE) |>
        readr::write_csv(file.path(local_conn, file))
    })
})

# Check that the files are available after attempting to download
files_missing <- purrr::some(
  google_files,
  ~ !checkmate::test_file_exists(file.path(local_conn, .))
)
if (files_missing) {
  data_available <- FALSE
} else {
  data_available <- TRUE
}

The diseasystores require a database to store its features in. These should be configured before use and can be stored in the packages options.

# We define target_conn as a function that opens a DBIconnection to the DB
target_conn <- \() DBI::dbConnect(duckdb::duckdb())
options(
  "diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn
)
target_conn <- \() DBI::dbConnect(duckdb::duckdb())
if (rlang::is_installed("withr")) {
  withr::local_options(
    "diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn
  )
} else {
  opts <- c(
    opts,
    options("diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn)
  )
}

Once the files are downloaded and the target DB is configured, we can initialize the diseasystore that uses the Google COVID-19 data.

ds <- DiseasystoreGoogleCovid19$new()

Once configured such, we can use the feature store directly to get data.

# We can see all the available features in the feature store
ds$available_features
# And then retrieve a feature from the feature store
ds$get_feature(feature = "n_hospital",
               start_date = as.Date("2020-01-01"),
               end_date = as.Date("2020-06-01"))
if (exists("ds")) rm(ds)
gc()
if (!rlang::is_installed("withr")) {
  options(opts)
}


Try the diseasystore package in your browser

Any scripts or data that you put into this service are public.

diseasystore documentation built on April 4, 2025, 5:56 a.m.