#' @rdname io
#'
#' @title Retrieve and read MixPanel CSV files
#'
#' @details `retrieve()` visits the Google bucket where AnVIL MixPanel
#' data are stored, and retrieves all `*.csv` files to a local
#' cache. Files already present in the cache are not
#' re-localized. You will be prompted to authenticate with an
#' account for which shared access to the `MixPanel 2021` Google
#' Drive folder.
#'
#' @param destination character(1) location of the local cache. The
#' default is the standard operating system location for R
#' packages.
#'
#' @return `retrieve()` returns a tibble with columns year, month,
#' title, and path to the localized file. The year, month, and
#' title are parsed from the file name.
#'
#' @import futile.logger
#'
#' @importFrom tools R_user_dir
#'
#' @importFrom googledrive as_id drive_ls drive_download
#'
#' @importFrom dplyr .data filter mutate select arrange anti_join
#' bind_rows everything
#'
#' @importFrom tibble tibble
#'
#' @export
retrieve <-
function(destination = R_user_dir("mixpanel", "cache"))
{
if (!dir.exists(destination))
dir.create(destination)
MixPanel_2021_id <- as_id("1JzgYx1QQYDPZRZjkbqzMHy5jH0JD9R7i")
files <- drive_ls(MixPanel_2021_id)
csv_files <- filter(files, endsWith(.data$name, ".csv"))
current <- tibble(name = dir(destination, pattern = "*.csv"))
download <- anti_join(csv_files, current, by = "name")
flog.info(
"discovered %d files, downloading %d",
nrow(csv_files), nrow(download)
)
path <- file.path(destination, download$name)
status <- Map(drive_download, download$id, path)
tbl <- tibble(path = file.path(destination, csv_files$name))
tbl |>
mutate(
month = factor(substr(basename(path), 1, 3), levels = month.abb),
year = substr(basename(path), 4, 7),
title = substr(basename(path), 9, nchar(basename(path)) - 4L)
) |>
select(.data$year, .data$month, .data$title, .data$path) |>
arrange(.data$year, .data$month, .data$title)
}
#' @rdname io
#'
#' @details `read()` reads several csv files into tibbles, and then
#' binds rows into a single tibble. Column headers with a hyphen `
#' - `, typically separating date ranges spanning the month, are
#' replaced with `DateRangeCount`.
#'
#' @param tbl A tibble returned by `retrieve()`, typically filtered to
#' select a title of interest.
#'
#' @param \dots Additional arguments passed to `read_csv()`.
#'
#' @param is_retention_report logical(1). When `FALSE` (default)
#' process files returned by `read_csv()` without further
#' modification. When `TRUE`, remove the `$average` 'Date' present
#' in some files.
#'
#' @return `read()` returns a single tibble.
#'
#' @importFrom readr read_csv
#'
#' @export
read <-
function(tbl, ..., is_retention_report = FALSE)
{
flog.info("reading %d csv files", nrow(tbl))
tbls <- lapply(tbl$path, function(path, ...) {
tbl <- read_csv(path, show_col_types = FALSE, ...)
## drop $average row by reading twice
if (is_retention_report && inherits(tbl$Date, "character")) {
tbl <- read_csv(
path, show_col_types = FALSE, skip = 2, col_names = names(tbl)
)
}
## re-name date ranges as 'DateRangeCount'
idx <- grepl(" - ", names(tbl), fixed = TRUE)
names(tbl)[idx] <- "DateRangeCount"
tbl
}, ...)
nrows <- vapply(tbls, nrow, integer(1))
bind_rows(tbls) |>
mutate(Month = rep(tbl$month, nrows)) |>
select(.data$Month, everything())
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.