R/google_drive.R

Defines functions gd_put gd_config

Documented in gd_config gd_put

# These functions download or upload any file you want, using the
# project-specific configuration for a Google Drive folder

#' Create or overwrite a Google Drive configuration file
#'
#' @param folder character name of the Google Drive folder where all
#'   files for this project are to be stored, nested in a file structure
#'   parallel to the local project file structure
#' @param config_file character name of the YAML file where this configuration
#'   information should be written
#' @export
gd_config <- function(folder, config_file=getOption("scipiper.gd_config_file")) {
  require_libs('googledrive')
  
  # write the given information to the specified config_file
  cfg <- list(folder=folder)
  if(!dir.exists(dirname(config_file))) dir.create(dirname(config_file), recursive=TRUE)
  readr::write_lines(yaml::as.yaml(cfg), config_file)
  
  # check for credentials
  cred_file <- '.httr-oauth'
  if(!file.exists(cred_file)) {
    googledrive::drive_auth(cache=TRUE)
  }
  
  if(config_file != getOption("scipiper.gd_config_file")) {
    warning("config_file != default; consider setting options('scipiper.gd_config_file') in .Rprofile")
  }
}

#' Upload a file to Google Drive
#'
#' Upload (create or overwrite) a file to the project bucket and path. Writes an
#' indicator file exactly corresponding to the data_file path and name (but with
#' indicator file extension).
#'
#' @param remote_ind character name of the indicator file to write locally, but
#'   which describes the status of the remote file once the file has been
#'   uploaded by this function. The remote data file will have a name
#'   corresponding to this ind_file (without the indicator extension, but with
#'   same path and basename).
#' @param local_source character name of EITHER a data file to upload OR the
#'   indicator file of a data file to upload. Using the same value for both
#'   remote_ind and local_source (or setting local_source to the data file name
#'   corresponding to the indicator in remote_ind) will only work (in remake) if
#'   you are calling `gd_put` from within the same function that created the
#'   data_file. If instead you have separate recipes for (a) creating the
#'   original data_file, (b) posting the data_file, and (c) retrieving the
#'   data_file from google drive, then the 'a' and 'c' recipes must have
#'   different targets and this function's local_source argument should match
#'   the target of the 'a' recipe while this function's remote_ind argument
#'   should match the target of this recipe (=='b') and the data_file target of
#'   the 'c' recipe. See the examples. Nonetheless, because we have commonly
#'   adopted the 2-target option where remote_ind and local_source _can_ be the
#'   same, the default for this argument is to set `local_source=remote_ind`.
#' @param mock_get character. if remote_ind and local_source imply different
#'   local file locations, should the current local file (implied by
#'   local_source) be left alone ('none'), or copied ('copy') or moved ('move')
#'   to the location implied by remote_ind? If 'copy' or 'move' are used, and if
#'   gd_get will be called in an upcoming command, this argument may help to
#'   avoid an unnecessary download from Google Drive back to this computer
#'   because `gd_get` skips the download if there's already a local file in the
#'   right place with the right contents (MD5 hash).
#' @param on_exists what to do if the file already exists - update, replace, or
#'   throw an error? The default is to update (using google drive's versioning
#'   functionality). Note that while replacing might be cleaner than updating in
#'   some ways, it has the drawback that only the owner (or Google Teams
#'   organizer) of an item can delete it. Since 'replace' here means delete the
#'   old file and post the new one, 'replace' doesn't work for collaboration on
#'   standard Drive folders owned by a single person, unless only that owner
#'   will ever be trying to replace the file in question.
#' @param type media type as passed to drive_upload or drive_update
#' @param verbose logical, used in gd_put and passed onto
#'   googledrive::drive_update, drive_upload, and/or drive_rm
#' @param dry_put logical. If TRUE, calls to this function won't actually push
#'   anything to Google Drive; they'll just pretend they've done it.
#' @param config_file character name of the YAML file containing
#'   project-specific configuration information for Google Drive
#' @param ind_ext the indicator file extension to expect at the end of
#'   remote_ind
#' @export
#' @examples
#' \dontrun{
#' #### using 2 recipes
#'
#' ## remake file
#' # create and post 1_data/out/mydata.rds (and an indicator for it) at once
#' 1_data/out/mydata.rds.ind:
#'   command: create_and_post_mydata(target_name)
#' # retrieve data file on demand
#' 1_data/out/mydata.rds:
#'   command: gd_get('1_data/out/mydata.rds.ind')
#'
#' ## function definitions
#' create_and_post_mydata <- function(ind_file) {
#'   # create data file (no need to make indicator yet)
#'   data_file <- as_data_file(ind_file)
#'   mydata # <- ...compute mydata here...
#'   write.csv(mydata, data_file)
#'   # post and create indicator file
#'   gd_put(remote_ind=ind_file, local_source=data_file)
#' }
#'
#' #### using 3 recipes
#'
#' ## remake file
#' # create 1_data/cache/mydata.rds (and an indicator for it) locally
#' 1_data/tmp/mydata.rds.ind:
#'   command: create_mydata(target_name)
#' # post 1_data/cache/mydata.rds to 1_data/out/mydata.rds on Drive
#' 1_data/out/mydata.rds.ind:
#'   command: gd_put(remote_ind=target_name, local_source='1_data/tmp/mydata.rds.ind')
#' # retrieve data file on demand
#' 1_data/out/mydata.rds:
#'   command: gd_get('1_data/out/mydata.rds.ind')
#'
#' ## function definitions
#' create_mydata <- function(ind_file) {
#'   data_file <- as_data_file(ind_file)
#'   mydata # <- ...compute mydata here...
#'   write.csv(mydata, data_file)
#'   sc_indicate(ind_file, data_file=data_file)
#' }
#'
#' }
gd_put <- function(
  remote_ind, local_source=remote_ind, mock_get=c('copy','move','none'),
  on_exists=c('update','replace','stop'), type=NULL, verbose=FALSE,
  dry_put=getOption("scipiper.dry_put"),
  config_file=getOption("scipiper.gd_config_file"),
  ind_ext=getOption("scipiper.ind_ext")) {
  
  # check arguments
  mock_get <- match.arg(mock_get)
  
  # tell R CMD check not to worry about symbols used for dplyr non-standard eval
  . <- name <- drive_resource <- '.dplyr.var'
  
  # decide whether local_source is an indicator or data file and find the data file
  local_file <- find_local_file(local_source, ind_ext)
  
  # identify the remote data file to be indicated by remote_ind
  data_file <- as_data_file(remote_ind, ind_ext=ind_ext)
  
  # allow for dry runs of gd_put, where we move and create files locally but do
  # nothing on Google Drive
  if(isTRUE(dry_put)) {
    sc_indicate(ind_file=remote_ind, warning="dry_put=TRUE; not actually pushed", data_file=data_file)
    mock_move_copy(mock_get, local_file, data_file)
    return()
  }
  
  # prepare to use google drive
  require_libs('googledrive')
  gd_config <- yaml::yaml.load_file(config_file)
  
  # determine whether and where the remote file exists
  remote_path <- gd_locate_file(data_file, config_file)
  remote_id <- tail(remote_path$id, 1)
  
  # determine the last known parent, which is either already or soon to be the
  # proximate parent of the item to create
  parent <- remote_path %>% slice(nrow(remote_path)-1) %>% pull(id)
  
  # create the parent folder[s] on google drive as needed
  if(is.na(remote_id)) {
    # determine what exists
    remote_dirs <- remote_path %>%
      slice(-1) %>%slice(-nrow(.)) %>% # chop off the top parent-dir row and the bottom all-NA row
      pull(name)
    final_dirs <- strsplit(dirname(get_relative_path(data_file)), split='/')[[1]]
    
    # double-check that any overlapping path elements agree
    stopifnot(all.equal(final_dirs[seq_along(remote_dirs)], remote_dirs))
    
    # create any needed directories on google drive, updating the parent until
    # it's the proximate parent
    if(length(final_dirs) > length(remote_dirs)) {
      needed_dirs <- if(length(remote_dirs) == 0) {
        final_dirs
      } else {
        final_dirs[-seq_along(remote_dirs)]
      }
      # add the needed directories in order
      for(i in seq_along(needed_dirs)) {
        parent <- googledrive::drive_mkdir(name=needed_dirs[i], parent=googledrive::as_id(parent))$id
      }
    }
  }
  
  # post the file (create or update) from the local data file to Google Drive
  remote_file_changed <- TRUE # assume we'll be changing it
  if(is.na(remote_id)) {
    if(verbose) message("Uploading ", local_file, " to Google Drive")
    remote_id <- googledrive::drive_upload(media=local_file, path=googledrive::as_id(parent), type=type, verbose=verbose)$id
  } else {
    on_exists <- match.arg(on_exists)
    if(on_exists == 'stop') {
      stop('File already exists and on_exists==stop')
    } else {
      # check Drive to see whether the file we want to post is identical to what's already up there
      local_hash <- unname(tools::md5sum(local_file))
      remote_hash <- remote_path %>% slice(n()) %>% pull(drive_resource) %>% .[[1]] %>% .[['md5Checksum']]
      # if the local file is different from the file on Drive, update or replace it
      if(local_hash == remote_hash) {
        if(verbose) message("Not re-posting identical ", local_file, " to Google Drive")
        remote_file_changed <- FALSE
      } else {
        switch(
          on_exists,
          update={
            if(verbose) message("Updating ", local_file, " on Google Drive")
            remote_id <- googledrive::drive_update(googledrive::as_id(remote_id), media=local_file, verbose=verbose)$id
          },
          replace={
            if(verbose) message("Replacing ", local_file, " on Google Drive")
            googledrive::drive_rm(googledrive::as_id(remote_id), verbose=verbose)
            remote_id <- googledrive::drive_upload(media=local_file, path=googledrive::as_id(parent), type=type, verbose=verbose)$id
          }
        )
      }
    }
  }
  
  # write the indicator file
  if(remote_file_changed) {
    # most common case. involves another check on Google Drive, with patience in
    # case Drive doesn't fully recognized the new file right away
    retry_patiently(gd_confirm_posted(ind_file=remote_ind, config_file=config_file), verbose=verbose)
  } else {
    # special case. if the remote file was already identical to the local, then
    # we did not repost the file, and we don't need to check Drive again
    sc_indicate(ind_file=remote_ind, hash=remote_hash)
  }
  
  # if posting was successful, potentially bypass a superfluous download from
  # google drive by copying or moving local_file to data_file (the gd_get
  # destination)
  mock_move_copy(mock_get, local_file, data_file)
  
  invisible()
}

mock_move_copy <- function(mock_get, local_file, data_file) {
  if(mock_get %in% c('copy','move')) {
    if(data_file != local_file) {
      file.copy(from=local_file, to=data_file, overwrite=TRUE)
      if(mock_get == 'move') {
        file.remove(local_file)
      }
    }
  }
}

#' Download a file from Google Drive if needed
#'
#' Download a file from Google Drive to the local project based on the
#' information implied by the indicator file (including the location on google
#' drive and the local destination location). Skips the download if the local
#' file already exists and the remote and local hashes are identical.
#'
#' @param ind_file character name of the indicator file for which data should be
#'   downloaded. downloads the Google Drive object whose key equals the
#'   data_file basename
#' @param type see `type` argument to `googledrive::drive_download()`
#' @param overwrite see `overwrite` argument to `googledrive::drive_download()`
#' @param verbose see `verbose` argument to `googledrive::drive_download()`;
#'   also used to determine whether to include messages specific to `gd_get()`
#' @param config_file character name of the YAML file containing
#'   project-specific configuration information for Google Drive
#' @param ind_ext the indicator file extension to expect at the end of ind_file
#' @examples
#' \dontrun{
#' gd_get('0_test/test_sheet.ind', type='xlsx', overwrite=TRUE)
#' }
#' @importFrom utils tail
#' @export
gd_get <- function(ind_file, type=NULL, overwrite=TRUE, verbose=FALSE,
                   config_file=getOption("scipiper.gd_config_file"),
                   ind_ext=getOption("scipiper.ind_ext")) {
  
  # infer the data file name from the ind_file. gd_get always downloads to that
  # location if it downloads at all
  data_file <- as_data_file(ind_file, ind_ext=ind_ext)
  
  # bypass the download from google drive if the right local file already exists
  if(file.exists(data_file)) {
    remote_hash <- yaml::yaml.load_file(ind_file)$hash
    local_hash <- unname(tools::md5sum(data_file))
    if(remote_hash == local_hash) return(data_file)
  }
  
  require_libs('googledrive')
  
  # figure out whether and where the file exists on gdrive
  remote_path <- gd_locate_file(data_file, config_file)
  remote_id <- tail(remote_path$id, 1)
  
  # download the file from Google Drive to the local data_file
  if(!is.na(remote_id)) {
    if(verbose) message("Downloading ", data_file, " from Google Drive")
    if(!dir.exists(dirname(data_file))) dir.create(dirname(data_file), recursive = TRUE)
    googledrive::drive_download(
      file=googledrive::as_id(remote_id), path=data_file,
      type=type, overwrite=overwrite, verbose=verbose)
  } else {
    stop(paste0("Could not locate ", data_file, " for download from Google Drive"))
  }
  
  return(data_file)
}

# Locate a file along a path relative to the gd_config folder, or return NA if not found
gd_locate_file <- function(file, config_file=getOption("scipiper.gd_config_file")) {
  # tell R CMD check not to worry about symbols used for dplyr non-standard eval
  drive_resource <- parents <- name <- '.dplyr.var'
  
  # load the project's googledrive configuration
  gd_config <- yaml::yaml.load_file(config_file)
  
  # normalize the relative path for this file so we can use in confidently as a
  # relative path from both the local working directory and the google drive
  # parent folder
  relative_path <- get_relative_path(file)
  relative_path_escaped <- relative_path %>% 
    gsub(pattern = '[', replacement = '\\[', fixed = TRUE) %>% 
    gsub(pattern = ']', replacement = '\\]', fixed = TRUE) %>% 
    gsub(pattern = '.', replacement = '\\.', fixed=TRUE) %>% 
    gsub(pattern = '/', replacement = '$|^') 
  
  # query google drive for all possibly relevant files and add their parents as
  # a simple column
  relevant_files <- bind_rows(
    googledrive::drive_get(
      id=googledrive::as_id(gd_config$folder)),
    googledrive::drive_ls(
      path=googledrive::as_id(gd_config$folder), 
      pattern=sprintf("^%s$", relative_path_escaped),
      verbose=FALSE,
      recursive=TRUE)
  ) %>%
    dplyr::mutate(parents=lapply(drive_resource, function(dr) {
      parent <- unlist(dr$parents)
      if(is.character(parent)) parent else NA
    })) %>%
    tidyr::unnest(parents) # make it a single row per item-parent combination
  
  # navigate from the outermost directory down to the file to identify the file
  # by both its name and its directory location
  path_elements <- strsplit(relative_path, split='/')[[1]]
  path_df <- filter(relevant_files, id==googledrive::as_id(gd_config$folder))
  for(i in seq_along(path_elements)) {
    elem <- path_elements[i]
    parent <- path_df[[i,'id']]
    elem_row <- filter(relevant_files, name==elem, parents==parent)
    if(nrow(elem_row) == 1) {
      path_exists <- TRUE
      path_df <- bind_rows(path_df, elem_row)
    } else if(nrow(elem_row) == 0) {
      path_exists <- FALSE
      path_df <- bind_rows(path_df, tibble(id=NA))
      break
    } else if(nrow(elem_row) > 0) {
      stop(sprintf('Found multiple copies of %s in Drive',
                   do.call(file.path, as.list(path_elements[seq_len(i)]))))
    }
  }
  
  return(path_df)
}

get_relative_path <- function(file) {
  
  # tell R CMD check not to worry about symbols used for dplyr non-standard eval
  . <- '.dplyr.var'
  
  file %>% 
    normalizePath(winslash='/', mustWork=FALSE) %>%
    gsub(normalizePath(getwd(), winslash='/'), '', .) %>% # remove the working directory if present
    gsub('^/', '', .) # remove the leading slash if present
}

#' List the Google Drive objects for this project
#' 
#' List the Google Drive objects in the project bucket/path as given in config_file
#' 
#' @param ... arguments passed to googledrive::drive_ls
#' @param config_file character name of the YAML file containing project-specific
#'   configuration information
#' @export
gd_list <- function(..., config_file=getOption("scipiper.gd_config_file")) {
  
  require_libs('googledrive')
  
  message("Listing project files on Google Drive")
  gd_config <- yaml::yaml.load_file(config_file)
  folder_df <- googledrive::drive_ls(path=googledrive::as_id(gd_config$folder), ...)
  
  return(folder_df)  
}

#' Check whether a file is on Google Drive, and if so, write an indicator file
#'
#' @param ind_file character name of the indicator file to write locally once
#'   the file has been uploaded; will exactly correspond to the data_file on GD
#' @param config_file character name of the YAML file containing
#'   project-specific configuration information
#' @param ind_ext the indicator file extension to expect at the end of ind_file
#' @importFrom utils tail
#' @export
gd_confirm_posted <- function(
  ind_file,
  config_file=getOption("scipiper.gd_config_file"),
  ind_ext=getOption("scipiper.ind_ext")) {
  
  require_libs('googledrive')
  
  # tell R CMD check not to worry about symbols used for dplyr non-standard eval
  . <- drive_resource <- '.dplyr.var'
  
  # look on Google Drive for the specified file
  data_file <- as_data_file(ind_file, ind_ext=ind_ext)
  remote_path <- gd_locate_file(data_file, config_file)
  remote_id <- tail(remote_path$id, 1)
  if(is.na(remote_id)){
    stop(paste('failed to find Google Drive file:', data_file))
  } else {
    remote_info <- remote_path %>% slice(n()) %>% pull(drive_resource) %>% .[[1]]
    sc_indicate(ind_file, hash=remote_info$md5Checksum)
  }
}
aappling-usgs/scipiper documentation built on Aug. 1, 2020, 3:11 p.m.