R/utils-retriever-data.R

Defines functions append_data_citations append_retriever_citation prepare_datasets download_datasets import_retriever_data install_retriever_data use_default_data_path get_default_data_path check_default_data_path

Documented in append_data_citations append_retriever_citation check_default_data_path download_datasets get_default_data_path import_retriever_data install_retriever_data prepare_datasets use_default_data_path

#' @title Check if a default data path is set
#' 
#' @description See \code{portalr::\link[portalr]{check_default_data_path}} for details.
#' 
#' @export
check_default_data_path <- function()
{
    portalr::check_default_data_path(ENV_VAR = "MATSS_DATA_PATH", DATA_NAME = "MATSS data")
}

#' @title What is the default data path?
#'
#' @description See \code{portalr::\link[portalr]{get_default_data_path}} for details.
#' @inheritParams portalr::get_default_data_path
#'
#' @export
get_default_data_path <- function(fallback = "~")
{
    portalr::get_default_data_path(fallback, ENV_VAR = "MATSS_DATA_PATH")
}

#' @title Manage the default path for downloading MATSS Data into
#'
#' @description See \code{portalr::\link[portalr]{use_default_data_path}} for details.
#' @inheritParams portalr::use_default_data_path
#'
#' @export
use_default_data_path <- function(path = NULL)
{
    portalr::use_default_data_path(path, ENV_VAR = "MATSS_DATA_PATH")
}

#' @title Download data from the data retriever
#' @aliases import_retriever_data
#'  
#' @description \code{install_retriever_data} downloads retriever datasets and 
#'   is a wrapper around \code{rdataretriever::\link[rdataretriever]{install}}
#'   
#' @param force_install whether to install the dataset if the correctly named 
#'   folder already exists
#' @inheritParams rdataretriever::install
#' 
#' @return NULL
#' 
#' @examples
#' \dontrun{
#'   install_retriever_data("veg-plots-sdl")
#' }
#' @export
#'
install_retriever_data <- function(dataset, path = get_default_data_path(), 
                                   force_install = FALSE)
{

    # check for existence of data_path
    path <- normalizePath(path, mustWork = FALSE)
    if (!dir.exists(path))
    {
        if(usethis::ui_yeah(paste0(usethis::ui_path(path), 
                                   " does not exist. Create it?")))
        {
            dir.create(path)
        } else {
            stop(usethis::ui_oops(paste("Please choose a valid path or set a different default location with\n", 
                                        usethis::ui_code("set_default_data_path()"))))
        }
    }
    
    # where to put the retriever data
    folder_path <- file.path(path, dataset)
    if (dir.exists(folder_path) && !force_install)
    {
        message("A folder already exists for \"", dataset, "\"... skipping.\n", 
                "Use `force_install = TRUE` to overwrite it with a fresh install.")
    } else {
        # make the folder
        dir.create(folder_path)
        
        # install the retriever data
        tryCatch({
            rdataretriever::install_csv(dataset, data_dir = folder_path)
            data_citation <- rdataretriever::get_citation(dataset)
            raw_citation <- sub("^Citation:[[:space:]]*", "", data_citation)
            cat(raw_citation, file = file.path(folder_path, "CITATION"))
        }, 
        error = function(e) {
            unlink(folder_path, recursive = TRUE)
            e
        }
        )
    }
}

#' @rdname install_retriever_data
#'
#' @description \code{import_retriever_data} loads a previously downloaded 
#'   retriever dataset
#' 
#' @param path the overarching folder in which to download datasets; OR the 
#'   full path to the folder containing the data (when `dataset == NULL`)
#' @inheritParams install_retriever_data
#' 
#' @return NULL
#' 
#' @examples
#' \dontrun{
#'   import_retriever_data("veg-plots-sdl")
#' }
#' @export
#'
import_retriever_data <- function(dataset = NULL, path = get_default_data_path())
{
    if (!is.null(dataset))
    {
        path <- file.path(path, dataset)
    }
    
    # check for existence of data_path
    files <- dir(path)
    if (length(files) == 0) # check for presence of downloaded files
    {
        warning("Didn't find any downloaded data in ", path, ".\n", 
                "Did you run get_retriever_data() first?")
        return(NULL)
    }
    
    # load each csv and return a list
    files <- setdiff(files, "CITATION")
    tempdata <- vector('list', length(files))
    names(tempdata) <- sub('.csv', '', files)
    for (j in seq_along(files))
    {
        tempdata[[j]] <- utils::read.csv(file.path(path, files[j]), 
                                         stringsAsFactors = FALSE, 
                                         fileEncoding = "latin1")
    }
    return(tempdata)
}

#' @rdname install_retriever_data
#' 
#' @description \code{download_datasets} is a wrapper around 
#'   \code{\link{install_retriever_data}} to download multiple datasets, with 
#'   the default to download all of the datasets that are supported.
#'   
#' @inheritParams install_retriever_data
#' 
#' @return NULL
#' 
#' @examples
#' \dontrun{
#'   download_datasets()
#' }
#' @export
#' 
download_datasets <- function(dataset = c("jornada-lter-rodent", 
                                          "shortgrass-steppe-lter", 
                                          "veg-plots-sdl", 
                                          "mapped-plant-quads-mt", 
                                          "ushio-maizuru-fish-community", 
                                          "global-population-dynamics", 
                                          "breed-bird-survey", 
                                          "biotimesql"), 
                              path = get_default_data_path(), 
                              force_install = FALSE)
{
    purrr::walk(dataset, 
                install_retriever_data, 
                path = path, 
                force_install = force_install)
}

#' Prepare datasets for usage
#' 
#' @description This wraps all the functions that prepare datasets from specific 
#'   databases (e.g. [`prepare_bbs_ts_data`], [`prepare_biotime_data`]).
#'   
#' @param dataset what datasets to prepare (must follow the naming convention 
#'   of [`download_datasets`])
#' @param bbs_params params to pass to [`prepare_bbs_ts_data`]
#' @param biotime_params params to pass to [`prepare_biotime_data`]
#'
#' @return NULL
#' 
#' @export
#' 
prepare_datasets <- function(dataset = c("breed-bird-survey", 
                                         "biotimesql"), 
                             bbs_params = list(path = get_default_data_path(), start_yr = 1965, end_yr = 2018, min_num_yrs = 10, data_subset = NULL), 
                             biotime_params = list(path = get_default_data_path(), data_subset = NULL))
{
    if ("breed-bird-survey" %in% dataset)
    {
        do.call(prepare_bbs_ts_data, bbs_params)
    }
    
    if ("biotimesql" %in% dataset)
    {
        do.call(prepare_biotime_data, biotime_params)
    }
    
    invisible()
}

#' @title Append citation info to a formatted dataset
#' 
#' @description Given an existing formatted dataset, and the path to the 
#'   downloaded dataset, from retriever, and via `import_retriever_data()`, 
#'   read in the citation info and add it to the metadata for the dataset
#'   
#' @param formatted_data a dataset that already follows the `MATSS`` standard
#' @param path where to load the raw data files from
#' 
#' @return the same dataset, with the citation appended to `metadata`
#'
#' @export
#' 
append_retriever_citation <- function(formatted_data, path)
{
    citation_file <- file.path(path, "CITATION")
    if (file.exists(citation_file))
    {
        citation_text <- readLines(citation_file, warn = FALSE)
        formatted_data$metadata$citation <- c(formatted_data$metadata$citation, 
                                              citation_text)
    }
    return(formatted_data)
}

#' @title Generate a vector of citations.
#' 
#' @description Given an existing vector of citations (or the NULL default), 
#'   add the citations that are specified in the paths of `citation_files`
#'   
#' @param citations a vector of strings containing existing citations to append
#' @param citation_files a vector of filepaths to the citation files
#' 
#' @return a vector of strings containing the citations
#'
#' @export
#' 
append_data_citations <- function(citations = NULL, 
                                  citation_files)
{
    new_citations <- vapply(citation_files, function(filepath) {
        f <- file(filepath, open = "r")
        out <- readLines(f, warn = FALSE)
        unlink(f)
        return(out)
    }, "", USE.NAMES = FALSE)
    return(c(citations, new_citations))
}
weecology/MATSS documentation built on May 15, 2020, 7:03 p.m.