Nothing
#' fetch_data function
#'
#' Download a data set from the PMLB repository, (optionally) store it locally, and return the data set.
#' You must be connected to the internet if you are fetching a data set that is not cached locally.
#'
#' @param dataset_name The name of the data set to load from PMLB
#' @param return_X_y Boolean. Whether to return the data with the features and labels stored
#' in separate data structures or a single structure (can be TRUE or FALSE, defaults to FALSE)
#' @param local_cache_dir The directory on your local machine to store the data files in
#' (defaults to NA, indicating cache will not be used)
#' @param dropna Boolean. Whether rows with NAs should be automatically dropped. Default to TRUE.
#' @seealso \code{\link{summary_stats}}.
#' @export
#' @examples
#' # Features and labels in single data frame
#' penguins <- fetch_data("penguins")
#' head(penguins)
#'
#' # Features and labels stored in separate data structures
#' penguins <- fetch_data("penguins", return_X_y = TRUE)
#' penguins$x # data frame
#' penguins$y # vector
fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, dropna = TRUE) {
GITHUB_URL <- "https://github.com/EpistasisLab/pmlb/raw/master/datasets"
SUFFIX <- ".tsv.gz"
if (!dataset_name %in% dataset_names) {
stop("'dataset_name' ", dataset_name, " not found in PMLB.\n * Check spelling, capitalisation etc.", call. = FALSE)
}
if (!(is.logical(return_X_y) && length(return_X_y) == 1)) {
stop("'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", return_X_y, ".", call. = FALSE)
}
dataset_url <- paste0(
GITHUB_URL, "/",
dataset_name, "/",
dataset_name,
SUFFIX
)
if (is.na(local_cache_dir)) {
tmp <- tempfile()
utils::download.file(dataset_url, tmp)
dataset <- utils::read.csv(gzfile(tmp),
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE
)
} else {
if (!file.exists(local_cache_dir)) {
dir.create(file.path(local_cache_dir))
}
dataset_path <- file.path(local_cache_dir, paste0(dataset_name, SUFFIX))
# read file from cache
if (file.exists(dataset_path)) {
dataset <- utils::read.csv(dataset_path,
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE
)
# download file to cache and read it
} else {
utils::download.file(dataset_url, dataset_path)
dataset <- utils::read.csv(dataset_path,
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE
)
}
}
if (return_X_y) {
dataset <- list(
x = dataset[, names(dataset) != "target"],
y = dataset$target
)
}
if (dropna) {
return(stats::na.omit(dataset))
}
dataset
}
#' pmlb: R interface to the Penn Machine Learning Benchmarks data repository
#'
#' The \href{https://github.com/EpistasisLab/pmlb}{PMLB} repository contains a curated collection of data sets for evaluating and
#' comparing machine learning algorithms.
#' These data sets cover a range of applications, and include binary/multi-class classification problems and regression problems,
#' as well as combinations of categorical, ordinal, and continuous features. There are approximately 290 data sets included in the PMLB repository
#' and there are no missing values in these data sets.
#'
#' This R library includes summaries of the classification and regression data sets but does NOT
#' include any of the PMLB data sets. The data sets can be downloaded using the \code{\link{fetch_data}} function which
#' is similar to the corresponding PMLB python function.
#'
#' See \code{\link{fetch_data}}, \code{\link{summary_stats}} for usage examples and further information.
#'
#' If you use PMLB in a scientific publication, please consider citing the following paper:
#'
#' Randal S. Olson, William La Cava, Patryk Orzechowski, Ryan J. Urbanowicz, and Jason H. Moore (2017).
#'
#' PMLB: a large benchmark suite for machine learning evaluation and comparison
#'
#' https://biodatamining.biomedcentral.com/articles/10.1186/s13040-017-0154-4
#'
#' BioData Mining 10, page 36.
#'
#' I have no affiliation with the authors of PMLB or the University of Pennsylvania.
#'
#' @docType package
#' @name pmlb
NULL
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.