#' fetch_data Function
#'
#' Download a data set from the PMLB repository, (optionally) store it locally, and return the data set.
#' You must be connected to the internet if you are fetching a data set that is not cached locally.
#'
#' @param dataset_name The name of the data set to load from PMLB
#' @param return_X_y Whether to return the data with the features and labels stored in separate data structures or a single structure (can be TRUE or FALSE, defaults to FALSE)
#' @param local_cache_dir The directory on your local machine to store the data files in (defaults to NA, indicating cache will not be used)
#' @seealso \code{\link{classification_summary}} and \code{\link{regression_summary}}
#' @export
#' @examples
#'\dontrun{
#' # Features and labels in single data frame
#' iris <- fetch_data('iris')
#' iris
#'
#' # Features and labels stored in separate data structures
#' iris <- fetch_data('iris', return_X_y=TRUE)
#' iris$x # data frame
#' iris$y # vector
#'}
fetch_data <- function(dataset_name, return_X_y=FALSE, local_cache_dir=NA) {
GITHUB_URL <- 'https://github.com/EpistasisLab/penn-ml-benchmarks/raw/master/datasets'
SUFFIX <- '.tsv.gz'
if ( dataset_name %in% classification_dataset_names ) {
data_type <- 'classification'
} else if ( dataset_name %in% regression_dataset_names ) {
data_type <- 'regression'
} else {
stop("'dataset_name', ", dataset_name, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE)
}
if ( return_X_y != TRUE && return_X_y != FALSE ) {
stop("'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", return_X_y, ".", call.=FALSE)
}
dataset_url <- paste0(GITHUB_URL, '/',
data_type, '/',
dataset_name, '/',
dataset_name,
SUFFIX)
if ( is.na(local_cache_dir) ) {
tmp <- tempfile()
utils::download.file(dataset_url, tmp)
dataset <- utils::read.csv( gzfile(tmp),
sep="\t",
header=TRUE,
stringsAsFactors=FALSE)
} else {
if ( !file.exists(local_cache_dir) ) {
dir.create(file.path(local_cache_dir))
}
dataset_path <- file.path(local_cache_dir, paste0(dataset_name, SUFFIX))
# read file from cache
if ( file.exists(dataset_path) ) {
dataset <- utils::read.csv( dataset_path,
sep="\t",
header=TRUE,
stringsAsFactors=FALSE)
# download file to cache and read it
} else {
utils::download.file(dataset_url, dataset_path)
dataset <- utils::read.csv( dataset_path,
sep="\t",
header=TRUE,
stringsAsFactors=FALSE)
}
}
if ( isTRUE(return_X_y) ) {
x <- dataset[, names(dataset) != "target"]
y <- dataset$target
dataset <- list(x=x, y=y)
}
return(dataset)
}
#' pmlblite: R interface to the Penn Machine Learning Benchmarks data repository
#'
#' The \href{https://github.com/EpistasisLab/penn-ml-benchmarks}{PMLB} repository contains a curated collection of data sets for evaluating and
#' comparing machine learning algorithms.
#' These data sets cover a range of applications, and include binary/multi-class classification problems and regression problems,
#' as well as combinations of categorical, ordinal, and continuous features. There are approximately 290 data sets included in the PMLB repository
#' and there are no missing values in these data sets.
#'
#' This R library includes summaries of the classification and regression data sets but does NOT
#' include any of the PMLB data sets. The data sets can be downloaded using the \code{\link{fetch_data}} function which
#' is similar to the corresponding PMLB python function.
#'
#' See \code{\link{fetch_data}}, \code{\link{classification_summary}} and \code{\link{regression_summary}} for usage examples and further information.
#'
#' If you use PMLB in a scientific publication, please consider citing the following paper:
#'
#' Randal S. Olson, William La Cava, Patryk Orzechowski, Ryan J. Urbanowicz, and Jason H. Moore (2017).
#'
#' PMLB: a large benchmark suite for machine learning evaluation and comparison
#'
#' https://biodatamining.biomedcentral.com/articles/10.1186/s13040-017-0154-4
#'
#' BioData Mining 10, page 36.
#'
#' I have no affiliation with the authors of PMLB or the University of Pennsylvania.
#'
#' @docType package
#' @name pmlblite
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.