#' @title Get an OpenML data set.
#'
#' @description Given a data set ID, the corresponding \code{\link{OMLDataSet}}
#' will be downloaded (if not in cache) and returned.
#'
#' Note that data splits and other task-related information are not included in
#' an \code{\link{OMLDataSet}}. Tasks can be downloaded with \code{\link{getOMLTask}}.
#'
#' @note
#' One of \code{data.id} or \code{data.name} must be passed.
#'
#' @template arg_data.id
#' @param data.name [\code{character(1)}]\cr
#' Data set name.
#' This is an alternative to \code{data.id}.
#' Default is \code{NULL}.
#' @param data.version [\code{integer(1)}]\cr
#' Version number of the data set with name \code{data.name}.
#' Default is \code{NULL}.
#' Ignored if \code{data.id} is passed.
#' @template arg_cache_only
#' @template arg_verbosity
#' @return [\code{\link{OMLDataSet}}].
#' @family downloading functions
#' @family data set-related functions
#' @example inst/examples/getOMLDataSet.R
#' @export
getOMLDataSet = function(data.id = NULL, data.name = NULL, data.version = NULL, cache.only = FALSE, verbosity = NULL) {
if (!xor(is.null(data.id), is.null(data.name)))
stopf("You must provide either a data.id or a data.name, but not both.")
assertFlag(cache.only)
if (is.null(data.name)) {
data.id = asInt(data.id, lower = 0)
return(getOMLDataSetById(data.id = data.id, cache.only = cache.only, verbosity = verbosity))
}
getOMLDataSetByName(data.name = data.name, data.version = data.version, cache.only = cache.only, verbosity = verbosity)
}
# Helper function to get data set by data name (and version number).
# (Makes use of getOMLDataSetById)
getOMLDataSetByName = function(data.name = NULL, data.version = NULL, cache.only = FALSE, verbosity = NULL) {
# else get list of datasets RESTRICTED to the given name
data.sets = .listOMLDataSets(data.name = data.name, verbosity = verbosity)
# match by name
matching.ids = which(data.sets$name == data.name)
matching.sets = data.sets[matching.ids, , drop = FALSE] # nolint
# otherwise we have multiple matches and need to consider the version
data.id = if (is.null(data.version)) {
# in this case we default to the newest version
showInfo(verbosity, "Multiple version available, but no data.version passed! Returning the newest version.")
matching.sets[getMaxIndex(matching.sets$version), "data.id"]
} else {
data.version = asInt(data.version, lower = 0)
matching.sets[matching.sets$version == data.version, "data.id"]
}
if (is.null(data.id) || length(data.id) == 0) {
stopf("Version %i does not exist for dataset '%s'. Available versions: %s",
data.version, data.name, collapse(matching.sets$version, sep = ", "))
}
# get number of matches ...
n.matches = length(matching.ids)
# ... and react accordingly
if (n.matches == 0)
stopf("No dataset with name '%s' found.", data.name)
if (n.matches == 1)
return(getOMLDataSetById(data.id = matching.sets$data.id, cache.only = cache.only, verbosity = verbosity))
return(getOMLDataSetById(data.id = data.id, cache.only = cache.only, verbosity = verbosity))
}
# Helper function to get data set by data ID.
getOMLDataSetById = function(data.id = NULL, cache.only = FALSE, verbosity = NULL) {
down = downloadOMLObject(data.id, object = "data", cache.only = cache.only, verbosity = verbosity)
f = down$files
# parse data set description
data.desc = parseOMLDataSetDescription(down$doc)
# warn if dataset not cached and deactivated
if (data.desc$status == "deactivated") {
warningf("Data set has been deactivated.")
} else if (data.desc$status == "in_preparation") {
warningf("Data set is in preparation and will be activated soon.")
}
# now read data file
data = arff.reader(f$dataset.arff$path)
if (!is.na(data.desc$row.id.attribute)) {
# add row.id.attribute also to ignore list if not already there
data.desc$ignore.attribute = na.omit(union(data.desc$ignore.attribute, data.desc$row.id.attribute))
}
data = setRowNames(data, as.character(seq_row(data) - 1L))
def.target = data.desc$default.target.attribute
target.ind = which(colnames(data) %in% def.target)
colnames.old = colnames(data)
colnames(data) = make.names(colnames(data), unique = TRUE)
colnames.new = colnames(data)
# overwrite default target attribute to make sure that it's the actual name of the column
data.desc$default.target.attribute = colnames.new[target.ind]
makeOMLDataSet(
desc = data.desc,
data = data,
colnames.old = colnames.old,
colnames.new = colnames.new,
target.features = data.desc$default.target.attribute
)
}
parseOMLDataSetDescription = function(doc) {
default.target.attribute = xmlOValS(doc, "/oml:data_set_description/oml:default_target_attribute")
default.target.attribute = if (!is.null(default.target.attribute)) unlist(strsplit(default.target.attribute, ",")) else ""
ignore.attribute = xmlOValsMultNsS(doc, "/oml:data_set_description/oml:ignore_attribute")
ignore.attribute = if (!is.null(ignore.attribute)) unlist(strsplit(ignore.attribute, ","))
args = filterNull(list(
id = xmlRValI(doc, "/oml:data_set_description/oml:id"),
name = xmlRValS(doc, "/oml:data_set_description/oml:name"),
version = xmlRValS(doc, "/oml:data_set_description/oml:version"),
description = xmlRValS(doc, "/oml:data_set_description/oml:description"),
format = xmlRValS(doc, "/oml:data_set_description/oml:format"),
creator = xmlOValsMultNsS(doc, "/oml:data_set_description/oml:creator"),
contributor = xmlOValsMultNsS(doc, "/oml:data_set_description/oml:contributor"),
collection.date = xmlOValS(doc, "/oml:data_set_description/oml:collection_date"),
upload.date = xmlRValD(doc, "/oml:data_set_description/oml:upload_date"),
language = xmlOValS(doc, "/oml:data_set_description/oml:language"),
licence = xmlOValS(doc, "/oml:data_set_description/oml:licence"),
url = xmlRValS(doc, "/oml:data_set_description/oml:url"),
default.target.attribute = default.target.attribute,
row.id.attribute = xmlOValS(doc, "/oml:data_set_description/oml:row_id_attribute"),
ignore.attribute = ignore.attribute,
version.label = xmlOValS(doc, "/oml:data_set_description/oml:version_label"),
citation = xmlOValS(doc, "/oml:data_set_description/oml:citation"),
visibility = xmlOValS(doc, "/oml:data_set_description/oml:visibility"),
original.data.url = xmlOValS(doc, "/oml:data_set_description/oml:original_data_url"),
paper.url = xmlOValS(doc, "/oml:data_set_description/oml:paper.url"),
update.comment = xmlOValS(doc, "/oml:data_set_description/oml:update.comment"),
md5.checksum = xmlRValS(doc, "/oml:data_set_description/oml:md5_checksum"),
status = xmlRValS(doc, "/oml:data_set_description/oml:status"),
tags = xmlOValsMultNsS(doc, "/oml:data_set_description/oml:tag")
))
do.call(makeOMLDataSetDescription, args)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.