Nothing
#' Get a catalog of datasets
#'
#' Crunch datasets are collected in folders called "projects". `datasets()` can
#' be used to filter a project's contents to see only datasets (and not other
#' projects). You can also use it to pull a catalog of datasets from search
#' results.
#'
#' The `datasets()<-` assignment function provides an alternative method for
#' moving a dataset into a project. This may be more convenient in some cases
#' than using [mv()].
#' @param x a `ProjectFolder` or `SearchResults` that may contain datasets
#' @param value For assignment, a `CrunchDataset` to move
#' @return When `x` is a `ProjectFolder`, `datasets()` returns the folder with
#' its "index" filtered to contain only datasets; otherwise, it returns an
#' object of class `DatasetCatalog`. The assignment function returns the
#' project `x` with the given dataset added to it.
#' @name datasets
#' @export
#' @examples
#' \dontrun{
#' # Get the names of the datasets contained in a project
#' projects() %>%
#' cd("Important Clients") %>%
#' datasets() %>%
#' names()
#' # The assignment method lets you move a dataset to a project
#' proj <- cd(projects(), "Important Clients")
#' ds <- loadDataset("New important client survey", project = "Studies")
#' datasets(proj) <- ds
#' }
datasets <- function(x = getAPIRoot()) {
# TODO: deprecate the dataset catalog, i.e. `datasets()`
if (inherits(x, "SearchResults")) {
## This is close enough to a dataset catalog
out <- structure(list(index = x$datasets), class = "shoji")
} else if (is.project(x)) {
## This is a ProjectFolder, so filter it to only datasets
## TODO: explicit test, though this is called in loadDataset
return(x[types(x) %in% "dataset"])
} else {
out <- crGET(paste0(shojiURL(x, "catalogs", "datasets"), "all"))
}
DatasetCatalog(out)
}
#' Get the names of datasets in a project
#'
#' `listDatasets()` is a convenience function for quickly seeing what datasets
#' are in a project. It is equivalent to `names(datasets(proj))`, with some
#' additional optional arguments.
#'
#' @param kind character specifying whether to look in active, archived, or all
#' datasets. Default is "active", i.e. non-archived.
#' @param project `ProjectFolder` entity, character name of a project, or
#' `NULL`, the default. If a Project entity or reference is supplied, the
#' function will display datasets from that Project's datasets. If `NULL`,
#' your personal folder will be used.
#' @param refresh logical: should the function check the Crunch API for new
#' datasets? Default is FALSE.
#' @param shiny deprecated, no longer works
#' @return A character vector of dataset names, each of which would be a valid
#' input for [loadDataset()]
#' @export
listDatasets <- function(kind = c("active", "all", "archived"),
project = NULL,
refresh = FALSE,
shiny = FALSE) {
if (shiny) {
halt("listDatsets(shiny = TRUE) is no longer supported")
} else {
Call <- match.call()
if (refresh) {
dropDatasetsCache()
}
if (is.null(project)) {
warn_once(
"As of crunch 1.26.0, listDatasets() with no project specified ",
"only lists your 'personal' datasets (those that you created) ",
"and not those that were shared with you.",
option = "crunch.list.personal.msg"
)
project <- "~"
}
if (is.character(project)) {
project <- cd(projects(), project)
}
if (!is.project(project)) {
halt(
"Project ", deparseAndFlatten(eval.parent(Call$project)),
" is not valid"
)
}
## Grab just the datasets
dscat <- datasets(project)
## Subset as indicated
kind <- match.arg(kind)
if (kind == "active") {
dscat <- active(dscat)
} else if (kind == "archived") {
dscat <- archived(dscat)
}
return(names(dscat))
}
}
#' Load a Crunch Dataset
#'
#' This function gives you a Dataset object, which refers to a dataset hosted on
#' the Crunch platform. With this Dataset, you can perform lots of data cleaning
#' and analysis as if the dataset were fully resident on your computer, without
#' having to pull data locally.
#'
#' You can specify a dataset to load by its human-friendly "name", within
#' the project (folder) to find it in. This makes code more
#' readable, but it does mean that if the dataset is renamed or moved to a
#' different folder, your code may no longer work. The fastest, most reliable
#' way to use `loadDataset()` is to provide a URL to the dataset--the dataset's
#' URL will never change.
#'
#' @param dataset character, the path to a Crunch dataset to load, or a
#' dataset URL. If `dataset` is a path to a dataset in a project, the path will
#' be be parsed and walked, relative to `project`, and the function will look
#' for the dataset inside that project. If `dataset` is just a string and `project`
#' is set to `NULL`, the function will assume that `dataset` is the dataset id.
#' @param kind character specifying whether to look in active, archived, or all
#' datasets. Default is "active", i.e. non-archived.
#' @param project `ProjectFolder` entity, character name (path) to a project.
#' Defaults to the project set in `envOrOption('crunch.default.project')`
#' or "./" (the project root), if the default is not set.
#' @param refresh logical: should the function check the Crunch API for new
#' datasets? Default is `FALSE`.
#' @return An object of class `CrunchDataset`.
#'
#' @examples
#' \dontrun{
#' ds <- loadDatasets("A special dataset", project = "Studies")
#' ds2 <- loadDatasets("~/My dataset", project = "Studies")
#' ds3 <- loadDataset("My dataset", project = projects()[["Studies"]]) # Same as ds2
#' ds4 <- loadDataset("https://app.crunch.io/api/datasets/bd3ad2/")
#' }
#' @export
#' @seealso See [cd()] for details of parsing and walking dataset folder/project
#' paths.
loadDataset <- function(dataset,
kind = c("active", "all", "archived"),
project = defaultCrunchProject("."),
refresh = FALSE) {
if (inherits(dataset, "DatasetTuple")) {
return(entity(dataset))
}
if (refresh) {
dropDatasetsCache()
}
if (is.character(dataset)) {
if (is.crunchURL(dataset)) {
## Just load it, no other querying needed
return(loadDatasetFromURL(dataset))
}
## See if "dataset" is a path or name
found <- lookupDataset(dataset, project = project)
## Subset as indicated.
found <- switch(match.arg(kind),
active = active(found),
all = found,
archived = archived(found)
)
if (length(found) == 0) {
if (missing(project)) {
warn_once(
"Finding datasets by name without specifying a path is no longer supported.",
option = "find.dataset.no.project"
)
}
halt(dQuote(dataset), " not found")
}
## This odd selecting behavior handles the multiple matches case
out <- found[[names(found)[1]]]
if (!is.dataset(out)) {
## There is inconsistency btw DatasetCatalog and ProjectFolder
out <- entity(out)
}
return(out)
} else if (is.whole(dataset)) {
warning(
"'dataset' should be a character dataset name, path, or URL. ",
"Loading by numeric index is deprecated."
)
dsname <- listDatasets(kind = kind, project = project)[dataset]
if (is.na(dsname)) {
halt("subscript out of bounds")
}
return(loadDataset(
dsname,
kind = kind,
project = project
))
} else {
halt(
"'dataset' should be a character dataset name, path, or URL, ",
"not an object of class ", class(dataset)
)
}
}
is.crunchURL <- function(x) {
# /api/ check is for redacted mocks that prune the scheme://host
# We don't only use that check because web app URLs don't include /api/
# Note that this does pass through URLs that are to resources inside a
# dataset, such as /api/datasets/123/variables/.
# Call `datasetReference()` on the return to get a dataset URL
is.character(x) && length(x) == 1L && grepl("^http|^/api/", x) # nolint
}
loadDatasetFromURL <- function(url) {
## Load dataset without touching a dataset catalog
if (!grepl("/api/", url)) { # nolint
## It's a web app URL, probably. Turn it into an API URL
url <- datasetReference(url)
}
dataset <- CrunchDataset(crGET(url))
tuple(dataset) <- DatasetTuple(
entity_url = self(dataset),
body = dataset@body,
index_url = shojiURL(dataset, "catalogs", "parent")
)
return(dataset)
}
lookupDataset <- function(x, project = NULL) {
# x is assumed to be either a dataset name or a path to a dataset by name
# return is a dataset catalog with dataset tuples matching that name,
# potentially scoped to a specified project
Call <- match.call()
# First, see if there is a path, and if so, walk it, possibly relative to
# `project`
dspath <- parseFolderPath(x)
x <- tail(dspath, 1)
if (length(dspath) == 1 && is.null(project)) {
# This code path used to use the datasets by_name endpoint. However
# As of 2024-11, that endpoint is no longer very useful because it only
# surfaces datasets that are in personal folders (going away very soon) &
# direct dataset shares (deprecated).
# So we use this to load by dataset id, a nice convenience feature.
# To get here, a user had to explicitly set `project=NULL` so they're
# presumably not here accidentally
pseudo_shoji <- tryCatch({
ds_base_url <- absoluteURL("datasets/", envOrOption("crunch.api"))
ds_entity <- crGET(paste0(ds_base_url, "/", x))
# Need to make this pseudo DatasetCatalog to match old API call (because `loadDataset`
# will later pass it through `active()`/`archived()`)
structure(list(
self = ds_base_url,
index = setNames(list(ds_entity$body), ds_entity$self)
), class = "shoji")
}, error = function(...) NULL) # But if we don't find it just return empty catalog
return(DatasetCatalog(pseudo_shoji))
}
# Resolve `project`
if (is.null(project)) {
project <- projects()
} else if (!is.project(project)) {
project <- ProjectFolder(crGET(resolveProjectURL(project)))
}
if (length(dspath) > 1) {
project <- cd(project, dspath[-length(dspath)])
}
# Filter the project folder to be only datasets matching this name
return(datasets(project[names(project) == x]))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.