#' Convert Variables to local R objects
#'
#' Crunch Variables reside on the server, allowing you to work with
#' datasets that are too big to bring into memory on your machine. Many
#' functions, such as `max`, `mean`, and [crtabs()], translate your commands
#' into API queries and return only the result. But, not every operation you'll
#' want to perform has been implemented on the Crunch servers. If you need to do
#' something beyond what is currently supported, you can bring a variable's
#' data into R with `as.vector(ds$var)` and work with it like any
#' other R vector.
#'
#' `as.vector` transfers data from Crunch to a local R session. Note:
#' `as.vector` returns the vector in the row order of the dataset. If filters
#' are set that specify an order that is different from the row order of the
#' dataset, the results will ignore that order. If you need the vector ordered
#' in that way, use syntax like `as.vector(ds$var)[c(10, 5, 2)]` instead.
#'
#' @param x a CrunchVariable
#' @param mode for Categorical variables, one of either "factor" (default,
#' which returns the values as factor); "numeric" (which returns the numeric
#' values); or "id" (which returns the category ids). If "id", values
#' corresponding to missing categories will return as the underlying integer
#' codes; i.e., the R representation will not have any `NA` elements. Otherwise,
#' missing categories will all be returned `NA`. For non-Categorical
#' variables, the `mode` argument is ignored.
#' @return an R vector of the type corresponding to the Variable. E.g.
#' CategoricalVariable yields type factor by default, NumericVariable yields
#' numeric, etc.
#' @seealso [as.data.frame][as.data.frame.CrunchDataset] for another interface
#' for (lazily) fetching data from the server as needed; [exportDataset()] for
#' pulling all of the data from a dataset.
#' @name as-vector
#' @aliases as.vector
NULL
setGeneric("as.vector")
#' @rdname as-vector
#' @export
setMethod("as.vector", "CrunchVariable", function(x, mode) {
f <- zcl(activeFilter(x))
# TODO: this will return in dataset order even if there is a filter is
# specified that is not in the same order as rows
# (eg as.vector(ds$v1[c(10:1)])) as.vector should re-order by default
# see CrunchDataFrame for one way this could be accomplished
columnParser(type(x))(
getValues(x, filter = toJSON(f, for_query_string = TRUE)),
x,
mode
)
})
#' @rdname as-vector
#' @export
setMethod("as.vector", "CrunchExpr", function(x, mode) {
payload <- list(query = toJSON(list(out = zcl(x)), for_query_string = TRUE))
if (length(x@filter)) {
payload[["filter"]] <- toJSON(x@filter, for_query_string = TRUE)
} else {
payload$filter <- "{}"
}
out <- paginatedGET(paste0(x@dataset_url, "table/"),
query = payload, table = TRUE, limit = .crunchPageSize(x)
)
## pass in the variable metadata to the column parser
variable <- VariableEntity(structure(list(body = out$metadata$out),
class = "shoji"
))
return(columnParser(out$metadata$out$type)(out$data$out, variable, mode))
})
parse_column <- list(
numeric = function(col, variable, mode) {
missings <- vapply(col, Negate(is.numeric), logical(1))
missings[names(col) %in% "?"] <- TRUE
col[missings] <- NA_real_
return(as.numeric(unlist(col)))
},
text = function(col, variable, mode) {
missings <- vapply(col, Negate(is.character), logical(1))
col[missings] <- NA_character_
return(as.character(unlist(col)))
},
categorical = function(col, variable, mode = NULL) {
vartype <- "categorical_factor"
## Deal with mode. Valid modes: factor (default), numeric, id
if (is.null(mode)) {
## Default from the base method is "any", which means nothing to us
mode <- "any"
}
if (mode == "numeric") {
vartype <- "categorical_numeric_values"
} else if (mode == "id") {
vartype <- "categorical_ids"
} else if (type(variable) == "categorical" && is.3vl(variable)) {
## Temporary: restrict on type==categorical so MRs don't get
## turned into logicals (until we're ready to flip that switch)
vartype <- "logical"
}
return(columnParser(vartype)(col, variable))
},
categorical_factor = function(col, variable, mode = NULL) {
out <- columnParser("numeric")(col)
cats <- na.omit(categories(variable))
out <- factor(names(cats)[match(out, ids(cats))], levels = names(cats))
return(out)
},
categorical_ids = function(col, variable, mode) {
missings <- vapply(col, is.list, logical(1)) ## for the {?:values}
col[missings] <- lapply(col[missings], function(x) x[["?"]])
return(as.numeric(unlist(col)))
},
categorical_numeric_values = function(col, variable, mode) {
out <- columnParser("numeric")(col)
cats <- na.omit(categories(variable))
out <- values(cats)[match(out, ids(cats))]
return(out)
},
logical = function(col, variable, mode = NULL) {
out <- columnParser("numeric")(col)
return(as.logical(out))
},
categorical_array = function(col, variable, mode) {
out <- columnParser("categorical")(unlist(col), variable, mode)
ncols <- length(tuple(variable)$subvariables)
nvals <- length(out)
out <- lapply(seq_len(ncols), function(iii) {
out[seq(iii, to = nvals, by = ncols)]
})
out <- as.data.frame(out)
if (namekey(variable) == "alias") {
names(out) <- aliases(subvariables(variable))
} else {
names(out) <- names(subvariables(variable))
}
return(out)
},
numeric_array = function(col, variable, mode) {
out <- columnParser("numeric")(unlist(col), variable, mode)
ncols <- length(tuple(variable)$subvariables)
nvals <- length(out)
out <- lapply(seq_len(ncols), function(iii) {
out[seq(iii, to = nvals, by = ncols)]
})
out <- as.data.frame(out)
if (namekey(variable) == "alias") {
names(out) <- aliases(subvariables(variable))
} else {
names(out) <- names(subvariables(variable))
}
return(out)
},
datetime = function(col, variable, mode) {
out <- columnParser("text")(col)
return(from8601(out))
}
)
columnParser <- function(vartype) {
if (vartype == "multiple_response") {
vartype <- "categorical_array"
}
return(parse_column[[vartype]] %||% parse_column[["numeric"]])
}
## Pulled to a function so that it can be mocked in tests
.categoricalPageSize <- function() 200000L
.crunchPageSize <- function(variable) {
## Determine a safe page size for paginating GET values/
categorical.size <- .categoricalPageSize()
if (is.variable(variable)) {
vartype <- type(variable)
if (is.Array(variable)) {
## It's effectively categorical, so let's just divide by number
## of subvars
return(categorical.size %/% length(subvariableURLs(tuple(variable))))
} else if (vartype == "categorical") {
return(categorical.size)
} else if (vartype %in% c("numeric", "datetime")) {
return(categorical.size %/% 2L)
} else if (vartype == "text") {
## Throttle aggressively
return(categorical.size %/% 40L)
}
} else {
## Crunch Expression. Probably fine, but let's throttle a bit to be safe
return(categorical.size %/% 4L)
}
}
getValues <- function(x, ...) {
paginatedGET(shojiURL(x, "views", "values"), list(...),
limit = .crunchPageSize(x)
)
}
paginatedGET <- function(url, query, offset = 0, limit = 1000, table = FALSE) {
## Paginate the GETting of values. Called both from getValues and in
## the as.vector.CrunchExpr method in expressions.R
query$offset <- offset
query$limit <- limit
out <- list()
keep.going <- TRUE
i <- 1
## Function to determine number of values received, depending on whether
## we have a crunch:table or shoji:view
if (table) {
len <- function(x) length(x$data$out)
} else {
len <- length
}
with(temp.option(scipen = 15), {
## Mess with scipen so that the query string formatter doesn't
## convert an offset like 100000 to '1+e05', which server rejects
while (keep.going) {
## Wrap the GET in a parser function, default no-op, so we can
## get data out of a crunch:table
out[[i]] <- crGET(url, query = query)
if (len(out[[i]]) < limit) {
keep.going <- FALSE
} else {
query$offset <- query$offset + limit
i <- i + 1
}
}
})
## Collect the result
if (table) {
out[[1]]$data$out <- unlist(lapply(out, function(x) x$data$out),
recursive = FALSE
)
out <- out[[1]]
} else {
out <- unlist(out, recursive = FALSE)
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.