Nothing
#' Check and update schema descriptions
#'
#' This function takes a raw schema description and updates values that were
#' only given as wildcard or implied values. It is automatically called by
#' \code{reorganise}, but can also be used in concert with the getters to debug
#' a schema.
#' @param input [\code{data.frame(1)}]\cr an input for which to check a schema
#' description.
#' @param schema [\code{symbol(1)}]\cr the schema description.
#' @details The core idea of a schema description is that it can be written in a
#' very generic way, as long as it describes sufficiently where in a table
#' what variable can be found. A very generic way can be via using the
#' function \code{\link{.find}} to identify the initially unknown
#' cell-locations of a variable on-the-fly, for example when it is merely
#' known that a variable must be in the table, but not where it is.
#'
#' \code{validateSchema} matches a schema with an input table and inserts the
#' accordingly evaluated positions (of clusters, filters and variables),
#' adapts some of the meta-data and ensures formal consistency of the schema.
#' @return An updated schema description
#' @examples
#' # build a schema for an already tidy table
#' (tidyTab <- tabs2shift$tidy)
#'
#' schema <-
#' setIDVar(name = "territories", col = 1) %>%
#' setIDVar(name = "year", col = .find(pattern = "period")) %>%
#' setIDVar(name = "commodities", col = 3) %>%
#' setObsVar(name = "harvested", col = 5) %>%
#' setObsVar(name = "production", col = 6)
#'
#' # before ...
#' schema
#'
#' # ... after
#' validateSchema(schema = schema, input = tidyTab)
#'
#' @importFrom checkmate assertNames assertClass assertNumeric
#' @importFrom rlang is_quosure
#' @importFrom dplyr mutate across ungroup n right_join
#' @importFrom tidyr replace_na everything
#' @importFrom purrr map_int map_lgl map
#' @importFrom methods new
#' @export
validateSchema <- function(schema = NULL, input = NULL){
assertDataFrame(x = input)
assertClass(x = schema, classes = "schema")
filter <- schema@filter
groups <- schema@groups
tabDim <- dim(input)
variables <- schema@variables
# 1. complete cluster information ----
clusters <- schema@clusters
# set cluster start if it is NULL or a qousure
if(is.null(clusters$row)){
clusters$row <- 1
} else if(is.list(clusters$row)){
clusters$row <- .eval_find(input = input, row = clusters$row, clusters = clusters)
}
if(is.null(clusters$col)){
clusters$col <- 1
} else if(is.list(clusters$col)){
clusters$col <- .eval_find(input = input, col = clusters$col, clusters = clusters)
}
if(is.null(clusters$width)){
nPos <- table(clusters$col)
dist <- diff(c(unique(clusters$col), tabDim[2]+1))
clusters$width <- rep(dist, times = nPos)
}
if(is.null(clusters$height)){
if(length(clusters$row) > 1){
nPos <- table(clusters$row)
dist <- diff(c(unique(clusters$row), tabDim[1]+1))
clusters$height <- rep(dist, times = nPos)
} else {
clusters$height <- tabDim[1]+1 - min(clusters$row)
}
}
nClusters <- max(lengths(clusters))
if(nClusters == 0) nClusters <- 1
# make sure that all elements occur the same number of times
clusters$row <- rep(x = clusters$row, length.out = nClusters)
clusters$col <- rep(x = clusters$col, length.out = nClusters)
clusters$width <- rep(x = clusters$width, length.out = nClusters)
clusters$height <- rep(x = clusters$height, length.out = nClusters)
# 2. evaluate filter ----
allRows <- 1:dim(input)[1]
if(!is.null(filter$row)){
filter$row <- .eval_find(input = input, row = filter$row)
}
if(!is.null(filter$col)){
filter$col <- .eval_find(input = input, col = filter$col)
}
# 3. adjust variables ----
outsideCluster <- filterOut <- isAbs <- NULL
selectRows <- selectCols <- idCols <- NULL
clusterID <- clusters$id
groupID <- clusters$group
# first, evaluate whether any variable other than clusterID or groupID has a 'row' set
headerRows <- map(.x = seq_along(variables), .f = function(ix){
tempName <- names(variables)[ix]
if(!tempName %in% c(groupID, clusterID)){
temp <- variables[[ix]]
if(temp$type == "observed"){
temp$row
} else {
NULL
}
}
})
headerRows <- unlist(headerRows, use.names = FALSE)
for(i in seq_along(variables)){
varProp <- variables[[i]]
varName <- names(variables)[i]
# resolve quosures from grep-ing unknown col/rows ----
if(!is.null(varProp$row)){
if(is.list(varProp$row)){
varProp$row <- .eval_find(input = input, row = varProp$row, clusters = clusters)
# ignore header rows
varProp$row <- varProp$row[!varProp$row %in% headerRows]
}
}
if(!is.null(varProp$col)){
if(is.list(varProp$col)){
varProp$col <- .eval_find(input = input, col = varProp$col, clusters = clusters)
}
}
# check whether the variable is wide ----
if(varProp$type == "observed"){
isWide <- map_lgl(.x = seq_along(idCols), function(ix){
if(length(varProp$col) == length(idCols[[ix]])){
all(varProp$col == idCols[[ix]])
} else {
FALSE
}
})
if(any(isWide) & is.null(varProp$key)){
varProp$key <- 0
varProp$value <- "{all_rows}"
}
}
# figure out which rows to filter out
if(!varProp$dist & !varName %in% c(groupID, clusterID)){
if(varProp$type == "observed"){
if(is.null(varProp$row)){
if(is.null(varProp$key)){
varProp$row <- clusters$row
} else {
varProp$row <- 1
}
}
}
if(!is.null(varProp$row)){
if(is.null(names(filter$row[[1]]))){
filterOut <- sort(unique(c(filterOut, varProp$row)))
}
}
}
if(varProp$type == "id"){
if(!is.null(varProp$val)){
varProp$dist <- TRUE
}
idCols <- c(idCols, list(varProp$col))
}
# identify all selected columns ----
selectCols <- unique(c(selectCols, varProp$col))
# make sure that all elements occur the same number of times ----
if(!is.null(varProp$row)){
if(length(varProp$row) == 1){
varProp$row <- rep(x = varProp$row, length.out = nClusters)
}
if(any(varName == groupID)){
varProp$row <- varProp$row[clusters$member]
}
}
if(any(varName == groupID)){
if(!is.null(varProp$row)){
varProp$col <- rep(x = varProp$col, length.out = length(varProp$row))
}
} else {
if(!is.null(varProp$col)){
if(length(varProp$col) == 1){
varProp$col <- rep(x = varProp$col, length.out = nClusters)
}
}
}
# make sure that cluster or group IDs are set to NA ----
# that their rows can be recognised as removable, in case there is nothing
# else in that row
if(any(varName %in% c(clusterID, groupID))){
for(j in seq_along(varProp$col)){
input[varProp$row[j], varProp$col[j]] <- NA
}
}
# adapt rows and columns if there are groups ----
varProp$row <- .eval_sum(input = input, groups = groups,
data = varProp$row)
variables[[i]] <- varProp
names(variables)[i] <- varName
}
# 4. remove empty rows ----
testRows <- input[,selectCols]
emptyRows <- which(rowSums(is.na(testRows)) == ncol(testRows))
# 5. adapt filter and cluster position to groups ----
clusters$row <- .eval_sum(input = input, groups = groups,
data = clusters$row)
clusters$height <- .eval_sum(input = input, groups = groups,
data = clusters$height)
filterOut <- .eval_sum(input = input, groups = groups,
data = filterOut)
allRows <- .eval_sum(input = input, groups = groups,
data = allRows)
if(!is.null(filter$row)){
filter$row <- filter$row[filter$row %in% sort(unique(allRows[!allRows %in% c(filterOut, emptyRows)]))]
filter$row <- .eval_sum(input = input, groups = groups,
data = filter$row)
} else {
filter$row <- sort(unique(allRows[!allRows %in% c(filterOut, emptyRows)]))
}
# 6. write it all ----
out <- new(Class = "schema",
clusters = clusters,
format = schema@format,
groups = schema@groups,
filter = filter,
variables = variables,
validated = TRUE)
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.