# doRegroupings ----------------------------------------------------------------
#' Apply Regrouping of Values in a Data Frame
#'
#' @param Data data frame
#' @param regroup.actual default: settings$regroup.actual
#' @param regroup.config default: settings$regroup.config
#' @param settings list of settings that may contain the elements
#' \code{regroup.actual} and \code{regroup.config}
#' @param checkRemaining if TRUE (default) it is checked if all values that
#' occurred in a column to be regrouped have been considered in the regrouping
#' @param to.factor if \code{TRUE} the new values are converted to
#' \code{factor}. The default is \code{FALSE}.
#' @param to.numeric (default: \code{TRUE}, overrides \code{to.factor}!), passed
#' to \code{\link{regroup}}
#' @param dbg if \code{TRUE} (default) debug messages are shown
#'
#' @export
#'
doRegroupings <- function(
Data, regroup.actual = kwb.utils::selectElements(settings, "regroup.actual"),
regroup.config = kwb.utils::selectElements(settings, "regroup.config"),
settings = NULL, checkRemaining = TRUE, to.factor = FALSE, to.numeric = TRUE,
dbg = TRUE
)
{
#checkRemaining=TRUE;dbg=TRUE
if (kwb.utils::isNullOrEmpty(regroup.actual)) {
message("No regroupings specified -> nothing to do.")
}
#i<-13;actual <- regroup.actual[[i]]
skipped <- character()
for (i in seq_along(regroup.actual)) {
actual <- regroup.actual[[i]]
if (actual$from %in% names(Data)) {
message_if(dbg, sprintf("%02d. %s", i, actual$to))
Data <- applyRegrouping(
Data,
actual,
regroup.config,
to.factor = to.factor,
to.numeric = to.numeric,
dbg = dbg
)
} else {
skipped <- c(skipped, kwb.utils::hsQuoteChr(actual$from))
}
} # end of for (actual in regroup.actual)
message_if(
dbg && (n <- length(skipped)) > 0L,
sprintf("%d actual regroupings have been skipped ", n),
"since the following columns were missing:\n- ",
paste(unique(skipped), collapse = "\n- ")
)
Data
}
# applyRegrouping --------------------------------------------------------------
applyRegrouping <- function(
Data, actual, regroup.config, to.factor, to.numeric, dbg
)
{
kwb.utils::catIf(dbg, sprintf(
paste(
"Creating column '%s' from column '%s' using",
"config '%s' with %s ...\n"
),
actual$to, actual$from, actual$name, actual$labels
))
values <- kwb.utils::selectColumns(Data, actual$from)
config <- regroup.config[[actual$name]]
# Check for (and message about!) untreated values only if dbg is TRUE
if (! is.null(config)) {
values <- regroupedValues(
values = values,
config = c(config, list(checkRemaining = dbg)),
labels = actual$labels,
to.factor = to.factor,
to.numeric = to.numeric,
dbg = dbg
)
} else {
message_if(dbg, sprintf(
"No config '%s' available -> Just copying...", actual$name
))
}
Data[[actual$to]] <- values
kwb.utils::catIf(dbg, sprintf("-- Column '%s' ok.\n", actual$to))
Data
}
# regroupedValues --------------------------------------------------------------
#' Regroup Values According to Configuration
#'
#' @param values vector of values
#' @param config configuration (list) describing how to regroup. If the list
#' contains an element \code{breaks} the function \code{\link{groupByBreaks}}
#' is called to group values together that belong to the same intervals that
#' are defined by the breaks. Otherwise the list must contain an element
#' \code{values} and an element of the name given in \code{labels} (default:
#' "labels1"). These are given to the function \code{\link{regroup}} that
#' performs a "value to label"-regrouping.
#' @param labels default: "labels1"
#' @param to.factor if \code{TRUE} the new values are converted to
#' \code{factor}. The default is \code{FALSE}.
#' @param to.numeric (default: \code{TRUE}, overrides \code{to.factor}!), passed
#' to \code{\link{regroup}}
#' @param dbg if \code{TRUE} (default) debug messages are shown
#'
#' @export
#'
regroupedValues <- function(
values, config = NULL, labels = "labels1", to.factor = FALSE,
to.numeric = TRUE, dbg = TRUE
)
{
# If config is NULL or an empty list, return the original values
if (is.null(config) || length(config) == 0) {
kwb.utils::catIf(dbg, sprintf(
"Returning original values for %s.\n", deparse(substitute(values))
))
return (values)
}
# If no breaks are given, use regroup
if (is.null(config$breaks)) {
assignments <- toAssignments2(
values = kwb.utils::selectElements(config, "values"),
labels = kwb.utils::selectElements(config, labels)
)
values.new <- regroup(
x = values,
assignments = assignments$assignments,
ignore.case = config$ignore.case, # may be NULL
to.factor = to.factor,
to.numeric = to.numeric
)
# check for untreated values if not specified differently
check <- kwb.utils::defaultIfNULL(config$checkRemaining, TRUE)
if (check) {
kwb.utils::catIf(dbg, "-- Checking for untreated values... ")
ok <- messageOnRemaining(
x = sort(unique(values)), assignments = assignments
)
kwb.utils::catIf(dbg && ok, "ok.\n")
}
} else { # If breaks are given, use groupByBreaks
values.new <- groupByBreaks(
x = values,
breaks = kwb.utils::selectElements(config, "breaks"),
values = kwb.utils::selectElements(config, labels),
right = ifelse (is.null(config$right), TRUE, config$right),
to.factor = to.factor
)
}
values.new
}
# toAssignments2 ---------------------------------------------------------------
toAssignments2 <- function(values, labels)
{
groups <- unique(labels)
assignments <- lapply(groups, FUN = function(group) values[labels == group])
list(assignments = stats::setNames(assignments, nm = groups))
}
# regroup ----------------------------------------------------------------------
#' Assign Values to Groups of Values
#'
#' @param x vector of values
#' @param assignments list of assignments of the form \<key\> = \<values\> with
#' \<values\> being a vector of elements to be looked up in \code{x} and to be
#' replaced with \<key\> in the output vector
#' @param ignore.case if \code{TRUE} the case is ignored when comparing values
#' @param to.factor if \code{TRUE} the new values are converted to
#' \code{factor}. The default is \code{FALSE}.
#' @param to.numeric if \code{TRUE} (the default!) and independent of
#' \code{to.factor} (!) the returned values are converted to numeric values if
#' all assigned (even though string) values "look like" numeric values, such
#' as "1", "2", "3.4", "5.67".
#' @return vector with as many elements as there are elements in \code{x}. The
#' vector contains \<key\> at positions where the elements in \code{x} appeared
#' in the vector \<values\> of a \<key\> = \<values\> assignment of
#' \code{assignments}
#'
#' @export
#'
#' @examples
#' regroup(c("A", "B", "C", "D"), assignments = list(
#' "AB" = c("A", "B"),
#' "CD" = c("C", "D")
#' ))
#'
#'
#' x <- c("A", "B", "C", "D", "E", "A")
#' assignments <- list(
#' "1" = c("A", "B"),
#' "2" = c("C", "D")
#' )
#'
#' regroup(x, assignments)
#'
#' # to.factor is ignored...
#' regroup(x, assignments, to.factor = TRUE)
#'
#' # ... unless to.numeric is FALSE!
#' regroup(x, assignments, to.factor = TRUE, to.numeric = FALSE)
#'
regroup <- function(
x, assignments, ignore.case = NULL, to.factor = FALSE, to.numeric = TRUE
)
{
# Set default for ignore.case
ignore.case <- kwb.utils::defaultIfNULL(ignore.case, FALSE)
xnew <- rep(NA, times = length(x))
keys <- names(assignments)
for (key in keys) {
matching <- if (ignore.case) {
tolower(x) %in% tolower(assignments[[key]])
} else {
x %in% assignments[[key]]
}
if (any(matching)) {
xnew[matching] <- key
}
}
typeConverted(xnew, to.factor, factorLevels = keys, to.numeric = to.numeric)
}
# typeConverted ----------------------------------------------------------------
typeConverted <- function(x, to.factor, factorLevels, to.numeric = TRUE)
{
# Convert "<NA>" to NA
x[x == "<NA>"] <- NA
# If all new values look like numeric convert to numeric
if (isTRUE(to.numeric) &&
all(kwb.utils::hsValidValue(kwb.utils::hsTrim(x), lng = "en"))) {
as.numeric(x)
} else if (isTRUE(to.factor)) {
factor(x, factorLevels)
} else {
x
}
}
# messageOnRemaining -----------------------------------------------------------
messageOnRemaining <- function(x, assignments)
{
remaining <- setdiff(x, unlist(assignments, use.names = FALSE))
ok <- (length(remaining) == 0L)
message_if(
! ok,
"Untreated value(s): ", kwb.utils::stringList(remaining, qchar = '"')
)
ok
}
# groupByBreaks ----------------------------------------------------------------
#' Group Values Belonging to Intervals
#'
#' Group values together that belong to the same intervals being defined by
#' breaks
#'
#' @param x vector of values or a data frame. If \code{x} is a data frame, the
#' function is applied to each column given in \code{columns} (all numeric
#' columns by default)
#' @param breaks vector of breaks
#' @param values values to be assigned
#' @param right if TRUE the intervals are right-closed, else left-closed.
#' @param add.Inf.limits if TRUE (default), -Inf and Inf are added to the left
#' and right, respectively, of \code{breaks}
#' @param to.factor if \code{TRUE} the new values are converted to
#' \code{factor}. The default is \code{FALSE}.
#' @param columns \code{NULL} or vector of column names (if \code{x} is a data
#' frame)
#' @param keyFields \code{NULL} or vector of column names (if \code{x} is a data
#' frame). If not \code{NULL}, a data frame with these columns coming first
#' and the interval labels in the last column is returned.
#'
#' @export
#'
#' @examples
#' groupByBreaks(1:10, breaks = 5, values = c("<= 5", "> 5"))
#' groupByBreaks(1:10, breaks = 5, right = FALSE, values = c("< 5", ">= 5"))
#'
#' # Prepare a simple data frame
#' x <- kwb.utils::noFactorDataFrame(
#' id = c("A", "B", "C"),
#' value = c(10, 20, 30)
#' )
#'
#' # Keep the ID column of the data frame
#' groupByBreaks(x, breaks = 20, keyFields = "id")
#'
groupByBreaks <- function(
x, breaks, values = breaksToIntervalLabels(breaks), right = TRUE,
add.Inf.limits = TRUE, to.factor = FALSE, columns = NULL, keyFields = NULL
)
{
# If x is a data frame, apply this function to each given or numeric
# column and return
if (is.data.frame(x)) {
basis <- if (! is.null(keyFields)) {
kwb.utils::selectColumns(x, keyFields, drop = FALSE)
}
if (is.null(columns)) {
columns <- names(which(sapply(x, is.numeric)))
}
categoricals <- lapply(
x[columns], groupByBreaks, breaks = breaks, values = values,
right = right, add.Inf.limits = add.Inf.limits, to.factor = to.factor
)
categoricals <- kwb.utils::asNoFactorDataFrame(categoricals)
return(cbind(basis, categoricals))
}
if (add.Inf.limits) {
breaks <- c(-Inf, breaks, Inf)
}
groups <- data.frame(
row = seq_along(x),
groupnumber = cut(x = x, breaks = breaks, labels = FALSE, right = right)
)
### TODO: use cut with labels = values instead of merge!
#cut(x = x, breaks = breaks, labels = values, right = right)
### cut gives a warning if there are duplicated values in "labels"...
duplicates <- values[duplicated(values)]
if (length(duplicates) > 0) {
message(
"groupByBreaks: There are duplicated values (",
kwb.utils::stringList(duplicates), ") in the labels to be given: ",
kwb.utils::stringList(values),
".\n-> Can you modify the breaks to prevent this?"
)
}
groupToValue <- data.frame(
groupnumber = seq_len(length(breaks) - 1),
values = values,
stringsAsFactors = FALSE
)
result <- merge(groups, groupToValue, sort = FALSE, all.x = TRUE)
xnew <- result$values[order(result$row)]
typeConverted(xnew, to.factor, factorLevels = values)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.