# checkGrouping ----------------------------------------------------------------
#' Compare Two Columns of a Data Frame (Raw Vs Regrouped)
#'
#' @param data data frame
#' @param column_raw name of column in \code{data} containing original (raw)
#' values
#' @param column_cat name of column in \code{data} containing the result of
#' regrouping the raw value into categories
#'
checkGrouping <- function(data, column_raw, column_cat)
{
# Select only the two columns to be compared (1: raw, 2: cat)
data <- kwb.utils::selectColumns(data, c(column_raw, column_cat))
# Show the unique value combinations
kwb.utils::printIf(
TRUE, kwb.utils::resetRowNames(unique(data[order(data[, 1]), ])),
caption = "Raw values and assigned categories"
)
# Show the levels of the categoised variable
kwb.utils::printIf(
TRUE, levels(data[, 2]),
caption = "\nLevels of category factor"
)
}
# checkNumberOfUnique ----------------------------------------------------------
#' Show Number of Unique Values in Selected Columns
#'
#' @param data data frame
#' @param columns names of columns in \code{data} for which to count unique
#' values
#'
checkNumberOfUnique <- function(data, columns = names(data))
{
cat("Number of unique values in column\n")
for (column in columns) {
cat(sprintf(
"- %s: %d\n", column, length(unique(kwb.utils::selectColumns(data, column)))
))
}
}
# dataFramesToTextMatrix -------------------------------------------------------
#' Convert List of Data Frames to Character Matrix
#'
#' In the character matrix the data frames appear one below the other. Each
#' data frame has a header and each data frame is separated from the following
#' data frame by an empty row.
#'
#' @param data_frames list of data frames
#'
#' @export
#'
#' @examples
#' data_frames <- list(
#' data.frame(a = 1:3, b = 2:4),
#' data.frame(a = 1:5, b = 2:6, c = 3:7)
#' )
#'
#' dataFramesToTextMatrix(data_frames)
#'
dataFramesToTextMatrix <- function(data_frames)
{
stopifnot(is.list(data_frames))
stopifnot(all(sapply(data_frames, inherits, "data.frame")))
ncol <- max(sapply(data_frames, ncol))
nrows <- sapply(data_frames, nrow)
nrow <- sum(nrows) + 2 * length(data_frames) - 1
text_matrix <- matrix("", nrow, ncol)
offsets <- c(0, cumsum(nrows + 2))
for (i in seq_along(data_frames)) {
from_row <- 1 + offsets[i]
to_row <- from_row + nrows[i]
text_area <- as.matrix(data_frames[[i]])
text_matrix[from_row, seq_len(ncol(text_area))] <- names(data_frames[[i]])
text_matrix[(from_row + 1):to_row, seq_len(ncol(text_area))] <- text_area
}
text_matrix
}
# fillUpNA ---------------------------------------------------------------------
#' Fill NA in First Vector With Values From Second Vector
#'
#' @param x first vector
#' @param y second vector
#' @param dbg if \code{TRUE} a debug message is shown
#' @param name_x name of x
#' @param name_y name of y
#' @return \code{x} with \code{NA} replaced by the values in \code{y} at the
#' same positions
#'
fillUpNA <- function(x, y, dbg = TRUE, name_x = NULL, name_y = NULL)
{
name_x <- getname(name_x, substitute(x))
name_y <- getname(name_y, substitute(y))
stopifnot(length(x) == length(y))
is_na <- is.na(x)
number_of_na_before <- sum(is_na)
x[is_na] <- y[is_na]
kwb.utils::catIf(dbg, sprintf(
"%d NAs in '%s' filled up with values from '%s'\n",
number_of_na_before - sum(is.na(x)), name_x, name_y
))
x
}
# getChangesOfDuplicates -------------------------------------------------------
#' Get Changes of Rows That Are Duplicated in Selected Columns
#'
#' @param df a data frame
#' @param columns names of columns in \code{df} in which to look for duplicate
#' value combinations
#' @param add_columns names of additional columns that shall appear in the
#' output even if there are no changes in these columns
#'
#' @return list of data frames. The list has as many elements as there are
#' different value combinations in \code{columns} that appear more than once
#' in \code{df}. Each element is a data frame with all rows from \code{df}
#' that have the same value combination in \code{columns}. By default the data
#' frame contains the columns given in \code{columns} and those columns out of
#' \code{df} in which there is at least one change over the values in the
#' different rows.
#'
#' @export
#'
#' @examples
#' df <- data.frame(
#' id = 1:7,
#' name = c("one", "one", "two", "two", "three", "three", "three"),
#' type = c("A", "A", "B", "C", "D", "D", "D"),
#' size = c(10, 11, 12, 12, 13, 13, 14),
#' height = c(1, 1, 2, 3, 4, 4, 5)
#' )
#'
#' df
#'
#' getChangesOfDuplicates(df, "name")
#' getChangesOfDuplicates(df, c("name", "type"))
#'
getChangesOfDuplicates <- function(df, columns, add_columns = columns)
{
fetch <- function(df, cols) kwb.utils::selectColumns(df, cols, drop = FALSE)
candidates <- fetch(df, columns)
groups <- unique(candidates[duplicated(candidates), , drop = FALSE])
groups <- kwb.utils::fullySorted(groups)
groups[[".group"]] <- seq_len(nrow(groups))
y <- merge(df, groups, by = columns)
if (any(is.na(y[[".group"]]))) {
stop("Unexpected error in getChangesOfDuplicates(): .group is NA!")
}
#y <- y[! is.na(y[[".group"]]), , drop = FALSE]
nm <- names(y)
cols <- c(columns, add_columns)
result <- lapply(split(y, y[[".group"]]), function(x) {
fetch(x, unique(c(cols, nm[! sapply(x, kwb.utils::allAreEqual)])))
})
lapply(unname(result), count_unique)
}
# getYearFromColumn ------------------------------------------------------------
#' Get Integer Year Number from Column
#'
#' @param data data frame
#' @param column representing a date or date and time
#'
#' @return vector of integer as long as the number of rows in \code{data}
#'
#' @export
#'
getYearFromColumn <- function(data, column)
{
values <- kwb.utils::selectColumns(data, column)
if (inherits(values, "Date")) {
as.integer(format(values, "%Y"))
} else {
result <- as.integer(substr(values, 1, 4))
year_range <- range(unique(stats::na.omit(result)))
warning(
"Found text when I was expecting date. Returning first four characters:",
year_range[1], "-", year_range[2]
)
result
}
}
# overwriteIfNotNA -------------------------------------------------------------
#' Use Non-NA Values from Source Column in Target Column
#'
#' Overwrite the values in the target column with the values in the source
#' column at indices where the values in the source column are not NA
#'
#' @param data data frame
#' @param target_column name of target column
#' @param source_column name of source column
#'
#' @export
#'
overwriteIfNotNA <- function(data, target_column, source_column)
{
data[[target_column]] <- replaceUnlessNA(
x = kwb.utils::selectColumns(data, target_column),
substitute = kwb.utils::selectColumns(data, source_column)
)
data
}
# replaceByCondition -----------------------------------------------------------
#' Replace Values in Column in Rows Matching Condition
#'
#' @param df data frame in which to do substitutions
#' @param file path to CSV file with columns \code{group},
#' \code{target_column}, \code{condition}, \code{replacement}
#' @param group group name. If given, only rows in \code{file} that have
#' this group name in column \code{group} are considered.
#' @param config optional. Data frame containing the configuration as being read
#' from \code{file}.
#' @param dbg if \code{TRUE} debug messages are shown
#' @export
#'
#' @examples
#' # Create a very simple data frame
#' df <- data.frame(a = 1:3)
#'
#' # Create a very simple configuration
#' config <- read.table(sep = ",", header = TRUE, text = c(
#' "group,target,condition,replacement",
#' "g1,a,a==2,22",
#' "g2,a,a==3,33"
#' ))
#'
#' # Write the configuration to a temporary file
#' file <- tempfile()
#' write.csv(config, file)
#'
#' # Apply all replacements configured in the file ...
#' replaceByCondition(df, file)
#'
#' # ... or in the configuration
#' replaceByCondition(df, config = config)
#'
#' # Apply selected replacements
#' replaceByCondition(df, file, group = "g1")
#' replaceByCondition(df, file, group = "g2")
#'
replaceByCondition <- function(
df, file = NULL, group = NULL, config = NULL, dbg = TRUE
)
{
if (is.null(file) && is.null(config)) {
clean_stop("Either 'file' or 'config' must be given.")
}
if (is.null(config)) {
config <- utils::read.csv(
file, comment.char = "#", stringsAsFactors = FALSE
)
}
fetch <- kwb.utils::createAccessor(config)
if (! is.null(group)) {
groups <- fetch("group")
stopIfNotIn(group, unique(groups), singular = "group", do_stop = FALSE)
config <- config[groups == group, , drop = FALSE]
fetch <- kwb.utils::createAccessor(config)
}
if (nrow(config) == 0L) {
return(structure(df, metadata = data.frame(
target = character(0),
n_replaced = integer(0),
condition = character(0),
replacement = character(0)
)))
}
#stopifnot(! anyDuplicated(fetch("target_column")))
# Evaluate all criteria at once
matches <- kwb.utils::getAttribute("details", x = kwb.utils::matchesCriteria(
Data = df,
criteria = as.character(fetch("condition")),
na.to.false = TRUE,
add.details = TRUE,
dbg = FALSE
))
# Provide column vectors of "config"
columns <- as.character(fetch("target"))
replacements <- fetch("replacement")
# Apply the replacements
for (i in seq_along(columns)) {
df[which(matches[, i]), columns[i]] <- replacements[i]
}
# Provide metadata on the results of the applied replacements
metadata <- kwb.utils::resetRowNames(cbind(
kwb.utils::removeColumns(config, "group"),
n_replaced = colSums(matches, na.rm = TRUE)
))
# Print the metadata
kwb.utils::catIf(dbg, paste(collapse = "", sprintf(
"Data correction in '%s': %d values with \"%s\" set to '%s'\n",
metadata$target,
metadata$n_replaced,
metadata$condition,
metadata$replacement
)))
structure(df, metadata = metadata)
}
# replaceUnlessNA ------------------------------------------------------------
#' Use Elements of Substitute at Indices Where Substitutes Are Not NA
#'
#' @param x vector in which to substitute
#' @param substitute vector containing substitutions
#'
replaceUnlessNA <- function(x, substitute)
{
stopifnot(length(x) == length(substitute))
ok <- ! is.na(substitute)
x[ok] <- substitute[ok]
x
}
# reportNA ---------------------------------------------------------------------
#' Count NA in a Column and Give a Message about It
#'
#' @param data data frame
#' @param column name of column in \code{data}
#' @param subject value for placeholder \emph{subject} in output: "NAs
#' \emph{subject}: \emph{count_NA}"
#'
reportNA <- function(data, column, subject = "in data")
{
number_of_na <- kwb.utils::countNaInColumn(data, column)
cat(sprintf("NAs %s: %d\n", subject, number_of_na))
number_of_na
}
# stopIfNotIn ------------------------------------------------------------------
#' Stop with Info If Element Is Not in Expected Set
#'
#' Stop with info message if element is not in expected set of elements
#'
#' @param element element to be looked for in \code{elements}
#' @param elements vector of possible elements
#' @param singular name of object to appear in error message. Default:
#' \code{"option"}
#' @param plural name of object (plural) to appear in error message. Default:
#' \code{paste0(singular, "s")}
#' @param do_stop if \code{FALSE} (the default is \code{TRUE}) program execution
#' does not stop. Instead a message is shown.
#' @export
#'
stopIfNotIn <- function(
element, elements, singular = "option", plural = paste0(singular, "s"),
do_stop = TRUE
)
{
if (! element %in% elements) {
msg <- sprintf(
"No such %s: '%s'. Available %s: %s",
singular, element, plural, kwb.utils::stringList(elements)
)
if (do_stop) {
clean_stop(msg)
} else {
message(msg)
}
}
}
# stopOnDuplicates -------------------------------------------------------------
#' Stop If There Are Duplicates over given Columns
#'
#' @param data data frame
#' @param columns names of columns over which to look for duplicates. By
#' default, all columns in \code{data} are used.
#' @param name name of data
#' @export
#'
stopOnDuplicates <- function(data, columns = names(data), name = NULL)
{
name <- getname(name, substitute(data))
changes <- getChangesOfDuplicates(data, columns)
if (length(changes) > 0) {
print(changes)
stopf(
"There are duplicates in column(s) %s of %s (see above)!",
kwb.utils::stringList(columns), name
)
}
}
# writeStandardCsv -------------------------------------------------------------
#' Write CSV File in a Standardised Manner
#'
#' @param x data frame
#' @param file path to CSV file to be written
#' @param \dots additional arguments passed to \code{\link[utils]{write.table}}
#'
#' @export
#'
writeStandardCsv <- function(x, file, ...)
{
utils::write.table(
x, file = file, col.names = TRUE, row.names = FALSE,
dec = kwb.utils::argsCsv("en")$dec,
sep = kwb.utils::argsCsv("de")$sep,
quote = FALSE, ...
)
}
# removeRowsThatAreNaInColumn --------------------------------------------------
#' Remove Rows That are NA in Given Column
#'
#' @param data data frame
#' @param column column name
#' @param dbg it \code{TRUE} debug messages are shown
#'
#' @return \code{data} with rows removed that are \code{NA} in
#' \code{data[[column]]}
#'
#' @export
#'
#' @examples
#' df <- data.frame(a = c(1, NA, 3), b = c(11, 22, NA))
#' df
#'
#' removeRowsThatAreNaInColumn(df, "a")
#' removeRowsThatAreNaInColumn(df, "b")
#'
removeRowsThatAreNaInColumn <- function(data, column, dbg = TRUE)
{
is_na <- is.na(kwb.utils::selectColumns(data, column))
kwb.utils::catAndRun(
dbg = dbg,
sprintf("Removing %d rows that are NA in column '%s'", sum(is_na), column),
data[! is_na, ]
)
}
# printNumberOfNA --------------------------------------------------------------
#' Print Number of NA Values in Given Column
#'
#' @param data data frame
#' @param column column name
#' @param name name of data
#' @export
#'
printNumberOfNA <- function(data, column, name = NULL)
{
name <- getname(name, substitute(data))
cat(sprintf("\nis.na(%s$%s):\n", name, column))
print(table(is.na(kwb.utils::selectColumns(data, column))))
}
# printTableForColumn ----------------------------------------------------------
#' Print Result of table() for Given Column
#'
#' @param data data frame
#' @param column column name
#' @param name name of data
#' @export
#'
printTableForColumn <- function(data, column, name = NULL)
{
name <- getname(name, substitute(data))
cat(sprintf("\ntable(%s$%s):\n", name, column))
print(table(kwb.utils::selectColumns(data, column), useNA = "always"))
}
# logicalToYesNo ---------------------------------------------------------------
#' Convert Vector of Logical to Vector of "Ja"/"Nein"
#'
#' @param x vector of logical
#' @param yesno vector of character of length two giving the strings to be used
#' for \code{TRUE} and \code{FALSE}, respectively
#'
#' @return vector of character
#'
#' @export
#'
#' @examples
#' logicalToYesNo(c(TRUE, FALSE, TRUE))
#' logicalToYesNo(c(TRUE, FALSE, TRUE), yesno = c("Yeah!", "Oh no!"))
#'
logicalToYesNo <- function(x, yesno = c("Ja", "Nein"))
{
ifelse(x, yesno[1], yesno[2])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.