R/post_processing.R

Defines functions process_truncate_by_threshold process_truncate_by_iqr do_processing

Documented in do_processing process_truncate_by_iqr process_truncate_by_threshold

# Post-processing Automation ########################################
#' @title Post-processing of datamatrix
#'
#' @description
#' Applies functions to a matrix or data.frame.
#'
#' @param x
#' Matrix or Data.frame.
#' @param functions
#' List of lists, specifying functions to be applied as well as their
#' arguments. See details.
#'
#' @details
#' Functions are passed into the post-processor as a named list. The name
#' `f` of the list entry is the function to be applied via
#' \code{\link[base:do.call]{base::do.call}}.
#' The list entry itself is another named list, specifying the arguments
#' to the function `f` as named arguments.
#'
#' The functions must take a matrix or data.frame as first argument and
#' return another matrix or data.frame of the same dimensions as
#' single output.
#'
#' Examples of post-processing steps are truncation
#' (\code{\link{process_truncate_by_iqr}},
#' \code{\link{process_truncate_by_threshold}}) or
#' centering / standardizing data (via \code{\link{scale}},
#' see example section below).
#'
#' Can be useful to apply on simulated datasets, even outside of the
#' simulation function (e.g. when standardization is only required at the
#' modeling step).
#'
#' @note
#' Use with caution - no error checking is done for now so the user has
#' to take care of everything themselves! Furthermore, output of the
#' functions is not checked either.
#'
#' @return
#' Matrix or data.frame with post-processing applied.
#'
#' @examples
#' do_processing(diag(5), 
#'     functions = list(scale = list(center = TRUE, scale = FALSE)))
#'
#' @export
do_processing <- function(x, functions = list()) {

    # apply post-processing functions
    for (f in names(functions)) {
        x <- do.call(f, utils::modifyList(list(x), functions[[f]]))
    }

    x
}

# Post-processing Functions #########################################
#' @title Truncate columns of datamatrix at datamatrix specific thresholds
#'
#' @description
#' Truncation based on the interquartile range to be applied to a dataset.
#'
#' @param x
#' Matrix or Data.frame.
#' @param truncate_multipliers
#' Vector of truncation parameters. Either a single value which is
#' replicated as necessary or of same dimension as `ncol(x)`.
#' If any vector entry is NA, the corresponding column will not be
#' truncated. If named, then the names must correspond to columnnames in `x`,
#' and only specified columns will be processed. See details.
#' @param only_numeric
#' If TRUE and if `x` is a data.frame, then only columns of type `numeric` will
#' be processed. Otherwise all columns will be processed (e.g. also in the
#' case that `x` is a matrix).
#'
#' @details
#' Truncation is processed as follows:
#' \enumerate{
#' \item Compute the 1st and 3rd quartile q1 / q3 of variables in `x`.
#' \item Multiply these quantities by values in `truncate_multipliers` to obtain
#' _L_ and _U_. If a value is NA, the corresponding variable is not truncated.
#' \item Set any value smaller / larger than _L_ / _U_ to _L_ / _U_.
#' }
#'
#' Truncation multipliers can be specified in three ways (note that whenever
#' `only_numeric` is set to TRUE, then only numeric columns are affected):
#'
#' \itemize{
#' \item A single numeric - then all columns will be processed in the same way
#' \item A numeric vector without names - it is assumed that the length can be
#' replicated to the number of columns in `x`, each column is processed by the
#' corresponding value in the vector
#' \item A numeric vector with names - length can differ from the columns in
#' `x` and only the columns for which the names occur in the vector are
#' processed
#' }
#'
#' @return
#' Matrix or data.frame of same dimensions as input.
#'
#' @export
process_truncate_by_iqr <- function(x,
                                    truncate_multipliers = NA,
                                    only_numeric = TRUE) {

    # if input vector has names -> remember them
    truncate_names <- NULL
    if (!is.null(names(truncate_multipliers)) & !is.null(names(x)))
        truncate_names <- names(truncate_multipliers)

    # truncation vector
    truncate_vector <- rep_len(NA, ncol(x))
    names(truncate_vector) <- names(x)

    # set values in vector if available
    if (!is.null(truncate_names)) {
        truncate_vector[truncate_names] <- truncate_multipliers
    } else truncate_vector <- rep_len(truncate_multipliers, ncol(x))

    if (is.data.frame(x) & only_numeric) {
        ind_numeric <- sapply(x, class) == "numeric"
        truncate_vector[!ind_numeric] <- NA
    }

    # which columns should be truncated
    do_trunc <- which(!sapply(truncate_vector, is.na))

    if (length(do_trunc) > 0) {
        # determine thresholds to truncate at
        quantiles <- apply(x[, do_trunc, drop = FALSE], 2,
            stats::quantile, c(0.25, 0.75))
        iqr <- apply(quantiles, 2, diff)
        truncate_upper <- quantiles[2, ] +
            truncate_vector[do_trunc] * iqr
        truncate_lower <- quantiles[1, ] -
            truncate_vector[do_trunc] * iqr

        # do truncation
        for (i in seq_along(do_trunc)) {
            x[x[, do_trunc[i]] > truncate_upper[i], do_trunc[i]] <-
                truncate_upper[i]
            x[x[, do_trunc[i]] < truncate_lower[i], do_trunc[i]] <-
                truncate_lower[i]
        }
    }

    x
}

#' @title Truncate columns of datamatrix at specified thresholds
#'
#' @description
#' Truncation based on fixed thresholds to be applied to a dataset. Allows
#' to implement truncation by measures derived from the overall data generating
#' mechanism.
#'
#' @param x
#' Matrix or Data.frame.
#' @param truncate_lower,truncate_upper
#' Vectors of truncation parameters, i.e. lower and upper tresholds for
#' truncation.
#' Either a single value which is replicated as necessary or of same dimension
#' as `ncol(x)`. If any vector entry is NA, the corresponding column will not be
#' truncated. Truncation at lower and upper thresholds is treated independently.
#' If named, then the names must correspond to columnnames in `x`,
#' and only specified columns will be processed. See details.
#' @param only_numeric
#' If TRUE and if `x` is a data.frame, then only columns of type `numeric` will
#' be processed. Otherwise all columns will be processed (e.g. also in the
#' case that `x` is a matrix).
#'
#' @details
#' Truncation is defined by setting all values below or above the truncation
#' threshold to the truncation threshold.
#'
#' Truncation parameters can be specified in three ways (note that whenever
#' `only_numeric` is set to TRUE, then only numeric columns are affected):
#'
#' \itemize{
#' \item A single numeric - then all columns will be processed in the same way
#' \item A numeric vector without names - it is assumed that the length can be
#' replicated to the number of columns in `x`, each column is processed by the
#' corresponding value in the vector
#' \item A numeric vector with names - length can differ from the columns in
#' `x` and only the columns for which the names occur in the vector are
#' processed
#' }
#'
#' @return
#' Matrix or data.frame of same dimensions as input.
#'
#' @export
process_truncate_by_threshold <- function(x,
                                          truncate_lower = NA,
                                          truncate_upper = NA,
                                          only_numeric = TRUE) {

    # if input vectors have names -> remember them
    truncate_lower_names <- NULL
    if (!is.null(names(truncate_lower)) & !is.null(names(x)))
        truncate_lower_names <- names(truncate_lower)
    truncate_upper_names <- NULL
    if (!is.null(names(truncate_upper)) & !is.null(names(x)))
        truncate_upper_names <- names(truncate_upper)

    # truncation vectors
    truncate_lower_vector <- rep_len(NA, ncol(x))
    names(truncate_lower_vector) <- names(x)
    truncate_upper_vector <- rep_len(NA, ncol(x))
    names(truncate_upper_vector) <- names(x)

    # set values in vector if available
    if (!is.null(truncate_lower_names)) {
        truncate_lower_vector[truncate_lower_names] <- truncate_lower
    } else truncate_lower_vector <- rep_len(truncate_lower, ncol(x))
    if (!is.null(truncate_upper_names)) {
        truncate_upper_vector[truncate_upper_names] <- truncate_upper
    } else truncate_upper_vector <- rep_len(truncate_upper, ncol(x))

    if (is.data.frame(x) & only_numeric) {
        ind_numeric <- sapply(x, class) == "numeric"
        truncate_lower_vector[!ind_numeric] <- NA
        truncate_upper_vector[!ind_numeric] <- NA
    }

    # which columns should be truncated
    do_trunc <- which(!sapply(truncate_lower_vector, is.na))
    if (length(do_trunc) > 0) {
        # do truncation
        for (i in do_trunc) {
            x[x[, i] < truncate_lower_vector[i], i] <-
                truncate_lower_vector[i]
        }
    }

    do_trunc <- which(!sapply(truncate_upper_vector, is.na))
    if (length(do_trunc) > 0) {
        # do truncation
        for (i in do_trunc) {
            x[x[, i] > truncate_upper_vector[i], i] <-
                truncate_upper_vector[i]
        }
    }

    x
}
matherealize/simdata documentation built on Dec. 5, 2024, 4:17 a.m.