flipData: Functions for extracting and describing data

Documented in AddDummyVariablesForNAs CleanSubset CleanWeights ErrorIfInfinity ErrorIfMissingDataFound ExcludeCasesWithAnyMissingData ExcludeCasesWithCompletelyMissingData MissingDataFail MissingValuesByVariable NoData RemoveCasesWithAllNA RemoveCasesWithAnyNA

#' \code{RemoveCasesWithAnyNA}
#'
#' @description Remove rows which contain NA.
#' @param x The input dataframe.
#' @export
RemoveCasesWithAnyNA <- function(x)
{
    x[apply(is.na(x), 1, sum) == 0, , drop = FALSE]
}

#' \code{RemoveCasesWithAllNA}
#'
#' @description Remove rows which are all NA.
#' @param x The input dataframe.
#' @export
RemoveCasesWithAllNA <- function(x)
{
    x[apply(is.na(x), 1, sum) < ncol(x), , drop = FALSE]
}

#' \code{ExcludeCasesWithAnyMissingData}
#' @description This is a wrapper for \code{\link{na.omit}}
#' @param data A \code{\link{data.frame}}.
#' @examples
#' df <- data.frame(x = c(NA, 1), y = 1:2)
#' ExcludeCasesWithAnyMissingData(df)
#' @importFrom stats na.omit
#' @export
ExcludeCasesWithAnyMissingData <- function(data)
{
    result <- na.omit(data)
    if(nrow(result) == 0)
        NoData()
    result
}

#' \code{ExcludeCasesWithCompletelyMissingData}
#' @description Create a copy of a data frame where any cases which have only missing values
#' are removed.
#' @param data A \code{\link{data.frame}}.
#' @examples
#' my.df <- data.frame("A" = c(1, 2, 3, 4, NA), "B" = c(NA, 1, 2, 3, NA), "C" = c(NA, NA, 1, 2, NA))
#' ExcludeCasesWithCompletelyMissingData(my.df)
#' @importFrom verbs SumEachRow
#' @export
ExcludeCasesWithCompletelyMissingData <- function(data)
{
    result <- data[SumEachRow(is.na(data), remove.missing = FALSE) < ncol(data), ]
    if (nrow(result) == 0)
    {
        NoData()
    }
    return(result)
}

#' \code{ErrorIfMissingDataFound}
#' @description This is a wrapper for \code{\link{na.fail}}
#' @param data A \code{\link{data.frame}}.
#'# @examples
#'# df = data.frame(x = c(NA, 1), y = 1:2)
#'# ErrorIfMissingDataFound(df)
#' @export
ErrorIfMissingDataFound <- function(data)
{
    if (any(is.na(data)))
        MissingDataFail()
    data
}

notAvailable <- function(unavailable.function.name)
{
    stop(paste(unavailable.function.name,"is not available for this analysis. Please contact support if you believe this option should be available."))
}

#' \code{NoData}
#' @description Error thrown when all cases contain missing data and no data is
#'     available for use as the "Exclude cases with missing data" option was
#'     selected.
#' @export
NoData <- function()
{
    stop("All observations contain some missing data, so an analysis is not ",
         "possible. Check to see if there are any 'missing' options that can ",
         "be used.")
}

#' \code{MissingDataFail}
#' @description Error thrown when missing values are present and the
#'     "Error if missing data" option was selected.
#' @export
MissingDataFail <- function()
{
    stop("The data contains missing values. Change the 'missing' option to run the analysis.")
}

#' \code{MissingValuesByVariable}
#' @description Computes the number of missing values by variable.
#' @param data A \code{\link{data.frame}}.
#' @importFrom flipFormat FormatAsPercent
#' @export
MissingValuesByVariable <- function(data)
{
    n <- nrow(data)
    missing <- apply(is.na(data), 2, sum)
    result <- data.frame("Missing" = missing,
        proportion = FormatAsPercent(missing / n))
    names(result)[2] <- paste0("Percent (of ", n, ")")
    result}


#' \code{CleanSubset}
#' Takes a QSubset variable and turns it into a logical vector with no missing values
#' @param subset A QSubset variable from Displayr or Q.
#' @param n.total The total number of observations.
#' @importFrom verbs Sum
#' @export
CleanSubset <- function(subset, n.total)
{
    new.subset <- NULL
    if (is.null(subset))
    {
        n.subset <- n.total
        new.subset <- rep(TRUE, n.total)
    }
    else
    {
        subset.length <- length(subset)
        if(subset.length == 1)
        {
            n.subset <- n.total
            new.subset <- rep(subset, n.total)
        }
        else
        {
            if (subset.length != n.total)
                stop("subset.length != n.total")
            subset[is.na(subset)] <- FALSE
            subset[is.na(subset)] <- FALSE

        }
    }
    if (!is.null(new.subset))
        subset <- CopyAttributes(new.subset, subset)
    n.subset <- Sum(subset, remove.missing = FALSE)
    attr(subset, "n.subset") = n.subset
    subset
}

#' \code{CleanWeights}
#' Takes a QSubset variable and turns it into a logical vector with no missing values
#' @param weights A weights variable from Displayr or Q (i.e., \code{QcalibratedWeight}
#' or \code{QPopulationWeight}).
#' @export
CleanWeights <- function(weights)
{
    if (is.null(weights))
        return(weights)
    weights[is.na(weights) | weights < 0] <- 0
    weights
}


#' Returns an error if any column(s) contain +/- ininity.
#'
#' @param data A \code{data.frame}.
#' @export
ErrorIfInfinity <- function(data)
{
    infinite.cols <- sapply(data, function(x) any(is.infinite(x)))
    if (any(infinite.cols))
    {
        infinite.vars <- paste(names(infinite.cols)[infinite.cols], collapse = ", ")
        stop("Variable(s) ", infinite.vars, " contain infinite values.",
             " Either recode the infinities to finite values or set them as missing data.")
    }
}

#' @name AddDummyVariablesForNAs
#'
#' @title Add dummy variables to a \code{data.frame} suitable for regression models.
#'
#' @description Appends a matrix to a \code{data.frame} containing columns of dummy variables. It assumes
#'   the input data is suitable for regression and contains a single column for an outcome variable.
#'   Other columns would be for individual predictors used in the regression. The appended columns of dummy
#'   variables indicate the missing status of each predictor. i.e. 1 if the predictor is missing in that case,
#'   zero otherwise. A column is added for each predictor that has at least one case of missing data.
#'
#' @param data A \code{data.frame} of the data to be used for a Regression model assuming a single outcome.
#' @param outcome.name A \code{characater} of the name of the outcome variable in the data.
#' @param checks A \code{logical} to determine if further checks are done to remove cases from the data.
#'   If \code{TRUE}, cases are removed if all predictors are missing or if the response is missing.
#' @export
AddDummyVariablesForNAs <- function(data, outcome.name, checks = TRUE)
{
    outcome <- data[which(names(data) == outcome.name)]
    predictor.df <- data[-which(names(data) == outcome.name)]
    # Create dummy variable matrix, only create column if necessary
    dummy.variable.df <- lapply(predictor.df, function(x) {
        z <- is.na(x)
        if (any(z)) as.integer(z)
    })
    # Remove the NULL elements (no missing)
    dummy.variable.df <- Filter(length, dummy.variable.df)
    missing.outcomes <- is.na(outcome[[1]])
    # If no missing data in predictors, return original data, trimming missing outcomes if req
    if (length(dummy.variable.df) == 0)
    {
        if (any(missing.outcomes) && checks)
            data <- data[!missing.outcomes, ]
        return(data)
    }
    dummy.variable.df <- checkAndMapDummyVariables(dummy.variable.df)
    if (ncol(data) == 2)
        cases.all.predictors.missing <- rep(FALSE, nrow(predictor.df))
    else
        cases.all.predictors.missing <- apply(predictor.df, 1, function(x) all(is.na(x)))

    dummy.variable.df <- as.data.frame(dummy.variable.df, check.names = FALSE)
    names(dummy.variable.df) <- paste0(names(dummy.variable.df), ".dummy.var_GQ9KqD7YOf")
    # replace NAs in predictor df with means or reference level
    predictor.df <- remapDataFrame(predictor.df)
    # Create new data.frame
    new.data <- cbind.data.frame(outcome, predictor.df, dummy.variable.df)
    if (checks)
    {
        # Check if all predictors missing, if so, subset, attributes to be preserved in EstimationData
        if (any(cases.all.predictors.missing) || any(missing.outcomes))
            new.data <- new.data[!cases.all.predictors.missing & !missing.outcomes, ]
        # Inspect dummy matrix and remove dummy variables no longer required
        dummy.parts <- names(new.data) %in% names(dummy.variable.df)
        if (any(empty.dummy.vars <- lapply(new.data[dummy.parts], sum) == 0))
            new.data[names(which(empty.dummy.vars))] <- NULL
        # Copy attributes from the dummy variable (required for aliased dummy mapping)
        new.data <- CopyAttributes(new.data, dummy.variable.df)
    }
    new.data
}

# original.data should be data.frame input with numeric and factor columns
remapDataFrame <- function(dataframe)
{
    remapped.list <- lapply(dataframe, function(x) {
        if (is.numeric(x))
            x[is.na(x)] <- mean(x, na.rm = TRUE)
        else if (is.factor(x))
            x[is.na(x)] <- levels(x)[1]
        else
            stop("Unexpected class when using dummy variable adjustment. ",
                 "Supplied variable should be 'numeric' or 'factor', ",
                 "however suppled variable is ", sQuote(class(x)))
        return(x)
    })
    as.data.frame(remapped.list, check.names = FALSE)
}

# Helper function for the dummy variable adjustment.
# This function adds an attribute which contains a character vector mapping the dummy variable
# to each variable that are linked to its binary 0/1 values.
# This is usually redundant since it typically only maps to a single variable and that information
# is evident in the dummy variable name. The exception is in the case of aliased
# Dummy variables. That is, predictors have precisely the same missing cases, causing the
# dummy variables for those predictors will be identical (aliased). In this case, a single
# dummy variable is created for the group of predictors with identical missing value structure
# and the mapping contains a character vector of all variables in that group that correspond to it.
checkAndMapDummyVariables <- function(data)
{
    x <- 1L
    while(x < length(data))
    {
        current.duplicates <- vapply(names(data)[-(1L:x)],
                                     function(y) all(data[[y]] == data[[x]]), logical(1))
        names.to.map <- names(data)[x]
        if (any(current.duplicates))
        {
            names.to.map <- c(names.to.map, names(which(current.duplicates)))
            data[names(which(current.duplicates))] <- NULL
        }
        attr(data[[x]], "predictors.matching.dummy") <- names.to.map
        x <- x + 1
    }
    # Add the attribute for the last list element if it wasn't added in the prior steps
    if (is.null(attr(data[[length(data)]], "predictors.matching.dummy")))
        attr(data[[length(data)]], "predictors.matching.dummy") <- names(data)[length(data)]
    data
}

Displayr/flipData documentation built on March 2, 2024, 3:49 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Displayr/flipData
Functions for extracting and describing data

R/missingdata.R
In Displayr/flipData: Functions for extracting and describing data

Documented in AddDummyVariablesForNAs CleanSubset CleanWeights ErrorIfInfinity ErrorIfMissingDataFound ExcludeCasesWithAnyMissingData ExcludeCasesWithCompletelyMissingData MissingDataFail MissingValuesByVariable NoData RemoveCasesWithAllNA RemoveCasesWithAnyNA

R Package Documentation

Browse R Packages

We want your feedback!

Displayr/flipData Functions for extracting and describing data

R/missingdata.R In Displayr/flipData: Functions for extracting and describing data

Documented in AddDummyVariablesForNAs CleanSubset CleanWeights ErrorIfInfinity ErrorIfMissingDataFound ExcludeCasesWithAnyMissingData ExcludeCasesWithCompletelyMissingData MissingDataFail MissingValuesByVariable NoData RemoveCasesWithAllNA RemoveCasesWithAnyNA

R Package Documentation

Browse R Packages

We want your feedback!

Displayr/flipData
Functions for extracting and describing data

R/missingdata.R
In Displayr/flipData: Functions for extracting and describing data