R/utils.R
In coreSim: Core Functionality for Simulating Quantities of Interest from Generalised Linear Models

Documented in qi_slimmer

#' Find maximum, minimum, and median values for each scenario found using
#' \code{\link{qi_builder}}
#'
#' @param df a data frame of simulated quantities of interest created by
#' \code{\link{qi_builder}}.
#' @param scenario_var character string of the variable name marking the
#' scenarios.
#' @param qi_var character string of the name of the variable with the
#' simulated quantity of interest values.
#'
#' @return A data frame with the fitted values and the minimum (\code{qi_min}),
#' median (\code{qi_median}), and maximum (\code{qi_max}) values from the
#' central interval specified with \code{ci} in \code{\link{qi_builder}}.
#'
#' @details This funciton slims down a simulation data set to some of its key
#' features (minimun, median, and maximum value for each fitted scenario) so that
#' it takes up less memory and can be easily plotted.
#'
#' The function is incorporated into \code{\link{qi_builder}} and can be run
#' using \code{slim = TRUE}.
#'
#' @examples
#' library(car)
#'
#' # Normal linear model
#' m1 <- lm(prestige ~ education + type, data = Prestige)
#'
#' # Simulate coefficients
#' m1_sims <- b_sim(m1)
#'
#' # Create fitted values
#' fitted_df <- expand.grid(education = 6:16, typewc = 1)
#'
#' # Find predicted outcomes (95% central interval, by default)
#' linear_qi <- qi_builder(b_sims = m1_sims, newdata = fitted_df, slim = FALSE)
#'
#' # Slim data set
#' linear_slim <- qi_slimmer(linear_qi)
#'
#' @importFrom stats median
#' @importFrom dplyr group_by summarise %>%
#'
#' @export

qi_slimmer <- function(df, scenario_var = 'scenario_', qi_var = 'qi_'){
    qi_ <- scenario_ <- NULL

    names(df)[names(df) == qi_var] <- 'qi_'
    names(df)[names(df) == scenario_var] <- 'scenario_'

    if (!(names(df)[[ncol(df)]] == 'scenario_'))
        df$scenario_ <- interaction(df[, 1:(ncol(df)-1)], drop = TRUE)

    df_out <- df %>% group_by(scenario_) %>%
        summarise(qi_min = min(qi_),
                  qi_median = median(qi_),
                  qi_max = max(qi_)
        ) %>%
        data.frame

    scenarios_df <- df[!duplicated(df$scenario_), 1:(ncol(df)-2)] %>%
        data.frame(row.names = NULL)
    df_out <- cbind(scenarios_df, df_out)
    df_out$scenario_ <- NULL

    names(df_out)[names(df_out) == 'qi_'] <- qi_var
    return(df_out)
}


#' Find interactions in fitted data frame
#' @noRd

interaction_builder <- function(b_sims, newdata) {
    fitted_names <- names(newdata)
    sim_names <- names(b_sims)

    possible_inter <- max_period_count(sim_names)

    if (possible_inter != 0) {
        possible_inter <- possible_inter + 1
        if (length(fitted_names) < possible_inter)
            possible_inter <- length(fitted_names)

        for (k in 2:possible_inter) {
            interactions <- intersect(
                possible_interaction_terms(fitted_names, n = k), sim_names)

            if (length(interactions) != 0) {
                for (i in interactions) {
                    if (length(intersect(i, fitted_names)) == 0) {
                        sub_df <- data.frame()
                        for (u in fitted_names) {
                            if (grepl(u, i))
                                sub_df <- cbind_fill(sub_df, u = newdata[[u]])
                        }
                        newdata[, i] <- apply(sub_df, 1, prod)
                    }
                }
            }
        }
    }
    return(newdata)
}


#' Bind column to empty data.frame
#' @source http://stackoverflow.com/a/26685092/1705044
#' @noRd

cbind_fill <- function(...){
    nm <- list(...)
    nm <- lapply(nm, as.matrix)
    n <- max(sapply(nm, nrow))
    do.call(cbind, lapply(nm, function (x)
        rbind(x, matrix(, n-nrow(x), ncol(x)))))
}

#' Find the maximum number of periods in the elements of a character string
#' @noRd

max_period_count <- function(x) {
    x <- x[!(grepl('^I\\..*\\.[1-9].*$', x))]
    max(nchar(x) - nchar(gsub('\\.', '', x)))
}

#' Find all possible interaction terms
#' @noRd

possible_interaction_terms <- function(x, n = 2) {

    if (n > length(x))
        stop('Number of possible interaction terms cannot exceed number of base terms.')

    one_set <- function(base_names, x2) {
        if (missing(x2)) x2 <- base_names
        comb <- vector()
        for (i in base_names) {
            temp_vec <- x2[-grep(i, x2)]
            temp_comb1 <- paste(i, temp_vec, sep = '.')
            temp_comb2 <- paste(i, rev(temp_vec), sep = '.')
            comb <- c(comb, temp_comb1, temp_comb2)
            comb <- comb[!duplicated(comb)]
        }
        return(comb)
    }

    two_way <- one_set(x)
    temp_list <- list(two_way)

    if (n > 2) {
        for (u in 2:(n-1)) {
            if (u == 2) temp_list[[u]] <- one_set(x, unlist(temp_list[1]))
            else temp_list[[u]] <- one_set(x, unlist(temp_list[u-1]))
        }
    }

    out <- unlist(temp_list[[n-1]])

    return(out)
}

#' Convert \code{ci} interval from percent to proportion and check if valid
#' @noRd

ci_check <- function(x) {
    if (x > 1 & x <= 100) x <- x / 100
    if (x <= 0 | x > 1) {
        stop(sprintf("%s is not a valid central interval.", x),
             call. = FALSE)
    }
    return(x)
}

#' Check the user supplied QI function for consistency
#' @noRd

FUN_check <- function(x) {
    if (!is.function(x))
        stop('FUN must be a function.', call. = FALSE)
    if (length(formals(x)) != 1)
        stop('FUN can only have one argument.\nThis argument must accept only a numeric vector of the fitted linear component calculated from the simulations.',
             call. = FALSE)
}

#' Check results from user supplied function
#' @noRd

FUN_results_check <- function(x) {
    if (!is.vector(x))
        stop('FUN did not return a vector.', call. = FALSE)
    if (!is.numeric(x))
        stop('FUN returned a non-numeric vector.', call. = FALSE)
}

#' Find scenarios to simulate from fitted model objects
#' @noRd

find_scenarios <- function(obj, nsim, large_computation = FALSE) {
    # Supported model types for scenario extraction
    supported <- c('lm', 'glm')
    if (!any(supported %in% class(obj)))
        stop(sprintf('Scenarios cannot currently be extracted from %s models.\n Please define scenarios with newdata.',
            class(obj)), call. = FALSE)

    out <- obj$model[, -1]

    out <- out[!duplicated(out[, 1:ncol(out)]), ]

    if (!isTRUE(large_computation)) {
        unique_scenarios <- nrow(out)
        nout_sims <- nsim * unique_scenarios
            if (nout_sims > 100000)
                stop(sprintf('%s unique scenarios were found.\nWith %s simulations, %s simulated scenarios would be created.\n\nFor manageable computation, please supply a smaller number of scenarios to newdata.',
                             unique_scenarios, nsim, nout_sims), call. = FALSE)
    }
    return(out)
}

#' Convert factor levels into binary categorical values
#' @importFrom splines bs
#' @noRd

factorise <- function(x, b_sims) {
    x <- data.frame(x)
    comb <- x
    for (i in names(x)) {
        if(is.character(x[[i]]) | is.factor(x[[i]])) {
            unique_levels <- unique(x[[i]])
            new_cols <- sprintf('%s%s', i, unique_levels)

            cat_df <- data.frame(
                        matrix(0, nrow = nrow(x), ncol = length(new_cols)))
            names(cat_df) <- new_cols

            for (u in new_cols) cat_df[, u][x[[i]] == gsub(i, '', u)] <- 1
            comb <- comb[, !(names(comb) %in% i), drop = FALSE]
            comb <- cbind(comb, cat_df)
        }
    }
    comb <- comb[, names(comb) %in% names(b_sims), drop = FALSE]
    return(comb)
}

#' Find non-linear transformations of variables in newdata
#' @noRd

non_linear_transformer <- function(x, b_sims) {
    sim_names <- colnames(b_sims)

    # Polynomials
    ## Note: assumes poly <= 9
    poly_pattern <- '^I\\..*\\.[1-9]\\.$'
    if (any(grepl(poly_pattern, sim_names))) {
        sub_names <- extract_names(x = x, sim_names = sim_names,
                                   pattern = poly_pattern)
        for (i in sub_names) {
            var_i <- gsub('^I\\.', '', i)
            var_i <- gsub('\\.[1-9]\\.$', '', var_i)

            if((var_i %in% colnames(x))) {
                pow_i <- as.numeric(substr(i, nchar(i)-1, nchar(i)-1))
                x[, i] <- x[, var_i]^pow_i
            }
        }
    }

    # Natural logarithms
    log_pattern <- '^log\\..*\\.$'
    if (any(grepl(log_pattern, sim_names))) {
        sub_names <- extract_names(x = x, sim_names = sim_names,
                                   pattern = log_pattern)
        for (i in sub_names) {
            var_i <- gsub('^log\\.', '', i)
            var_i <- gsub('\\.$', '', var_i)

            if((var_i %in% colnames(x))) {
                x[, i] <- log(x[, var_i])
            }
        }
    }

    # B-Spline basis for polynomial splines
    bs_pattern <- '^bs\\..*\\.[1-9]$'
    if (any(grepl(bs_pattern, sim_names))) {
        sub_names <- extract_names(x = x, sim_names = sim_names,
                                   pattern = bs_pattern)
        base_names <- gsub('^bs\\.', '', sub_names)
        base_names <- gsub('\\.[1-9]$', '', base_names)
        base_names <- gsub('\\.\\.degree\\.\\.\\.[1-9]', '', base_names)
        base_names <- unique(base_names)
        for (i in base_names) {
            i_names <- sub_names[grep(i, sub_names)]
            d <- as.numeric(substr(sub_names, nchar(sub_names),
                                         nchar(sub_names)))
            d <- max(d)
            temp_bs <- as.data.frame(bs(x[, i], degree = d))
            names(temp_bs) <- i_names
            x <- cbind(x, temp_bs)
        }
    }

    return(x)
}

#' Extract column names matching pattern
#' @noRd

extract_names <- function(x, sim_names, pattern) {
    sub_names <- sim_names[grepl(pattern, sim_names)]
    sub_names <- sub_names[!(sub_names %in% names(x))]
    return(sub_names)
}

#' Constrict a data frame of simulated values to a central interval
#' @param sims_scenarios a data frame of simulated quantities of interest and
#' a column grouping them by fitted scenario.
#' @param scenario_var character string of the variable name marking the
#' scenarios.
#' @param qi_var character string of the name of the variable with the
#' simulated quantity of interest values.
#' @param ci numeric value indicating the central interval. Must be in (0, 1].
#'
#' @importFrom stats quantile
#' @importFrom dplyr bind_rows
#' @noRd

qi_central_interval <- function(sims_scenarios, scenario_var = 'scenario_',
                                qi_var = 'qi_', ci = 0.95)
{
    qi_ <- NULL

    lower <- (1 - ci)/2
    upper <- 1 - lower

    names(sims_scenarios)[names(sims_scenarios) == qi_var] <- 'qi_'

    qi_list <- split(sims_scenarios, sims_scenarios[[scenario_var]])
    qi_list <- lapply(seq_along(qi_list), function(x){
        lower_bound <- quantile(qi_list[[x]][, 'qi_'], prob = lower)
        upper_bound <- quantile(qi_list[[x]][, 'qi_'], prob = upper)
        subset(qi_list[[x]], qi_ >= lower_bound & qi_ <= upper_bound)
    })

    out <- data.frame(bind_rows(qi_list))
    names(out)[names(out) == 'qi_'] <- qi_var

    return(out)
}

Any scripts or data that you put into this service are public.

coreSim documentation built on May 2, 2019, 5:11 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

coreSim
Core Functionality for Simulating Quantities of Interest from Generalised Linear Models

R/utils.R
In coreSim: Core Functionality for Simulating Quantities of Interest from Generalised Linear Models

Defines functions qi_slimmer interaction_builder cbind_fill max_period_count possible_interaction_terms ci_check FUN_check FUN_results_check find_scenarios factorise non_linear_transformer extract_names qi_central_interval

Documented in qi_slimmer

Try the coreSim package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

coreSim Core Functionality for Simulating Quantities of Interest from Generalised Linear Models

R/utils.R In coreSim: Core Functionality for Simulating Quantities of Interest from Generalised Linear Models

Defines functions qi_slimmer interaction_builder cbind_fill max_period_count possible_interaction_terms ci_check FUN_check FUN_results_check find_scenarios factorise non_linear_transformer extract_names qi_central_interval

Documented in qi_slimmer

Try the coreSim package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

coreSim
Core Functionality for Simulating Quantities of Interest from Generalised Linear Models

R/utils.R
In coreSim: Core Functionality for Simulating Quantities of Interest from Generalised Linear Models