R/01_prepare_data.R

Defines functions add_gps trim_none trim_crump trim_sturmer trim_walker trim_data calculate_weight add_all_weights prepare_data

Documented in add_all_weights add_gps calculate_weight prepare_data trim_crump trim_data trim_none trim_sturmer trim_walker

###
### Data preparation functions for three-group trimming paper
################################################################################

###
### Functions for PS estimation
################################################################################

###  Generalized PS Function
##' Add generalized PS
##'
##' .. content for details ..
##'
##' @param data data_frame
##' @param formula formula for PS model
##' @param family \code{multinomial(parallel = FALSE)} should be used.
##' @param subset subset expression if estimating PS only for the subset
##' @param ps_prefix string used as the prefix. Use to distinguish the re-estimated PS from the original PS.
##'
##' @return data_frame containing three additional columns for PS.
##'
##' @export
add_gps <- function(data,
                    formula,
                    family,
                    subset,
                    ps_prefix) {

    assertthat::assert_that("data.frame" %in% class(data))
    assertthat::assert_that(class(formula) == "formula")
    assertthat::assert_that(is.character(ps_prefix))
    assertthat::assert_that(length(ps_prefix) == 1)

    ## Logical
    if (missing(subset)) {
        ## If NULL, using all the sample.
        subset_logical <- rep(TRUE, nrow(data))
    } else {
        ## Otherwise, subset appropriately.
        subset_logical <- eval(substitute(subset), envir = data)
    }

    ## Fit multinomial logistic regression
    res_vglm <- try(VGAM::vglm(formula = formula,
                               data    = data[subset_logical,],
                               family  = family))

    if (is.error(res_vglm)) {

        message("## vglm errored. Checking for a design matrix rank deficiency.\n")
        ## Design matrix
        X <- model.matrix(object = formula, data = data)
        ## X can have no rows, in which case Matrix::rankMatrix(X) fails
        X_rank <- try(Matrix::rankMatrix(X))
        ## Look for columns that are linearly dependent.
        if (!is.error(X_rank)) {
            for (i in seq_len(ncol(X))) {
                ## drop = FALSE to avoid vector degeneration.
                if (X_rank == Matrix::rankMatrix(X[, -i, drop = FALSE])) {
                    message("##  Dropping ", colnames(X)[i], " does not reduce rank.\n")
                }
            }
        } else {
            message("## Matrix rank assessment errored. Possibly zero rows.\n")
        }

        message("## Returning data as is without adding PS.\n")
        return(data)

    } else {

        ## Calculate PS for all groups.
        ps_data <- as_data_frame(VGAM::predict(res_vglm, newdata = data[subset_logical,], type = "response"))
        ## Add prefix
        names(ps_data) <- paste0(ps_prefix, names(ps_data))
        ##
        ps_data_full_size <- data_frame(v1 = rep(as.numeric(NA), nrow(data)),
                                        v2 = v1,
                                        v3 = v1)
        names(ps_data_full_size) <- names(ps_data)
        ## Put the calculated PS in appropriate rows
        ps_data_full_size[subset_logical,] <- ps_data
        ## Add to original data frame
        return(bind_cols(data, ps_data_full_size))
    }
}


###
### Functions for PS trimming
################################################################################

###   None (a dummy function to return 1's)
##' Dummy function to conduct no trimming at all.
##'
##' .. content for details ..
##'
##' @param A Treatment indicator vector having three levels indicated in \code{levels}.
##' @param ps0 PS vector for the first level.
##' @param ps1 PS vector for the second level.
##' @param ps2 PS vector for the third level.
##' @param levels Character vector holding three elements corresponding to levels in \code{A}
##' @param thres Threshold for the trimming strategy.
##'
##' @return A numeric indicator vector. 1 for kept and 0 for dropped.
##'
##' @export
trim_none <- function(A, ps0, ps1, ps2, levels, thres) {
    ## All of them has to be in [thres, 1.0].
    ## Treatment vector is not used.
    as.numeric(rep(1, length(ps0)))
}

###   Crump
##' Crump trimming function
##'
##' .. content for details ..
##'
##' @inheritParams trim_none
##'
##' @return A numeric indicator vector. 1 for kept and 0 for dropped.
##'
##' @export
trim_crump <- function(A, ps0, ps1, ps2, levels, thres) {
    ## All of them has to be in [thres, 1.0].
    ## Treatment vector is not used.
    as.numeric(ps0 >= thres & ps1 >= thres & ps2 >= thres)
}

###   Sturmer
##' Sturmer trimming function
##'
##' .. content for details ..
##'
##' @inheritParams trim_none
##'
##' @return A numeric indicator vector. 1 for kept and 0 for dropped.
##'
##' @export
trim_sturmer <- function(A, ps0, ps1, ps2, levels, thres) {

    ## Calculate the lower bounds as thres-quantile in respective group.
    l0 <- quantile(ps0[A == levels[1]], prob = thres)
    l1 <- quantile(ps1[A == levels[2]], prob = thres)
    l2 <- quantile(ps2[A == levels[3]], prob = thres)

    ## All three must be satisfied
    result <- as.numeric(ps0 >= l0 & ps1 >= l1 & ps2 >= l2)

    attributes(result) <- list(bounds = c(l0, l1, l2))

    result
}

###   Walker
##' Walker trimming function
##'
##' .. content for details ..
##'
##' @inheritParams trim_none
##'
##' @return A numeric indicator vector. 1 for kept and 0 for dropped.
##'
##' @export
trim_walker <- function(A, ps0, ps1, ps2, levels, thres) {

    ## Prevalence of treatment
    p0 <- mean(A == levels[1])
    p1 <- mean(A == levels[2])
    p2 <- mean(A == levels[3])

    ## Preference scores
    pi0 <- (ps0 / p0) / ((ps0 / p0) + (ps1 / p1) + (ps2 / p2))
    pi1 <- (ps1 / p1) / ((ps0 / p0) + (ps1 / p1) + (ps2 / p2))
    pi2 <- (ps2 / p2) / ((ps0 / p0) + (ps1 / p1) + (ps2 / p2))

    ## All three must be satisfied
    as.numeric(pi0 >= thres & pi1 >= thres & pi2 >= thres)
}

###   Generic trimming function (returns data with keep vector enclosed in a list)
##' Dummy function to conduct no trimming at all.
##'
##' .. content for details ..
##'
##' @param data data_frame
##' @param trim_method_name Function used for trimming that takes \code{A}, \code{ps0}, \code{ps1}, \code{ps2}, \code{levels}, and \code{thres} as arguments. Either one of the string: \code{\link{none}}, \code{\link{crump}}, \code{\link{sturmer}}, \code{\link{walker}}.
##' @param thres Threshold for the trimming strategy.
##' @param A_name Treatment indicator name in \code{data}.
##' @param ps0_name Name of the column in \code{data} for the PS for the first level.
##' @param ps1_name Name of the column in \code{data} for the PS for the second level.
##' @param ps2_name Name of the column in \code{data} for the PS for the third level.
##' @param levels Character vector holding three elements corresponding to levels in \code{A}
##'
##' @return trimmed data_frame with fewer rows
##'
##' @export
trim_data <- function(data, trim_method_name, thres,
                      A_name, ps0_name, ps1_name, ps2_name,
                      levels) {

    trim_fun <- get(paste0("trim_", trim_method_name))

    data$keep <- trim_fun(A   = unlist(data[,A_name]),
                          ps0 = unlist(data[,ps0_name]),
                          ps1 = unlist(data[,ps1_name]),
                          ps2 = unlist(data[,ps2_name]),
                          levels = levels,
                          thres = thres)

    data <- data %>%
        filter(keep == 1)

    ## It was a singleton list with data_frame for the empirical analyses.
    ## list(data)
    data
}


###
### Functions for PS weighting
################################################################################

###   Return one type of weight
##' Calculate one type of PS weight given PS
##'
##' .. content for details ..
##'
##' @inheritParams trim_none
##' @param weight_type Either one of \code{iptw}, \code{mw}, and \code{ow}.
##'
##' @return A numeric vector of individual PS weights.
##'
##' @export
calculate_weight <- function(A, ps0, ps1, ps2, levels, weight_type) {

    iptw <- (1/ps0 * (A == levels[1])) + (1/ps1 * (A == levels[2])) + (1/ps2 * (A == levels[3]))

    if (weight_type == "iptw") {
        ## IPTW
        return(iptw)

    } else if (weight_type == "mw") {
        ## Matching weights (MW)
        return(pmin(ps0, ps1, ps2) * iptw)

    } else if (weight_type == "ow") {
        ## Overlap weights (OW)
        return((1/(1/ps0 + 1/ps1 + 1/ps2)) * iptw)

    } else {
        stop("Invalid option for weight_type: ", weight_type)

    }
}

###   Add all weights
##' Calculate one type of PS weight given PS
##'
##' .. content for details ..
##'
##' @inheritParams trim_none
##' @param data data_frame
##' @param ps_prefix1 Prefix for the PS estimated in the entire cohort.
##' @param ps_prefix2 Prefix for the PS estimated in the trimmed cohort.
##'
##' @return data_frame containing IPTW, MW, and OW estimated in the entire cohort as well as the trimmed cohort.
##'
##' @export
add_all_weights <- function(data, A_name, levels, ps_prefix1 = "ps1_", ps_prefix2 = "ps2_") {

    ps1_names <- paste0(ps_prefix1, levels)

    data$iptw1 <- rep(as.numeric(NA), nrow(data))
    data[data$keep == 1,]$iptw1 <- calculate_weight(A = unlist(data[data$keep == 1, A_name]),
                                                    ps0 = unlist(data[data$keep == 1, ps1_names[1]]),
                                                    ps1 = unlist(data[data$keep == 1, ps1_names[2]]),
                                                    ps2 = unlist(data[data$keep == 1, ps1_names[3]]),
                                                    levels = levels,
                                                    weight_type = "iptw")
    data$mw1 <- rep(as.numeric(NA), nrow(data))
    data[data$keep == 1,]$mw1 <- calculate_weight(A = unlist(data[data$keep == 1, A_name]),
                                                  ps0 = unlist(data[data$keep == 1, ps1_names[1]]),
                                                  ps1 = unlist(data[data$keep == 1, ps1_names[2]]),
                                                  ps2 = unlist(data[data$keep == 1, ps1_names[3]]),
                                                  levels = levels,
                                                  weight_type = "mw")
    data$ow1 <- rep(as.numeric(NA), nrow(data))
    data[data$keep == 1,]$ow1 <- calculate_weight(A = unlist(data[data$keep == 1, A_name]),
                                                  ps0 = unlist(data[data$keep == 1, ps1_names[1]]),
                                                  ps1 = unlist(data[data$keep == 1, ps1_names[2]]),
                                                  ps2 = unlist(data[data$keep == 1, ps1_names[3]]),
                                                  levels = levels,
                                                  weight_type = "ow")

    ps2_names <- paste0(ps_prefix2, levels)
    ## Proceed if ps2_* are all available
    if (all(ps2_names %in% names(data))) {

        data$iptw2 <- rep(as.numeric(NA), nrow(data))
        data[data$keep == 1,]$iptw2 <- calculate_weight(A = unlist(data[data$keep == 1, A_name]),
                                                        ps0 = unlist(data[data$keep == 1, ps2_names[1]]),
                                                        ps1 = unlist(data[data$keep == 1, ps2_names[2]]),
                                                        ps2 = unlist(data[data$keep == 1, ps2_names[3]]),
                                                        levels = levels,
                                                        weight_type = "iptw")
        data$mw2 <- rep(as.numeric(NA), nrow(data))
        data[data$keep == 1,]$mw2 <- calculate_weight(A = unlist(data[data$keep == 1, A_name]),
                                                      ps0 = unlist(data[data$keep == 1, ps2_names[1]]),
                                                      ps1 = unlist(data[data$keep == 1, ps2_names[2]]),
                                                      ps2 = unlist(data[data$keep == 1, ps2_names[3]]),
                                                      levels = levels,
                                                      weight_type = "mw")
        data$ow2 <- rep(as.numeric(NA), nrow(data))
        data[data$keep == 1,]$ow2 <- calculate_weight(A = unlist(data[data$keep == 1, A_name]),
                                                      ps0 = unlist(data[data$keep == 1, ps2_names[1]]),
                                                      ps1 = unlist(data[data$keep == 1, ps2_names[2]]),
                                                      ps2 = unlist(data[data$keep == 1, ps2_names[3]]),
                                                      levels = levels,
                                                      weight_type = "ow")

    }
    ## Return data in the end
    return(data)
}


###
### Functions for PS matching
################################################################################


###
### Functions for coordinating the entire data preparation
################################################################################
##' One-stop function for data preparation
##'
##' Takes a data_frame and
##'
##' @param data data_frame
##' @param formula1 PS model formula for the entire cohort estimation.
##' @param formula2 PS model formula for the trimmed cohort estimation.
##' @param family PS model family
##' @param ps_prefix1 PS variable prefix for the entire cohort estimation
##' @param ps_prefix2 PS variable prefix for the trimmed cohort estimation
##' @param df_trim_thres data_frame with columns \code{trim_method_name} and \code{thres} to indicate
##' @param A_name Treatment indicator name in \code{data}.
##' @param ps0_name Name of the column in \code{data} for the PS for the first level.
##' @param ps1_name Name of the column in \code{data} for the PS for the second level.
##' @param ps2_name Name of the column in \code{data} for the PS for the third level.
##' @param levels Character vector holding three elements corresponding to levels in \code{A}
##'
##' @return nested data_frame containing
##'
##' @export
prepare_data <- function(data,
                         formula1,
                         formula2,
                         family,
                         ps_prefix1,
                         ps_prefix2,
                         df_trim_thres,
                         A_name,
                         levels) {

    ## Add first-step GPS based on the entire cohort.
    data <- add_gps(data = data,
                    formula = formula1,
                    family = family,
                    ps_prefix = ps_prefix1)

    ## Trim data by various methods and hold in a nested data frame
    nested_df <- df_trim_thres %>%
        group_by(trim_method_name, thres) %>%
        ## Put it in a list column.
        mutate(trimmed_data = list(trim_data(data = data,
                                             ## String is used to retrieve the right function.
                                             trim_method_name = trim_method_name,
                                             thres = thres,
                                             A_name = A_name,
                                             ps0_name = paste0(ps_prefix1, levels)[1],
                                             ps1_name = paste0(ps_prefix1, levels)[2],
                                             ps2_name = paste0(ps_prefix1, levels)[3],
                                             levels = levels)))

    ## Conduct PS re-estimation within each trimmed cohort.
    nested_df <- nested_df %>%
        group_by(trim_method_name, thres) %>%
        mutate(trimmed_data = map(trimmed_data, function(data) {
            add_gps(data = data,
                    formula = formula2,
                    family = VGAM::multinomial(parallel = FALSE),
                    ps_prefix = ps_prefix2)
        }))

    ## Add weights
    nested_df <- nested_df %>%
        group_by(trim_method_name, thres) %>%
        mutate(trimmed_data = map(trimmed_data, function(data) {
            add_all_weights(data = data,
                            A_name = A_name,
                            levels = levels,
                            ps_prefix1 = ps_prefix1,
                            ps_prefix2 = ps_prefix2)
        }))

    ## Add matching id


    ## Return nested df indexed by trimming methods.
    return(nested_df)
}
kaz-yos/trim3 documentation built on May 3, 2019, 7:40 p.m.