R/detect_idiosyncratic.R

Defines functions get_p_value detect_idiosyncratic

Documented in detect_idiosyncratic get_p_value

## Set global variables for foreach
globalVariables('b')

#' detect_idiosyncratic
#'
#' Test for systematic treatment effect heterogeneity using Fisherian
#' permutation inference methods.
#' @param formula An object of class formula, as in lm(), such as Y ~ Z with
#'   only the treatment variable on the right-hand side.
#' @param data A data.frame, tbl_df, or data.table with the input data.
#' @param interaction.formula A right-sided formula with pre-treatment
#'   covariates to model treatment effects for on the right hand side, such as ~
#'   x1 + x2 + x3. Defaultis NULL (no interactions modeled)
#' @param control.formula A right-sided formula with pre-treatment covariates to
#'   adjust for on the right hand side, such as ~ x1 + x2 + x3. Default is NULL
#'   (no variables adjusted for)
#' @param plugin Whether to calculate the plug-in p-value without sweeping over
#'   range of possible treatment effect magnitudes. Default is FALSE.
#' @param tau.hat The value of the plug-in treatment effect. Default is sample
#'   average treatment effect.
#' @param test.stat  Test statistic function to use on the data. Default is
#'   shifted Kolmogorov-Smirnov statistic, potentially with covariate adjustment
#'   depending on passed arguments.  test.stat can be a string name for a test statistic function, or the function itself.
#' @param te.vec Vector of taus to examine if you want to override generating
#'   ones automatically. Default is NULL.
#' @param B  Number of permutations to take. Default is 500.
#' @param gamma How wide of a CI to make around tau-hat for search. Default is
#'   0.0001.
#' @param grid.gamma Parameter to govern where the grid points are sampled.
#'   Bigger values means more samples towards the estimated tau-hat. Default is
#'   100*gamma.
#' @param grid.size Number of points in the grid. Default is 151.
#' @param return.matrix  Whether to return the matrix of all the imputed
#'   statistics.  Default is FALSE.
#' @param na.rm A logical flag indicating whether to list-wise delete missing
#'   data. The function will report an error if missing data exist. Default is
#'   FALSE.
#' @param n.cores Number of cores to use to parallelize permutation step.
#'   Default is 1.
#' @param verbose  Whether to print out progress bar when fitting and other
#'   diagnostics. Default is TRUE.
#' @param ... Extra arguments passed to the generate_permutations function and
#'   test.stat functions.
#'
#' @return If plug-in, the value of the test and the associated p-value. If not,
#'   a list with the value of the test statistic on the observed data, the value
#'   of the CI-adjusted p-value, the plug-in p-value, and other information on
#'   the test.
#'
#' @examples
#' Z <- rep(c(0, 1), 100)
#' tau <- 4
#' Y <- ifelse(Z, rnorm(100, tau), rnorm(100, 0))
#' df <- data.frame(Y=Y, Z=Z)
#' tst <- detect_idiosyncratic(Y ~ Z, df, B = 50, grid.size = 50)
#' @export
#' @importFrom graphics abline lines plot rug
#' @importFrom stats binom.test binomial coef confint ecdf glm lm lowess predict
#'   qchisq qnorm var vcov na.omit
#' @importFrom utils setTxtProgressBar txtProgressBar
#' @importFrom quantreg rq
#' @importFrom mvtnorm rmvnorm
#' @importFrom foreach "%do%" "%dopar%" foreach
#' @importFrom parallel makePSOCKcluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom ggplot2 ggplot aes facet_grid geom_point geom_smooth labs
detect_idiosyncratic <- function(formula, data,
                                 interaction.formula = NULL, control.formula = NULL,
                                 plugin = FALSE, tau.hat = NULL,
                                 test.stat = ifelse( is.null(W) & is.null(X), "SKS_stat", ifelse( is.null(W), "SKS_stat_cov", "SKS_stat_int_cov" ) ),
                                 te.vec = NULL,
                                 B = 500, gamma = 0.0001, grid.gamma = 100*gamma,
                                 grid.size = 151, return.matrix = FALSE, na.rm = FALSE,
                                 n.cores = 1, verbose = TRUE, ...){
    
    ## ---------------------------
    ## Get variables from formulas
    ## and check formulas
    ## ---------------------------
    if(inherits(data, "data.frame")) {
        data <- as.data.frame(data)
    }else{
        stop("The input data must be of class tbl_df, data.table, or data.frame.")
    }
    
    if(length(lhs_vars(formula)) != 1 | length(rhs_vars(formula)) != 1){
        stop("The formula argument must be of the form outcome ~ treatment.")
    }
    main.vars <- get_vars(formula,data=data)
    if(any(!(main.vars %in% colnames(data)))){
        stop("Some variables in formula are not present in your data.")
    }
    
    if(!is.null(interaction.formula)){
        if(length(lhs_vars(interaction.formula)) != 0){
            stop("Do not provide an outcome variable in interaction.formula.")
        }
        interaction.vars <- get_vars(interaction.formula,data=data)
        if(any(!(interaction.vars %in% colnames(data)))){
            stop("Some variables in interaction.formula are not present in your data.")
        }
        if(any(main.vars %in% interaction.vars)){
            stop("Either your outcome variable or your treatment variable is present in your interaction formula.")
        }
    }else{
        interaction.vars <- NULL
    }

    if(!is.null(control.formula)){
        if(length(lhs_vars(control.formula)) != 0){
            stop("Do not provide an outcome variable in control.formula.")
        }
        control.vars <- get_vars(control.formula,data=data)
        if(any(!(control.vars %in% colnames(data)))){
            stop("Some variables in control.formula are not present in your data.")
        }
        if(any(main.vars %in% interaction.vars)){
            stop("Either your outcome variable or your treatment variable is present in your control formula.")
        }
        if(!is.null(interaction.formula)){
            if(any(control.vars %in% interaction.vars)){
                stop("You have variables in your interaction formula that are also present in your control formula.")
            }
        }
    }else{
        control.vars <- NULL
    }

    ## NA handling
    vars <- c(main.vars, interaction.vars, control.vars)
    if(na.rm == TRUE){
        data <- na.omit(data[,vars])
    }else{
        if(sum(is.na(data[,vars])) > 0){
            stop("You have missing values in your input data. Either set na.rm = TRUE or check your input data for missingness.")
        }
    }

    ## Extract data
    Y <- data[,main.vars[1]]
    Z <- data[,main.vars[2]]
    if(!is.null(interaction.formula)){
        W <- as.matrix(data[,interaction.vars])
    }else{
        W <- NULL
    }
    if(!is.null(control.formula)){
        X <- as.matrix(data[,control.vars])
    }else{
        X <- NULL
    }
    
    ## ------------------------
    ## Run checks on input data
    ## ------------------------
    ## Class checks on Y and Z
    if(!inherits(Y, "numeric")){
        stop("Y must be a numeric vector.")
    }
    if(inherits(Z, "logical")){
        Z <- ifelse(Z, 1, 0)
    }else if(inherits(Z, "factor") | inherits(Z, "character")){
        Z <- as.character(Z)
        tcat <- unique(Z)[1]
        Z <- ifelse(Z == tcat, 1, 0)
        cat("Z is factor or character class, setting", tcat, "as treatment category.\n")
    }

    ## Class and input checks on W and X
    if(plugin & !is.null(interaction.formula)){
        warning("Running plug-in test but covariates provided for interaction.formula, will not adjust for specified heterogeneity.")
    }
    if(plugin & is.null(tau.hat)){
        warning("Using plug-in test with no argument passed to tau.hat, will estimate tau.hat from the data.\n")
    }
    if(!is.null(te.vec) & !is.null(W)){
        warning("Adjusting for existing heterogeneity but treatment effect vector provided, ignoring provided treatment effect vector.")
    }
    if(!is.null(W)){
        if(!is.function(test.stat) && test.stat %in% c("WSKS_t", "SKS_pool_t", "WSKS.t", "SKS.pool.t")){
            if(ncol(W) > 1){
                stop("Can only adjust for a single (categorical) covariate when using WSKS_t or SKS_pool_t as test statistics.")
            }
            W <- as.factor(W)
        }
        if(!inherits(W, "factor")){
            unq.check <- apply(W, 2, function(x){length(unique(x))})
        }else{
            unq.check <- length(unique(W))
        }
        if(any(unq.check == 1)){
            stop("Some variables specified for the interaction model in interaction.formula have no variation.")
        }
    }
    if(!is.null(X)){
        if(!(inherits(X, "data.frame") | inherits(X, "matrix"))){
            stop("X must be either a data frame or a matrix.")
        }
        unq.check <- apply(X, 2, function(x){length(unique(x))})
        if(any(unq.check == 1)){
            stop("Some variables specified for adjustment in control.formula have no variation.")
        }
    }

    ## Checks on functions
    ## Accept both old dot-case and new snake_case names
    no.adj.funs <- c("KS_stat", "SKS_stat", "rq_stat",
                     "KS.stat", "SKS.stat", "rq.stat")
    adj.funs <- c("SKS_stat_cov_pool", "SKS_stat_cov", "SKS_stat_cov_rq", "rq_stat_cond_cov", "rq_stat_uncond_cov",
                  "SKS.stat.cov.pool", "SKS.stat.cov", "SKS.stat.cov.rq", "rq.stat.cond.cov", "rq.stat.uncond.cov")
    adj.int.funs <- c("SKS_stat_int_cov_pool", "SKS_stat_int_cov",
                      "SKS.stat.int.cov.pool", "SKS.stat.int.cov")
    int.funs <- c("WSKS_t", "SKS_pool_t",
                  "WSKS.t", "SKS.pool.t")
    if ( is.function( test.stat ) ) {
      forms.l <-  formals( test.stat )
        forms <- names( forms.l )
        if ( is.null(W) ) {
          if ( "W" %in% forms && !is.null( forms.l["W"]  ) ) {
            warning( "No W passed but test.stat calls for passed W" )
          }
        } else {
          if ( !( "W" %in% forms ) ) {
            warning( "W passed, but test.stat does not provide for W" )
          }
        }
        if ( is.null(X) ) {
          if ( "X" %in% forms && !is.null( forms.l["X"]  ) ) {
            warning( "No X passed but test.stat calls for passed X" )
          }
        } else {
          if ( !( "X" %in% forms ) ) {
            warning( "X passed, but test.stat does not provide for X" )
          }
        }
    } else if(is.null(W) & is.null(X)){
        if(!(test.stat %in% no.adj.funs)){
            stop("You have provided an invalid test statistic when not adjusting for covariates or specifying interactions. Must provide one of KS_stat, SKS_stat, or rq_stat. Please see test_stat_info() for more information.")
        }
    } else if(is.null(W)){
        if(!(test.stat %in% adj.funs)){
            stop("You have provided an invalid test statistic when adjusting for covariates in control.formula but not specifying interactions. Must provide one of SKS_stat_cov_pool, SKS_stat_cov, SKS_stat_cov_rq, rq_stat_cond_cov, or rq_stat_uncond_cov. Please see test_stat_info() for more information.")
        }
    } else if(is.null(X)){
        just.int.funs <- c(adj.int.funs, int.funs)
        if(!(test.stat %in% just.int.funs)){
            stop("You have provided an invalid test statistic when specifying interactions in interaction.formula but not adjusting for covariates. Must provide one of SKS_stat_int_cov_pool, SKS_stat_int_cov, WSKS_t, or SKS_pool_t. Please see test_stat_info() for more information.")
        }
    } else{
        if(!(test.stat %in% adj.int.funs)){
            stop("You have provided an invalid test statistic when specifying interactions in interaction.formula and adjusting for covariates in control.formula. Must provide one of SKS_stat_int_cov_pool or SKS_stat_int_cov. Please see test_stat_info() for more information.")
        }
    }
    
    ## ------------------------------------------------
    ## Detect whether to run plugin, FRTCI, or interact
    ## ------------------------------------------------
    if(plugin){ ## Plug-in test
        if(is.null(tau.hat)){
            tau.hat <- mean(Y[Z == 1]) - mean(Y[Z == 0])
        }
        fpi_out <- FRTplug(Y = Y, Z = Z, test.stat = match.fun(test.stat), tau.hat = tau.hat, verbose= verbose, ...)
    }else if(is.null(W)){ ## FRTCI - with or without adjusting
        fpi_out <- FRTCI(Y = Y, Z = Z, X = X, test.stat = match.fun(test.stat), B = B, gamma = gamma,
                         grid.gamma = grid.gamma, grid.size = grid.size,
                         te.vec = te.vec, return.matrix = return.matrix,
                         n.cores = n.cores, verbose = verbose, ...)
    }else{ ## FRTCI_interact (test for variation beyond covariates)
        fpi_out <- FRTCI_interact(Y = Y, Z = Z, W = W, X = X,
                                  test.stat = match.fun(test.stat), B = B, gamma = gamma,
                                  grid.gamma = grid.gamma, grid.size = grid.size,
                                  return.matrix = return.matrix, n.cores = n.cores,
                                  verbose = verbose, ...)
    }

    ## Add to output
    fpi_out$call <- match.call()
    if ( is.character( test.stat ) ) {
      fpi_out$test.stat <- test.stat
    } else {
      fpi_out$test.stat <- as.character( substitute(test.stat) )
    }
    
    return(fpi_out)

}

#' get p-value along with uncertainty on p-value
#' 
#' Give confidence bounds (from monte carlo simulation error) for the p-values returned by a test
#' 
#' @param tst A FRTCI.test object from detect_idiosyncratic()
#' 
#' @return p-value and range of p-values due to monte carlo error.
#'
#' @examples
#' Z <- rep(c(0, 1), 100)
#' tau <- 4
#' Y <- ifelse(Z, rnorm(100, tau), rnorm(100, 0))
#' df <- data.frame(Y=Y, Z=Z)
#' tst <- detect_idiosyncratic(Y ~ Z, df, B = 50, grid.size = 50)
#' get_p_value( tst )
#'
#' @export
get_p_value <- function( tst ) {
    cnts <- (tst$ci.p - tst$gamma) * tst$B
    bts <- vapply( cnts, function( cnt ) {
        bt <- binom.test( cnt, tst$B )
        as.numeric( bt$conf.int )
    }, numeric(2) )
    stopifnot( tst$p.value == max( tst$ci.p ) )
    c( p.value=max( tst$ci.p ), min.p= min( bts[1,] ), max.p=max( bts[2,] ), plug=tst$p.value.plug )
}

Try the hettx package in your browser

Any scripts or data that you put into this service are public.

hettx documentation built on Feb. 24, 2026, 5:08 p.m.