R/difference_in_means.R

Defines functions difference_in_means_internal weighted_var_internal difference_in_means

Documented in difference_in_means

#' Built-in Estimators: Difference-in-means
#'
#' @param formula An object of class "formula", such as Y ~ Z
#' @param blocks An optional bare (unquoted) name of the block variable. Use for blocked designs only.
#' @param clusters An optional bare (unquoted) name of the variable that corresponds to the clusters in the data; used for cluster randomized designs. For blocked designs, clusters must be within blocks.
#' @param data A data.frame.
#' @param weights An optional bare (unquoted) name of the weights variable.
#' @param subset An optional bare (unquoted) expression specifying a subset of observations to be used.
#' @param alpha The significance level, 0.05 by default.
#' @param condition1 names of the conditions to be compared. Effects are estimated with condition1 as control and condition2 as treatment. If unspecified, condition1 is the "first" condition and condition2 is the "second" according to r defaults.
#' @param condition2 names of the conditions to be compared. Effects are estimated with condition1 as control and condition2 as treatment. If unspecified, condition1 is the "first" condition and condition2 is the "second" according to r defaults.
#'
#'
#' @details This function implements difference-in-means estimation, with and without blocking. Standard errors are estimated as the square root of the sum of the within-group variances, divided by their respective sample sizes (Equation 3.6 in Gerber and Green 2012). If blocked, the difference in means estimate is taken in each block, then averaged together according to block size.
#'
#' @export
#'
#' @examples
#'
#'  df <- data.frame(Y = rnorm(100),
#'                   Z = sample(1:3, 100, replace = TRUE),
#'                   block = sample(c("A", "B", "C"), 100, replace = TRUE))
#'
#'  difference_in_means(Y ~ Z, data = df)
#'  difference_in_means(Y ~ Z, condition1 = 3, condition2 = 2, data = df)
#'
#'  difference_in_means(Y ~ Z, blocks = block, data = df)
#'  difference_in_means(Y ~ Z, blocks = block, condition1 = 3, condition2 = 2, data = df)
#'
difference_in_means <-
  function(formula,
           blocks,
           clusters,
           condition1 = NULL,
           condition2 = NULL,
           data,
           weights,
           subset,
           alpha = .05) {

    if (length(all.vars(formula[[3]])) > 1) {
      stop(
        "The formula should only include one variable on the right-hand side: the treatment variable."
      )
    }

    where <- parent.frame()
    model_data <- eval(substitute(
      clean_model_data(
        formula = formula,
        data = data,
        subset = subset,
        cluster = clusters,
        block = blocks,
        weights = weights,
        where = where
      )
    ))

    data <- data.frame(y = model_data$outcome,
                       t = model_data$design_matrix[, ncol(model_data$design_matrix)])
    data$cluster <- model_data$cluster
    data$weights <- model_data$weights
    data$block <- model_data$block

    rm(model_data)

    if (is.null(data$block)){

      return_frame <- difference_in_means_internal(
        condition1 = condition1,
        condition2 = condition2,
        data = data,
        alpha = alpha
      )

      ## todo: add inflation from GG fn 20 ch 3
      if (is.na(return_frame$df)) {
        return_frame$df <- with(return_frame,
                                N - 2)
      }

      return_frame$p <- with(return_frame,
                             2 * pt(abs(est / se), df = df, lower.tail = FALSE))
      return_frame$ci_lower <- with(return_frame,
                                    est - qt(1 - alpha / 2, df = df) * se)
      return_frame$ci_upper <- with(return_frame,
                                    est + qt(1 - alpha / 2, df = df) * se)

      return_list <- as.list(return_frame)

    } else {

      pair_matched <- FALSE

      if (!is.null(data$cluster)) {

        ## Check that clusters nest within blocks
        if (!all(tapply(data$block, data$cluster, function(x)
          all(x == x[1])))) {
          stop("All units within a cluster must be in the same block.")
        }

        ## get number of clusters per block
        clust_per_block <- tapply(data$cluster,
                                  data$block,
                                  function(x) length(unique(x)))
      } else {
        clust_per_block <- tabulate(as.factor(data$block))
      }

      ## Check if design is pair matched
      if (any(clust_per_block == 1)) {
        stop(
          "Some blocks have only one unit or cluster. Blocks must have multiple units or clusters."
        )
      } else if (all(clust_per_block == 2)) {
        pair_matched <- TRUE
      } else if (any(clust_per_block == 2) & any(clust_per_block > 2)) {
        stop(
          "Some blocks have two units or clusters, while others have more units or clusters. Design must either be paired or all blocks must be of size 3 or greater."
        )
      }

      block_dfs <- split(data, data$block)

      block_estimates <- lapply(block_dfs, function(x) {
        difference_in_means_internal(
          data = x,
          condition1 = condition1,
          condition2 = condition2,
          pair_matched = pair_matched,
          alpha = alpha
        )
      })

      block_estimates <- do.call(rbind, block_estimates)

      N_overall <- with(block_estimates, sum(N))

      # Blocked design, (Gerber Green 2012, p73, eq3.10)
      diff <- with(block_estimates, sum(est * N/N_overall))

      df <- NA
      n_blocks <- nrow(block_estimates)

      if (pair_matched) {

        if (is.null(data$cluster)) {
          # Pair matched, cluster randomized (Gerber Green 2012, p77, eq3.16)
          se <-
            with(
              block_estimates,
              sqrt( (1 / (n_blocks * (n_blocks - 1))) * sum((est - diff)^2) )
            )
        } else {
          # Pair matched, unit randomized (Imai, King, Nall 2009, p36, eq6)
          se <-
            with(
              block_estimates,
              sqrt(
                ( n_blocks / ((n_blocks - 1) * N_overall^2) ) *
                  sum( (N * est - (N_overall * diff)/n_blocks)^2 )
              )
            )

          # from  (Imai, King, Nall 2009, p37)
          df <- n_blocks - 1
        }

      } else {
        # Block randomized (Gerber and Green 2012, p. 74, footnote 17)
        se <- with(block_estimates, sqrt(sum(se^2 * (N/N_overall)^2)))
      }

      if(is.na(df)) {
        ## we don't know if this is correct!
        df <- N_overall - n_blocks
      }

      p <- 2 * pt(abs(diff / se), df = df, lower.tail = FALSE)
      ci_lower <- diff - qt(1 - alpha / 2, df = df) * se
      ci_upper <- diff + qt(1 - alpha / 2, df = df) * se

      return_list <-
        list(
          est = diff,
          se = se,
          p = p,
          ci_lower = ci_lower,
          ci_upper = ci_upper,
          df = df
        )

    }

    return_list <- dim_like_return(return_list,
                                   alpha = alpha,
                                   formula = formula)

    attr(return_list, "class") <- "difference_in_means"

    return(return_list)

  }



weighted_var_internal <- function(w, x, xWbar){
  wbar <- mean(w)
  n <- length(w)
  return(
    n / ((n - 1) * sum(w) ^ 2) *
      (
        sum((w * x - wbar * xWbar) ^ 2)
        - 2 * xWbar * sum((w - wbar) * (w * x - wbar * xWbar))
        + xWbar ^ 2 * sum((w - wbar) ^ 2)
      )
  )
}



difference_in_means_internal <-
  function(condition1 = NULL,
           condition2 = NULL,
           data,
           pair_matched = FALSE,
           alpha = .05) {

    if (is.factor(data$t)) {
      condition_names <- levels(data$t)
    } else{
      condition_names <- sort(unique(data$t))
    }

    if (length(condition_names) == 1) {
      stop("Must have units with both treatment conditions within each block.")
    }

    if (is.null(condition1) & is.null(condition2)) {
      condition1 <- condition_names[1]
      condition2 <- condition_names[2]
    }

    # Check that treatment status is uniform within cluster, checked here
    # so that the treatment vector t doesn't have to be built anywhere else
    if (!is.null(data$cluster)) {

      if (is.factor(data$cluster)) {
        data$cluster <- droplevels(data$cluster)
      }

      if (!all(tapply(data$t, data$cluster, function(x) all(x == x[1])))) {
        stop(
          "All units within a cluster must have the same treatment condition."
        )
      }
    }

    N <- length(data$y)

    Y2 <- data$y[data$t == condition2]
    Y1 <- data$y[data$t == condition1]

    ## Check to make sure multiple in each group if pair matched is false
    if (!pair_matched & (length(Y2) == 1 | length(Y1) == 1)) {
      stop(
        "Not a pair matched design and one treatment condition only has one value, making standard errors impossible to calculate."
      )
    }

    df <- NA

    if (is.null(data$weights)) {

      diff <- mean(Y2) - mean(Y1)

      if (pair_matched) {
        # Pair matched designs
        se <- NA
      } else {

        if (is.null(data$cluster)) {
        # Non-pair matched designs, unit level randomization
        se <- sqrt(var(Y2) / length(Y2) + var(Y1) / length(Y1))

        } else {
          # Non-pair matched designs, cluster randomization
          # (Gerber and Green 2012, p. 83, eq. 3.23)
          k <- length(unique(data$cluster))

          # In the below we set na.rm = T because if cluster is a factor, subsetting
          # it will keep the other levels
          se <- sqrt(
            (var(tapply(Y2, data$cluster[data$t == condition2], mean), na.rm = T) * N) /
              (k * length(Y2)) +
            (var(tapply(Y1, data$cluster[data$t == condition1], mean), na.rm = T) * N) /
              (k * length(Y1))
          )
        }

        df <- se^4 /
          (
            (var(Y2) / length(Y2))^2 / (length(Y2) - 1) +
              (var(Y1) / length(Y1))^2 / (length(Y1) - 1)
          )
      }


    } else {

      # TODO: weights and clusters
      # TODO: weights and matched pair

      w2 <- data$weights[data$t == condition2]
      w1 <- data$weights[data$t == condition1]

      mean2 <- weighted.mean(Y2, w2)
      mean1 <- weighted.mean(Y1, w1)
      diff <-  mean2 - mean1

      var2 <- weighted_var_internal(w2, Y2, mean2)
      var1 <- weighted_var_internal(w1, Y1, mean1)

      se <- sqrt(var2 + var1)

      # todo: check welch approximation with weights
      df <- se^4 /
        (
          (var2^2 / (length(Y2)-1)) +
          (var1^2 / (length(Y1)-1))
        )
    }

    return_frame <-
      data.frame(
        est = diff,
        se = se,
        N = N,
        df = df
      )

    return(return_frame)

  }
DeclareDesign/DDestimate documentation built on Jan. 13, 2018, 2:19 a.m.