R/overall.R
In latenetwork: Inference on LATEs under Network Interference of Unknown Form

Documented in overall

#' Inference on Average Overall Effect Parameters
#'
#' Inference on the average overall effect of the IV on the outcome,
#' that on the treatment receipt, and the local average overall effect
#' in the presence of network spillover of unknown form
#'
#' @details
#' The `overall()` function estimates the average overall effect of the IV
#' on the outcome, that on the treatment receipt, and
#' the local average overall effect via inverse probability weighting
#' in the approximate neighborhood interference framework.
#' The function also computes the standard errors and the confidence intervals
#' for the target parameters based on the network HAC estimation and
#' the wild bootstrap.
#' For more details, see Hoshino and Yanagi (2023).
#' The lengths of `Y`, `D`, `Z`, `S` and
#' of the row and column of `A` must be the same.
#' `K` must be a positive integer.
#' `bw` must be `NULL` or a non-negative number.
#' `B` must be `NULL` or a positive number.
#' `alp` must be a positive number between 0 and 0.5.
#'
#' @param Y An n-dimensional outcome vector
#' @param D An n-dimensional binary treatment vector
#' @param Z An n-dimensional binary instrumental vector
#' @param S An n-dimensional logical vector to indicate whether each unit
#' belongs to the sub-population S
#' @param A An n times n symmetric binary adjacency matrix
#' @param K A scalar to indicate the range of neighborhood
#' used for constructing the interference set.
#' Default is 1.
#' @param bw A scalar of the bandwidth used for the HAC estimation and
#' the wild bootstrap.
#' If `bw = NULL`, the rule-of-thumb bandwidth proposed by Leung (2022) is used.
#' Default is NULL.
#' @param B The number of bootstrap repetitions.
#' If `B = NULL`, the wild bootstrap is skipped.
#' Default is NULL.
#' @param alp The significance level. Default is 0.05.
#'
#' @returns A data.frame containing the following elements:
#' \item{est}{The parameter estimate}
#' \item{HAC_SE}{The standard error computed by the network HAC estimation}
#' \item{HAC_CI_L}{The lower bound of the confidence interval computed by
#' the network HAC estimation}
#' \item{HAC_CI_U}{The upper bound of the confidence interval computed by
#' the network HAC estimation}
#' \item{wild_SE}{The standard error computed by the wild bootstrap}
#' \item{wild_CI_L}{The lower bound of the confidence interval computed by
#' the wild bootstrap}
#' \item{wild_CI_U}{The upper bound of the confidence interval computed by
#' the wild bootstrap}
#' \item{bw}{The bandwidth used for the HAC estimation
#' and the wild bootstrap}
#' \item{size}{The size of the subpopulation S}
#'
#' @examples
#' # Generate artificial data
#' set.seed(1)
#' n <- 2000
#' data <- latenetwork::datageneration(n = n)
#'
#' # Arguments
#' Y   <- data$Y
#' D   <- data$D
#' Z   <- data$Z
#' S   <- rep(TRUE, n)
#' A   <- data$A
#' K   <- 1
#' bw  <- NULL
#' B   <- NULL
#' alp <- 0.05
#'
#' # Estimation
#' latenetwork::overall(Y = Y,
#'                      D = D,
#'                      Z = Z,
#'                      S = S,
#'                      A = A,
#'                      K = K,
#'                      bw = bw,
#'                      B = B,
#'                      alp = alp)
#'
#' @references Hoshino, T., & Yanagi, T. (2023).
#' Causal inference with noncompliance and unknown interference.
#' arXiv preprint arXiv:2108.07455.
#'
#' Leung, M.P. (2022).
#' Causal inference under approximate neighborhood interference.
#' Econometrica, 90(1), pp.267-293.
#'
#' @export
#'
overall <- function(Y,
                    D,
                    Z,
                    S,
                    A,
                    K = 1,
                    bw = NULL,
                    B = NULL,
                    alp = 0.05) {

  # Error handling -------------------------------------------------------------

  error <- errorhandling(Y = Y,
                         D = D,
                         Z = Z,
                         IEM = NULL,
                         S = S,
                         A = A,
                         bw = bw,
                         B = B,
                         alp = alp)

  # Variable definitions -------------------------------------------------------

  # Distance matrix
  graph0 <- igraph::graph_from_adjacency_matrix(adjmatrix = A,
                                                mode = "undirected")

  distance0 <- igraph::distances(graph = graph0,
                                 v  = igraph::V(graph0),
                                 to = igraph::V(graph0),
                                 algorithm = "dijkstra")

  # Variable definitions
  Y0 <- Y
  D0 <- D
  Z0 <- Z

  Y <- Y[S]
  D <- D[S]
  Z <- Z[S]
  A <- A[S, S]
  graph <- igraph::graph_from_adjacency_matrix(adjmatrix = A,
                                               mode = "undirected")
  distance <- distance0[S, S]

  # Size of the sub-population S
  size <- sum(S)

  # List of neighbors
  neighbor_list <- list()
  for (i in which(S)) {
    neighbor <- which(distance0[i, ] >= 0 & distance0[i, ] <= K)
    neighbor_list <- append(neighbor_list, list(neighbor))
  }

  # Bandwidth ------------------------------------------------------------------

  if (is.null(bw)) {

    # Find largest connected component
    connect <- igraph::components(graph = graph)
    connect_member <-
      which(connect$membership == statip::mfv(connect$membership))

    # Average path length
    average_path_length <-
      sum(distance[connect_member, connect_member]) /
      (length(connect_member) * (length(connect_member) - 1))

    # Average degree
    average_degree <- sum(A) / size

    # Rule-of-thumb bandwidth proposed by Leung (2022)
    if (average_degree > 1) {

      bw0 <- ifelse(average_path_length < 2 * log(size) / log(average_degree),
                    0.5 * average_path_length,
                    average_path_length^(1/3))

      bw <- round(max(bw0, 2 * K))

    } else {

      bw <- 2 * K

    }

  }

  # Causal inference -----------------------------------------------------------

  # Function to compute the generalized propensity score
  # Input "z": A value of IV
  # Output: A value of GPS
  GPS <- function(z) {
    mean(Z == z)
  }

  # Function to compute the conditional outcome mean
  # Input "z": A value of IV
  # Output: A value of the conditional outcome mean
  mu_Y <- function(z) {
    summand <- rep(0, size)
    for (i in 1:size) {
      neighbor <- neighbor_list[[i]]
      if (length(neighbor) > 0) {
        summand[i] <- sum(Y0[neighbor]) * (Z[i] == z) / GPS(z = z)
      }
    }
    return(mean(summand))
  }

  # Function to compute the conditional treatment mean for AOED
  # Input "z": A value of IV
  # Output: A value of the conditional treatment mean for AOED
  mu_D_AOED <- function(z) {
    summand <- rep(0, size)
    for (i in 1:size) {
      neighbor <- neighbor_list[[i]]
      if (length(neighbor) > 0) {
        summand[i] <- sum(D0[neighbor]) * (Z[i] == z) / GPS(z = z)
      }
    }
    return(mean(summand))
  }

  # Function to compute the conditional treatment mean for ADED
  # Input "z": A value of IV
  # Output: A value of the conditional treatment mean for ADED
  mu_D_ADED <- function(z) {
    mean(D * (Z == z)) / GPS(z = z)
  }

  # Function to compute AOEY
  # Input: Nothing
  # Output: A value of AOEY
  AOEY <- function() {
    mu_Y(z = 1) - mu_Y(z = 0)
  }

  # Function to compute AOED
  # Input: Nothing
  # Output: A value of AOED
  AOED <- function() {
    mu_D_AOED(z = 1) - mu_D_AOED(z = 0)
  }

  # Function to compute ADED
  # Input: Nothing
  # Output: A value of ADED
  ADED <- function() {
    mu_D_ADED(z = 1) - mu_D_ADED(z = 0)
  }

  # Function to compute LAOE
  # Input: Nothing
  # Output: A value of LAOE
  LAOE <- function() {
    AOEY() / ADED()
  }

  # Estimates
  est_AOEY <- AOEY()
  est_AOED <- AOED()
  est_ADED <- ADED()
  est_LAOE <- LAOE()

  # Variable definitions
  W_Z1 <- (Z == 1)
  W_Z0 <- (Z == 0)

  W_Y <- W_D_AOED <- W_D_ADED <- rep(0, size)
  for (i in 1:size) {

    neighbor <- neighbor_list[[i]]

    multiplier <- (Z[i] == 1) / GPS(z = 1) - (Z[i] == 0) / GPS(z = 0)

    W_Y[i]      <- sum(Y0[neighbor]) * multiplier
    W_D_AOED[i] <- sum(D0[neighbor]) * multiplier
    W_D_ADED[i] <- D[i] * multiplier

  }

  V_AOEY <- W_Y -
    W_Z1 * mu_Y(z = 1) / GPS(z = 1) +
    W_Z0 * mu_Y(z = 0) / GPS(z = 0)

  V_AOED <- W_D_AOED -
    W_Z1 * mu_D_AOED(z = 1) / GPS(z = 1) +
    W_Z0 * mu_D_AOED(z = 0) / GPS(z = 0)

  V_ADED <- W_D_ADED -
    W_Z1 * mu_D_ADED(z = 1) / GPS(z = 1) +
    W_Z0 * mu_D_ADED(z = 0) / GPS(z = 0)

  V_LAOE <- V_AOEY / est_ADED - V_ADED * est_AOEY / est_ADED^2

  # Function to compute HAC standard error and confidence interval
  # Input: Nothing
  # Output: A list containing HAC standard error and confidence interval
  HAC_func <- function() {

    # Standard error
    neighbor_mat <- (distance <= bw)

    SE_AOEY <- sqrt(t(V_AOEY) %*% neighbor_mat %*% V_AOEY / size^2)
    SE_AOED <- sqrt(t(V_AOED) %*% neighbor_mat %*% V_AOED / size^2)
    SE_ADED <- sqrt(t(V_ADED) %*% neighbor_mat %*% V_ADED / size^2)
    SE_LAOE <- sqrt(t(V_LAOE) %*% neighbor_mat %*% V_LAOE / size^2)

    # Critical value based on asymptotic normality
    cvnorm <- stats::qnorm(1 - alp / 2)

    # Confidence interval
    CI_AOEY <- c(est_AOEY - cvnorm * SE_AOEY,
                 est_AOEY + cvnorm * SE_AOEY)

    CI_AOED <- c(est_AOED - cvnorm * SE_AOED,
                 est_AOED + cvnorm * SE_AOED)

    CI_ADED <- c(est_ADED - cvnorm * SE_ADED,
                 est_ADED + cvnorm * SE_ADED)

    CI_LAOE <- c(est_LAOE - cvnorm * SE_LAOE,
                 est_LAOE + cvnorm * SE_LAOE)

    # Result
    HAC_AOEY <- c(SE_AOEY, CI_AOEY)
    HAC_AOED <- c(SE_AOED, CI_AOED)
    HAC_ADED <- c(SE_ADED, CI_ADED)
    HAC_LAOE <- c(SE_LAOE, CI_LAOE)

    return(list(AOEY = HAC_AOEY,
                AOED = HAC_AOED,
                ADED = HAC_ADED,
                LAOE = HAC_LAOE))

  }

  # Function to compute the Omega matrix used for wild bootstrap
  # Input: Nothing
  # Output: the Omega matrix
  Omega_func <- function() {

    # Compute the numerator
    neighbor_mat <- (distance <= bw)
    numerator <- NULL
    for (i in 1:size) {

      intersect_mat <- sweep(x = neighbor_mat,
                             MARGIN = 2,
                             FUN = "*",
                             neighbor_mat[i, ])

      numerator <- rbind(numerator,
                         apply(intersect_mat, MARGIN = 1, FUN = sum))

    }

    # Compute the denominator
    denominator <- sum(neighbor_mat) / size

    return(numerator/denominator)

  }

  # Function to compute wild bootstrap standard error and confidence interval
  # Input: Nothing
  # Output: A list containing bootstrap standard error and confidence interval
  wild_func <- function() {

    # Square root matrix of Omega
    Omega <- Omega_func()
    Omega <- methods::as(Omega, "dgCMatrix")
    E <- eigen(Omega)
    E$values[abs(E$values) < 1e-10] <- 0
    sqrt_Omega <- E$vectors %*% diag(sqrt(E$values)) %*% t(E$vectors)

    # Bootstrap
    AOEY_boot <- AOED_boot <- ADED_boot <- LAOE_boot <- NULL
    for (r in 1:B) {

      zeta <- stats::rnorm(size)

      R_vector <- sqrt_Omega %*% zeta

      V_AOEY_boot <- V_AOEY * R_vector
      V_AOED_boot <- V_AOED * R_vector
      V_ADED_boot <- V_ADED * R_vector
      V_LAOE_boot <- V_LAOE * R_vector

      AOEY_boot <- c(AOEY_boot, mean(V_AOEY_boot))
      AOED_boot <- c(AOED_boot, mean(V_AOED_boot))
      ADED_boot <- c(ADED_boot, mean(V_ADED_boot))
      LAOE_boot <- c(LAOE_boot, mean(V_LAOE_boot))

    }

    # Standard error
    SE_AOEY <- stats::sd(AOEY_boot)
    SE_AOED <- stats::sd(AOED_boot)
    SE_ADED <- stats::sd(ADED_boot)
    SE_LAOE <- stats::sd(LAOE_boot)

    # Confidence interval
    q_AOEY <- stats::quantile(AOEY_boot,
                              probs = c(alp / 2, 1 - alp / 2))

    q_AOED <- stats::quantile(AOED_boot,
                              probs = c(alp / 2, 1 - alp / 2))

    q_ADED <- stats::quantile(ADED_boot,
                              probs = c(alp / 2, 1 - alp / 2))

    q_LAOE <- stats::quantile(LAOE_boot,
                              probs = c(alp / 2, 1 - alp / 2))

    CI_AOEY <- c(est_AOEY + q_AOEY[1],
                 est_AOEY + q_AOEY[2])

    CI_AOED <- c(est_AOED + q_AOED[1],
                 est_AOED + q_AOED[2])

    CI_ADED <- c(est_ADED + q_ADED[1],
                 est_ADED + q_ADED[2])

    CI_LAOE <- c(est_LAOE + q_LAOE[1],
                 est_LAOE + q_LAOE[2])

    # Result
    wild_AOEY <- c(SE_AOEY, CI_AOEY)
    wild_AOED <- c(SE_AOED, CI_AOED)
    wild_ADED <- c(SE_ADED, CI_ADED)
    wild_LAOE <- c(SE_LAOE, CI_LAOE)

    return(list(AOEY = wild_AOEY,
                AOED = wild_AOED,
                ADED = wild_ADED,
                LAOE = wild_LAOE))

  }

  # HAC estimation
  HAC <- HAC_func()

  # Wild bootstrap
  if (!is.null(B)) {

    wild <- wild_func()

  } else {

    wild <- list(AOEY = rep(NA, 3),
                 AOED = rep(NA, 3),
                 ADED = rep(NA, 3),
                 LAOE = rep(NA, 3))

  }

  # Result
  Estimate <- rbind(c(est_AOEY, HAC$AOEY, wild$AOEY, bw, size),
                    c(est_AOED, HAC$AOED, wild$AOED, bw, size),
                    c(est_ADED, HAC$ADED, wild$ADED, bw, size),
                    c(est_LAOE, HAC$LAOE, wild$LAOE, bw, size)
  )

  colnames(Estimate) <- c("est",
                          "HAC_SE",
                          "HAC_CI_L",
                          "HAC_CI_U",
                          "wild_SE",
                          "wild_CI_L",
                          "wild_CI_U",
                          "bw",
                          "size")

  rownames(Estimate) <- c("AOEY", "AOED", "ADED", "LAOE")

  Estimate <- as.data.frame(Estimate)

  return(Estimate)

}