R/compute.att_gt.R
In did: Treatment Effects with Multiple Periods and Groups

Documented in compute.att_gt

#' @title Compute Group-Time Average Treatment Effects
#'
#' @description `compute.att_gt` does the main work for computing
#'  multiperiod group-time average treatment effects
#'
#' @param dp A DIDparams object
#'
#' @return a list with length equal to the number of groups times the
#'  number of time periods; each element of the list contains an
#'  object that contains group-time average treatment effect as well
#'  as which group it is for and which time period it is for. It also exports
#'  the influence function which is used externally to compute
#'  standard errors.
#'
#' @keywords internal
#'
#' @export
compute.att_gt <- function(dp) {
  #-----------------------------------------------------------------------------
  # unpack DIDparams
  #-----------------------------------------------------------------------------
  data <- data.table::as.data.table(dp$data)
  yname <- dp$yname
  tname <- dp$tname
  idname <- dp$idname
  xformla <- dp$xformla
  weightsname <- dp$weightsname
  est_method <- dp$est_method
  base_period <- dp$base_period
  panel <- dp$panel
  true_repeated_cross_sections <- dp$true_repeated_cross_sections
  print_details <- dp$print_details
  control_group <- dp$control_group
  anticipation <- dp$anticipation
  gname <- dp$gname
  n <- dp$n
  nT <- dp$nT
  nG <- dp$nG
  tlist <- dp$tlist
  glist <- dp$glist

  #-----------------------------------------------------------------------------
  # main computations
  #-----------------------------------------------------------------------------

  # will populate with all att(g,t)
  attgt.list <- list()

  # place holder in lists
  counter <- 1

  # number of time periods
  tlist.length <- ifelse(base_period != "universal", length(tlist) - 1, length(tlist))
  tfac <- ifelse(base_period != "universal", 1, 0)

  # influence function
  inffunc <- Matrix::Matrix(data = 0, nrow = n, ncol = nG * (nT - tfac), sparse = TRUE)

  # list of collect sparse matrix updates
  inffunc_updates <- list()
  # counter for keeping track of updates
  update_counter <- 1

  # never treated option
  nevertreated <- (control_group[1] == "nevertreated")

  if (nevertreated) {
    data[, .C := as.integer(get(gname) == 0)]
  }
  data[, .y := get(yname), .SDcols = yname]

  # loop over groups
  for (g in 1:nG) {
    # Set up .G once
    data[, .G := as.numeric(get(gname) == glist[[..g]]), .SDcols = gname]

    # loop over time periods
    for (t in 1:tlist.length) {
      #-----------------------------------------------------------------------------
      # Set pret

      # varying base period
      pret <- t

      # universal base period
      if (base_period == "universal") {
        # use same base period as for post-treatment periods
        pret <- tail(which((tlist + anticipation) < glist[g]), 1)
      }

      # use "not yet treated as control"
      # that is, never treated + units that are eventually treated,
      # but not treated by the current period (+ anticipation)
      if (!nevertreated) {
        # Capture loop variable t before data.table evaluation to avoid scoping issues
        # (data.table expressions access columns by name, so 't' would refer to tname column)
        time_threshold <- tlist[max(t, pret) + tfac] + anticipation
        data[, .C := as.integer((get(gname) == 0) |
          ((get(gname) > time_threshold) &
            (get(gname) != glist[[..g]])))]
      }


      # check if in post-treatment period
      if ((glist[g] <= tlist[(t + tfac)])) {
        # update pre-period if in post-treatment period to
        # be  period (g-delta-1)
        pret <- tail(which((tlist + anticipation) < glist[g]), 1)

        # print a warning message if there are no pre-treatment period
        if (length(pret) == 0) {
          warning(paste0("There are no pre-treatment periods for the group first treated at ", glist[g], "\nUnits from this group are dropped"))

          # if there are not pre-treatment periods, code will
          # jump out of this loop
          break
        }
      }


      #-----------------------------------------------------------------------------
      # if we are in period (g-1), normalize results to be equal to 0
      # and break without computing anything
      if (base_period == "universal") {
        if (tlist[pret] == tlist[(t + tfac)]) {
          attgt.list[[counter]] <- list(att = 0, group = glist[g], year = tlist[(t + tfac)], post = 0)
          # inffunc[,counter] <- rep(0,n)
          # counter <- counter+1
          inffunc_updates[[update_counter]] <- list(
            indices = rep(TRUE, n), # Apply to all units
            values = as.matrix(rep(0, n)) # Zero influence function
          )

          # Update the counters
          update_counter <- update_counter + 1
          counter <- counter + 1
          next
        }
      }

      # print the details of which iteration we are on
      if (print_details) {
        cat(paste("current period:", tlist[(t + tfac)]), "\n")
        cat(paste("current group:", glist[g]), "\n")
        cat(paste("set pretreatment period to be", tlist[pret]), "\n")
      }

      #-----------------------------------------------------------------------------
      # results for the case with panel data
      #-----------------------------------------------------------------------------

      # post treatment dummy variable
      post.treat <- 1 * (glist[g] <= tlist[t + tfac])

      # total number of units (not just included in G or C)
      # disdat <- data[data[,tname] == tlist[t+tfac] | data[,tname] == tlist[pret],]
      target_times <- c(tlist[t + tfac], tlist[pret])
      disdat <- data[get(tname) %in% target_times]


      if (panel) {
        # transform  disdat it into "cross-sectional" data where one of the columns
        # contains the change in the outcome over time.
        disdat <- get_wide_data(disdat, yname, idname, tname)

        # still total number of units (not just included in G or C)
        n <- nrow(disdat)

        # pick up the indices for units that will be used to compute ATT(g,t)
        disidx <- disdat$.G == 1 | disdat$.C == 1

        # pick up the data that will be used to compute ATT(g,t)
        disdat <- disdat[disidx]

        n1 <- nrow(disdat) # num obs. for computing ATT(g,t)

        # drop missing factors
        disdat <- droplevels(disdat)

        # give short names for data in this iteration
        G <- disdat$.G
        C <- disdat$.C

        # handle pre-treatment universal base period differently
        # we need to do this because panel2cs2 just puts later period
        # in .y1, but if we are in a pre-treatment period with a universal
        # base period, then the "base period" is actually the later period
        Ypre <- if (tlist[(t + tfac)] > tlist[pret]) disdat$.y0 else disdat$.y1
        Ypost <- if (tlist[(t + tfac)] > tlist[pret]) disdat$.y1 else disdat$.y0
        w <- disdat$.w

        # matrix of covariates
        covariates <- model.matrix(xformla, data = disdat)

        #-----------------------------------------------------------------------------
        # more checks for enough observations in each group

        # if using custom estimation method, skip this part
        custom_est_method <- class(est_method) == "function"

        if (!custom_est_method) {
          pscore_problems_likely <- FALSE
          reg_problems_likely <- FALSE

          # checks for pscore based methods
          if (est_method %in% c("dr", "ipw")) {
            # preliminary_logit <- glm(G ~ -1 + covariates, family=binomial(link=logit))
            preliminary_logit <- fastglm::fastglm(G ~ -1 + covariates, family = binomial())
            preliminary_pscores <- predict(preliminary_logit, type = "response")
            if (max(preliminary_pscores) >= 0.999) {
              pscore_problems_likely <- TRUE
              warning(paste0("overlap condition violated for ", glist[g], " in time period ", tlist[t + tfac]))
            }
          }

          # check if can run regression using control units
          if (est_method %in% c("dr", "reg")) {
            control_covs <- covariates[G == 0, , drop = FALSE]
            # if (determinant(t(control_covs)%*%control_covs, logarithm=FALSE)$modulus < .Machine$double.eps) {
            if (rcond(t(control_covs) %*% control_covs) < .Machine$double.eps) {
              reg_problems_likely <- TRUE
              warning(paste0("Not enough control units for group ", glist[g], " in time period ", tlist[t + tfac], " to run specified regression"))
            }
          }

          if (reg_problems_likely | pscore_problems_likely) {
            attgt.list[[counter]] <- list(att = NA, group = glist[g], year = tlist[(t + tfac)], post = post.treat)
            # inffunc[,counter] <- NA
            # counter <- counter+1
            inffunc_updates[[update_counter]] <- list(
              indices = rep(TRUE, n), # Apply to all units
              values = as.matrix(rep(NA, n)) # NA influence function
            )

            # Update the counters
            update_counter <- update_counter + 1
            counter <- counter + 1
            next
          }
        }

        #-----------------------------------------------------------------------------
        # code for actually computing att(g,t)
        #-----------------------------------------------------------------------------

        if (inherits(est_method, "function")) {
          # user-specified function
          attgt <- est_method(
            y1 = Ypost, y0 = Ypre,
            D = G,
            covariates = covariates,
            i.weights = w,
            inffunc = TRUE
          )
        } else if (est_method == "ipw") {
          # inverse-probability weights
          attgt <- DRDID::std_ipw_did_panel(Ypost, Ypre, G,
            covariates = covariates,
            i.weights = w,
            boot = FALSE, inffunc = TRUE
          )
        } else if (est_method == "reg") {
          # regression
          attgt <- DRDID::reg_did_panel(Ypost, Ypre, G,
            covariates = covariates,
            i.weights = w,
            boot = FALSE, inffunc = TRUE
          )
        } else {
          # doubly robust, this is default
          attgt <- DRDID::drdid_panel(Ypost, Ypre, G,
            covariates = covariates,
            i.weights = w,
            boot = FALSE, inffunc = TRUE
          )
        }

        # adjust influence function to account for only using
        # subgroup to estimate att(g,t)
        attgt$att.inf.func <- (n / n1) * attgt$att.inf.func
      } else { # repeated cross sections / unbalanced panel

        # pick up the indices for units that will be used to compute ATT(g,t)
        # these conditions are (1) you are observed in the right period and
        # (2) you are in the right group (it is possible to be observed in
        # the right period but still not be part of the treated or control
        # group in that period here
        rightids <- disdat$.rowid[disdat$.G == 1 | disdat$.C == 1]

        # this is the fix for unbalanced panels; 2nd criteria shouldn't do anything
        # with true repeated cross sections, but should pick up the right time periods
        # only with unbalanced panel
        disidx <- (data$.rowid %in% rightids) & ((data[[tname]] == tlist[t + tfac]) | (data[[tname]] == tlist[pret]))

        # pick up the data that will be used to compute ATT(g,t)
        disdat <- data[disidx, ]

        # drop missing factors
        disdat <- droplevels(disdat)

        # give short names for data in this iteration
        G <- disdat$.G
        C <- disdat$.C
        Y <- disdat[[yname]]
        post <- 1 * (disdat[[tname]] == tlist[t + tfac])
        # num obs. for computing ATT(g,t), have to be careful here
        n1 <- sum(G + C)
        w <- disdat$.w

        #-----------------------------------------------------------------------------
        # checks to make sure that we have enough observations
        skip_this_att_gt <- FALSE
        if (sum(G * post) == 0) {
          warning(paste0("No units in group ", glist[g], " in time period ", tlist[t + tfac]))
          skip_this_att_gt <- TRUE
        }
        if (sum(G * (1 - post)) == 0) {
          warning(paste0("No units in group ", glist[g], " in time period ", tlist[t]))
          skip_this_att_gt <- TRUE
        }
        if (sum(C * post) == 0) {
          warning(paste0("No available control units for group ", glist[g], " in time period ", tlist[t + tfac]))
          skip_this_att_gt <- TRUE
        }
        if (sum(C * (1 - post)) == 0) {
          warning(paste0("No availabe control units for group ", glist[g], " in time period ", tlist[t]))
          skip_this_att_gt <- TRUE
        }

        if (skip_this_att_gt) {
          attgt.list[[counter]] <- list(att = NA, group = glist[g], year = tlist[(t + tfac)], post = post.treat)
          # inffunc[,counter] <- NA
          # counter <- counter+1
          inffunc_updates[[update_counter]] <- list(
            indices = rep(TRUE, n), # Apply to all units
            values = as.matrix(rep(NA, n)) # NA influence function
          )

          # Update the counters
          update_counter <- update_counter + 1
          counter <- counter + 1
          next
        }


        # matrix of covariates
        covariates <- model.matrix(xformla, data = disdat)

        #-----------------------------------------------------------------------------
        # code for actually computing att(g,t)
        #-----------------------------------------------------------------------------

        if (inherits(est_method, "function")) {
          # user-specified function
          attgt <- est_method(
            y = Y,
            post = post,
            D = G,
            covariates = covariates,
            i.weights = w,
            inffunc = TRUE
          )
        } else if (est_method == "ipw") {
          # inverse-probability weights
          attgt <- DRDID::std_ipw_did_rc(
            y = Y,
            post = post,
            D = G,
            covariates = covariates,
            i.weights = w,
            boot = FALSE, inffunc = TRUE
          )
        } else if (est_method == "reg") {
          # regression
          attgt <- DRDID::reg_did_rc(
            y = Y,
            post = post,
            D = G,
            covariates = covariates,
            i.weights = w,
            boot = FALSE, inffunc = TRUE
          )
        } else {
          # doubly robust, this is default
          attgt <- DRDID::drdid_rc(
            y = Y,
            post = post,
            D = G,
            covariates = covariates,
            i.weights = w,
            boot = FALSE, inffunc = TRUE
          )
        }

        # n/n1 adjusts for estimating the
        # att_gt only using observations from groups
        # G and C
        attgt$att.inf.func <- (n / n1) * attgt$att.inf.func

        # If ATT is NaN, replace it with NA, and make Influence functions equal to zero
        if (is.nan(attgt$ATT)) {
          attgt$ATT <- NA
          attgt$att.inf.func <- 0 * attgt$att.inf.func
        }
      } # end panel if

      # save results for this att(g,t)
      attgt.list[[counter]] <- list(
        att = attgt$ATT, group = glist[g], year = tlist[(t + tfac)], post = post.treat
      )


      # populate the influence function in the right places
      if (panel) {
        # inf.func[disidx] <- attgt$att.inf.func
        # Collect the indices and corresponding values for the update
        inffunc_updates[[update_counter]] <- list(
          indices = disidx,
          values = attgt$att.inf.func
        )
      } else {
        # aggregate inf functions by id (order by id)
        aggte_inffunc <- suppressWarnings(stats::aggregate(attgt$att.inf.func, list(rightids), sum))
        disidx <- (unique(data$.rowid) %in% aggte_inffunc[, 1])
        # inf.func[disidx] <- aggte_inffunc[,2]
        inffunc_updates[[update_counter]] <- list(
          indices = disidx,
          values = aggte_inffunc[, 2]
        )
      }


      # save it in influence function matrix
      # inffunc[g,t,] <- inf.func
      # inffunc[,counter] <- inf.func
      update_counter <- update_counter + 1

      # update counter
      counter <- counter + 1
    } # end looping over t
  } # end looping over g

  # Apply the updates to the influence function matrix
  update_inffunc <- rbindlist(lapply(seq_along(inffunc_updates), function(i) {
    update <- inffunc_updates[[i]]
    data.table(row = which(update$indices), col = i, value = update$values)
  }), fill = TRUE)
  # Apply updates to the sparse matrix
  inffunc[cbind(update_inffunc$row, update_inffunc$col)] <- update_inffunc$value


  return(list(attgt.list = attgt.list, inffunc = inffunc))
}