PSLB: Propensity Score Analysis with Local Balance

Documented in T4LB

#' Tuning for Local Balance (T4LB) Algorithm
#' 
#' The T4LB algorithm optimze the balancing property of the estimated propensity score by minimizing a local balance
#' criteria, while still controlling for the global balance. In addition to the the absolute standardized difference (S/D) 
#' of the covariates in the whole population, the T4LB selects the model with the minimum "mean" or "max" of the S/D in the 
#' pre-specified local neighborhoods. Hence, the selected model would have the best balancing property among the candidate models,
#' and the estimated scores retain the interpretation as an appropriate propensity score, and the weights have the intepretation 
#' as an inverse probability weights (IPW). T4LB can be used in propensity score estimation approaches that requires parameter 
#' tuning, such as using machine learning approaches to estimate propensity scores. 
#' 
#' @param ps A matrix contains the estimated propensity scores from candidate models. Each column of the "ps" matrix is the 
#'           propensity scores estimated from each candidate model corresponding to each set of parameters. 
#' @param weight A matrix contains the weight calculated from the corresponding input propensity score. Each column of 
#'               the "weight" matrix should correspond to each column of the "ps" matrix. If weight = NULL, the IPW of ATE
#'               will be calculated using the input propensity scores.
#' @param X The covariate matrixes used for balance evaluation. It can be one or a list of covariate matrixes. If one covariate 
#'          matrix is provided, this matrix will be used for balance evaluation for all candidate models. If a list of 
#'          covariate matrixes are provided, each element of the list will be used for balance evaluation for each candidate models.
#'          The length of the "X" list must equal to the number of column of the "ps"/"weight" matrix. The rows of "X" matrix 
#'          correspond to the subjects/participants, and the columns of the "X" matrix correspond to the covariates.
#' @param Z The binary treatment indicator vector. A vector with 2 unique numeric values in 0 = untreated and 1 = treated.
#' @param ej The matrix of the local neighborhoods, which contains two columns of positive values greater or equal to 0 and less or equal to 1.
#'           The rows of ej represent the neighborhoods. The first column is the start point of the local neighborhoods.
#'           The second column is the end point of the local neighborhoods.
#' @param para The matrix whose each row contains the set of parameters for each candidate model.
#' @param criteria Choose "mean" to use the mean of the absolute S/D among all covariates and neighborhoods as the criteria of local balance;
#'                 choose "max" to use the max of the absolute S/D among all covariates and neighborhoods as the criteria of local balance.
#'                 Default is "mean".
#' 
#' @return A list containing the following components:
#'         \itemize{
#'           \item{"weight"}: The optimal weight from the selected model with the best local balance.
#'           \item{"propensity.score"}: The optimal propensity score from the selected model with the best local balance.
#'           \item{"parameter"}: The set of parameters of the selected model with the best local balance.
#'           \item{"balance"}: The matrix containing the global and local balance of the selected propensity score model.
#'                             The first row contains the absolute S/D of each covariates in the whole study population, 
#'                             which represents the global covariate balance. The rest rows contain the absolute
#'                             S/D of the covariates in the PS-stratified sub-population corresponding to the loal neighborhoods of "ej", 
#'                             which represent the local covariate balance. 
#'         }
#' 
#' @references Lee, B. K., Lessler, J. and Stuart, E. A. (2009) Improving propensity score weighting using
#'             machine learning. Statistics in Medicine, 29, 337-346.

#' @examples
#' KS = Kang_Schafer_Simulation(n = 1500, seeds = 371834)
#' # Misspecified propensity score model
#' X = KS$Data[,7:10]
#' Z = KS$Data[,2]
#' # Local neighborhoods
#' ej = cbind(seq(0,0.8,0.2), seq(0.2,1,0.2))
#' print(ej)
#' # Transform X with Gaussian kernel using 10 sigma
#' x.k = Gaussian_Kernel_feature(X = X, n_sigma = 10)
#' sigma = rep(x.k$sigma_seq, each = 3)
#' r = as.vector(t(x.k$r))
#' para = cbind("sigma" = sigma, "l" = r)
#' # Fitting the kernelized logistic regression model
#' m = matrix(seq(1,30), nrow = 10, ncol = 3, byrow = T)
#' k.ps = matrix(nrow = nrow(X), ncol = nrow(para))
#' k.w = k.ps
#' for (i in 1:10) {
#'   for (j in 1:3) {
#'     log.fit = glm(Z~x.k$features[[i]][[j]], family = "binomial")
#'     A = weight.calculate.ps(Z, log.fit$fitted.values, standardize = T)
#'     k.ps[,m[i,j]] = A$ps
#'     k.w[,m[i,j]] = A$weight
#'   }
#' }
#' rm(A,log.fit,i,j,m)
#' best.glm = T4LB(ps = k.ps, weight = k.w, X = X, Z = Z, ej = ej, para = para)
#' str(t4lb.find)
#' best.glm$balance
#' 
#' @export
T4LB = function(ps, weight, X, Z, ej, para, criteria = "mean") {
  n = nrow(ps)
  p = ncol(ps)
  if (p != nrow(para)) {
    stop("Please make sure there is one set of propensity score for each set of parameters")
  }
  if (is.list(X) && length(X)!= p) {
    stop("The number of the covariate matrix for balance checking should be one or as many as the number of sets of parameters.")
  }
  # IPW for ATE will be calculated if weight is not provided. Weight will be standardized.
  if (is.null(weight)) {
    weight = matrix(nrow = n, ncol = p)
    for (i in 1:p) {
      weight[,i] = weight.calculate.ps(Z = Z, ps = ps[,i], standardize = TRUE)
    }
  } else if (!is.null(weight)) {
    weight = weight
  }
  
  std = list()
  std.summary = matrix(nrow = p, ncol = 5)
  for (i in 1:p) {
    if (is.matrix(X)) {
      A = abs(Fitted.strata.diff(X = X, Z = Z, ps = ps[,i], weight = weight[,i], ej))
    } else if (is.list(X)) {
      A = abs(Fitted.strata.diff(X = X[[i]], Z = Z, ps = ps[,i], weight = weight[,i], ej))
    }
    std[[i]] = A
    std.summary[i,] = Standardized.Diff.Stat(std = A, ej = ej, p1 = 15, p2 = 5)
  }
  best.I = CBPS.BestIndex(std.list = std.summary, para)
  if (is.na(best.I$best.index[1]) | is.na(best.I$best.index[2])) {
    print("Uncontrolledly large global covariate balance")
    out = NULL
  } else {
    if (criteria == "mean") {
      I = best.I$best.index[1]
    } else if (criteria == "max") {
      I = best.I$best.index[2]
    }
    out.ps = ps[,I]
    out.w = weight[,I]
    out.para = para[I,]
    names(out.para) = colnames(para)
    out = list(weight = out.w, propensity.score = out.ps, parameter = out.para,
               balance = std[[I]])
  }
  return(out)
}