MultiObjMatch: Multi-objective Matching Algorithm

Documented in build.dist.struct data_precheck obj.to.match

#' Internal helper function that deals with missing values and redundant columns in the input dataframe.
#'
#' @param X a dataframe that the user initially inputs for matching. 
#'
#' @return a dataframe with modified data if necessary
data_precheck <- function(X){
  if (is.vector(X)) X <- matrix(X, length(X), 1)
  if(!(length(z) == (dim(X)[1]))){
    stop("Length of z does not match row count in X")
  }
  if(!(length(exact) == length(z))){
    stop("Length of exact does not match length of z")
  }
  if(!(all((z == 1) | (z == 0)))){
    stop("The z argument must contain only 1s and 0s")
  }
  
  if(is.data.frame(X) || is.character(X)){
    if(!is.data.frame(X)) X <- as.data.frame(X)
    X.chars <- which(laply(X, function(y) 'character' %in% class(y)))
    if(length(X.chars) > 0){
      if (verbose) print('Character variables found in X, converting to factors.')
      for(i in X.chars){
        X[,i] <- factor(X[,i])
        
      }
    }
    #if some variables are factors convert to dummies
    X.factors <-  which(laply(X, function(y) 'factor' %in% class(y)))
    
    #handle missing data
    for(i in which(laply(X, function(x) any(is.na(x))))){
      if (verbose) print(paste('Missing values found in column', i ,'of X; imputing and adding missingness indicators'))
      if(i %in% X.factors){
        #for factors, make NA a new factor level
        X[,i] <- addNA(X[,i])
      }else{
        #for numeric/logical, impute means and add a new indicator for missingness
        X[[paste(colnames(X)[i],'NA', sep = '')]] <- is.na(X[,i])
        X[which(is.na(X[,i])),i] <- mean(X[,i], na.rm = TRUE)
      }
    }
    for(i in rev(X.factors)){
      dummyXi <- model.matrix(as.formula(
        paste('~',colnames(X)[i], '-1')),data=X)
      X <- cbind(X[,-i], dummyXi)
    }
    
  }else{
    #handle missing data
    for(i in c(1:ncol(X))){
      if(any(is.na(X[,i]))){
        X <- cbind(X,is.na(X[,i]))
        colnames(X)[ncol(X)] <- paste(colnames(X)[i],'NA', sep = '')
        X[which(is.na(X[,i])),i] <- mean(X[,i], na.rm = TRUE)    
      }
    }
    
  }
  #get rid of columns that do not vary
  varying <- apply(X,2, function(x) length(unique(x)) > 1)
  if(!all(varying) && verbose) print('Constant-value columns found in X, they will not be used to calculate Mahalanobis distance.')
  X <- X[,which(varying),drop = FALSE]
  
  return(X)
}


#' An internal helper function that generates the data abstraction for the edge weights of the main
#' network structure. 
#'
#' @param z a vector of treatment and control indicators, 1 for treatment and 0 for control.
#' @param X a data frame or a numeric or logical matrix containing covariate information for treated and control units. Its row count must be equal to the length of z. 
#' @param distMat a matrix of pair-wise distance specified by the user  
#' @param exact an optional vector of the same length as z. If this argument is specified, treated
#' units will only be allowed to match to control units that have equal values in
#' the corresponding indices of the exact vector. For example, to match patients
#' within hospitals only, one could set exact equal to a vector of hospital IDs for
#' each patient.
#' @param dist.type one of ('propensity','user','none'). If ’propensity’ is specified (the default option), the function estimates a propensity score via logistic regression of
#' z on X and imposes a propensity score caliper. If ’user’ is specified, the user
#' must provide a vector of values on which a caliper will be enforced using the
#' calip.cov argument. If ’none’ is specified no caliper is used.
#' @param calip.option a character indicating the type of caliper used 
#' @param calip.cov see calip.option.
#' @param caliper a numeric value that gives the size of the caliper when the user specifies the calip.option argument
#' as ’propensity’ or ’calip.cov’.
#' @param verbose a boolean value whether to print(cat) debug information. Default: FALSE
#'
#' @return a distance structure used for constructing the main network flow problem
build.dist.struct<-
  function(z, X, distMat, exact = NULL, dist.type = "Mahalanobis", calip.option = 'propensity', calip.cov = NULL, caliper = 0.2, verbose = FALSE){
    cal.penalty <- 100


    if(is.null(exact)) exact = rep(1, length(z))
    if(!(calip.option %in% c('propensity','user','none'))){
      stop('Invalid calip.option specified.')
    }
    if (is.vector(X)) X <- matrix(X, length(X), 1)
    if(!(length(z) == (dim(X)[1]))){
      stop("Length of z does not match row count in X")
    }
    if(!(length(exact) == length(z))){
      stop("Length of exact does not match length of z")
    }
    if(!(all((z == 1) | (z == 0)))){
      stop("The z argument must contain only 1s and 0s")
    }

    if(is.data.frame(X) || is.character(X)){
      if(!is.data.frame(X)) X <- as.data.frame(X)
      X.chars <- which(laply(X, function(y) 'character' %in% class(y)))
      if(length(X.chars) > 0){
        if (verbose) print('Character variables found in X, converting to factors.')
        for(i in X.chars){
          X[,i] <- factor(X[,i])

        }
      }
      #if some variables are factors convert to dummies
      X.factors <-  which(laply(X, function(y) 'factor' %in% class(y)))

      #handle missing data
      for(i in which(laply(X, function(x) any(is.na(x))))){
        if (verbose) print(paste('Missing values found in column', i ,'of X; imputing and adding missingness indicators'))
        if(i %in% X.factors){
          #for factors, make NA a new factor level
          X[,i] <- addNA(X[,i])
        }else{
          #for numeric/logical, impute means and add a new indicator for missingness
          X[[paste(colnames(X)[i],'NA', sep = '')]] <- is.na(X[,i])
          X[which(is.na(X[,i])),i] <- mean(X[,i], na.rm = TRUE)
        }
      }
      for(i in rev(X.factors)){
        dummyXi <- model.matrix(as.formula(
          paste('~',colnames(X)[i], '-1')),data=X)
        X <- cbind(X[,-i], dummyXi)
      }

    }else{
      #handle missing data
      for(i in c(1:ncol(X))){
        if(any(is.na(X[,i]))){
          X <- cbind(X,is.na(X[,i]))
          colnames(X)[ncol(X)] <- paste(colnames(X)[i],'NA', sep = '')
          X[which(is.na(X[,i])),i] <- mean(X[,i], na.rm = TRUE)
        }
      }

    }


    #get rid of columns that do not vary if the distance measure is Mahalanobis distance
    if(dist.type == "Mahalanobis"){
      varying <- apply(X,2, function(x) length(unique(x)) > 1)
      if(!all(varying) && verbose) print('Constant-value columns found in X, they will not be used to calculate Mahalanobis distance.')
      X <- X[,which(varying),drop = FALSE]
    }

    if (calip.option == 'propensity') {
      calip.cov <- glm.fit(cbind(rep(1, nrow(X)),X), z, family = binomial())$linear.predictors
      cal <- sd(calip.cov) * caliper
    }else if(calip.option == 'user'){
      stopifnot(!is.null(calip.cov))

      cal <- sd(calip.cov) * caliper

    }

    nobs <- length(z)
    rX <- as.matrix(X)
    if(dist.type=="Mahalanobis"){
      for (j in 1:(dim(rX)[2])) rX[, j] <- rank(rX[, j])
      cv <- cov(rX)
      vuntied <- var(1:nobs)
      rat <- sqrt(vuntied/diag(cv))
      if(length(rat) == 1){
        cv <- as.matrix(rat) %*% cv %*% as.matrix(rat)
      }else{
        cv <- diag(rat) %*% cv %*% diag(rat)
      }
      icov <- ginv(cv)
    }
    nums <- 1:nobs
    ctrl.nums <- 1:(sum(z == 0))
    treated <- nums[z == 1]

    #find distance between each treated and each control it will be connected to and store in a distance structure
    dist.struct <- list()
    if(dist.type == "Euclidean"){
      distMat = as.matrix(dist(rX))
    }
    for (i in c(1:length(treated))) {
      controls <- nums[(z == 0) & (exact == exact[treated[i]])]
      control.names <- ctrl.nums[exact[z == 0] == exact[treated[i]]]
      if(dist.type == "Mahalanobis"){
        costi <- mahalanobis(rX[controls, ,drop=FALSE], rX[treated[i], ], icov, inverted = T)
      } else if(dist.type == "Euclidean"){
        costi <- as.vector(distMat[treated[i], controls])
      } else{
        costi <- as.vector(distMat[treated[i], controls])
      }

      if (calip.option != 'none') {
        calip.update <- rep(0, length(costi))
        calip.update[abs(calip.cov[treated[i]] - calip.cov[controls]) - cal > 0] <- Inf
        costi <- costi + calip.update
      }
      names(costi) <- control.names

      dist.struct[[i]] <- costi[is.finite(costi)]
    }

    if (sum(laply(dist.struct, length)) == 0) stop('All matches forbidden. Considering using a wider caliper?')
    return(dist.struct)
  }


#' An internal helper function that transforms the output from the RELAX algorithm 
#' to a data structure that is more interpretable for the output of the main matching function 
#'
#' @param out.elem an object from the output of RELAX algorithm
#' @param already.done a factor indicating the index of matches already been transformed
#' @param prev.obj an object of previously transformed matches 
#'
#' @return a named list with elements containing matching information useful for the main matching function
obj.to.match <- function(out.elem, already.done = NULL, prev.obj = NULL){
  tcarcs <- length(unlist(out.elem$net$edgeStructure))
  edge.info <- extractEdges(out.elem$net)
  one.sol <- function(sol){
    x <- sol[1:tcarcs]
    match.df <- data.frame(treat = as.factor(edge.info$startn[1:tcarcs]), x = x, control = edge.info$endn[1:tcarcs])
    matched.or.not <- daply(match.df, .(match.df$treat), function(treat.edges) c(as.numeric(as.character(treat.edges$treat[1])),
                                                                                 sum(treat.edges$x)), .drop_o = FALSE)
    if (any(matched.or.not[, 2] == 0)) {
      match.df <- match.df[-which(match.df$treat %in% matched.or.not[which(matched.or.not[,
                                                                                          2] == 0), 1]), ]
    }
    match.df$treat <- as.factor(as.character(match.df$treat))
    matches <- as.matrix(daply(match.df, .(match.df$treat), function(treat.edges) treat.edges$control[treat.edges$x ==
                                                                                                        1], .drop_o = FALSE))
    matches - length(out.elem$net$treatedNodes)
  }
  if(is.null(already.done)) return(llply(out.elem$solutions, one.sol))
  new.ones <- setdiff(1:length(out.elem$solutions), already.done)
  out.list <- list()
  out.list[already.done] <- prev.obj
  out.list[new.ones] <- llply(out.elem$solutions[new.ones], one.sol)
  return(out.list)
}


#' An internal helper function that generates the data abstraction for the edge weights of the main
#' network structure using the distance matrix passed by the user. 
#' @param z a vector indicating whether each unit is in treatment or control group
#' @param distMat a matrix of pair-wise distance
#' @param verbose a boolean value whether to print(cat) debug information. Default: FALSE
#'
#' @return a distance structure used for constructing the main network flow problem
build.dist.struct_user<-
  function(z, distMat, verbose = FALSE){
    if(!(all((z == 1) | (z == 0)))){
      stop("The z argument must contain only 1s and 0s")
    }

    nobs <- length(z)

    nums <- 1:nobs
    ctrl.nums <- 1:(sum(z == 0))
    treated <- nums[z == 1]

    #find distance between each treated and each control it will be connected to and store in a distance structure
    dist.struct <- list()
    for (i in c(1:length(treated))) {
      controls <- nums[(z == 0)]
      control.names <- ctrl.nums
      costi <- as.vector(distMat[treated[i], controls])
      names(costi) <- control.names
      dist.struct[[i]] <- costi[is.finite(costi)]
    }

    if (sum(laply(dist.struct, length)) == 0) stop('All matches forbidden. Considering using a wider caliper?')
    return(dist.struct)
  }


#' An internal helper function that combines two distance object
#'
#' @param a a distance structure object 
#' @param b a distance structure object
#'
#' @return a new distance structure object whose edge weights are the sum of the corresponding edge weigths in a and b
combine_dist <- function(a, b){
  if(length(a) != length(b)) stop("Make sure both distance structure have the same amount of treated units.")
  res = list()
  n = length(a)
  for(i in 1:n){
    ai = a[[i]]
    bi = b[[i]]
    intersectedIndex = intersect(names(ai), names(bi))
    res[[i]] = ai[intersectedIndex] + bi[intersectedIndex]
    if((length(intersectedIndex) != length(ai)) || (length(intersectedIndex) != length(bi))){
      stop("The two distance matrices and parameters do not provide the same set of edges in the network. Check your input.")
    }
  }
  return(res)
}