funcly: Useful Functions in R for All-Purpose Analysis

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# author: Reza Hosseini

## an implementation of matching using Matchit and fitting via zelig
# TODO: Reza Hosseini: implement the exact match faster and without MatchIt / Zelig

library("MatchIt")
library("Zelig")
library("dplyr")
library("data.table")

# it filters out continuous columns
# which become degenrate if we fit a glm model
# cols can be continuous or binary (0 / 1)
FilterContiCols_ifDegenrateGlm = function(
    df, cols, valueCol, family="gaussian") {

   formulaStr = paste0(
      valueCol,
      "~",
      paste(cols, collapse="+"))

  formula = as.formula(formulaStr)
  predRegMod = glm(formula, data=df, family=family)
  predRegMod_summ = summary(predRegMod)

  x = predRegMod[["coefficients"]]
  cols2 = names(x)[is.na(x)]
  cols = setdiff(cols, cols2)

  return(cols)
}

TestFilterContiCols_ifDegenrateGlm = function() {

  n = 10
  x1 = rnorm(n)
  x2 = rnorm(n)
  x3 = x1 + x2
  y = 2*x1 + x2 + x3 + rnorm(n)
  df = data.frame("x1"=x1, "x2"=x2, "x3"=x3, "y"=y)

  FilterContiCols_ifDegenrateGlm(
      df=df,
      cols=c("x1", "x2", "x3"),
      valueCol="y",
      family="gaussian")

  n = 10
  x1 = rnorm(n)
  x2 = rnorm(n)
  x3 = x1 + x2
  y = 2*x1 + x2 + x3 + rnorm(n) > 0
  df = data.frame("x1"=x1, "x2"=x2, "x3"=x3, "y"=y)

  FilterContiCols_ifDegenrateGlm(
      df=df,
      cols=c("x1", "x2", "x3"),
      valueCol="y",
      family="binomial")
}

# this function is for causal studies
FilterContCols_forCausal = function(
    binDf,
    predCols_bin,
    valueCol) {

  # first take out columns with only one value
  predCols_bin2 = predCols_bin
  omitCols = NULL
  for (col in predCols_bin2) {
    tab = table(binDf[ , col])
    if (length(tab) < 2) {
      warning(paste(
          "The column:",
          col,
          "had only one value. Therefore was eliminated."))
      omitCols = c(omitCols, col)
    }
  }

  if (length(omitCols) > 0) {
    predCols_bin2 = setdiff(predCols_bin2, omitCols)
  }

  predCols_bin = predCols_bin2

  # some cont columns could be degenerate causing issues
  # we take them out in first stage by fitting a model
  # for treat as response
  # this will eliminate cont columns which are already collinaer
  # even with treat being a predictor as in the final model
  predCols_bin2 = FilterContiCols_ifDegenrateGlm(
      df=binDf,
      cols=predCols_bin,
      valueCol="treat",
      family="binomial")

  if (length(predCols_bin2) < length(predCols_bin)) {
    warnings(paste(
        "Some columns in matching were degenerate. ",
        "We have omitted these columns: ",
        paste(setdiff(predCols_bin, predCols_bin2), collapse="--")))
  }

  predCols_bin = predCols_bin2

  # some control columns could be degenerate causing issues
  # these will have coefficient of NA in models
  # some models can handle that but Zelig cannot as of now
  # we omit those
  # we include treat as well and take it out afterwards
  predCols_bin2 = FilterContiCols_ifDegenrateGlm(
      df=binDf,
      cols=c("treat", predCols_bin),
      valueCol=valueCol,
      family="gaussian")

  if (!("treat" %in% predCols_bin2)) {
    warning(paste(
       "treatment col is too collinear with the pred cols you provided.",
       "the alg tried to fix that by removing some columns from design matrix.",
       "however it could not prevent degenerate models.",
       "this might be caused by categorical vars with two many labels.",
       collapse="\n"))
  }

  predCols_bin2 = setdiff(predCols_bin2, "treat")

   if (length(predCols_bin2) < length(predCols_bin)) {
    warnings(paste(
        "Some columns in matching were degenerate. ",
        "We have omitted these columns: ",
        paste(setdiff(predCols_bin2, predCols_bin), collapse="--")))
  }

 return(predCols_bin2)
}


# estimate raw effect.
# the valueCol must be 0/1 and continuous
RawEffectCi = function(df, valueCol, treatCol) {
  mod = glm(as.formula(paste0(valueCol, "~", treatCol)), data=df)
  rawEffect_summ = summary(mod)
  coef = rawEffect_summ[["coefficients"]]
  rowName = rownames(coef)[grep(treatCol, rownames(coef))]
  coefDf = data.frame(coef)
  coefDf[ , "var"] = rownames(coef)
  rawEffect = coefDf[(coefDf[ , "var"] %in% rowName), ]
  x = rawEffect
  lower = x[["Estimate"]] - 1.96 * x[["Std..Error"]]
  upper = x[["Estimate"]] + 1.96 * x[["Std..Error"]]
  raw_ci = c(lower, upper)
  return(c(lower, upper))
}

TestRawEffectCi = function() {

  n = 5000
  treat = sample(c("drug", "placebo"), n, replace=TRUE)
  u1 = 2*(treat == "drug") + 5 * rnorm(n)
  u2 = sample(c("cat", "dog", "horse"), n, replace=TRUE)
  y = 3*(treat == "drug") + u1 - 2 * (u2 == "dog") + rnorm(n)
  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "y"=y)
  valueCol = "y"
  predCols_categ = c("u2")
  predCols_conti = c("u1")
  treatCol = "treat"

  RawEffectCi(df=df, valueCol=valueCol, treatCol=treatCol)

}

# idCol should be provided to join the matched data set later if needed
# if idCol is not provided we define an idCol: "id"
Process_andMatch = function(
    df,
    idCol=NULL,
    predCols_categ=NULL,
    predCols_conti=NULL,
    treatCol,
    treatPair=c(0, 1),
    figsPath=NULL,
    method="nearest",
    dichomNum=4
    ) {

  if (is.null(idCol)) {
    idCol = "row_num_orig"
    df[ , idCol] = 1:nrow(df)
  }

  # only keep necessary columns
  df = df[ , c(idCol, predCols_conti, predCols_categ, treatCol)]

  # only keep treatment pair
  df = df[df[ , treatCol] %in% treatPair, ]
  if (nrow(df) == 0) {
    warnings(paste(
        "The data size became 0 after picking the treatment pairs.",
        "Make sure you have the correct treatment pairs."))
  }

  newCols = NULL
  if (!is.null(predCols_conti)) {
    ## we turn the only two continuous/ordered control variables to categorical
    # to do that we dichotomize each variable using quantiles
    # the function below for each continuous variable will add
    # a new column with the column name being the same
    # with an extra suffix "_categ"

    res = Add_dichomVarMulti(
      dt=df,
      cols=predCols_conti,
      num=dichomNum)
    newCols = res[["newCols"]]
    dt = res[["dt"]]
    ## these are the new columns added
    #print(newCols)
    df = data.frame(dt)

    for (col in newCols) {
      #print(table(df[ , col]))
    }
  }

  ## defining treatment
  predCols = c(predCols_categ, newCols)
  df = df[ , c(idCol, treatCol, predCols)]

  ## define a new column called treat and assign 0, 1
  df[ , "treat"] = plyr::mapvalues(
      df[ , treatCol],
      from=treatPair,
      to=c("0", "1"))

  #df[ , "treat"] = as.numeric(df[ , "treat"])

  ## check the number of units on control and treatment
  print("data size per treatment arm for raw data")
  ssTab = table(df[ , "treat"])
  print(ssTab)

  #### Explore control variables:
  # (1) if their distribution is different
  # (2) if they "effect" the response
  # matching is usually necessary if both (1) and (2) hold

  ## Check if predictors distributions really differ across treatment groups
  if (!is.null(figsPath)) {
    for (col in predCols) {
      #print(col)
      df2 = df
      df2[ , "treat"] = as.character(df2[ , "treat"])
      res = PltCompare_categDist(df=df2, labelCol=col, compareCol="treat")
      p = res[["p"]]
      fn = paste0(figsPath, "compare_distbn_", col, ".png")
      png(file=fn)
      print(p)
      dev.off()
      #Pause()
    }
  }

  # set reference for the treatment to be always the control arm: 0
  # first removing factor ordering to avoid errors
  df[ , "treat"] = factor(df[ , "treat"], ordered=FALSE)
  df = within(df, treat <- relevel(treat, ref="0"))

  ## matching

  ## step 0: remove missing or re-assign
  ## below we calculate the percentage (out of 100%) of missing
  # for each of our predictor/control variables
  # then we assign the mode of observed data to them
  # if the missing is more than 1%
  # since missing could be informative we add a new value "not_avail"
  for (col in predCols) {
    df[ , col] = as.character(df[ , col])
  }

  for (col in predCols) {
    print(col)
    x = sum(is.na(df[ , col]))
    print(100 * x / nrow(df))

    if (x > 0 & x < 1) {
      print("less than 1 percent missing. mode was assigned.")
      df[is.na(df[ , col]) , col] = Zelig::Mode(df[ , col])
      } else if (x >= 1) {
        print("more than 1 percent missing. not_avail as assigned.")
        df[is.na(df[ , col]) , col] = "not_avail"
      }
  }

  # for treatment column we remove NAs as well
  df = df[!is.na(df[ , "treat"]), ]
  #dim(df)
  #table(df[ , "treat"])

  print("data size per treatment after removing NAs")
  ssTab_afterRemoveNA = table(df[ , "treat"])
  print(ssTab_afterRemoveNA)

  # step 1

  # building binary predictors
  formulaStr = paste0("~", paste(predCols, collapse="+"))
  formula = as.formula(formulaStr)
  binDf = model.matrix(formula, data=df)
  cols = colnames(binDf)

  # making sure the column names are not
  # causing issues for "formula" comprehension
  cols = AbbrStringVec(
      strings=cols,
      wordLength=50,
      replaceStrings=c("Inf", "\\(", "\\)", ",", "\\[", "\\]"),
      sep="_")

  cols = AbbrStringVec(
      strings=cols,
      wordLength=50,
      replaceStrings=c("-"),
      sep="minus")

  cols = AbbrStringVec(
      strings=cols,
      wordLength=50,
      replaceStrings=c("\\+"),
      sep="plus")

  cols = AbbrStringVec(
      strings=cols,
      wordLength=50,
      replaceStrings=c("\\."),
      sep="dot")

  names(cols) = NULL
  colnames(binDf) = cols

  ## removing intercept
  predCols_bin = setdiff(colnames(binDf), "Intercept")
  binDf = cbind(binDf, df[ , c("treat", idCol)])

  # zelig expects binary variables even for treatCol
  # it causes errors later down the line if we use factor
  # df[ , "treat"] = as.numeric(df[ , "treat"])

  binDf[ , "dummy_response"] = (1:nrow(binDf)) / nrow(binDf)
  print("remove collinearity before matching")
  predCols_bin = FilterContCols_forCausal(
      binDf=binDf,
      predCols_bin=predCols_bin,
      valueCol="dummy_response")
  predCols_bin_withTreat = c("treat", predCols_bin)

  binDf = SubsetCols(binDf, dropCols="dummy_response")

  ### matching phase
  matchStr = paste0("treat ~ ", paste(predCols_bin, collapse="+"))
  matchFormula = as.formula(matchStr)
  match = MatchIt::matchit(
      matchFormula,
      data=binDf,
      method=method)
      #subclass=6)

  match_summ = summary(match)
  matchDf = MatchIt::match.data(match)

  dataPerc_afterMatching = round(100 * nrow(matchDf) / nrow(binDf), 1)
  ssTab_afterMatching = table(matchDf[ , "treat"])

  # lets remove the NA causing variables again after matching
  matchDf0 = matchDf
  matchDf0[ , "dummy_response"] = (1:nrow(matchDf0)) / nrow(matchDf0)
  predCols_bin = FilterContCols_forCausal(
      binDf=matchDf0,
      predCols_bin=predCols_bin,
      valueCol="dummy_response")
  predCols_bin_withTreat = c("treat", predCols_bin)

  print("survived data percent after matching")
  print(dataPerc_afterMatching)

  return(list(
      "ssTab"=ssTab,
      "ssTab_afterRemoveNA"=ssTab_afterRemoveNA,
      "ssTab_afterMatching"=ssTab_afterMatching,
      "binDf"=binDf,
      "predCols_bin"=predCols_bin,
      "predCols_bin_withTreat"=predCols_bin_withTreat,
      "match"=match,
      "match_summ"=match_summ,
      "matchDf"=matchDf,
      "dataPerc_afterMatching"=dataPerc_afterMatching
      ))
}

TestProcess_andMatch = function() {

  # test 1
  # x1 has effect on y event after matching for x2
  # but the effect should be 2 after matching
  # and should almost (1 + 2) before
  #library("matchit")
  n = 5000
  treat = sample(c("drug", "placebo"), n, replace=TRUE)
  u1 = 2*(treat == "drug") + 5 * rnorm(n)
  u2 = sample(c("cat", "dog", "horse"), n, replace=TRUE)
  y = 3*(treat == "drug") + u1 - 2 * (u2 == "dog") + rnorm(n)
  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "y"=y)

  valueCol = "y"
  predCols_categ = c("u2")
  predCols_conti = c("u1")
  treatCol = "treat"
  treatPair = c("placebo", "drug")
  figsPath = NULL
  method = "exact"
  df[ , "id"] = 1:nrow(df)

  res = Process_andMatch(
      df=df,
      idCol="id",
      predCols_categ=predCols_categ,
      predCols_conti=predCols_conti,
      treatCol=treatCol,
      treatPair=treatPair,
      figsPath=figsPath,
      method=method)

  summary(res[["match"]])

  n = 5000
  treat = sample(c("drug", "placebo"), n, replace=TRUE)
  u1 = 2*(treat == "drug") + 5 * rnorm(n)
  u2 = sample(c("cat", "dog", "horse"), n, replace=TRUE)
  y = 3*(treat == "drug") + u1 - 2 * (u2 == "dog") + rnorm(n)

  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "y"=y)

  valueCol = "y"
  predCols_categ = c("u2")
  predCols_conti = c("u1")
  treatCol = "treat"
  treatPair = c("placebo", "drug")
  figsPath = NULL
  method = "nearest"


  res = Process_andMatch(
      df=df,
      idCol=NULL,
      predCols_categ=predCols_categ,
      predCols_conti=predCols_conti,
      treatCol=treatCol,
      treatPair=treatPair,
      figsPath=figsPath,
      method=method)

  summary(res[["match"]])

  res[["binDf"]]

}


# this fits models to matched data
# valueCol is extracted from original data
# for the exact method you use should set use_subclass to be TRUE
Fit_matchedData = function(
    df,
    valueCol,
    binDf, # binary data before matching to calculate raw metrics
    matchDf,
    idCol="row_num_orig",
    predCols_bin,
    use_subclass=FALSE,
    balance_wrtSubclass=FALSE) {

  # if df does not have the idCol, the index must be simply the row num
  if (!(idCol %in% colnames(df))) {
    df[ , idCol] = 1:nrow(df)
  }

  # adding valueCol from original data to matched data
  matchDf = merge(
      matchDf,
      df[ , c(idCol, valueCol)],
      by=idCol,
      all.x=TRUE,
      all.y=FALSE)

  binDf = merge(
      binDf,
      df[ , c(idCol, valueCol)],
      by=idCol,
      all.x=TRUE,
      all.y=FALSE)

  # Check if predictors are influential on response
  formulaStr = paste0(valueCol,  "~", paste(predCols_bin, collapse="+"))
  formula = as.formula(formulaStr)
  predRegMod = glm(formula, data=binDf)
  predRegMod_summ = summary(predRegMod)

  # fitting a simple model (only treat) with treatment before matching
  raw_ci = RawEffectCi(df=binDf, valueCol=valueCol, treatCol="treat")

  ## fitting another model with treatment and other covariates
  predCols_bin_withTreat = c("treat", predCols_bin)

  # fitting a glm model with preds and treatment
  formulaStr = paste0(
      valueCol,
      "~",
      paste(predCols_bin_withTreat, collapse="+"))

  formula = as.formula(formulaStr)
  regMod = glm(formula, data=binDf)
  regMod_summ = summary(regMod)
  coef = regMod_summ[["coefficients"]]
  rowName = rownames(coef)[grep("treat", rownames(coef))]
  coefDf = data.frame(coef)
  coefDf[ , "var"] = rownames(coef)
  regEffect = coefDf[(coefDf[ , "var"] %in% rowName), ]
  x = regEffect
  lower = x[["Estimate"]] - 1.96 * x[["Std..Error"]]
  upper = x[["Estimate"]] + 1.96 * x[["Std..Error"]]
  reg_ci = c(lower, upper)

  matchDf0 = matchDf
  if (use_subclass) {
    if (!("subclass" %in% colnames(matchDf))) {
      warning(paste(
          "subclass is not in matchDf.",
          "subclass is only avail for some matching methods."))
    }
    matchDf0 = matchDf[ , "subclass", drop=FALSE]
    matchDf0[ , "subclass"] = as.character(matchDf[ , "subclass"])
    formulaStr = "~subclass"
    formula = as.formula(formulaStr)
    binDf_subclass = model.matrix(formula, data=matchDf0)
    rm(matchDf0)
    binDf_subclass = SubsetCols(binDf_subclass, drop="(Intercept)")
    predCols_bin = colnames(binDf_subclass)
    matchDf = cbind(matchDf[ , c("treat", valueCol)], binDf_subclass)
    predCols_bin_withTreat = c("treat", predCols_bin)
  }

  ## fit zelig with treatment
  formulaStr = paste0(
      valueCol,
      "~",
      paste(predCols_bin_withTreat, collapse="+"))

  formula = as.formula(formulaStr)

  matchDf_balanced = NULL
  if (balance_wrtSubclass) {
    res = BalanceSampleSize_wrtCols(
        df=matchDf,
        sliceCols="treat",
        wrtCols="subclass",
        itemCols=NULL,
        sliceCombinValues_toBalance=NULL)
    matchDf_balanced = res[["subDf"]]
    matchDf = matchDf_balanced
  }

  raw_matched_ci = RawEffectCi(
      df=matchDf, valueCol=valueCol, treatCol="treat")

  ## fitting a model
  zeligMod = Zelig:::zelig(
      formula,
      data=matchDf,
      model="ls",
      cite=FALSE)

  zeligMod_summ = summary(zeligMod)

  res = GetEffect_fromZeligMod(zeligMod=zeligMod)

  estimEffect = res[["estimEffect"]]
  estimEffect_summ = res[["estimEffect_summ"]]
  fd_ci = res[["fd_ci"]]
  fd_mean = res[["fd_mean"]]

  return(list(
      "matchDf_balanced"=matchDf_balanced,
      "zeligMod"=zeligMod,
      "zeligMod_summ"=zeligMod_summ,
      "estimEffect"=estimEffect,
      "estimEffect_summ"=estimEffect_summ,
      "raw_ci"=raw_ci,
      "reg_ci"=reg_ci,
      "raw_matched_ci"=raw_matched_ci,
      "fd_ci"=fd_ci,
      "fd_mean"=fd_mean
      ))
}


TestFit_matchedData = function() {


  # test 1
  # x1 has effect on y event after matching for x2
  # but the effect should be 2 after matching
  # and should almost (1 + 2) before
  #library("matchit")
  n = 5000
  treat = sample(c("drug", "placebo"), n, replace=TRUE)
  u1 = 2*(treat == "drug") + 5 * rnorm(n)
  u2 = sample(c("cat", "dog", "horse"), n, replace=TRUE)
  y = 3*(treat == "drug") + u1 - 2 * (u2 == "dog") + rnorm(n)

  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "y"=y)

  valueCol = "y"
  predCols_categ = c("u2")
  predCols_conti = c("u1")
  treatCol = "treat"
  treatPair = c("placebo", "drug")
  figsPath = NULL
  method = "exact"

  matchRes = Process_andMatch(
      df=df,
      idCol=NULL,
      predCols_categ=predCols_categ,
      predCols_conti=predCols_conti,
      treatCol=treatCol,
      treatPair=treatPair,
      figsPath=figsPath,
      method=method)

  binDf = matchRes[["binDf"]]
  matchDf = matchRes[["matchDf"]]
  predCols_bin = matchRes[["predCols_bin"]]

  fitRes = Fit_matchedData(
      df=df,
      valueCol="y",
      binDf=binDf, # binary data before matching
      matchDf=matchDf,
      idCol="row_num_orig",
      predCols_bin=predCols_bin,
      use_subclass=FALSE)


  ## for the exact method we should set use_subclass to be TRUE
  fitRes = Fit_matchedData(
      df=df,
      valueCol="y",
      binDf=binDf, # binary data before matching
      matchDf=matchDf,
      idCol="row_num_orig",
      predCols_bin=predCols_bin,
      use_subclass=TRUE)


  # test 2
  # x1 has effect on y event after matching for x2
  # but the effect should be 2 after matching
  # and should almost (1 + 2) before
  #library("matchit")
  n = 100
  treat = sample(c("drug", "placebo"), n, replace=TRUE)
  u1 = 2*(treat == "drug") + 5 * rnorm(n)
  u2 = sample(paste0("label", as.character(1:5)), n, replace=TRUE)
  y = 3*(treat == "drug") + u1 - 2 * (u2 == "label1") + rnorm(n)

  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "y"=y)

  valueCol = "y"
  predCols_categ = c("u2")
  predCols_conti = c("u1")
  treatCol = "treat"
  treatPair = c("placebo", "drug")
  figsPath = NULL
  method = "exact"

  matchRes = Process_andMatch(
      df=df,
      idCol=NULL,
      predCols_categ=predCols_categ,
      predCols_conti=predCols_conti,
      treatCol=treatCol,
      treatPair=treatPair,
      figsPath=figsPath,
      method=method)

  binDf = matchRes[["binDf"]]
  matchDf = matchRes[["matchDf"]]
  predCols_bin = matchRes[["predCols_bin"]]

  matchDt = data.table(matchDf)
  SortDf(matchDt[ , .N, c("treat", "subclass")], "subclass")

  res = BalanceSampleSize(
      df=matchDf,
      sliceCols=c("treat", "subclass"))

  matchDf_balanced =  res[["subDf"]]
  matchDt_balanced = data.table(matchDf_balanced)
  SortDf(matchDt_balanced[ , .N, c("treat", "subclass")], "subclass")


  fitRes = Fit_matchedData(
      df=df,
      valueCol="y",
      binDf=binDf, # binary data before matching
      matchDf=matchDf,
      idCol="row_num_orig",
      predCols_bin=predCols_bin,
      use_subclass=FALSE)

  fitRes[["raw_ci"]]
  fitRes[["fd_ci"]]

  fitRes = Fit_matchedData(
      df=df,
      valueCol="y",
      binDf=binDf, # binary data before matching
      matchDf=matchDf,
      idCol="row_num_orig",
      predCols_bin=predCols_bin,
      use_subclass=FALSE)

  fitRes[["raw_ci"]]
  fitRes[["fd_ci"]]

  ## balance sample size
  n = 2000
  treat = sample(c("drug", "placebo"), n, replace=TRUE)
  u1 = 2*(treat == "drug") + 5 * rnorm(n)
  u2 = sample(paste0("label", as.character(1:5)), n, replace=TRUE)
  y = 3*(treat == "drug") + u1 - 2 * (u2 == "label1") + rnorm(n)

  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "y"=y)

  valueCol = "y"
  predCols_categ = c("u2")
  predCols_conti = c("u1")
  treatCol = "treat"
  treatPair = c("placebo", "drug")
  figsPath = NULL
  method = "exact"

  matchRes = Process_andMatch(
      df=df,
      idCol=NULL,
      predCols_categ=predCols_categ,
      predCols_conti=predCols_conti,
      treatCol=treatCol,
      treatPair=treatPair,
      figsPath=figsPath,
      method=method)

  binDf = matchRes[["binDf"]]
  matchDf = matchRes[["matchDf"]]
  predCols_bin = matchRes[["predCols_bin"]]


  ## for the exact method we should set use_subclass to be TRUE
  fitRes = Fit_matchedData(
      df=df,
      valueCol="y",
      binDf=binDf, # binary data before matching
      matchDf=matchDf,
      idCol="row_num_orig",
      predCols_bin=predCols_bin,
      use_subclass=FALSE,
      balance_wrtSubclass=TRUE
      )

  fitRes[["raw_ci"]]
  fitRes[["fd_ci"]]
  fitRes[["raw_matched_ci"]]
  matchDf_balanced = fitRes[["matchDf_balanced"]]
  dt = data.table(matchDf_balanced)
  SortDf(dt[ , .N, c("treat", "subclass")], "subclass")

}


# this does both steps of matching and fitting
# if the method is exact use_subclass should be FALSE or left to be NULL
Match_andFit = function(
    df,
    valueCol,
    predCols_categ=NULL,
    predCols_conti=NULL,
    treatCol,
    treatPair=c(0, 1),
    figsPath=NULL,
    method="nearest",
    dichomNum=4,
    use_subclass=FALSE,
    balance_wrtSubclass=FALSE
    ) {

  # use_subclass should be TRUE if the method is exact
  #if (is.null(use_subclass) & method != "exact") {
  #  use_subclass = FALSE
  #}

  #if (is.null(use_subclass) & method == "exact") {
  #  use_subclass = TRUE
  #}

  matchRes = Process_andMatch(
      df,
      idCol=NULL,
      predCols_categ=predCols_categ,
      predCols_conti=predCols_conti,
      treatCol=treatCol,
      treatPair=treatPair,
      figsPath=figsPath,
      method=method,
      dichomNum=dichomNum)

  binDf = matchRes[["binDf"]]
  matchDf = matchRes[["matchDf"]]
  predCols_bin = matchRes[["predCols_bin"]]

  fitRes = Fit_matchedData(
      df=df,
      valueCol=valueCol,
      binDf=binDf, # binary data before matching
      matchDf=matchDf,
      idCol="row_num_orig",
      predCols_bin=predCols_bin,
      use_subclass=use_subclass,
      balance_wrtSubclass=balance_wrtSubclass)

  res = c(matchRes, fitRes)
  return(res)

}


TestMatch_andFit = function() {

  # test 1
  # x1 has effect on y event after matching for x2
  # but the effect should be 2 after matching
  # and should almost (1 + 2) before
  #library("matchit")
  n = 5000
  treat = sample(c("drug", "placebo"), n, replace=TRUE)
  u1 = 2*(treat == "drug") + 5 * rnorm(n)
  u2 = sample(c("cat", "dog", "horse"), n, replace=TRUE)
  y = 3*(treat == "drug") + u1 - 2 * (u2 == "dog") + rnorm(n)

  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "y"=y)

  valueCol = "y"
  predCols_categ = c("u2")
  predCols_conti = c("u1")
  treatCol = "treat"
  treatPair = c("placebo", "drug")
  figsPath = NULL
  method = "nearest"

  res = Match_andFit(
    df=df,
    valueCol=valueCol,
    predCols_categ=predCols_categ,
    predCols_conti=predCols_conti,
    treatCol=treatCol,
    treatPair=treatPair,
    figsPath=figsPath,
    method=method,
    use_subclass=FALSE)

  summary(res[["match"]])

  res[["estimEffect"]]
  res[["raw_ci"]]
  res[["reg_ci"]]
  res[["fd_ci"]]
  res[["fd_mean"]]

  # x1 should not have any effect after controlling for x2
  n = 200
  treat = sample(c("1", "0"), n, replace=TRUE)
  u1 = 1*(treat == "1") + 2*rnorm(n)
  u2 = sample(c("cat", "dog", "horse"), n, replace=TRUE)
  y = 2 * u1 - 2 * (u2 == "dog") + rnorm(n)
  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "y"=y)

  valueCol = "y"
  predCols_categ = c("u2")
  predCols_conti = "u1"
  treatCol = "treat"
  treatPair = c("0", "1")
  figsPath = NULL

  res = Match_andFit(
    df=df,
    valueCol=valueCol,
    predCols_categ=predCols_categ,
    predCols_conti=predCols_conti,
    treatCol=treatCol,
    treatPair=treatPair,
    figsPath=figsPath,
    method="exact")

  res[["raw_ci"]]
  res[["reg_ci"]]
  res[["fd_ci"]]

  res = Match_andFit(
    df=df,
    valueCol=valueCol,
    predCols_categ=predCols_categ,
    predCols_conti=predCols_conti,
    treatCol=treatCol,
    treatPair=treatPair,
    figsPath=figsPath,
    method="exact",
    use_subclass=TRUE)

  res[["raw_ci"]]
  res[["reg_ci"]]
  res[["fd_ci"]]

  s = summary(res[["match"]])
  s
  sum(s[["q.table"]]["Total"])

  ##
  n = 2000
  treat = sample(c("1", "0"), n, replace=TRUE)
  u1 = 1*(treat == "1") + 2*rnorm(n)
  u2 = sample(c("cat", "dog", "horse"), n, replace=TRUE)
  u3 = u2
  y = 2 * u1 - 2 * (u2 == "dog") + rnorm(n)
  df = data.frame("treat"=treat, "u1"=u1, "u2"=u2, "u3"=u3, "y"=y)

  valueCol = "y"
  predCols_categ = c("u2", "u3")
  predCols_conti = "u1"
  treatCol = "treat"
  treatPair = c("0", "1")
  figsPath = NULL

  res = Match_andFit(
    df=df,
    valueCol=valueCol,
    predCols_categ=predCols_categ,
    predCols_conti=predCols_conti,
    treatCol=treatCol,
    treatPair=treatPair,
    figsPath=figsPath,
    method="exact",
    balance_wrtSubclass=TRUE)

  res[["raw_ci"]]
  res[["reg_ci"]]
  res[["fd_ci"]]
}

# get the output from zelig
GetEffect_fromZeligMod = function(zeligMod) {

  library(dplyr)
  # we use try as the zelig set values can fail due to NA's
  # in the coefs
  trial1 = try({
      xCont = Zelig:::setx(zeligMod, treat=0)
  })
  trial2 = try({
      xTreat = Zelig:::setx(zeligMod, treat=1)
  })

  if (class(trial1) == "try-error" || class(trial2) == "try-error") {
    return(list(
        "estimEffect"=NULL,
        "estimEffect_summ"=NULL,
        "fd_ci"="",
        "fd_mean"=NULL,
        "att_ci"=att_ci))
  }

  estimEffect = Zelig:::sim(zeligMod, x=xCont, x1=xTreat)
  estimEffect_summ = summary(estimEffect)
  zqi = zeligMod %>%
              Zelig:::setx(treat="0") %>%
              Zelig:::setx1(treat="1") %>%
              Zelig:::sim()

  #pv0 = zqi$get_qi(qi ="pv", xvalue = "x")
  #ev0 = zqi$get_qi(qi = "ev", xvalue = "x")
  #pv1 = zqi$get_qi(qi = "pv", xvalue = "x")
  #ev1 = zqi$get_qi(qi = "ev", xvalue = "x1")
  fd = zqi$get_qi(qi="fd", xvalue="x1")
  fd_ci = quantile(fd, c(0.025, 0.975))
  fd_mean = mean(fd)

  att_ci = NULL
  #att = (zeligMod %>%
  #  ATT(treatment="treat",  treat=1) %>%
  #  get_qi(qi="ATT", xvalue="TE"))
  #att_ci = quantile(att, c(0.025, 0.0975))

  return(list(
      "estimEffect"=estimEffect,
      "estimEffect_summ"=estimEffect_summ,
      "fd_ci"=fd_ci,
      "fd_mean"=fd_mean,
      "att_ci"=att_ci))
}