R/Inference.R

#----------------------------------------------------------------------------------
# Estimate IC-based Variance (as. Var) and CIs (based on f_W and fY)
# New fast method for as var calculation (matrix vs)
#----------------------------------------------------------------------------------
# use estimates of fWi (hold all W's fixed at once),
# loop over all intersecting friends networks
# calculate R_ij*(fW_i*fW_j) - see page 33 Network paper vdL
#----------------------------------------------------------------------------------
# unified estimator naming used throughout the package:
# c("TMLE", "h_IPTW", "MLE")
#----------------------------------------------------------------------------------


# ------------------------------------------------------------------------------
# THIS FUNCTION IS NOT USED, ONLY HERE FOR TESTING SPARSE CONNECTIVITY MATRIX CONVERSION / CREATION
get_sparse_Fiintersectmtx <- function() {
  # ------------------------------------------------------------------------------
  # Simulate some network data
  # ------------------------------------------------------------------------------
  n <- 20000
  # n <- 10000
  # n <- 1000
  # fvec_i <- abs(rnorm(n))
  fvec_i <- rnorm(n)
  # fvec_i <- rep_len(1,n)
  # NetInd_k <- matrix(NA, nrow=n, ncol=3)
  # NetInd_k[1,] <- c(2,3,NA)
  # NetInd_k[4,] <- c(2,NA,NA)
  # nF <- rep.int(0,n)
  # nF[1] <- 2; nF[4] <- 1
  # Kmax <- 150
  Kmax <- 200
  # Kmax <- 15
  NetInd_k <- t(replicate(n, sample(1:n, Kmax, replace = FALSE)))
  nF <- rep.int(Kmax, n)
  # ------------------------------------------------------------------------------

  # ------------------------------------------------------------------------------
  # Using sparse matrix implementation to get conn_ind_mtx_1st_indir
  # ------------------------------------------------------------------------------
  # estvartime <- system.time({
    # NetInd as sparse adjacency matrix (new version returns pattern sparse mat nngCMatrix):
    sparse_mat <- NetInd.to.sparseAdjMat(NetInd_k, nF = nF, add_diag = TRUE)
    # Second pass over columns of connectivity mtx to connect indirect intersections (i and j have a common friend but are not friends themselves):
    sparse_mat_ind <- Matrix::crossprod(sparse_mat) # t(sparse_mat)%*%sparse_mat returns nsCMatrix (only non-zero entries)
    # MOVED TO est.sigma_sparse():
    Dstar_crossprod_new <- sum_crossprod_Fij(sparseAdjMat = sparse_mat_ind, fvec_i = fvec_i)
    Dstar_new <- (1/n) * ((2*Dstar_crossprod_new) + sum(fvec_i^2))
  # })
  # print("estvartime"); print(estvartime)

  # print(object.size(NetInd_k), units="Mb")
  # 3906.4 Kb # 0 Gb
  # sparse_mat as dgCMatrix:
  # print(object.size(sparse_mat), units="Mb")
  # 12,032.1 Kb
  # system.time({
  #   conn_ind_mtx_1st_indir <- get.Fiintersectmtx(n = n)$conn_ind_mtx_1st
  #   Dstar_old <- est.sigma_fsum(get.crossprodmtx(fvec_i), conn_ind_mtx_1st_indir)
  # })
  # print(all.equal(Dstar_new, Dstar_old))
  # # print(all.equal(Dstar, Dstar_old)); print(Dstar);
  # print(Dstar_new); print(Dstar_old)
  return(list(conn_ind_mtx_1st = sparse_mat_ind))
}

# sparse matrix class from Matrix package:
# dgCMatrix
  # class is a class of sparse numeric matrices in the compressed, sparse, column-oriented format.
# nsparseMatrix-classes
# ?'nsCMatrix-class'
# ngCMatrix, nsCMatrix, and ntCMatrix
  # class of sparse pattern matrices, i.e., binary matrices conceptually with TRUE/FALSE entries. Only the positions of the elements that are TRUE are stored.
  # Objects can be created by calls of the form new("ngCMatrix", ...) and so on.
  # More frequently objects are created by coercion of a numeric sparse matrix to the pattern form for use in the symbolic
  # analysis phase of an algorithm involving sparse matrices. Such algorithms often involve two phases: a symbolic phase wherein the
  # positions of the non-zeros in the result are determined and a numeric phase wherein the actual results are calculated.
  # During the symbolic phase only the positions of the non-zero elements in any operands are of interest, hence numeric sparse matrices can be treated as sparse pattern matrices.
# lsparseMatrix-classes {Matrix}
# lsCMatrix
# The second letter in the name of these non-virtual classes indicates general, symmetric, or triangular.
# The lsparseMatrix class is a virtual class of sparse matrices with TRUE/FALSE or NA entries. Only the positions of the elements that are TRUE are stored.
# Copied from simcausal, then modified to work as sparse pattern matrix:
# return pattern sparse matrix, no @x is recorded (ngMatrix):
NetInd.to.sparseAdjMat <- function(NetInd_k, nF, add_diag = FALSE) {
  nobs <- nrow(NetInd_k)
  sdims <- c(nobs, nobs)
  nnonzero <- sum(!is.na(NetInd_k))
  sparse_p <- as.integer(c(0, cumsum(nF)))
  sparse_iwNA <- as.vector(t(NetInd_k))
  sparse_i <- sparse_iwNA[!is.na(sparse_iwNA)] - 1
  out_sparseAdjMat <-  Matrix::sparseMatrix(i = sparse_i, p = sparse_p, dims = sdims, index1 = FALSE, giveCsparse = TRUE)
  if (add_diag) diag(out_sparseAdjMat) <- TRUE
  return(out_sparseAdjMat)
}

# For correlated i,j (s.t. i<j), add a term f_vec[i]*f_vec[j] to the cumulative sum
# Helper function to calculate cross product sum of correlated f_Wi (see p.33 of vdL)
sum_crossprod_Fij <- function(sparseAdjMat, fvec_i, excludeIDs = NULL) {
  # sparseAdjMat:
  # nnzero: number of non-zero elements
  # i: These are the 0-based row numbers for each non-zero element in the matrix.
  # "integer" of length nnzero, 0- based row numbers for each non-zero element in the matrix, i.e., i must be in 0:(nrow(.)-1).
  # p: integer vector for providing pointers, one for each column, to the initial (zero-based) index of elements in the column.
  # .@p is of length ncol(.) + 1, with p[1] == 0 and
  # p[length(p)] == nnzero, such that in fact, diff(.@p) are the number of non-zero elements for each column.
  assertthat::assert_that(is(sparseAdjMat, "sparseMatrix"))
  # 1) The number of friends for each observation:
  nF <- as.integer(diff(sparseAdjMat@p))
  # 2) Column based cummulative number of non-zero entries (cummulative nF)
  cumFindx <- sparseAdjMat@p
  # 3) All non-zero elements as a vector of 0-based row numbers:
  base0_IDrownums <- sparseAdjMat@i
  # 4) For each observation i that has non-zero nF (friends), add fvec_i[i]*fvec_Fj for each friend Fj of i:
  non0nF.idx <- which(nF > 1L) # don`t care if nF[i]=1 since it means i has 0 actual friends (i itself is included in nF)
  # non0nF.idx <- which(nF > 0L)

  Dstar_crossprod <- 0
  for (idx in non0nF.idx) {
    Fidx_base0 <- (cumFindx[idx]) : (cumFindx[idx + 1] - 1)
    FriendIDs <- base0_IDrownums[Fidx_base0 + 1] + 1
    # always remove the diag term (idx,idx) from FriendIDs (always the last entry),
    # since we only need to sum all fvec_i[i]^2 once (outside this fun)
    if (!is.null(excludeIDs)) {
      exclude_now <- c(which(FriendIDs %in% excludeIDs), length(FriendIDs))
    } else {
      exclude_now <- length(FriendIDs)
    }
    # FriendIDs <- FriendIDs[-length(FriendIDs)]
    FriendIDs <- FriendIDs[-exclude_now]
    Dstar_crossprod <- Dstar_crossprod + sum(fvec_i[idx] * fvec_i[FriendIDs])
  }
  return(Dstar_crossprod)
}

# Sum the cross prod vector over connectivity mtx (prod will only appear if (i,j) entry is 1):
est.sigma_sparse <- function(fvec_i, sparse_connectmtx, excludeIDs = NULL)  {
  n <- length(fvec_i)
  # sum of fvec_i[i]*fvec[j] for correlated cross-product terms (i,j), s.t., i<j
  Dstar_crossprod <- sum_crossprod_Fij(sparseAdjMat = sparse_connectmtx, fvec_i = fvec_i, excludeIDs = excludeIDs)
  # double cross prod sum + sum of squares over i=1,...,n
  Dstar <- (1/n) * ((2*abs(Dstar_crossprod)) + sum(fvec_i^2))
  return(Dstar)
}

est_sigmas <- function(estnames, n, NetInd_k, nF, obsYvals, ests_mat, QY_mat, wts_mat, fWi_mat, MC.tmle.eval) {
  `%+%` <- function(a, b) paste0(a, b)
  fWi <- fWi_mat[, "fWi_Qinit"]
  QY.init <- QY_mat[, "QY.init"]
  h_wts <- wts_mat[, "h_wts"]

  # NetInd as sparse adjacency matrix (new version returns pattern sparse mat ngCMatrix):
  sparse_mat <- NetInd.to.sparseAdjMat(NetInd_k, nF = nF, add_diag = TRUE)
  # Second pass over columns of connectivity mtx to connect indirect intersections (i and j have a common friend but are not friends themselves):
  connectmtx_1stO <- Matrix::crossprod(sparse_mat) # t(sparse_mat)%*%sparse_mat returns nsCMatrix (only non-zero entries)

  ## EIC for unconditional parameter psi_0 (marginilizing over W):
  IC_tmle <- h_wts * (obsYvals - QY.init) + (fWi - MC.tmle.eval)
  # IC_tmle <- h_wts * (obsYvals - QY.init) + (fWi - ests_mat[rownames(ests_mat)%in%"TMLE",])
  ## EIC for data-adaptive / conditional parameter psi_0(W) (conditioning on all W):
  IC_Q_tmle <- h_wts * (obsYvals - QY.init)
  ## EIC for IPTW (based on the mixture density clever covariate (h)):
  IC_iptw_h <- h_wts * (obsYvals) - (ests_mat[rownames(ests_mat)%in%"h_IPTW",])

  ## TMLE variance for psi_0 (a double sum to adjust for correlated W and Y):
  var_tmle <- est.sigma_sparse(IC_tmle, connectmtx_1stO)
  # var_tmle_indep <- (1/n) * sum(IC_tmle^2)
  ## TMLE variance for psi_0(W) (a double sum to adjust for correlated Y), gives inference CONDITIONAL all W, assuming dependent Q | A^s,W^s:
  var_tmle_W <- est.sigma_sparse(IC_Q_tmle, connectmtx_1stO)
  # var_tmle_W_corr <- est.sigma_sparse(abs(IC_Q_tmle), connectmtx_1stO)

  ## Gives inference CONDITIONAL all W, assuming independent Q | A^s,W^s:
  var_tmle_W_indQ <- (1/n) * sum(IC_Q_tmle^2)
  ## IPW inference:
  var_iptw_h <- est.sigma_sparse(IC_iptw_h, connectmtx_1stO)

  ## record variance ests for marginal parameter (psi_0):
  IC.dep.vars <- c(abs(var_tmle), abs(var_iptw_h), NA)
  ## record variance ests for parameter conditional on W (allowing for dependent Q)
  condW.vars <- c(abs(var_tmle_W), NA, NA)
  ## Inference conditional on W (assuming only independent Q, conditional on all W,A)
  condW.indepQ.vars <- c(abs(var_tmle_W_indQ), NA, NA)

  # Simple estimator of the iid asymptotic IC-based variance (no adjustment made when two observations i!=j are dependent):
  iid_var_tmle <- (1/n) * sum(IC_tmle^2)
  iid_var_iptw_h <- mean((IC_iptw_h)^2)
  # IID inference (ignores all dependence)
  iid.vars = c(abs(iid_var_tmle), # no adjustment for correlations i,j for tmle
               abs(iid_var_iptw_h), # no adjustment for correlations i,j for iptw
               NA)

  # Alternative TMLE variance estimators using decomposition of the EIC
  ## variance for the 2nd componenent of the EIC:
  IC_W_tmle <- (fWi - MC.tmle.eval)
  var_IC_W_tmle <- est.sigma_sparse(IC_W_tmle, connectmtx_1stO)
  aux.vars <- c(factorized_DY_DW = var_tmle_W_indQ + var_IC_W_tmle)

  # return(list(as.var_mat = as.var_mat))
  return(list(IC.dep.vars = as.matrix(IC.dep.vars),
              condW.vars = as.matrix(condW.vars),
              condW.indepQ.vars = as.matrix(condW.indepQ.vars),
              iid.vars = as.matrix(iid.vars),
              aux.vars = as.matrix(aux.vars)))
}

# recursively check for outvar name until found, then return the appropriate SummaryModel object:
findRegSummaryObj <- function(fit.obj, outvar) {
  if (is.list(fit.obj$outvar) | (length(fit.obj$outvar)>1)) {
    return(unlist(lapply(fit.obj$getPsAsW.models(), findRegSummaryObj, outvar)))
  } else if (fit.obj$outvar %in% outvar) {
    return(fit.obj)
  }
}

fit_reg.forms <- function(boot.form, DatNet.ObsP0, sW, sA) {
  boot.form.sVars <- process_regforms(boot.form, sW.map = sW$sVar.names.map, sA.map = sA$sVar.names.map)
  pred_nms <- boot.form.sVars$predvars
  boot_outvar_nms <- boot.form.sVars$outvars
  check.pred.exist <- unlist(lapply(unlist(pred_nms), function(sWname) sWname %in% c(DatNet.ObsP0$datnetW$names.sVar,DatNet.ObsP0$datnetA$names.sVar)))
  if (!all(check.pred.exist)) stop("the following predictors from boot.form regression could not be located in sW/sA summary measures: " %+%
                                    paste0(pred_nms[!check.pred.exist], collapse = ","))
  check.outvar.exist <- unlist(lapply(boot_outvar_nms, function(sAname) sAname %in% c(DatNet.ObsP0$datnetW$names.sVar,DatNet.ObsP0$datnetA$names.sVar)))
  if (!all(check.outvar.exist)) stop("the following outcomes from boot.form regression could not be located in sW/sA summary measures: " %+%
                                    paste0(boot_outvar_nms[!check.outvar.exist], collapse = ","))
  outvar_class <- lapply(boot_outvar_nms, function(sA_nms) DatNet.ObsP0$datnetA$type.sVar[sA_nms])
  subsets_expr <- lapply(boot_outvar_nms, function(var) lapply(var, function(var) {var}))
  regclass.boot <- RegressionClass$new(sep_predvars_sets = TRUE,
                                     outvar.class = outvar_class,
                                     outvar = boot_outvar_nms,
                                     predvars = pred_nms,
                                     subset = subsets_expr)
  regclass.boot$S3class <- "generic"
  summeas_boot <- newsummarymodel(reg = regclass.boot, DatNet.sWsA.g0 = DatNet.ObsP0)$fit(data = DatNet.ObsP0)
  return(list(boot_outvar_nms = boot_outvar_nms, summeas_boot = summeas_boot))
}


# --------------------------------------------------------------------------------------------------------
# Evaluating the D_Wi component of the EIC IC with Monte-Carlo sampling (requires evaluation of psi_i_n)
# --------------------------------------------------------------------------------------------------------
MCeval_fWi <- function(n.MC, DatNet.ObsP0, tmle_g1_out, tmle_g2_out) {
  mean_obs_gcomp_g1 <- vector(mode = "numeric", length = DatNet.ObsP0$nobs)
  mean_obs_gcomp_g2 <- vector(mode = "numeric", length = DatNet.ObsP0$nobs)
  mean_obs_tmle_B_g1 <- vector(mode = "numeric", length = DatNet.ObsP0$nobs)
  mean_obs_tmle_B_g2 <- vector(mode = "numeric", length = DatNet.ObsP0$nobs)
  mean_obs_gcomp_g1[] <- mean_obs_gcomp_g2[] <- mean_obs_tmle_B_g1[] <- mean_obs_tmle_B_g2[] <- 0L
  psi.evaluator <- tmle_g1_out$psi.evaluator
  # -----------------------------------------------------------------------------------------------
  # Always start with the observed Anodes
  # Save the original input data.table OdataDT, otherwise it will be over-written:
  # Note we are not saving the original saved Anodes and sA -> these fields NULLed during bootstrap
  # -----------------------------------------------------------------------------------------------
  if (!DatNet.ObsP0$Odata$curr_data_A_g0) {
    DatNet.ObsP0$Odata$restoreAnodes()
    DatNet.ObsP0$Odata$curr_data_A_g0 <- TRUE
  }
  OdataDT.P0 <- DatNet.ObsP0$Odata$OdataDT
  noNA.Ynodevals.P0 <- DatNet.ObsP0$noNA.Ynodevals
  det.Y.P0 <- DatNet.ObsP0$det.Y
  DatNet.gstar <- tmle_g1_out$DatNet.gstar

  # -----------------------------------------------------------------------------------------------
  # loop over n.MC
  # -----------------------------------------------------------------------------------------------
  for (i in (1:n.MC)) {
    # 1. Resample W (with replacement) as if IID (re-purposing the instance of DatNet.gstar); Re-evaluate baseline summaries sW;
    boot_idx <- sample.int(n = DatNet.ObsP0$nobs, replace = TRUE)
    DatNet.ObsP0$Odata$OdataDT <- OdataDT.P0[boot_idx, ]
    # Need to NULL previously backed-up values of A and sA, since they no longer correspond with bootstrapped sample
    DatNet.ObsP0$Odata$A_g0_DT <- NULL
    DatNet.ObsP0$Odata$sA_g0_DT <- NULL
    DatNet.ObsP0$Odata$save_sA_Vars <- NULL
    DatNet.ObsP0$datnetW$make.sVar(sVar.object = tmle_g1_out$sW)
    DatNet.ObsP0$datnetW$fixmiss_sVar() # permanently replace NA values in sW with 0
    DatNet.ObsP0$det.Y <- det.Y.P0[boot_idx]
    DatNet.ObsP0$noNA.Ynodevals <- noNA.Ynodevals.P0[boot_idx]

    # 2. USE old observed A's (no re-sampling)
    for (Anode in DatNet.ObsP0$nodes$Anodes) {
      DatNet.ObsP0$Odata$replaceOneAnode(AnodeName = Anode, newAnodeVal = OdataDT.P0[[Anode]])
    }
    # Re-evaluate exposure summaries based on resampled W and old (observed) A's
    DatNet.ObsP0$datnetA$make.sVar(sVar.object = tmle_g1_out$sA)
    DatNet.ObsP0$Odata$curr_data_A_g0 <- TRUE

    # 4. Predict P(Y_i=1|sW,sA) using m.Q.init (the initial fit \bar{Q}_N) based on newly resampled W and re-computed summaries (sW,sA)
    detY.boot <- DatNet.ObsP0$det.Y
    QY.init.boot <- DatNet.ObsP0$noNA.Ynodevals
    QY.init.boot[!detY.boot] <- tmle_g1_out$m.Q.init$predict(newdata = DatNet.ObsP0)$getprobA1[!detY.boot] # getting predictions P(Y=1) for non-DET Y
    # off.boot <- qlogis(QY.init.boot)  # offset

    # 8. Re-create DatNet.gstar with boostrapped W and sA generated under f.gstar:
    # However, first need to save (Anodes, sA) from the observed sample, otherwise they are forever lost
    DatNet.ObsP0$Odata$backupAnodes(sA = tmle_g1_out$sA)

    # Will over-write Anodes/sA in DatNet.ObsP0$Odata with tmle_g1_out$new.sA:
    DatNet.gstar$make.dat.sWsA(p = 1, new.sA.object = tmle_g1_out$new.sA, sA.object = tmle_g1_out$sA, DatNet.ObsP0 = DatNet.ObsP0)

    # 9. Evaluate the substitution estimator and the components of the EIC D_Y and D_W:
    fWi.g1 <- psi.evaluator$get.gcomp(m.Q.init = tmle_g1_out$m.Q.init)
    tmle.g1 <- psi.evaluator$get.tmleB(m.Q.starB = tmle_g1_out$MC_fit_params$m.Q.star)

    mean_obs_gcomp_g1 <- mean_obs_gcomp_g1 + fWi.g1
    mean_obs_tmle_B_g1 <- mean_obs_tmle_B_g1 + tmle.g1

    # 10. If (!is.null(tmle_g2_out)) then the above steps 8 & 9 are repeated for tmle_g2_out and the ATE.
    # First restore (Anodes,sA) that were generated in the observed bootstrapped sample!
    # In this case the intervention summaries, s.a, "def_new_sA(A = A)" will correctly use the observed bootstrapped values of A
    # -----------------------------------------------------------------------------------------------------------------------------
    if (!is.null(tmle_g2_out)) {
      # * restore the observed boostrapped Anodes and sA
      DatNet.ObsP0$Odata$restoreAnodes()
      # * verify sA's were also restored and if not, regenerate them
      if (!DatNet.ObsP0$Odata$restored_sA_Vars) DatNet.ObsP0$datnetA$make.sVar(sVar.object = tmle_g2_out$sA)
      DatNet.gstar$make.dat.sWsA(p = 1, new.sA.object = tmle_g2_out$new.sA, sA.object = tmle_g2_out$sA, DatNet.ObsP0 = DatNet.ObsP0)

      fWi.g2 <- psi.evaluator$get.gcomp(m.Q.init = tmle_g2_out$m.Q.init)
      tmle.g2 <- psi.evaluator$get.tmleB(m.Q.starB = tmle_g2_out$MC_fit_params$m.Q.star)
      mean_obs_gcomp_g2 <- mean_obs_gcomp_g2 + fWi.g2
      mean_obs_tmle_B_g2 <- mean_obs_tmle_B_g2 + tmle.g2
    }
  }

  mean_obs_gcomp_g1 <- mean_obs_gcomp_g1 / n.MC
  mean_obs_gcomp_g2 <- mean_obs_gcomp_g2 / n.MC
  mean_obs_tmle_B_g1 <- mean_obs_tmle_B_g1 / n.MC
  mean_obs_tmle_B_g2 <- mean_obs_tmle_B_g2 / n.MC

  # Restore the original data.table OdataDT, Y values and indicator of deterministic Y values
  # Otherwise it messes up the IC-based inference, since it uses the observed Yvals to est. the IC
  DatNet.ObsP0$Odata$OdataDT <- OdataDT.P0
  DatNet.ObsP0$noNA.Ynodevals <- noNA.Ynodevals.P0
  DatNet.ObsP0$det.Y <- det.Y.P0
  # these values are based on bootstrapped and or resampled W, so better to erase them:
  DatNet.ObsP0$Odata$curr_data_A_g0 <- TRUE
  DatNet.ObsP0$Odata$A_g0_DT <- NULL
  DatNet.ObsP0$Odata$sA_g0_DT <- NULL
  DatNet.ObsP0$Odata$save_sA_Vars <- NULL

  # mean_obs_gcomp_g1 = mean_obs_gcomp_g1,
  out_mean_tmleB <- list(EY_gstar1 = mean_obs_tmle_B_g1, EY_gstar2 = mean_obs_tmle_B_g2, ATE = mean_obs_tmle_B_g1 - mean_obs_tmle_B_g2)
  return(out_mean_tmleB)
}

# --------------------------------------------------------------------------------------------------------
# Bootstrap with resampling of (sW,sA,Y) with replacement (as if iid) - DOESN'T WORK FOR DEPENDENT DATA
# --------------------------------------------------------------------------------------------------------
iid_bootstrap_tmle <- function(n.boot, estnames, DatNet.ObsP0, tmle_g_out, QY_mat, wts_mat) {
  QY.init <- QY_mat[, "QY.init"]
  off <- qlogis(QY.init)  # offset
  DatNet.gstar <- tmle_g_out$DatNet.gstar
  psi.evaluator <- tmle_g_out$psi.evaluator
  m.Q.init <- tmle_g_out$m.Q.init
  m.h.fit <- tmle_g_out$m.h.fit
  h_wts <- wts_mat[, "h_wts"]
  Y <- DatNet.ObsP0$noNA.Ynodevals
  determ.Q <- DatNet.ObsP0$det.Y

  #************************************************
  # BOOTSTRAP TMLE UPDATES:
  #************************************************
  # n.boot <- 100
  boot_eps <- vector(mode = "numeric", length = n.boot)
  boot_tmle_B <- vector(mode = "numeric", length = n.boot)
  for (i in (1:n.boot)) {
      boot_idx <- sample.int(n = DatNet.ObsP0$nobs, replace = TRUE)
      boot.tmle.obj <- tmle.update(estnames = estnames,
                                   Y = Y[boot_idx], off = off[boot_idx], h_wts = h_wts[boot_idx],
                                   determ.Q = determ.Q[boot_idx], predictQ = FALSE)
      boot_eps[i] <- boot.tmle.obj$m.Q.star.coef
      boot_tmle_B[i] <- mean(psi.evaluator$get.boot.tmleB(m.Q.starB = boot_eps[i], boot_idx = boot_idx))
  }
  var_tmleB_boot <- var(boot_tmle_B)
  return(var_tmleB_boot)
}

# --------------------------------------------------------------------------------------------------------
# Parametric bootstrap: sampling W as iid, boot.nodes from m.h.fit or special fit obj and Y from m.Q.init.N;
# Parametric bootstrap: need to explicitly spec. fitted nodes which are cond independent and then resample from those fits.
# --------------------------------------------------------------------------------------------------------
# Could be very different from Anodes or part of Anodes. If no reg form was specified, finds it among hforms by default;
# However, Y's are always re-sampled from m.Q.init
# Allows boot.nodes to be different from Anodes with their own regression formulas
# These regressions are specified with "boot.form" arg: specifies regression forms for conditionally independent nodes to be resampled from such fits
par_bootstrap_tmle <- function(n.boot, boot.nodes, boot.form, estnames, DatNet.ObsP0, tmle_g1_out, tmle_g2_out) {
  # ******** REPLACED this with the actual model fit g.N *********
  f.g0 <- NULL

  boot_eps_g1 <- vector(mode = "numeric", length = n.boot)
  boot_eps_g2 <- vector(mode = "numeric", length = n.boot)

  boot_gcomp_g1 <- vector(mode = "numeric", length = n.boot)
  boot_gcomp_g2 <- vector(mode = "numeric", length = n.boot)
  boot_gcomp_ATE <- vector(mode = "numeric", length = n.boot)

  boot_tmle_B_g1 <- vector(mode = "numeric", length = n.boot)
  boot_tmle_B_g2 <- vector(mode = "numeric", length = n.boot)
  boot_tmle_B_ATE <- vector(mode = "numeric", length = n.boot)

  boot_IC_tmle <- vector(mode = "numeric", length = n.boot)

  psi.evaluator <- tmle_g1_out$psi.evaluator

  # Always start with the observed Anodes
  if (!DatNet.ObsP0$Odata$curr_data_A_g0) {
    DatNet.ObsP0$Odata$restoreAnodes()
    DatNet.ObsP0$Odata$curr_data_A_g0 <- TRUE
  }

  # -----------------------------------------------------------------------------------------------
  # Save the original input data.table OdataDT, otherwise it will be over-written:
  # Note we are not saving the original saved Anodes and sA -> these fields NULLed during bootstrap
  # -----------------------------------------------------------------------------------------------
  OdataDT.P0 <- DatNet.ObsP0$Odata$OdataDT
  noNA.Ynodevals.P0 <- DatNet.ObsP0$noNA.Ynodevals
  det.Y.P0 <- DatNet.ObsP0$det.Y
  DatNet.gstar <- tmle_g1_out$DatNet.gstar

  # -----------------------------------------------------------------------------------------------
  # fit any regressions specified in boot.form:
  # -----------------------------------------------------------------------------------------------
  if (!is.null(boot.form)) {
    boot.form_fit <- fit_reg.forms(boot.form, DatNet.ObsP0, tmle_g1_out$sW, tmle_g1_out$sA)
    summeas_boot <- boot.form_fit$summeas_boot
    boot_outvar_nms <- unlist(boot.form_fit$boot_outvar_nms)
  } else {
    boot_outvar_nms <- NULL
  }

  # -----------------------------------------------------------------------------------------------
  # loop over n.boot
  # -----------------------------------------------------------------------------------------------
  for (i in (1:n.boot)) {
    # 1. Resample W (with replacement) by re-purposing the instance of DatNet.gstar; Re-evaluate baseline summaries sW; Re-shuffle pre-saved values of Y and det.Y;
    boot_idx <- sample.int(n = DatNet.ObsP0$nobs, replace = TRUE)
    DatNet.ObsP0$Odata$OdataDT <- OdataDT.P0[boot_idx, ]

    # Need to NULL previously backed-up values of A and sA, since they no longer correspond with bootstrapped sample
    DatNet.ObsP0$Odata$A_g0_DT <- NULL
    DatNet.ObsP0$Odata$sA_g0_DT <- NULL
    DatNet.ObsP0$Odata$save_sA_Vars <- NULL

    DatNet.ObsP0$datnetW$make.sVar(sVar.object = tmle_g1_out$sW)
    DatNet.ObsP0$datnetW$fixmiss_sVar() # permanently replace NA values in sW with 0
    DatNet.ObsP0$det.Y <- det.Y.P0[boot_idx]
    DatNet.ObsP0$noNA.Ynodevals <- noNA.Ynodevals.P0[boot_idx]

    # 2. Generate new A's from g0 or g.N (replace A with sampled A's in DatNet.ObsP0); Re-evaluate exposure summaries (sA) based on new DatNet.ObsP0:
    if (is.null(f.g0)) {
      for (Anode in boot.nodes) {
        # If Anode was referenced in boot.form, then sample Anode from this new fit, otherwise sample from hforms fit
        if (Anode %in% boot_outvar_nms) {
          model.sVar.gN <- findRegSummaryObj(summeas_boot, outvar = Anode)
        } else {
          # NOTE: new RegressionClass implementation will allow pulling top level outvar names directly
          # (regardless of how the regressions were specified ("A1+A2~W" or c("A1~W","A2~W")))
          # will allow checking directly if model for Anode exists: (Anode %in% unlist(tmle_g1_out$m.h.fit$summeas.g0$reg$outvar))
          # current implementation doesn't allow that, so need to check the model is not NULL
          model.sVar.gN <- findRegSummaryObj(tmle_g1_out$m.h.fit$summeas.g0, outvar = Anode)
        }
        if (is.list(model.sVar.gN)) model.sVar.gN <- model.sVar.gN[[1]]
        if (is.null(model.sVar.gN)) stop("Model fit for parametric bootstrap variable '" %+% Anode %+% "' could not be located. " %+%
          "Make sure to specify the regression formula for every variable in 'boot.nodes', either as part of regression formulas in 'hform.g0'/'hform.gstar'" %+%
          "or 'boot.form'")
        A.sample.gN <- model.sVar.gN$sampleA(newdata = DatNet.ObsP0)
        DatNet.ObsP0$Odata$replaceOneAnode(AnodeName = Anode, newAnodeVal = A.sample.gN)
      }
    } else if (!is.null(f.g0)) {
      DatNet.ObsP0$make.dat.sWsA(p = 1, f.g_fun = f.g0, sA.object = tmle_g1_out$sA, DatNet.ObsP0 = DatNet.ObsP0)
    }

    DatNet.ObsP0$datnetA$make.sVar(sVar.object = tmle_g1_out$sA) # re-evaluate exposure summaries based on bootstraped W and re-sampled A's
    DatNet.ObsP0$Odata$curr_data_A_g0 <- TRUE

    # 4. Predict P(Y_i=1|sW,sA) using m.Q.init (the initial fit \bar{Q}_N) based on newly resampled (sW,sA):
    detY.boot <- DatNet.ObsP0$det.Y
    QY.init.boot <- DatNet.ObsP0$noNA.Ynodevals
    QY.init.boot[!detY.boot] <- tmle_g1_out$m.Q.init$predict(newdata = DatNet.ObsP0)$getprobA1[!detY.boot] # getting predictions P(Y=1) for non-DET Y
    off.boot <- qlogis(QY.init.boot)  # offset

    # 5. Sample a vector of new (Y_i, i=1,...,N):
    # **** TO BE REPLACED WITH SOMETHING LIKE B psi.evaluator$sampleY(Qprob = QY.init) ****
    Y.boot <- rbinom(n = length(QY.init.boot), size = 1, prob = QY.init.boot)

    # 6. Predict new weights h_wts = P_{\bar{g}^*_N}(sA | sW)/P_{\bar{g}_0}(sA | sW) :
    # using previously fitted m.h.fit$summeas.g0 and m.h.fit$summeas.gstar and the newly resampled (sW,sA) under Q.W.N, m.g.N
    h_wts_g1.boot <- predict.hbars(newdatnet = DatNet.ObsP0, m.h.fit = tmle_g1_out$m.h.fit)

    # 7. Fit a TMLE update epsilon on this new bootstapped dataset.
    boot.tmle_g1.obj <- tmle.update(estnames = estnames,
                                    Y = Y.boot, off = off.boot, h_wts = h_wts_g1.boot,
                                    determ.Q = detY.boot, predictQ = FALSE)
    boot_eps_g1[i] <- boot.tmle_g1.obj$m.Q.star.coef

    # If tmle_g2_out is present, do the same, before DatNet.ObsP0 is overwritten
    if (!is.null(tmle_g2_out)) {
      h_wts_g2.boot <- predict.hbars(newdatnet = DatNet.ObsP0, m.h.fit = tmle_g2_out$m.h.fit)
      boot.tmle_g2.obj <- tmle.update(estnames = estnames,
                                      Y = Y.boot, off = off.boot, h_wts = h_wts_g2.boot,
                                      determ.Q = detY.boot, predictQ = FALSE)
      boot_eps_g2[i] <- boot.tmle_g2.obj$m.Q.star.coef
    }

    # 8. Re-create DatNet.gstar with boostrapped summaires sW and sA generated under f.gstar:
    # However, first need to save (Anodes,sA) from the observed bootstrapped sample, otherwise they are forever lost
    DatNet.ObsP0$Odata$backupAnodes(sA = tmle_g1_out$sA)
    # Will over-write Anodes/sA in DatNet.ObsP0$Odata:
    DatNet.gstar$make.dat.sWsA(p = 1, new.sA.object = tmle_g1_out$new.sA, sA.object = tmle_g1_out$sA, DatNet.ObsP0 = DatNet.ObsP0)

    # 9. Evaluate the substitution estimator and the components of the EIC D_Y and D_W:
    fWi.boot_g1 <- psi.evaluator$get.gcomp(m.Q.init = tmle_g1_out$m.Q.init)
    boot_gcomp_g1[i] <- mean(fWi.boot_g1)
    boot_tmle_B_g1[i] <- mean(psi.evaluator$get.tmleB(m.Q.starB = boot.tmle_g1.obj$m.Q.star.coef))

    obsYvals.boot = DatNet.ObsP0$noNA.Ynodevals
    boot_IC_tmle[i] <- mean(h_wts_g1.boot * (obsYvals.boot - QY.init.boot) + (fWi.boot_g1 - boot_tmle_B_g1[i]))

    # 10. If (!is.null(tmle_g2_out)) then the above steps 8 & 9 are repeated for tmle_g2_out and the ATE.
    # First restore (Anodes,sA) that were generated in the observed bootstrapped sample!
    # In this case the intervention summaries, s.a, "def_new_sA(A = A)" will correctly use the observed bootstrapped values of A
    # -----------------------------------------------------------------------------------------------------------------------------
    # Q: How do we get the bootsrap var for the ATE?
      # 1) We can generate a new TMLE update that directly corresponds to the EIC for ATE (need to figure out EIC)
      # 2) We could also evaluate the ATE as a plug-in estimator from two separate TMLE updates - using this approach.
    # Both approaches should be equivalent, except for situations with 2 "incompatible" target parameters
    # -----------------------------------------------------------------------------------------------------------------------------
    if (!is.null(tmle_g2_out)) {
      # * restore the observed boostrapped Anodes and sA
      DatNet.ObsP0$Odata$restoreAnodes()
      # * verify sA's were also restored and if not, regenerate them
      if (!DatNet.ObsP0$Odata$restored_sA_Vars) DatNet.ObsP0$datnetA$make.sVar(sVar.object = tmle_g2_out$sA)
      DatNet.gstar$make.dat.sWsA(p = 1, new.sA.object = tmle_g2_out$new.sA, sA.object = tmle_g2_out$sA, DatNet.ObsP0 = DatNet.ObsP0)

      fWi.boot_g2 <- psi.evaluator$get.gcomp(m.Q.init = tmle_g2_out$m.Q.init)
      boot_gcomp_g2[i] <- mean(fWi.boot_g2)
      boot_gcomp_ATE[i] <- boot_gcomp_g1[i] - boot_gcomp_g2[i]

      boot_tmle_B_g2[i] <- mean(psi.evaluator$get.tmleB(m.Q.starB = boot.tmle_g2.obj$m.Q.star.coef))
      boot_tmle_B_ATE[i] <- boot_tmle_B_g1[i] - boot_tmle_B_g2[i]
    }
  }

  var_gcomp_boot_g1 <- var(boot_gcomp_g1)
  var_gcomp_boot_g2 <- var(boot_gcomp_g2)
  var_gcomp_boot_ATE <- var(boot_gcomp_ATE)

  var_tmleB_boot_g1 <- var(boot_tmle_B_g1)
  var_tmleB_boot_g2 <- var(boot_tmle_B_g2)
  var_tmleB_boot_ATE <- var(boot_tmle_B_ATE)

  if (!is.null(tmle_g2_out)) {
    gcomp_g1_boot_col <- rbind(tmle_g1_out$ests_mat["MLE",],                                  mean(boot_gcomp_g1), var(boot_gcomp_g1))
    gcomp_g2_boot_col <- rbind(tmle_g2_out$ests_mat["MLE",],                                  mean(boot_gcomp_g2), var(boot_gcomp_g2))
    gcomp_ATE_boot_col <- rbind(tmle_g1_out$ests_mat["MLE",]-tmle_g2_out$ests_mat["MLE",],    mean(boot_gcomp_ATE), var(boot_gcomp_ATE))
    tmle_B_g1_boot_col <- rbind(tmle_g1_out$ests_mat["TMLE",],                                mean(boot_tmle_B_g1), var(boot_tmle_B_g1))
    tmle_B_g2_boot_col <- rbind(tmle_g2_out$ests_mat["TMLE",],                                mean(boot_tmle_B_g2), var(boot_tmle_B_g2))
    tmle_B_ATE_boot_col <- rbind(tmle_g1_out$ests_mat["TMLE",]-tmle_g2_out$ests_mat["TMLE",], mean(boot_tmle_B_ATE), var(boot_tmle_B_ATE))

    res_mat <- cbind(gcomp_g1 = gcomp_g1_boot_col, gcomp_g2 = gcomp_g2_boot_col, gcomp_ATE = gcomp_ATE_boot_col,
                     tmle_B_g1 = tmle_B_g1_boot_col, tmle_B_g2 = tmle_B_g2_boot_col, tmle_B_ATE = tmle_B_ATE_boot_col)

    rownames(res_mat) <- c("Mean.Est", "Boot.Mean.Est", "Boot.Var")
    colnames(res_mat) <- c("gcomp_g1","gcomp_g2","gcomp_ATE","tmle_B_g1","tmle_B_g2","tmle_B_ATE")
    print("Parametric Bootstrap Inference: "); print(res_mat)

    est_mat <- cbind(tmle_g1_out$ests_mat, tmle_g2_out$ests_mat, tmle_g1_out$ests_mat - tmle_g2_out$ests_mat)
    colnames(est_mat) <- c("g1", "g2", "ATE")
    print("est_mat"); print(est_mat)
  }

  # Restore the original data.table OdataDT, Y values and indicator of deterministic Y values
  # Otherwise it messes up the IC-based inference, since it uses the observed Yvals to est. the IC
  DatNet.ObsP0$Odata$OdataDT <- OdataDT.P0
  DatNet.ObsP0$noNA.Ynodevals <- noNA.Ynodevals.P0
  DatNet.ObsP0$det.Y <- det.Y.P0
  # these values are based on bootstrapped and or resampled W, so better to erase them:
  DatNet.ObsP0$Odata$curr_data_A_g0 <- TRUE
  DatNet.ObsP0$Odata$A_g0_DT <- NULL
  DatNet.ObsP0$Odata$sA_g0_DT <- NULL
  DatNet.ObsP0$Odata$save_sA_Vars <- NULL

  out_var_tmleB_boot <- list(EY_gstar1 = var_tmleB_boot_g1, EY_gstar2 = var_tmleB_boot_g2, ATE = var_tmleB_boot_ATE)
  return(out_var_tmleB_boot)
}

# create output object with param ests of EY_gstar, vars and CIs for given gstar (or ATE if two tmle obj are passed)
# boot.var, n.boot,
make_EYg_obj <- function(estnames, estoutnames, alpha, DatNet.ObsP0, tmle_g_out, tmle_g2_out=NULL, MC.tmle.eval = NULL, var_tmleB_boot) {
  nobs <- DatNet.ObsP0$nobs
  NetInd_k <- DatNet.ObsP0$netind_cl$NetInd_k
  nF <- DatNet.ObsP0$netind_cl$nF
  ests_mat <- tmle_g_out$ests_mat
  QY_mat <- tmle_g_out$QY_mat
  fWi_mat <- tmle_g_out$fWi_mat
  wts_mat <- tmle_g_out$wts_mat

  boot.as.var_tmleB <- var_tmleB_boot * nobs # parametric bootstrap-based asymptotic variance (est. outside this function)

  if (!is.null(tmle_g2_out)) {
    ests_mat <- tmle_g_out$ests_mat - tmle_g2_out$ests_mat
    fWi_mat <- tmle_g_out$fWi_mat - tmle_g2_out$fWi_mat
    wts_mat <- tmle_g_out$wts_mat - tmle_g2_out$wts_mat
  }

  # ------------------------------------------------------------------------------------------
  # get the IC-based asymptotic variance estimates:
  # ------------------------------------------------------------------------------------------
  # getVar_time <- system.time(
  as.vars_obj <- est_sigmas(estnames = estnames, n = nobs,
                            NetInd_k = NetInd_k, nF = nF,
                            obsYvals = DatNet.ObsP0$noNA.Ynodevals,
                            MC.tmle.eval = MC.tmle.eval,
                            ests_mat = ests_mat,
                            QY_mat = QY_mat,
                            wts_mat = wts_mat, fWi_mat = fWi_mat)
  # )
  # if (gvars$verbose) {
  #   print("time to estimate Vars: "); print(getVar_time)
  # }
  # ------------------------------------------------------------------------------------------
  # parametric bootstrap-based asymptotic variance estimates matrix:
  # ------------------------------------------------------------------------------------------
  boot.as.var_mat <- matrix(nrow = nrow(as.vars_obj$IC.dep.vars), ncol = 1)
  boot.as.var_mat[1,1] <- boot.as.var_tmleB
  rownames(boot.as.var_mat) <- rownames(as.vars_obj$IC.dep.vars)
  colnames(boot.as.var_mat) <- colnames(as.vars_obj$IC.dep.vars)

  get_CI <- function(xrow, n) {
    f_est_CI <- function(n, psi, sigma2_N) { # get CI
      z_alpha <- qnorm(1-alpha/2)
      CI_est <- c(psi - z_alpha*sqrt(sigma2_N) / sqrt(n), psi + z_alpha*sqrt(sigma2_N) / sqrt(n))
      return(CI_est)
    }
    psi <- xrow["estimate"];
    sigma2_N <- xrow[2];
    return(f_est_CI(n = n, psi = psi, sigma2_N = sigma2_N))
  }

  # CIs based on bootstrapped variance:
  boot.CIs_mat <- t(apply(cbind(ests_mat, boot.as.var_mat), 1, get_CI, n = nobs))
  colnames(boot.CIs_mat) <- c("LBCI_"%+%as.character(alpha/2), "UBCI_"%+%as.character(1-alpha/2))

  CIs_mat <- t(apply(cbind(ests_mat, as.vars_obj$IC.dep.vars), 1, get_CI, n = nobs))
  colnames(CIs_mat) <- c("LBCI_"%+%as.character(alpha/2), "UBCI_"%+%as.character(1-alpha/2))

  # CIs based on conditional on W variance (allowing dep Q):
  condW.CIs_mat <- t(apply(cbind(ests_mat, as.vars_obj$condW.vars), 1, get_CI, n = nobs))
  colnames(condW.CIs_mat) <- c("LBCI_"%+%as.character(alpha/2), "UBCI_"%+%as.character(1-alpha/2))

  # CIs based on conditional on W variance (assuming indep Q):
  condW.indepQ.CIs_mat <- t(apply(cbind(ests_mat, as.vars_obj$condW.indepQ.vars), 1, get_CI, n = nobs))
  colnames(condW.indepQ.CIs_mat) <- c("LBCI_"%+%as.character(alpha/2), "UBCI_"%+%as.character(1-alpha/2))

  # CIs based on IID variance:
  iid.CIs <- t(apply(cbind(ests_mat, as.vars_obj$iid.vars), 1, get_CI, n = nobs))
  colnames(iid.CIs) <- c("LBCI_"%+%as.character(alpha/2), "UBCI_"%+%as.character(1-alpha/2))

  # ------------------------------------------------------------------------------------------
  # Rename estimators for the final output:
  # ------------------------------------------------------------------------------------------
  rownames(ests_mat) <- estoutnames
  rownames(as.vars_obj$IC.dep.vars) <- rownames(boot.as.var_mat) <- rownames(as.vars_obj$condW.vars) <- rownames(as.vars_obj$condW.indepQ.vars) <- rownames(as.vars_obj$iid.vars) <- estoutnames
  rownames(CIs_mat) <- rownames(boot.CIs_mat) <- rownames(condW.CIs_mat) <- rownames(condW.indepQ.CIs_mat)<- rownames(iid.CIs) <- estoutnames

  EY_g.star <- list(estimates = ests_mat,

                    boot.vars = (boot.as.var_mat / nobs), # parametric bootstrap variance
                    IC.vars = (as.vars_obj$IC.dep.vars / nobs), # IC-based variance (dependent)
                    condW.IC.vars = (as.vars_obj$condW.vars / nobs), # IC-based variance conditional on W
                    condW.indepQ.IC.vars = (as.vars_obj$condW.indepQ.vars / nobs), # IC-based variance conditional on W
                    iid.vars = (as.vars_obj$iid.vars / nobs), # iid Variance
                    aux.vars = (as.vars_obj$aux.vars / nobs), # auxilary (additional variance estimators)

                    boot.CIs = boot.CIs_mat,
                    IC.CIs = CIs_mat,
                    condW.CIs = condW.CIs_mat,
                    condW.indepQ.CIs = condW.indepQ.CIs_mat,
                    iid.CIs = iid.CIs,

                    h_g0_SummariesModel = NULL,
                    h_gstar_SummariesModel = NULL)

  if (is.null(tmle_g2_out)) {
    EY_g.star[["h_g0_SummariesModel"]] <- tmle_g_out$h_g0_SummariesModel
    EY_g.star[["h_gstar_SummariesModel"]] <- tmle_g_out$h_gstar_SummariesModel
  }

  return(EY_g.star)
}
osofr/tmlenet documentation built on May 24, 2019, 4:58 p.m.