R/genU.R
In causal.decomp: Causal Decomposition Analysis

# Generation of U based on the stochastic EM algorithm
genU <- function(k.y, k.m, Iternum, benchmark, data, covariates, group, intermediates, risk.factor, outcome, nsim, num_cores,
                 mods_formula = NULL, moderators = NULL) {

  if (is.null(mods_formula)) {
    if (is.null(moderators) || length(moderators) == 0) {
      stop("genU(): Provide `mods_formula` or a non-empty `moderators` vector.")
    }
    mods_formula <- as.formula(paste0(" ~ ", paste(moderators, collapse = " + ")))
  }
  
  ## Generate b.m and b.y

  #print("k.y:"); print(k.y)
  #print("k.m:"); print(k.m)
  #print("benchmark:"); print(benchmark)
  #print("names(data):"); print(names(data))
  #print("covariates:"); print(covariates)
  #print("group:"); print(group)
  #print("intermediates:"); print(intermediates)
  #print("risk.factor:"); print(risk.factor)
  #print("outcome:"); print(outcome)
  #print("formula:"); print(paste0(outcome, " ~ ", paste(c(group, intermediates, covariates), collapse = " + ")))
  #print("nsim:"); print(nsim)
  #print("num_cores:"); print(num_cores)

  fit.y.rxc = lm(as.formula(paste0(outcome, " ~ ", paste(c(group, intermediates, covariates), collapse = " + "))), data = data)
  # Y ~ R + X + M + C
  fit.y.rxmc = lm(paste0(outcome, " ~ ", paste(c(group, intermediates, risk.factor, covariates), collapse = " + ")), data = data)
  # M ~ R + X + C
  fit.m.rxc = glm(as.formula(paste0(risk.factor, " ~ ", paste(c(group, intermediates, covariates), collapse = " + "))), family = binomial(logit), data = data)
  b.m.xj = coef(fit.m.rxc)[benchmark]
  b.y.xj = coef(fit.y.rxc)[benchmark]
  b.y.m = coef(fit.y.rxmc)[paste0(risk.factor, levels(data[[risk.factor]])[2])]
  sigma.m = sd(residuals(fit.m.rxc))
  #intermediates_adj <- setdiff(intermediates, benchmark)  # NOTE: remove benchmark from the left side of the model, otherwise R will give warnings
  sigma.xj = sd(residuals(lm(paste0(benchmark, " ~ ", paste(setdiff(c(group, intermediates, covariates), benchmark), collapse = " + ")), data = data)))

  DATA.xj1 = DATA.xj0 = data
  DATA.xj1[, benchmark] = 1
  DATA.xj0[, benchmark] = 0

  b.m = log(k.m) + b.m.xj
  coef.m = c(coef(fit.m.rxc), U = b.m)
  u_fake = rnorm(length(data[[outcome]]), mean = 0, sd = sigma.xj)
  pre.m1 = 1 / (1 + exp(- (cbind(model.matrix(fit.m.rxc), U = u_fake + 1) %*% coef.m)))
  pre.m0 = 1 / (1 + exp(- (cbind(model.matrix(fit.m.rxc), U = u_fake) %*% coef.m)))
  diff.pi = mean(pre.m1 - pre.m0)
  rsq.u.m = (log(k.m) + b.m.xj)/sqrt((log(k.m) + b.m.xj)^2 + pi^2/(3 * sigma.m^2))
  b.y = (k.y * b.y.xj - b.y.m * diff.pi) *
    sigma.m / (sigma.m - abs(rsq.u.m/sqrt(1 - rsq.u.m)) * sigma.xj * diff.pi)

  ## Generate continuous U from its prior distribution first
  U = rnorm(nrow(data), 0, 0.5) #sigma.u
  #sigma.urxc: sens para

  coef.y.updated = NULL
  coef.m.updated = NULL

  ## EM steps
  for(iter in 1:Iternum) {
    #cat(iter, "\r")

    ## scale.y = "continuous"
    # interaction term: risk.factor : (X1 + X2 + ...)
    mods_vars <- attr(terms(mods_formula), "term.labels")
    mods_rhs  <- if (length(mods_vars)) paste(paste0(risk.factor, ":", mods_vars), collapse = " + ") else ""
    #l.y = lm(as.formula(paste0(outcome, " ~ ", paste(c(covariates,group, intermediates, risk.factor, paste0(risk.factor, ":", moderators)), collapse = " + "))),
    #         offset = b.y * U, data = data)
    l.y <- lm(
      as.formula(
        paste0(
          outcome, " ~ ",
          paste(c(covariates, group, intermediates, risk.factor), collapse = " + "),
          if (nzchar(mods_rhs)) paste0(" + ", mods_rhs) else ""
        )
      ),
      offset = b.y * U, data = data
    )
    coef.y = c(l.y$coef, U = b.y)
    sd.y = sigma(l.y) # This is the sd of Y conditional on t, m, C, X, and U.
    df.y = summary(l.y)$df[2]

    ## scale.m = "binary"
    l.m = glm(as.formula(paste0(risk.factor, " ~ ", paste(c(group, intermediates, covariates), collapse = " + "))),
              offset = b.m * U, data = data, family = binomial(link = "logit"))
    coef.m = c(l.m$coef, U = b.m)

    ## For continuous U
    # Calculate the integral in the denominator
    integral_res <- mclapply(1:nrow(data), FUN = function(i) {
      integrand = function(u){
        risk.factor_numeric <- as.numeric(data[i, risk.factor]) - 1
        dnorm(data[i, outcome], mean = c(model.matrix(l.y)[i, ] %*% l.y$coef) + u * b.y, sd = sd.y) *
          (1/(1 + exp(-(c(model.matrix(l.m)[i, ] %*% l.m$coef) + u * b.m))))^risk.factor_numeric *
          (1 - 1/(1 + exp(-(c(model.matrix(l.m)[i, ] %*% l.m$coef) + u * b.m))))^(1 - risk.factor_numeric) *
          dnorm(u, mean = 0, sd = 1)
      }
      res = integrate(integrand, lower = -10, upper = 10)$value
    }, mc.cores = num_cores)
    integral = unlist(integral_res)

    # Generate random values of U
    U_res <- mclapply(1:nrow(data), FUN = function(i) {
      # = NULL
      #U.final.nsim = matrix(NA, nrow(data), nsim)
      # Obtain the condition probability of U
      conditional.u = function(u){
        risk.factor_numeric <- as.numeric(data[i, risk.factor]) - 1
        dnorm(data[i, outcome], mean = c(model.matrix(l.y)[i, ] %*% l.y$coef) + u * b.y, sd = sd.y) *
          (1/(1 + exp(-(c(model.matrix(l.m)[i, ] %*% l.m$coef) + u * b.m))))^risk.factor_numeric *
          (1 - 1/(1 + exp(-(c(model.matrix(l.m)[i, ] %*% l.m$coef) + u * b.m))))^(1 - risk.factor_numeric) *
          dnorm(u, mean = 0, sd = 1)/integral[i] #sigma.u
      }
      dist = AbscontDistribution(d = conditional.u)  # signature for a dist with pdf ~ conditional.u
      rdist = r(dist)# function to create random variates from conditional.u
      if(iter < Iternum){
        U = rdist(1)
        U.final.nsim = NA
      } else if (iter == Iternum){
        U = rdist(1)
        U.final.nsim = rdist(nsim)
      }
      return(list(U = U, U.final.nsim = U.final.nsim))
    }, mc.cores = num_cores)

    U = sapply(U_res, "[[", "U")
    U.final.nsim = sapply(U_res, "[[", "U.final.nsim")
    data$U = U #apply(U_res, 1, mean)

  }
  coef.y.updated = rbind(coef.y.updated, coef.y)
  coef.m.updated = rbind(coef.m.updated, coef.m)

  #print("genU_inside function:"); print(list(U = U, b.m= b.m, b.y=b.y,
  #                           coef.y.updated = coef.y.updated,
  #                           coef.m.updated = coef.m.updated,
  #                           U.final.nsim = U.final.nsim))

  return(list(U = U, b.m= b.m, b.y=b.y,
              coef.y.updated = coef.y.updated,
              coef.m.updated = coef.m.updated,
              U.final.nsim = U.final.nsim))
}