R/BinaryOutModelClass.R

Defines functions logit_linkinv predict_single_reg predict_single_reg.long predict_single_reg.glm predict_single_reg.glm.long predict_single_reg.h2o predict_single_reg.SL predict_single_reg.sl3 fit_single_reg fit_single_reg.glmS3 fit_single_reg.speedglmS3 fit_single_reg.h2oS3 fit_single_reg.sl3S3 fit_single_reg.SLS3 coef.BinaryOutModel summary.BinaryOutModel binirized.to.DTlong join.Xmat

logit_linkinv <- function(x) { plogis(x) }

#----------------------------------------------------------------------------------
# Classes for predicting based on fitted regression models with binary outcome Bin ~ Xmat
#----------------------------------------------------------------------------------
# predict_single_reg <- function(self) UseMethod("predict_single_reg")
predict_single_reg <- function(self) {
  fitfunname <- self$getfit$fitfunname
  if (fitfunname %in% c("speedglm", "glm")) { return(predict_single_reg.glm(self)) }
  if (fitfunname == "h2o.ensemble") { return(predict_single_reg.h2o(self)) }
  if (fitfunname == "sl3_pipelines") { return(predict_single_reg.sl3(self)) }
  if (fitfunname == "SuperLearner") { return(predict_single_reg.SL(self)) }
}

predict_single_reg.long <- function(self) {
  fitfunname <- self$getfit$fitfunname
  if (fitfunname %in% c("speedglm", "glm")) { return(predict_single_reg.glm.long(self)) }
  if (fitfunname == "h2o.ensemble") { return(predict_single_reg.h2o(self)) }
  if (fitfunname == "SuperLearner") { return(predict_single_reg.SL(self)) }
}

# Generic prediction fun for logistic regression coefs, predicts P(A = 1 | newXmat)
predict_single_reg.glm <- function(self) {  # Does not handle cases with deterministic Anodes in the original data.
  model.fit <- self$getfit
  Xmat <- self$getXmat  
  assert_that(!is.null(Xmat)); assert_that(!is.null(self$subset_idx))
  pAout <- rep.int(gvars$misval, self$n)  # Set to default missing value for A[i] degenerate/degerministic/misval
  if (sum(self$subset_idx > 0)) {
    eta <- Xmat[,!is.na(model.fit$coef), drop = FALSE] %*% model.fit$coef[!is.na(model.fit$coef)]
    pAout[self$subset_idx] <- match.fun(FUN = model.fit$linkfun)(eta)
  }
  return(pAout)
}

predict_single_reg.glm.long <- function(self) {
  ID <- NULL  # To fix problem of no visible binding for global variable "ID"
  model.fit <- self$getfit
  Xmat <- self$getXmat  
  Y_vals <- self$getY
  assert_that(!is.null(Xmat)); assert_that(nrow(Xmat)==length(Y_vals)); assert_that(!is.null(self$subset_idx))
  pAout <- rep.int(gvars$misval, self$n)
  if (sum(self$subset_idx > 0)) {
    # -----------------------------------------------------------------
    # OBTAINING PREDICTIONS FOR LONG FORMAT P(Ind_j = 1 | Bin_j, W) BASED ON EXISTING POOLED FIT:
    # -----------------------------------------------------------------
    eta <- Xmat[,!is.na(model.fit$coef), drop = FALSE] %*% model.fit$coef[!is.na(model.fit$coef)]
    probA1 <- match.fun(FUN = model.fit$linkfun)(eta)
    # -----------------------------------------------------------------
    # GETTING ID-BASED PREDICTIONS (n) as cumprod of P(Ind_j = 1 | Bin_j, W) for j = 1, ..., K
    # -----------------------------------------------------------------    
    ProbAeqa_long <- as.vector(probA1^(Y_vals) * (1L - probA1)^(1L - Y_vals))    
    res_DT <- data.table(ID = self$ID, ProbAeqa_long = ProbAeqa_long)
    res_DT <- res_DT[, list(cumprob = cumprod(ProbAeqa_long)), by = ID]
    data.table::setkeyv(res_DT, c("ID")) # sort by ID
    res_DT_short <- res_DT[unique(res_DT[, key(res_DT), with = FALSE]), mult = 'last']
    ProbAeqa <- res_DT_short[["cumprob"]]
    pAout[self$subset_idx] <- ProbAeqa
  }
  return(pAout)
}

predict_single_reg.h2o <- function(self) {
  model.fit <- self$getfit$model.fit
  Xmat <- self$getXmat  
  assert_that(!is.null(Xmat)); assert_that(!is.null(self$subset_idx))
  pAout <- rep.int(gvars$misval, self$n)
  if ( any(class(model.fit) %in% "h2o.ensemble")) {
    if (sum(self$subset_idx > 0)) {
      test <- h2o::as.h2o(Xmat)
      predictions <- predict(model.fit, newdata = test)
      if (model.fit$family == "gaussian") { 
        pAout[self$subset_idx] <-  as.vector(predictions$pred)
      } else if (model.fit$family == "binomial") {
        pAout[self$subset_idx] <-  as.vector(predictions$pred[, 3])
      }
    }
  }
  return(pAout)
}

predict_single_reg.SL <- function(self) {
  model.fit <- self$getfit$model.fit
  Xmat <- self$getXmat  
  assert_that(!is.null(Xmat)); assert_that(!is.null(self$subset_idx))
  pAout <- rep.int(gvars$misval, self$n)
  if ( any(class(model.fit) %in% "SuperLearner")) {
    if (sum(self$subset_idx > 0)) {
      test <- data.frame(Xmat)
      predictions <- predict(model.fit, newdata = test, onlySL = TRUE)
      pAout[self$subset_idx] <-  as.vector(predictions$pred)
    }
  }
  return(pAout)
}

predict_single_reg.sl3 <- function(self) {
  model.fit <- self$getfit$model.fit
  # If you want to use new Xmat, you have to set savespace = FALSE, otherwise Xmat will be wiped
  # Xmat <- self$getXmat  
  # data <- as.data.frame(Xmat)
  # task <- sl3::sl3_Task$new(data, covariates = Wnodes, outcome = NULL)
  assert_that(!is.null(self$subset_idx))
  pAout <- rep.int(gvars$misval, self$n)
  if ( any(class(model.fit) %in% "Lrnr_sl")) {
    if (sum(self$subset_idx > 0)) {
      # predictions <- model.fit$predict(task)
      predictions <- model.fit$predict()  # Use the train dataset
      pAout[self$subset_idx] <-  predictions
    }
  }
  return(pAout)
}

#----------------------------------------------------------------------------------
# Classes for fitting regression models with binary outcome Bin ~ Xmat
#----------------------------------------------------------------------------------
# fit_single_reg <- function(self) UseMethod("fit_single_reg")
fit_single_reg <- function(self) {
  estimator <- self$estimator
  if (estimator == "glm__glm") { return(fit_single_reg.glmS3(self)) }
  if (estimator == "speedglm__glm") { return(fit_single_reg.speedglmS3(self)) }
  if (estimator == "h2o__ensemble") { return(fit_single_reg.h2oS3(self)) } 
  if (estimator == "sl3_pipelines") { return(fit_single_reg.sl3S3(self)) }
  if (estimator == "SuperLearner") { return(fit_single_reg.SLS3(self)) }
}
      
# S3 method for glm binomial family fit, takes BinaryOutModel object:
fit_single_reg.glmS3 <- function(self) {
  Xmat <- self$getXmat
  Y_vals <- self$getY
  wt_vals <- self$getWeight
  if (gvars$verbose) {
    print("calling glm.fit...")
    print("number of observations: " %+% nrow(Xmat))
  }
  
  if (nrow(Xmat) == 0L) {  # Xmat has 0 rows: return NA's and avoid throwing exception:
    model.fit <- list(coef = rep.int(NA_real_, ncol(Xmat)))
  } else {
    ctrl <- glm.control(trace=FALSE, maxit=1000)
    # scale weights because there were rare problems where large weights caused convergence problems
    model.fit <- stats::glm.fit(x = Xmat, y = Y_vals, weights = as.vector(scale(wt_vals, center = FALSE)), 
                                family = quasibinomial(), control = ctrl)
  }
  fit <- list(coef = model.fit$coef, linkfun = "logit_linkinv", fitfunname = "glm")
  if (gvars$verbose) print(fit$coef)
  class(fit) <- c(class(fit), c("glmS3"))
  return(fit)
}
        
# S3 method for speedglm binomial family fit, takes BinaryOutModel object:
fit_single_reg.speedglmS3 <- function(self) {
  Xmat <- self$getXmat
  Y_vals <- self$getY
  wt_vals <- self$getWeight
  if (gvars$verbose) {
    print("calling speedglm.wfit...")    
    print("number of observations: " %+% nrow(Xmat))
  }
        
  if (nrow(Xmat) == 0L) { # Xmat has 0 rows: return NA's and avoid throwing exception
    model.fit <- list(coef = rep.int(NA_real_, ncol(Xmat)))
  } else {
    model.fit <- try(
      # scale weights because there were rare problems where large weights caused convergence problems
      speedglm::speedglm.wfit(X = Xmat, y = Y_vals, weights = as.vector(scale(wt_vals, center = FALSE)), 
                              family = quasibinomial()), silent = TRUE)
    if (inherits(model.fit, "try-error")) {  # if failed, fall back on stats::glm
      message("speedglm::speedglm.wfit failed, falling back on stats:glm.fit; ", model.fit)
      return(fit_single_reg.glmS3(self))
    } 
  }
  if (gvars$verbose) print(model.fit$coef)
  fit <- list(coef = model.fit$coef, linkfun = "logit_linkinv", fitfunname = "speedglm")
  class(fit) <- c(class(fit), c("speedglmS3"))
  return(fit)
}        
      
# S3 method for h2o.ensemble binomial family fit, takes BinaryOutModel object:
fit_single_reg.h2oS3 <- function(self) {
  Xmat <- self$getXmat
  Y_vals <- self$getY
  wt_vals <- self$getWeight  # Cannot implement it since no weight argument in h2oEnsemble
  if (length(unique(wt_vals)) > 1) {
    message("Choose other estimators if want to implement obs.wt since h2o.ensemble does NOT have weight argument.")
  }
  h2olearner <- getopt("h2olearner")
  h2ometalearner <- getopt("h2ometalearner")
  CVfolds <- getopt("CVfolds")
  if (gvars$verbose) {
    print("calling h2o.ensemble...")
    print(length(h2olearner) %+% " machine learning algorithm(s): " %+% paste0(h2olearner, collapse = '; '))
    print("h2o metalearner: " %+% h2ometalearner)
    print("number of observations: " %+% nrow(Xmat))
  } 
  
  if (nrow(Xmat) == 0L) { # Xmat has 0 rows or : return NA's and avoid throwing exception
    model.fit <- list(coef = rep.int(NA_real_, ncol(Xmat)))
    if (gvars$verbose) {
      cat("#######################################################################\n")
      cat("No observations fall into the bin" %+% " So NO MODEL FITTED for this stratum " %+% self$outvar %+% "\n")
    }
  } else if (nrow(Xmat) != 0L & length(unique(Y_vals)) == 1) {  # Responses are constant
    if (gvars$verbose) {
      cat("#######################################################################\n")
      cat("Responses are constant. So NO MODEL FITTED for this stratum " %+% self$outvar %+% "\n" %+% "falling back on speedglm::speedglm.wfit;\n")
    }
    return(fit_single_reg.speedglmS3(self))
  } else {
    localH2O <- h2o::h2o.init(nthreads = -1)
    # h2o.removeAll()
    if (!all(grepl("glm", h2olearner))) {
      message("================================================================")
      message("Caution: h2olearner include other algorithms except GLM (and GLMLNET)\n"
              %+% "such as randomForest, GBM and deeplearning. These aggressive\n" %+%
                "algorithms may provide a different result than only using glm\n")
      message("================================================================") 
    }
    train <- data.frame(Xmat[, colnames(Xmat)[colnames(Xmat) != "Intercept"]], Y_vals)
    names(train)[ncol(train)] <- self$outvar
    train <- h2o::as.h2o(train)
    if (all(Y_vals %in% 0:1)) {
      train[, self$outvar] <- h2o::as.factor(train[, self$outvar])
      h2oFamily <- "binomial"
    } else {
      h2oFamily <- "gaussian"
    }
    x <- setdiff(names(train), self$outvar) 
    model.fit <- try(h2oEnsemble::h2o.ensemble(
      x = x, y = self$outvar, training_frame = train, family = h2oFamily,  
      learner = h2olearner, metalearner = h2ometalearner, cvControl = list(V=CVfolds)))
    if (inherits(model.fit, "try-error")) {  # if failed, fall back on sl3_pipelines
      message("h2oEnsemble::h2o.ensemble failed, falling back on sl3_pipelines; \n", model.fit)
      return(fit_single_reg.sl3S3(self)) 
    } 
    if (gvars$verbose) {
      if (h2ometalearner == "h2o.glm.wrapper") {
        print("the coefficients of the model " %+% self$outvar); print(h2o::h2o.coef(model.fit$metafit))
        print("Note, the first value is for the intercept")
      } else {
        print("there is no coefficient here since the metalearner is not GLM")
      }
    }       
  }
  fit <- list(model.fit = model.fit, coef = NULL, h2o.library = h2olearner, fitfunname = "speedglm")
  if (class(model.fit) == "h2o.ensemble" & h2ometalearner == "h2o.glm.wrapper") fit$coef <- h2o::h2o.coef(model.fit$metafit)
  if (class(model.fit) == "h2o.ensemble") {
    class(fit) <- c(class(fit), "h2o.ensemble")
    fit$fitfunname <- "h2o.ensemble"
    fit$family <- h2oFamily
  }
  return(fit)
}

# S3 method for sl3 binomial/ Gaussian family fit, takes BinaryOutModel objects:
fit_single_reg.sl3S3 <- function(self) {
  Xmat <- self$getXmat
  Y_vals <- self$getY
  wt_vals <- self$getWeight
  learners <- getopt("sl3_learners")
  metalearner <- getopt("sl3_metalearner")    
  if (gvars$verbose) {
      print("calling sl3...")
      print(length(learners) %+% " base machine learning algorithm(s): " %+% paste0(names(learners), collapse = '; '))
      print("number of observations: " %+% nrow(Xmat))
    }
  
  if (nrow(Xmat) == 0L) {  # Xmat has 0 rows or Responses are constant: return NA's and avoid throwing exception
    model.fit <- list(coef = rep.int(NA_real_, ncol(Xmat)))
    if (gvars$verbose) {
      cat("#######################################################################\n")
      cat("No observations fall into the bin. So NO MODEL FITTED for this stratum " %+% self$outvar %+% "\n")
    }
  } else if (nrow(Xmat) != 0L & length(unique(Y_vals)) == 1) {  # Responses are constant
    if (gvars$verbose) {
      cat("#######################################################################\n")
      cat("Responses are constant. So NO MODEL FITTED for this stratum " %+% self$outvar %+% "\n" %+% "falling back on speedglm::speedglm.wfit;\n")
    }
    return(fit_single_reg.speedglmS3(self))
  } else {
    if (length(unique(Y_vals)) == 2) { sl3Family <- "binomial" } else { sl3Family <- "gaussian" }
    #if (length(learners > 1)) { stacked_learners <- make_learner(Stack, learners) } else { stacked_learners <- learners }
    if (sl3Family == "gaussian") {
      if (!identical(metalearner$params$learner_function, sl3::metalearner_linear)) {
        metalearner <- sl3::make_learner(sl3::Lrnr_optim, loss_function = sl3::loss_squared_error,
                                         learner_function = sl3::metalearner_linear)
        cat("#######################################################################\n")
        cat("We have to use sl3::metalearner_linear() as learner_function in sl3 if the outcome is gaussian\n")
        cat("So we change the learner_function to sl3::metalearner_linear().\n")
      }
      print("metalearner algorithm: metalearner_linear")
    } else if (sl3Family == "binomial") {
      if (!identical(metalearner$params$learner_function, sl3::metalearner_logistic_binomial)) {
        metalearner <- sl3::make_learner(sl3::Lrnr_optim, loss_function = sl3::loss_loglik_binomial,
                                         learner_function = sl3::metalearner_logistic_binomial)
        cat("#######################################################################\n")
        cat("We have to use sl3::metalearner_logistic_binomial() as learner_function in sl3 if the outcome is binomial\n")
        cat("So we change the learner_function to sl3::metalearner_logistic_binomial().\n")
      }
      print("metalearner algorithm: metalearner_logistic_binomial")
    }
    
    n <- length(Y_vals)
    X <- data.frame(Xmat[, colnames(Xmat)[colnames(Xmat) != "Intercept"]])
    data <- as.data.frame(cbind(X, Y = Y_vals, weights = wt_vals))
    Wnodes <- names(X)
    Anode <- "Y"
    task <- sl3::sl3_Task$new(data, covariates = Wnodes, outcome = Anode, weights = "weights")
    
    # define Super Learner
    binom_sl <- sl3::make_learner(Lrnr_sl, learners, metalearner)
    
    model.fit <- try(binom_sl$train(task))
    
    if (inherits(model.fit, "try-error")) { # if failed, fall back on SuperLearner::SuperLearner
      message(" failed, falling back on SuperLearner::SuperLearner; ", model.fit)
      return(fit_single_reg.SLS3(self))
    }
  }
  fit <- list(model.fit = model.fit, coef = NULL, learner = learners, metalearner = metalearner, fitfunname = "speedglm")
  if (any(class(model.fit) == "Lrnr_sl")) {
    fit$coef <- model.fit$coefficients
    class(fit) <- c(class(fit), "sl3_pipelines")
    fit$fitfunname <- "sl3_pipelines"
  }
  return(fit)
}

# S3 method for SuperLearner binomial family fit, takes BinaryOutModel objects:
fit_single_reg.SLS3 <- function(self) {
  Xmat <- self$getXmat
  Y_vals <- self$getY
  wt_vals <- self$getWeight
  SL.library <- getopt("SL.library")
  CVfolds <- getopt("CVfolds")
  if (gvars$verbose) {
    print("calling SuperLearner...")
    print(length(SL.library) %+% " base machine learning algorithm(s): " %+% paste0(SL.library, collapse = '; '))
    print("number of observations: " %+% nrow(Xmat))
  }    
  
  if (nrow(Xmat) == 0L) {  # Xmat has 0 rows or Responses are constant: return NA's and avoid throwing exception
    model.fit <- list(coef = rep.int(NA_real_, ncol(Xmat)))
    if (gvars$verbose) {
      cat("#######################################################################\n")
      cat("No observations fall into the bin. So NO MODEL FITTED for this stratum " %+% self$outvar %+% "\n")
    }
  } else if (nrow(Xmat) != 0L & length(unique(Y_vals)) == 1) {  # Responses are constant
    if (gvars$verbose) {
      cat("#######################################################################\n")
      cat("Responses are constant. So NO MODEL FITTED for this stratum " %+% self$outvar %+% "\n" %+% "falling back on speedglm::speedglm.wfit;\n")
    }
    return(fit_single_reg.speedglmS3(self))
  } else {
    if (all(Y_vals >= 0 & Y_vals <= 1)) { SLFamily <- binomial() } else { SLFamily <- gaussian() }
    n <- length(Y_vals)
    X <- data.frame(Xmat[, colnames(Xmat)[colnames(Xmat) != "Intercept"]])
    model.fit <- try(SuperLearner::SuperLearner( 
      Y = Y_vals, X = X, family = SLFamily, verbose = gvars$verbose, SL.library = SL.library, 
      obsWeights = as.vector(scale(wt_vals, center = FALSE)), cvControl = list(V=CVfolds), 
      control = list(saveFitLibrary=TRUE)))
    
    if (inherits(model.fit, "try-error")) { # if failed, fall back on speedglm::speedglm.wfit
      message("SuperLearner::SuperLearner failed, falling back on speedglm::speedglm.wfit; ", model.fit)
      return(fit_single_reg.speedglmS3(self))
    }
  }
  fit <- list(model.fit = model.fit, coef = NULL, SL.library = SL.library, fitfunname = "speedglm")
  if (class(model.fit) == "SuperLearner") {
    fit$coef <- model.fit$coef
    class(fit) <- c(class(fit), "SuperLearner")
    fit$fitfunname <- "SuperLearner"
  }
  return(fit)
}

# S3 methods for getting coefs from fitted BinaryOutModel class object
coef.BinaryOutModel <- function(binaryoutmodel) {
  assert_that(binaryoutmodel$is.fitted)
  binaryoutmodel$getfit$coef
}

summary.BinaryOutModel <- function(binaryoutmodel) {
  assert_that(binaryoutmodel$is.fitted)
  fit <- binaryoutmodel$getfit
  append(list(reg = binaryoutmodel$show()), fit)
}
  
# -------------------------------------- binirized.to.DTlong --------------------------------
# Purpose: Convert existing Bin matrix (Bin indicators) for continuous self$outvar into long format data.table with 3 columns:
# ID - row number; sVar_allB.j - bin indicators collapsed into one col; bin_ID - bin number identify for prev. columns
# automatically removed all missing (degenerate) bin indicators      
# -------------------------------------------------------------------------------------------
binirized.to.DTlong <- function(BinsDat_wide, binID_seq, ID, bin_names, pooled_bin_name, name.sVar) {
  # Convert Bin matrix into a data.table (without data.frame as intermediate), with new row ID column:
  DT_BinsDat_wide <- data.table::as.data.table(BinsDat_wide)[, c("ID") := ID]
  data.table::setcolorder(DT_BinsDat_wide, c("ID", names(DT_BinsDat_wide)[-ncol(DT_BinsDat_wide)]))
  # melt into long format:
  # Only the bin where the obs falls into receives 1, others receive 0
  sVar_melt_DT <- melt(DT_BinsDat_wide,
                      id.vars = "ID",
                      measure.vars = bin_names,
                      value.name = pooled_bin_name,  
                      variable.name = name.sVar,
                      variable.factor = FALSE,
                      na.rm = FALSE)
  nbin_rep <- rep(binID_seq, each = nrow(BinsDat_wide))
  # 1) Add bin_ID; 2) remove a column with Bin names; 3) remove all rows with NA value for outcome (degenerate bins)
  if (!is.data.table(sVar_melt_DT)) {
    class(sVar_melt_DT)
    stop("sVar_melt_DT is not a data.table")
  }
  # Remove column named as name.sVar; Remove rows who have NA under column named as pooled_bin_name
  sVar_melt_DT <- sVar_melt_DT[, c("bin_ID") := list(nbin_rep)][, (name.sVar) := NULL][!is.na(get(pooled_bin_name))]
  data.table::setkeyv(sVar_melt_DT, c("ID", "bin_ID"))  # sort by ID, bin_ID to prepare for merge with predictors (W)
  return(sVar_melt_DT)
}
     
# ------------------------------------------- join.Xmat -------------------------------------
# Purpose: Prepare predictors (W/X_mat) as data.table, adding row IDs for a join
# Join with sVar_melt_DT that is already in long format; Check that created IDs match exactly for both datasets
# -------------------------------------------------------------------------------------------
join.Xmat = function(X_mat, sVar_melt_DT, ID) {
  nIDs <- length(unique(sVar_melt_DT[["ID"]]))
  assert_that(nIDs == nrow(X_mat))
  X_mat_DT <- data.table::as.data.table(X_mat)[, c("ID") := ID]
  data.table::setkeyv(X_mat_DT, c("ID")) # sort by ID
  sVar_melt_DT <- sVar_melt_DT[X_mat_DT] # Merge long format (self$pooled_bin_name, binIDs) with predictors (W)
  return(sVar_melt_DT)
}      
        

## ---------------------------------------------------------------------
#' R6 class for modeling (fitting and predicting) for a single binary regression model P(B | PredVars)
#'
#'  \code{BinaryOutModel} can store and manage the (binarize/ discretized) design matrix Xmat and the outcome Bin for the binary regression 
#'  P(Bin|Xmat). It provides argument \code{self$estimator} to include different candidate estimators in the fitting and predicting library,  
#'  such as data-adaptive super learner algorithms and parametric logistic regression. When fitting one pooled regression across multiple 
#'  bins, it provides method to convert data from wide to long format when requested (to gain computational efficiency). 
#'
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#' \item{bin_names} - Character vector of names of the bins.
#' \item{ID} - Integer vector of observation IDs used for pooling. \code{1:n}.
#' \item{pooled_bin_name} - Original name of the continuous covariate that was discretized into bins and then pooled.
#' \item{nbins} - Number of bins used for estimation of a continuous outvar, defined in ContinModel$new().
#' \item{estimator} - Character, one of "speedglm__glm" (default), "glm__glm", "h2o__ensemble", "SuperLearner".
#' \item{outvar} - Character, outcome name.
#' \item{predvars} - Character vector of predictor names.
#' \item{cont.sVar.flag} - Logical. If TRUE, indicate the original outcome variable is continuous.
# \item{fitclass} - (NOT IMPLEMENTED) Fit class based on estimator type.
#' \item{bw.j} - Bin width of a bin indicator obtained from the discretization of a continous covariate.
#' \item{is.fitted} - Logical. If TRUE, indicate the \code{BinaryOutModel} class object is fitted already.
#' \item{pool_cont} - Logical. If TRUE, perform pooling of bins.
#' \item{outvars_to_pool} - Character vector of outcome bin names for pooling.
#' \item{ReplMisVal0} - Logical. If TRUE, user-supplied gvars$misXreplace (Default to 0) will be used to replace all gvars$misval 
#'   among predictors. \code{ReplMisVal0} in \code{RegressionClass} will be used when instantiating an new object of \code{BinaryOutModel}. 
#' \item{n} - Number of rows in the input data.
#' \item{subset_expr} - Vector of length \code{n} that specifies a subset of data to be used in the fitting process. 
#'   Either logical, expression or indices.
#' \item{subset_idx} - Logical version of \code{subset_expr}. 
#' }
#' @section Methods:
#' \describe{
#'   \item{\code{new(reg)}}{Use \code{reg} (a \code{\link{RegressionClass}} class object) to instantiate an new object of \code{BinaryOutModel} 
#'     for a single binary regression.}
#'   \item{\code{newdata(newdata, getoutvar = TRUE, ...)}}{Evaluate subset and perform correct subseting of data to construct 
#'     X_mat, Yvals & wt_vals.}
#'   \item{\code{define.subset.idx(data)}}{Create a logical vector which is converted from subset_expr}
#'   \item{\code{fit(overwrite = FALSE, data, predict = FALSE, savespace = TRUE, ...)}}{fit a binary regression. Note that \code{overwrite} is 
#'     Logical. If \code{FALSE} (Default), the previous fitted model cannot be overwritten by new fitting model. \code{savespace} is Logical. 
#'     If \code{TRUE} (Default), wipe out all internal data when doing many stacked regressions.}
#'   \item{\code{copy.fit(bin.out.model)}}{Take fitted BinaryOutModel object as an input and save the fit to itself.}
#'   \item{\code{predict(newdata, savespace = TRUE, ...)}}{Predict the response P(A = 1|W = w, E = e).}
#'   \item{\code{copy.predict(bin.out.model)}}{Tke BinaryOutModel object that contains the predictions for P(A=1|w,e) and save to itself}
#'   \item{\code{predictAeqa(newdata, bw.j.sA_diff, savespace = TRUE, wipeProb = TRUE)}}{Predict the response P(A = a|W = w, E = e) for 
#'     observed A, W, E. Note that wipeProb is logical argument for self$wipe.alldat. If FALSE, vectors of probA1 & probAeqa will be kept.}
#'   \item{\code{show()}}{Print regression formula, including outcome and predictor names.}
#' } 
#' @section Active Bindings:
#' \describe{
#'   \item{\code{wipe.alldat(wipeProb = TRUE)}}{...}
#'   \item{\code{getfit}}{...}
#'   \item{\code{getprobA1}}{...}
#'   \item{\code{getprobAeqa}}{...}
#'   \item{\code{emptydata}}{...}
#'   \item{\code{emptyY}}{...}
#'   \item{\code{emptyWeight}}{...}
#'   \item{\code{emptySubset_idx}}{...}
#'   \item{\code{getXmat}}{...}
#'   \item{\code{getY}}{...}
#'   \item{\code{getWeight}}{...}
#' }
#' @seealso \code{\link{DatKeepClass}}, \code{\link{RegressionClass}}, \code{\link{tmleCom_Options}}
#' @example tests/examples/1_BinaryOutModel_examples.R
#' @export
BinaryOutModel  <- R6Class(classname = "BinaryOutModel",
  cloneable = TRUE, # changing to TRUE to make it easy to clone input h_g0/h_gstar model fits
  portable = TRUE,
  class = TRUE,
  public = list(
    bin_names = NULL,
    ID = NULL,
    pooled_bin_name = NULL,
    nbins = integer(),
    estimator = character(), 
    outvar = character(),   # Outcome name(s)
    predvars = character(), # Names of predictor vars
    cont.sVar.flag = logical(),
    # fitclass = character(), # Fit class based on estimator type
    bw.j = numeric(),
    is.fitted = FALSE,   
    pool_cont = logical(),
    outvars_to_pool = character(),
    ReplMisVal0 = logical(),
    n = NA_integer_,        # Number of rows in the input data
    subset_expr = NULL,     # PASS THE LOGICAL EXPRESSIONS TO self$subset WHICH WILL BE EVALUTED IN THE ENVIRONMENT OF THE data
    subset_idx = NULL,      # Logical vector of length n (TRUE = include the obs)
 
    initialize = function(reg, ...) {
      assert_that(is.string(reg$outvar))
      assert_that(is.character(reg$predvars))
      self$outvar <- reg$outvar
      self$predvars <- reg$predvars
      self$pool_cont <- reg$pool_cont
      self$outvars_to_pool <- reg$outvars_to_pool
      self$ReplMisVal0 <- reg$ReplMisVal0
      self$nbins <- reg$nbins
      self$subset_expr <- reg$subset_vars
      if (is.null(reg$subset_vars)) {self$subset_expr <- TRUE}
      assert_that(is.logical(self$subset_expr) || is.call(self$subset_expr) || is.character(self$subset_expr))
      self$estimator <- reg$estimator
      # self$fitclass <- getopt("fitclass")      
      
      # Get the bin width (interval length) for the current bin name self$outvar (for discretized continuous A only):
      self$cont.sVar.flag <- self$outvar %in% names(reg$intrvls.width)
      if (self$cont.sVar.flag) {
        intrvl.idx <- which(names(reg$intrvls.width) %in% self$outvar)
        if (length(intrvl.idx) > 1) stop("non-unique names for intrvls.width in RegressionClass")
        self$bw.j <- reg$intrvls.width[intrvl.idx]
      } else {
        self$bw.j <- 1L
      }
      invisible(self)
    },
    
    # ----------------------------------------- newdata ------------------------------------------- 
    # Purpose: Sets X_mat, Yvals, wt_vals, evaluates subset and performs correct subseting of data  
    # --------------------------------------------------------------------------------------------- 
    newdata = function(newdata, getoutvar = TRUE, ...) {
      assert_that(is.DatKeepClass(newdata))
      # CALL self$setdata.long() when: 1) self$pool_cont is TRUE & 2) more than one outvars_to_pool
      if (self$pool_cont && length(self$outvars_to_pool)>1) {
        private$setdata.long(data = newdata, ...)
      } else {
        private$setdata(data = newdata, getoutvar, ...)
      }
      invisible(self)
    },
  
    # ------------------------------------ define.subset.idx -------------------------------------- 
    # Purpose: return a logical vector which is converted from Subset subset_expr
    # --------------------------------------------------------------------------------------------- 
    define.subset.idx = function(data) {
      if (is.logical(self$subset_expr)) {
        subset_idx <- self$subset_expr
      } else if (is.call(self$subset_expr)) {
        subset_idx <- data$evalsubst(subset_exprs = self$subset_expr)
      } else if (is.character(self$subset_expr)) {
        subset_idx <- data$evalsubst(subset_vars = self$subset_expr)
      }
      assert_that(is.logical(subset_idx))
      if ((length(subset_idx) < self$n) && (length(subset_idx) > 1L)) {
        if (gvars$verbose) message("subset_idx has smaller length than self$n; repeating subset_idx p times, for p: " %+% data$p)
        subset_idx <- rep.int(subset_idx, data$p)
        if (length(subset_idx) != self$n) stop("BinaryOutModel$define.subset.idx: self$n is not equal to nobs*p!")
      }
      assert_that((length(subset_idx) == self$n) || (length(subset_idx) == 1L))
      return(subset_idx)
    },   

    fit = function(overwrite = FALSE, data, predict = FALSE, savespace = TRUE, ...) { # Move overwrite to a field? ... self$overwrite
      if (gvars$verbose) print("fitting the model: " %+% self$show())
      if (!overwrite) assert_that(!self$is.fitted) # do not allow overwrite of previous fitted model unless explicitely asked  
      self$newdata(newdata = data, ...) # populate bindat with X_mat & Y_vals & wt_vals
      private$model.fit <- fit_single_reg(self = self) 
      self$is.fitted <- TRUE
      # **********************************************************************
      # to save RAM space when doing many stacked regressions no longer predicting in fit:
      # **********************************************************************
      if (savespace) self$wipe.alldat
      invisible(self)
    },

    # take fitted BinaryOutModel object as an input and save the fits to itself
    copy.fit = function(bin.out.model) {
      assert_that("BinaryOutModel" %in% class(bin.out.model))
      private$model.fit <- bin.out.model$getfit
      self$is.fitted <- TRUE
      invisible(self)
    },

    # Predict the response P(Bin = 1|W = w);
    # Does not need to know the actual values of the binary outcome Bin to do prediction ($newdata(, getouvar = FALSE))
    # P(A[i]=1|W=w): uses private$model.fit to generate predictions for newdata:
    predict = function(newdata, savespace = TRUE, ...) {
      assert_that(self$is.fitted)
      if (missing(newdata)) { stop("must provide newdata for BinaryOutModel$predict()") }
      # re-populate self with new X_mat:
      self$newdata(newdata = newdata, getoutvar = FALSE, ...) 
      if (self$pool_cont && length(self$outvars_to_pool) > 1) {
        stop("BinaryOutModel$predict is not applicable to pooled regression, call BinaryOutModel$predictAeqa instead")
      } else {
        private$probA1 <- predict_single_reg(self = self)
      }
      if (savespace) self$emptydata  # Xmat is no longer needed, but subset, outvar & probA1 may be needed for private$probA1
      invisible(self)
    },

    # take BinaryOutModel object that contains the predictions for P(A=1|W) and save these predictions to self$
    copy.predict = function(bin.out.model) {
      assert_that("BinaryOutModel" %in% class(bin.out.model))
      assert_that(self$is.fitted)
      private$probA1 <- bin.out.model$getprobA1
    },

    # Predict the response P(Bin = b|W = w), which is returned invisibly;
    # Needs to know the values of b for prediction ($newdata(, getouvar = TRUE))
    # WARNING: This method cannot be chained together with methods that follow (s.a, class$predictAeqa()$fun())
    predictAeqa = function(newdata, bw.j.sA_diff, savespace = TRUE, wipeProb = TRUE) { 
      # P(A[i]=a|W=w) - calculating the likelihood for indA[i] (n vector of a's)
      if (missing(newdata) && !is.null(private$probAeqa)) { return(private$probAeqa) }
      assert_that(self$is.fitted)
      assert_that(!missing(newdata))
      self$newdata(newdata = newdata, getoutvar = TRUE) # populate bindat with new design matrix covars X_mat
      assert_that(is.logical(self$subset_idx))
      n <- newdata$nobs
      # obtain predictions (likelihood) for response on fitted data (from long pooled regression):
      if (self$pool_cont && length(self$outvars_to_pool) > 1) {
        probAeqa <- predict_single_reg.long(self = self) # overwrite probA1 with new predictions:
      } else {
        # get predictions for P(A[j]=1|W=newdata) from newdata:
        probA1 <- predict_single_reg(self = self)
        indA <- newdata$get.outvar(self$subset_idx, var = self$outvar) # Always a vector of 0/1
        assert_that(is.integerish(indA)) # check that obsdat.A is always a vector of of integers
        probAeqa <- rep.int(1L, n) # for missing, the likelihood is always set to P(A = a) = 1.
        assert_that(!any(is.na(probA1[self$subset_idx]))) # check that predictions P(A=1 | dmat) exist for all obs.
        probA1 <- probA1[self$subset_idx]
        # discrete version for the joint density:
        probAeqa[self$subset_idx] <- probA1^(indA) * (1 - probA1)^(1L - indA)
        # continuous version for the joint density:
        # probAeqa[self$subset_idx] <- (probA1^indA) * exp(-probA1)^(1 - indA)
        # Alternative intergrating the last hazard chunk up to x:
        # difference of A value and its left most bin cutoff: x - b_{j-1}
        if (!missing(bw.j.sA_diff)) {
          # + integrating the constant hazard all the way up to value of each sa:
          # probAeqa[self$subset_idx] <- probAeqa[self$subset_idx] * (1 - bw.j.sA_diff[self$subset_idx]*(1/self$bw.j)*probA1)^(indA)
          # cont. version of above:
          probAeqa[self$subset_idx] <- probAeqa[self$subset_idx] * exp(-bw.j.sA_diff[self$subset_idx]*(1/self$bw.j)*probA1)^(indA)
        }
        private$probA1 <- probA1
      }
      private$probAeqa <- probAeqa
      # **********************************************************************
      # to save RAM space when doing many stacked regressions wipe out all internal data:
      if (savespace) { self$wipe.alldat <- wipeProb } #  if wipeProb = FALSE, probA1 & probAeqa will be kept
      # **********************************************************************
      return(probAeqa)
    },
      
    show = function() { "P(" %+% self$outvar %+% "|" %+% paste(self$predvars, collapse=",") %+% ")" }
  ),
        
  active = list(
    wipe.alldat = function(wipeProb = TRUE) {
      if (wipeProb) { private$probA1 <- NULL }
      if (wipeProb) { private$probAeqa <- NULL }
      self$emptydata
      self$emptyY
      # self$emptyWeight
      self$emptySubset_idx
      return(self)
    },
    getfit = function() { private$model.fit },
    getprobA1 = function() { private$probA1 },
    getprobAeqa = function() { private$probAeqa },    
    emptydata = function() { private$X_mat <- NULL },
    emptyY = function() { private$Y_vals <- NULL},
    emptySubset_idx = function() { self$subset_idx <- NULL },
    emptyWeight = function() { private$wt_vals <- NULL },
    getXmat = function() { private$X_mat },
    getY = function() { private$Y_vals },
    getWeight = function() { private$wt_vals }
  ),
        
  private = list(
    model.fit = list(),   # the model fit (either coefficients or the model fit object)
    probA1 = NULL,        # Predicted probA=1 conditional on X_mat
    probAeqa = NULL,      # Likelihood of observing a particular value A=a conditional on X_mat
    X_mat = NULL,
    Y_vals = NULL,
    wt_vals = NULL,
    
    setdata = function(data, getoutvar, ...) {
      # everything is performed using data$ methods (data is of class DatKeepClass)
      assert_that(is.DatKeepClass(data))
      self$n <- data$nobs
      self$subset_idx <- self$define.subset.idx(data)
      private$wt_vals <- data$get.obsweights(rowsubset = self$subset_idx)
      if (getoutvar) private$Y_vals <- data$get.outvar(rowsubset = self$subset_idx, var = self$outvar) # Always a vector
      if (sum(self$subset_idx) == 0L) {  # When nrow(X_mat) == 0L avoids exception (when nrow == 0L => prob(A=a) = 1)
        private$X_mat <- matrix(, nrow = 0L, ncol = (length(self$predvars) + 1))
        colnames(private$X_mat) <- c("Intercept", self$predvars)
      } else {
        # *** THIS IS THE ONLY LOCATION IN THE PACKAGE WHERE CALL TO DatKeepClass$get.dat.sVar() IS MADE ***
        private$X_mat <- as.matrix(cbind(Intercept = 1, data$get.dat.sVar(self$subset_idx, self$predvars)))        
        if (self$ReplMisVal0) private$X_mat[gvars$misfun(private$X_mat)] <- gvars$misXreplace  # To find and replace misvals in X_mat
      }
      invisible(self)
    },
        
    setdata.long = function(data, ...) {
      assert_that(is.DatKeepClass(data))
      self$n <- data$nobs
      self$subset_idx <- self$define.subset.idx(data)
      if (!(data$active.bin.sVar %in% self$outvar)) { stop("currently binirized sVar does not match self$outvar argument") }
      # Setting up object fields related to pooling of continuous A:
      self$pooled_bin_name <- data$pooled.bin.nm.sVar(self$outvar)
      self$bin_names <- self$outvars_to_pool

      if (gvars$verbose) {
        print("self$bin_names: "); print(self$bin_names)
        print("self$pooled_bin_name: "); print(self$pooled_bin_name)
        print("self$outvar: "); print(self$outvar)
        print("self$nbins: "); print(self$nbins)
      }

      binID_seq <- 1L : self$nbins
      BinsDat_wide <- data$get.dat.sVar(self$subset_idx, covars = self$outvars_to_pool)
      self$ID <- as.integer(1:nrow(BinsDat_wide))

      # To grab bin Ind mat directly (prob a bit faster): BinsDat_wide <- data$dat.bin.sVar[self$subset_idx, ]
      BinsDat_long <- binirized.to.DTlong(BinsDat_wide = BinsDat_wide, binID_seq = binID_seq, ID = self$ID,
                                          bin_names = self$bin_names, pooled_bin_name = self$pooled_bin_name,
                                          name.sVar = self$outvar)
      X_mat.addwt <- cbind(data$get.dat.sVar(self$subset_idx, covars = self$predvars), weights = data$get.obsweights(self$subset_idx))
      sVar_melt_DT <- join.Xmat(X_mat = X_mat.addwt, sVar_melt_DT = BinsDat_long, ID = self$ID)
      # prepare design matrix for modeling w/ glm.fit or speedglm.wfit:
      # select bin_ID + predictors, add intercept column
      X_mat <- sVar_melt_DT[, c("bin_ID", self$predvars), with=FALSE][, c("Intercept") := 1] 
      setcolorder(X_mat, c("Intercept", "bin_ID", self$predvars)) # re-order columns by reference (no copy)
      self$ID <- sVar_melt_DT[["ID"]]
      private$X_mat <- as.matrix(X_mat)
      private$Y_vals <- sVar_melt_DT[, self$pooled_bin_name, with = FALSE][[1]] # outcome vector
      private$wt_vals <- sVar_melt_DT[, "weights", with = FALSE][[1]] # observation weight vector
      if (self$ReplMisVal0) private$X_mat[gvars$misfun(private$X_mat)] <- gvars$misXreplace
      if (gvars$verbose) {
        print("private$X_mat[1:10,]"); print(private$X_mat[1:10,])
        print("head(private$Y_vals)"); print(head(private$Y_vals, 100))
        print("head(private$wt_vals)"); print(head(private$wt_vals, 100))
      }
      invisible(self)
    } 
  )
)
chizhangucb/tmleCommunity documentation built on May 20, 2019, 3:34 p.m.