R/GenericModelClasses.R

Defines functions newsummarymodel newsummarymodel.generic newsummarymodel.contin newsummarymodel.categor newsummarymodel.binary def_regs_subset

# ---------------------------------------------------------------------------------
# Generic S3 constructors for the summary model classes:
# ---------------------------------------------------------------------------------
newsummarymodel <- function(reg, DatKeepClass.g0, ...) { UseMethod("newsummarymodel") }
# Summary model constructor for generic regression with multivariate outcome, but one set of predictors
newsummarymodel.generic <- function(reg, DatKeepClass.g0, ...) GenericModel$new(reg = reg, DatKeepClass.g0 = DatKeepClass.g0, ...)
# Summary model constructor for continuous outcome A[j]:
newsummarymodel.contin <- function(reg, DatKeepClass.g0, ...) ContinModel$new(reg = reg, DatKeepClass.g0 = DatKeepClass.g0, ...)
# Summary model constructor for categorical outcome A[j]:
newsummarymodel.categor <- function(reg, DatKeepClass.g0, ...) CategorModel$new(reg = reg, DatKeepClass.g0 = DatKeepClass.g0, ...)
# Summary model constructor for binary outcome A[j]:
newsummarymodel.binary <- function(reg, ...) BinaryOutModel$new(reg = reg, ...)


## ---------------------------------------------------------------------
#' R6 class for defining regression models that evaluate multivariate joint conditional density \eqn{P(A|W,E)} 
#'  (or \eqn{P(A|W)} if non-hierarchical structure)
#'
#'  \code{RegressionClass} provides multiple options used when estimating a conditional density \eqn{P(A|W,E)}. \code{A} 
#'  can be multivariate, if so, hazard specification will factorize \code{P(A|W,E)} = \code{P(A[1],..., } \code{A[M]|W,E)} as a 
#'  sequence \code{P(A[1]|W,E)} * \code{P(A[2]|W, E, A[1])} * ... * \code{P(A[M]|W, E, A[1],...,}  \code{A[M-1])}, where each of the 
#'  compoenents \code{A[m]} can be either binary, categorical or continuous, and each of the conditional densities 
#'  \code{P(A[m]|W, E, A[1],...,A[m-1])} will be controlled  by a new instance of \code{\link{GenericModel}} class. If \code{A[m]}
#'  binary, \code{P(A[m]|W, E, A[1],...,A[m-1])} will be esimtated by a user-specific library of candidate algorithms, including 
#'  parametric estimators such as logistic model with only main terms, and data-adaptive estimator such as super-learner algorithms. 
#'  If \code{A[m]} continuous (or categorical), \code{P(A[m]|W, E, A[1],...,A[m-1])} will then be controlled by a new instance of 
#'  \code{\link{ContinModel}} class (or \code{\link{CategorModel}} class). Note that each \code{GenericModel}, \code{ContinModel} 
#'  and \code{CategorModel} class will accompany with an adjunctive clone of a parent \code{RegressionClass} class. The automatically 
#'  recursive process of defining new instances of \code{GenericModel} and cloning \code{RegressionClass} classes won't stop until 
#'  the entire sequence of binary regressions that represents \code{P(A|W,E)} is constructed.
#'
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#' \item{\code{outvar.class}} - Character vector of outcome variable classes (of length(outvar)): one of \code{bin}, \code{cont}, \code{cat}.
#' \item{\code{outvar}} - Character vector of regression outcome variable names.
#' \item{\code{predvars}} - Character vector of regression-specific predictor names or a pool of all available predictor names.
#' \item{{reg_hazard}} - Logical, hazard fitting method. If TRUE, factorize P(outvar | predvars) into \\prod_{j}{P(outvar[j] | predvars)} for each j.
#' \item{\code{subset_vars}} - Named list for subset variables/ expression (later converted to logical vector).
#' \item{\code{ReplMisVal0}} - Logical. If TRUE, user-supplied gvars$misXreplace (Default to 0) will be used to replace all gvars$misval 
#'   among predictors (Default to TRUE).
#' \item{\code{nbins}} - Number of bins used for estimation of a continuous outvar, defined in ContinModel$new().
#' \item{\code{estimator}} - Character, one of "speedglm__glm" (default), "glm__glm", "h2o__ensemble", "SuperLearner". The estimator for which to fit 
#'   regression model. For "h2o__ensemble" and "SuperLearner", users can specify the data-adaptive algorithms through \code{tmleCom_Options}.
#' \item{\code{parfit}} - Logical. If TRUE, use parallel computing on binary regressions. See \code{foreach::foreach}.                            
#' \item{\code{pool_cont}} - Logical. If TRUE, pool over bins of a continuous outvar and fit one regression, along with bin_ID as an extra variable. 
#' \item{\code{outvars_to_pool}} - Character vector of bin names of a continuous outvars, should be identical to \code{bin_nms}.
#' \item{\code{intrvls}} - Numeric vector defining the number and positions of the bins or a named list of numeric vectors if 2 or more outvars.
#'   If not specified and outvar continuous, intervals will be determined in \code{ContinModel} through \code{DatKeepClass$detect.sVar.intrvls}.
#' \item{\code{intrvls.width}} - Named numeric vector of bin widths for each bin in \code{self$intrvls}. If not specified, default to 1 if
#'   outvar binary, default to \code{diff(self$intrvls)} if outvar continuous, 
#' }
#' @section Methods:
#' \describe{
#'   \item{\code{new(outvar.class = gvars$sVartypes$bin, outvar, predvars, subset_vars, intrvls,
#'                   ReplMisVal0 = TRUE, estimator = getopt("Qestimator"), parfit = getopt("parfit"),
#'                   pool_cont = getopt("poolContinVar")}}{Instantiate an new instance of \code{RegressionClass}} 
#'   \item{\code{ChangeManyToOneRegresssion(k_i, reg)}}{Clone the parent \code{RegressionClass} (\code{reg}) that include \code{length(self$outvar)} 
#'   regressions, and reset self to a single univariate \code{k_i} regression for outcome \code{self$outvar[[k_i]]}.}
#'   \item{\code{resetS3class()}}{Reset the object class to "RegressionClass" and "R6".}
#' }
#' @section Active Bindings:
#' \describe{
#'   \item{\code{S3class(newclass)}}{...}
#'   \item{\code{get.reg}}{...}
#' }
#' @seealso \code{\link{DatKeepClass}}, \code{\link{GenericModel}}, \code{\link{ContinModel}}, \code{\link{CategorModel}}
#' @example tests/examples/5_RegressionClass_examples.R
#' @export
RegressionClass <- R6Class("RegressionClass",
  class = TRUE,
  portable = TRUE,
  public = list(
    outvar.class = character(),    # Vector for classes of the outcome vars: bin / cont / cat
    outvar = character(),          # Vector of regression outcome variable names
    predvars = character(),        # Either a pool of all predictors (W) or regression-specific predictor names
    reg_hazard = FALSE,            # If TRUE, the joint P(outvar|predvars) is factorized as \prod_{j}{P(outvar[j] | predvars)} for each j outvar 
    subset_vars = NULL,            # Named LIST for subset vars (later evaluated to logical vector in the envir of the data)
    ReplMisVal0 = TRUE,            # If TRUE all gvars$misval among predicators are replaced with with gvars$misXreplace (0) 
    nbins = NULL,                  # Actual nbins used, for cont. outvar, defined in ContinModel$new()
    estimator = character(),       # Specify default estimator for model fitting
    parfit = logical(),            # TRUE for fitting binary regressions in parallel
    pool_cont = logical(),         # Pool binned cont outvar obs into long format (adding bin_ID as a covaraite)
    outvars_to_pool = character(), # Names of the binned continuous sVars, should match bin_nms
    intrvls = numeric(),           # Vector of numeric cutoffs defining the bins or a named list of numeric intervals (for length(self$outvar) > 1)
    intrvls.width = 1L,            # Named vector of bin-widths (bw_k : k=1,...,M) for each each bin in self$intrvls
	                           # When A is not continuous, intrvls.width IS SET TO 1.
                                   # When A is continuous, intrvls.width is SET TO self$intrvls.width 
    initialize = function(outvar.class = gvars$sVartypes$bin,
                          outvar, predvars, subset_vars, intrvls,
                          ReplMisVal0 = TRUE, # add ReplMisVal0 = TRUE 
                          estimator = getopt("Qestimator"),  # By defautl, g and Q use the same estimator
                          parfit = getopt("parfit"),
			  nbins = getopt("nbins"),
                          pool_cont = getopt("poolContinVar")
                          ) {
      assert_that(length(outvar.class) == length(outvar))

      self$outvar.class <- outvar.class
      self$outvar <- outvar
      self$predvars <- predvars
      self$ReplMisVal0 <- ReplMisVal0
      self$estimator <- estimator
      self$parfit <- parfit
      self$nbins <- nbins
      self$pool_cont <- pool_cont

      n_regs <- length(outvar)

      if (!missing(intrvls)) {
        assert_that(is.list(intrvls))
        assert_that(length(outvar) == length(intrvls))
        assert_that(all(names(intrvls) %in% outvar))
        self$intrvls <- intrvls
      } else {
        self$intrvls <- NULL
      }

      if (!missing(subset_vars)) {
        self$subset_vars <- subset_vars
        if (length(subset_vars) < n_regs) {
          self$subset_vars <- rep_len(subset_vars, n_regs)
        } else if (length(subset_vars) > n_regs) {
          if (!is.logical(subset_vars)) stop("not implemented")
	  self$subset_vars <- subset_vars[1:n_regs]  # chose the first n_regs elements in subset_vars
          # ... TO FINISH ... increase n_regs to all combinations of (n_regs x subset_vars)
        }
      } else {
        self$subset_vars <- rep_len(list(TRUE), n_regs)
      }
    },

    # take the clone of a parent RegressionClass (reg) for length(self$outvar) regressions
    # and set self to a single univariate k_i regression for outcome self$outvar[[k_i]]
    ChangeManyToOneRegresssion = function(k_i, reg) {
      assert_that(!missing(k_i))
      if (missing(reg)) stop("reg must be also specified when k_i is specified")
      assert_that(is.count(k_i))
      assert_that(k_i <= length(reg$outvar))

      n_regs <- length(reg$outvar)
      self$outvar.class <- reg$outvar.class[[k_i]] # Class of the outcome var: binary, categorical, continuous:
      self$outvar <- reg$outvar[[k_i]] # An outcome variable that is being modeled:

      if (self$reg_hazard) {
        # when modeling bin hazard indicators, no need to condition on previous outcomes as they will all be degenerate
        self$predvars <- reg$predvars # Regression covars (predictors):
      } else {
        self$predvars <- c(reg$outvar[-c(k_i:n_regs)], reg$predvars) # Regression covars (predictors):
      }

      # the subset is a list when RegressionClass specifies several regression models at once,
      # obtain the appropriate subset for this regression k_i and set it to self
      if (is.list(reg$subset_vars)) {
        self$subset_vars <- reg$subset_vars[[k_i]]
      }
      if (is.list(reg$intrvls)) {
        outvar_idx <- which(names(reg$intrvls) %in% self$outvar)
        self$intrvls <- reg$intrvls[[outvar_idx]]
      }
      self$S3class <- self$outvar.class # Set class on self for S3 dispatch...
      return(invisible(self))
    },

    resetS3class = function() class(self) <- c("RegressionClass", "R6")
  ),

  active = list(
    # For S3 dispatch on newsummarymodel():
    S3class = function(newclass) {
      if (!missing(newclass)) {
        if (length(newclass) > 1) stop("cannot set S3 class on RegressionClass with more than one outvar variable")
        if (length(class(self)) > 2) stop("S3 dispatch class on RegressionClass has already been set")
        class(self) <- c(class(self), newclass)
      } else {
        return(class(self))
      }
    },

    get.reg = function() {
      list(outvar.class = self$outvar.class,
          outvar = self$outvar,
          predvars = self$predvars,
          subset_vars = self$subset_vars)
    }
  )
)
	
	
## ---------------------------------------------------------------------
#' Generic R6 class for modeling (fitting and predicting) \eqn{P(A|W,E)} where \eqn{A} can be multivariate \eqn{(A[1], ..., A[M])}
#'
#'  \code{GenericModel} defines and models the conditional density \eqn{P(A=a|W=w,E=e)}, where \eqn{a} are generated under \eqn{g_star}
#'  or \eqn{g_0}. By calling \code{self$new(reg)}, it utilizes estimation options defined in \code{\link{RegressionClass}} class, and 
#'  automatically factorizes the multivariate conditional probability model \eqn{P(A|W,E)} into \eqn{M} univariate conditional probability 
#'  models (can be binary, categorical or continuous) and finally an entire tree of binary regression models (see details in
#'  \code{\link{tmleCommunity}}), where a new instance of \code{\link{BinaryOutModel}} class will be initialized for each binary 
#'  regression (If one outcome variable \eqn{A[m]} is already binary, then immediately call a new instance of \code{BinaryOutModel}). 
#'  By calling \code{self$fit(data)} and \code{self$predict(newdata)},  where \code{data} and \code{newdata}
#'  are \code{\link{DatKeepClass}} class objects, it fits \eqn{P(A|W,E)} and predicts \eqn{P(A=1|W=w,E=e)}, where values of \eqn{(w,e)}     
#'  are from \code{newdata}. Moreover, it predicts likelihood function \eqn{P(A=a| W=w,E=e)} through \code{self$predictAeqa(newdata)},   
#'  where \eqn{(a,w,e)} are from \code{newdata} (also a \code{DatKeepClass} class).
#'
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#' \item{\code{n_regs}} - Total number of regression models.
#' \item{\code{parfit_allowed}} - Logical. If TRUE, allow parallel computing to fit multivariate outvar.
#' }
#' @section Methods:
#' \describe{
#'   \item{\code{new(reg, ...)}}{Use \code{reg} (a \code{\link{RegressionClass}} object) to instantiate an new object of \code{GenericModel}}
#'   \item{\code{length}}{Get the number of regression models (i.e., the number of exposure viarables)}
#'   \item{\code{getPsAsW.models}}{Get all model objects (one model object per outcome var \code{A[m]})}
#'   \item{\code{getcumprodAeqa}}{Get joint prob as a vector of the cumulative prod over j for \code{P(A[m]=a[m]|W,E)}}
#'   \item{\code{fit(data, savespace = TRUE)}}{...}
#'   \item{\code{copy.fit(Generic.Model)}}{...}
#'   \item{\code{predict(newdata, savespace = TRUE)}}{...}
#'   \item{\code{predictAeqa(newdata, savespace = TRUE, wipeProb = TRUE, ...)}}{...}
#' }
#' @section Active Bindings:
#' \describe{
#'   \item{\code{wipe.alldat(wipeProb = TRUE)}}{...}
#' }
#' @seealso \code{\link{DatKeepClass}}, \code{\link{RegressionClass}}, \code{\link{ContinModel}}, 
#'  \code{\link{CategorModel}}, \code{\link{BinaryOutModel}}
#' @example tests/examples/4_GenericModel_examples.R
#' @export
GenericModel <- R6Class(classname = "GenericModel",
  portable = TRUE,
  class = TRUE,
  public = list(
    n_regs = integer(),        # Total no. of reg. models (Nonparametric / logistic regressions)
    parfit_allowed = FALSE,    # Allow parallel fit of multivar outvar when 1) reg$parfit = TRUE & 2) all.outvar.bin = TRUE
    is.fitted = FALSE,         
    initialize = function(reg, ...) {
      self$n_regs <- length(reg$outvar) # Number of sep. logistic regressions to run
      all.outvar.bin <-  all(reg$outvar.class %in% gvars$sVartypes$bin)

      if (reg$parfit & all.outvar.bin & (self$n_regs > 1)) self$parfit_allowed <- TRUE

      if (gvars$verbose) {
        print("#----------------------------------------------------------------------------------")
        print("New instance of GenericModel:")
        print("#----------------------------------------------------------------------------------")
        print("Outcomes: " %+% paste(reg$outvar, collapse = ", "))
        print("Predictors: " %+% paste(reg$predvars, collapse = ", "))
        print("No. of regressions: " %+% self$n_regs)
        print("All outcomes binary? " %+% all.outvar.bin)
        if (self$parfit_allowed) print("Performing parallel fits: " %+% self$parfit_allowed)
        print("#----------------------------------------------------------------------------------")
      }

      # factorize the joint into univariate regressions, by dimensionality of the outcome variable (A_nms):
      for (k_i in 1:self$n_regs) {
	reg_i <- reg$clone()
	reg_i$ChangeManyToOneRegresssion(k_i, reg)
        # Calling the constructor for the summary model P(A[j]|\bar{A}[j-1], W}), dispatching on reg_i class
	regS3class <- reg_i$S3class
        if (is.null(regS3class)) {
          reg_i$S3class <- "generic"; class(reg_i$S3class) <- "generic"
        }
        PsAsW.model <- newsummarymodel(reg = reg_i, ...)
        private$PsAsW.models <- append(private$PsAsW.models, list(PsAsW.model))
        names(private$PsAsW.models)[k_i] <- "P(A|W)."%+%k_i
      }
      invisible(self)
    },
    length = function(){ base::length(private$PsAsW.models) },
    getPsAsW.models = function() { private$PsAsW.models },  # get all summary model objects (one model object per outcome var A[j])
    getcumprodAeqa = function() { private$cumprodAeqa },  # get joint prob as a vector of the cumulative prod over j for P(A[j]=a[j]|W)

    # -------------------------------------------------------------------------------------
    # Methods for fitting regression models - fit, copy.fit
    # -------------------------------------------------------------------------------------
    fit = function(data, savespace = TRUE) {  
      assert_that(is.DatKeepClass(data))
      # serial loop over all regressions in PsAsW.models:
      if (!self$parfit_allowed) {
        for (k_i in seq_along(private$PsAsW.models)) {
          private$PsAsW.models[[k_i]]$fit(data = data, savespace = savespace)
        }
        # parallel loop over all regressions in PsAsW.models:
      } else if (self$parfit_allowed) {
        val <- checkpkgs(pkgs=c("foreach", "doParallel", "matrixStats"))
        mcoptions <- list(preschedule = FALSE)
        # NOTE: Each fitRes[[k_i]] will contain a copy of every single R6 object that was passed by reference ->
        # *** the size of fitRes is 100x the size of private$PsAsW.models ***
        fitRes <- foreach::foreach(k_i = seq_along(private$PsAsW.models), .options.multicore = mcoptions) %dopar% {
          private$PsAsW.models[[k_i]]$fit(data = data, savespace = savespace)
        }
        # copy the fits one by one from BinOutModels above into private field for BinOutModels
        for (k_i in seq_along(private$PsAsW.models)) {
          private$PsAsW.models[[k_i]]$copy.fit(fitRes[[k_i]])
        }
      }
      invisible(self)
    },  

    # take fitted GenericModel class object as an input and copy all the model fits down the line
    copy.fit = function(Generic.Model) {
      assert_that("GenericModel" %in% class(Generic.Model))
      private$PsAsW.models <- Generic.Model$getPsAsW.models()
      self$is.fitted <- TRUE
      invisible(self)
    },      
	      
    # P(A=1|W=w): uses private$m.fit to generate predictions
    predict = function(newdata, savespace = TRUE) {
      if (missing(newdata)) stop("must provide newdata")
      assert_that(is.DatKeepClass(newdata))
      # serial loop over all regressions in PsAsW.models:
      if (!self$parfit_allowed) { 
        for (k_i in seq_along(private$PsAsW.models)) {
	  private$PsAsW.models[[k_i]]$predict(newdata = newdata, savespace = savespace)
	}
      # parallel loop over all regressions in PsAsW.models:
      } else if (self$parfit_allowed) {
        val <- checkpkgs(pkgs=c("foreach", "doParallel", "matrixStats"))
        mcoptions <- list(preschedule = FALSE)
        # NOTE: Each predRes[[k_i]] will contain a copy of every single R6 object that was passed by reference ->
        # *** the size of fitRes is 100x the size of private$PsAsW.models ***
        predRes <- foreach::foreach(k_i = seq_along(private$PsAsW.models), .options.multicore = mcoptions) %dopar% {
          private$PsAsW.models[[k_i]]$predict(newdata = newdata, savespace = savespace)
        }
        # copy the predictions one by one from BinOutModels above into private field for BinOutModels
        for (k_i in seq_along(private$PsAsW.models)) {
          private$PsAsW.models[[k_i]]$copy.predict(predRes[[k_i]])
        }
      }
      invisible(self)
    },

    # WARNING: This method cannot be chained together with other methods (s.a, class$predictAeqa()$fun())
    # Uses daughter objects (stored from prev call to fit()) to get predictions for P(A=obsdat.A|W=w)
    # Invisibly returns the joint probability P(A=a|W=w), also saves it as a private field "cumprodAeqa"
    # P(A=a|W=w) - calculating the likelihood for obsdat.A[i] (n vector of a's):
    predictAeqa = function(newdata, savespace = TRUE, wipeProb = TRUE, ...) {
      if (missing(newdata)) stop("must provide newdata")
      assert_that(is.DatKeepClass(newdata))
      n <- newdata$nobs
      if (!self$parfit_allowed) {
	cumprodAeqa <- rep.int(1L, n)
        # loop over all regressions in PsAsW.models:
	for (k_i in seq_along(private$PsAsW.models)) {
	  cumprodAeqa <- cumprodAeqa * private$PsAsW.models[[k_i]]$predictAeqa(newdata = newdata, savespace = savespace, wipeProb = wipeProb, ...)
	}
      } else if (self$parfit_allowed) {
        val <- checkpkgs(pkgs=c("foreach", "doParallel", "matrixStats"))
        mcoptions <- list(preschedule = TRUE)
        probAeqa_list <- foreach::foreach(k_i = seq_along(private$PsAsW.models), .options.multicore = mcoptions) %dopar% {
          private$PsAsW.models[[k_i]]$predictAeqa(newdata = newdata, savespace = savespace, wipeProb = wipeProb, ...)
        }
        # cbind_t <- system.time(
        probAeqa_mat <- do.call('cbind', probAeqa_list)
        # rowProds_t <- system.time(
        cumprodAeqa <- matrixStats::rowProds(probAeqa_mat)
      }
      private$cumprodAeqa <- cumprodAeqa
      return(cumprodAeqa)
    }
  ),
  
  active = list(
    # recursively call all saved daughter model fits and wipe out any traces of saved data
    wipe.alldat = function(wipeProb = TRUE) {
      for (k_i in seq_along(private$PsAsW.models)) {
	private$PsAsW.models[[k_i]]$wipe.alldat <- wipeProb
      }
      return(self)
    }
  ),

  private = list(
    deep_clone = function(name, value) {
      # if value is is an environment, quick way to copy:
      # list2env(as.list.environment(value, all.names = TRUE), parent = emptyenv())
      # if a list of R6 objects, make a deep copy of each:
      if (name == "PsAsW.models") {
        lapply(value, function(PsAsW.model) PsAsW.model$clone(deep=TRUE))
        # to check the value is an R6 object:
      } else if (inherits(value, "R6")) {
        value$clone(deep=TRUE)
      } else {
        value  # For all other fields, just return the value  
      }
    },
    PsAsW.models = list(),
    fitted.pbins = list(),
    cumprodAeqa = NULL
  )
)
	      

# ---------------------------------------- def_regs_subset ----------------------------------
# Purpose: Define subset evaluation for new bins:
# same code in ContinModel$new and CategorModel$new replaced with outside function:
# -------------------------------------------------------------------------------------------
def_regs_subset <- function(self) {
  bin_regs <- self$reg$clone() # instead of defining new RegressionClass now cloning parent reg object and then ADDING new SETTINGS
  bin_regs$reg_hazard <- TRUE # don't add degenerate bins as predictors in each binary regression
  
  if (!self$reg$pool_cont) {
    bin_regs$outvar.class <- as.list(rep_len(gvars$sVartypes$bin, self$nbins))
    bin_regs$outvar <- self$bin_nms 
    bin_regs$predvars <- self$reg$predvars
    bin_regs$subset_vars <- lapply(self$bin_nms, function(var) { c(var, self$reg$subset_vars)})
    names(bin_regs$subset_vars) <- names(bin_regs$outvar.class) <- self$bin_nms
    
  } else {  # Same but when pooling across bin indicators:
    bin_regs$outvar.class <- gvars$sVartypes$bin
    bin_regs$outvar <- self$outvar
    bin_regs$outvars_to_pool <- self$bin_nms
    if (gvars$verbose)  {
      print("pooled bin_regs$outvar: "); print(bin_regs$outvar)
      print("bin_regs$outvars_to_pool: "); print(bin_regs$outvars_to_pool)
      print("bin_regs$subset_vars: "); print(bin_regs$subset_vars)
    }
  }
  bin_regs$resetS3class()
  return(bin_regs)
}
	      
	      
## -------------------------------------------------------------------------------------------
#' R6 class for modeling (fitting and predicting) joint probability for a univariate continuous outcome \code{A[m]}
#'
#'  \code{ContinModel} inherits from \code{\link{GenericModel}} class, defining and modeling a joint conditional density 
#'  \eqn{P(A[m]|W,E,...)} where \code{A[m]} is univariate and continuous. By calling \code{self$new()}, \code{A[m]} will be 
#'  discretized into \code{nbins} bins via one of the 3 bin cutoff approaches (See Details for \code{\link{tmleCommunity}}). 
#'  By calling \code{self$fit()}, it fits hazard regressoin \code{Bin_A[m][k] ~ W + E} on \code{data} (a \code{\link{DatKeepClass}} 
#'  class), which is the hazard probaility of the the observation of \code{A[m]} belongs to bin \code{Bin_A[m][k]}, given covariates 
#'  \eqn{(W, E)} and that observation doesn't belong to any precedent bins \code{Bin_A[m][1]}, \code{Bin_A[m][2]}, ..., 
#'  \code{Bin_A[m][k-1]}.
#' 
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#'  \item{\code{reg}} - .
#'  \item{\code{outvar}} - .
#'  \item{\code{nbins}} - 
#'  \item{\code{bin_nms}} - Character vector of column names of bin indicators.
#'  \item{\code{intrvls}} - 
#'  \item{\code{intrvls.width}} - 
#'  \item{\code{bin_weights}} - .
#' }
#' @section Methods:
#' \describe{
#'   \item{\code{new(reg, DataStorageClass.g0, DataStorageClass.gstar, ...)}}{Instantiate an new instance of \code{ContinModel} for a univariate continuous outcome \code{A[m]}}
#'   \item{\code{fit(data, savespace = TRUE)}}{...}
#'   \item{\code{predict(newdata, savespace = TRUE)}}{...}
#'   \item{\code{predictAeqa(newdata, savespace = TRUE, wipeProb = TRUE)}}{...}
#' }
#' @section Active Bindings:
#' \describe{
#'   \item{\code{cats}}{...}
#' }
#' @seealso \code{\link{DatKeepClass}}, \code{\link{RegressionClass}}, \code{\link{GenericModel}}, \code{\link{BinaryOutModel}}
#' @export
ContinModel <- R6Class(classname = "ContinModel",
  inherit = GenericModel,
  portable = TRUE,
  class = TRUE,
  public = list(
    reg = NULL,
    outvar = character(),        # The name of the continous outcome var (A[m])
    nbins = NULL,                # Actual nbins used, for cont. outvar
    bin_nms = character(),       # Column names for bin indicators
    intrvls = numeric(),         # Vector of numeric cutoffs defining the bins or a named list of numeric intervals 
	                         # (for length(self$outvar) > 1)
    intrvls.width = NULL,        # Named vector of bin-widths (bw_k : k=1,...,K) for each each bin in self$intrvls
    bin_weights = NULL,
    # Define settings for fitting contin A and then call $new for super class (GenericModel)
    initialize = function(reg, DatKeepClass.g0, DatKeepClass.gstar, ...) {
      self$reg <- reg
      self$outvar <- reg$outvar
 
      assert_that(is.DatKeepClass(DatKeepClass.g0))
      self$intrvls <- DatKeepClass.g0$detect.sVar.intrvls(reg$outvar,
                                                          nbins = self$reg$nbins,
                                                          bin_bymass = (getopt("bin.method") %in% "equal.mass"),
                                                          bin_bydhist = (getopt("bin.method") %in% "dhist"),
                                                          max_nperbin = as.integer(getopt("maxNperBin")))
      if (!missing(DatKeepClass.gstar)) {
        assert_that(is.DatKeepClass(DatKeepClass.gstar))
        gstar.intrvls <- DatKeepClass.gstar$detect.sVar.intrvls(reg$outvar,
                                                                nbins = self$reg$nbins,
                                                                bin_bymass = (getopt("bin.method") %in% "equal.mass"),
                                                                bin_bydhist = (getopt("bin.method") %in% "dhist"),
                                                                max_nperbin = as.integer(getopt("maxNperBin")))
        self$intrvls <- unique(sort(union(self$intrvls, gstar.intrvls)))
      }
        
      # Define the number of bins (no. of binary regressions to run),
      self$nbins <- self$reg$nbins <- length(self$intrvls) - 1
      # new outvar var names (bin names); all predvars remain unchanged;                                                    
      self$bin_nms <- DatKeepClass.g0$bin.nms.sVar(reg$outvar, self$nbins)
      # Save bin widths in reg class (naming the vector entries by bin names):
      self$intrvls.width <- diff(self$intrvls)
      self$intrvls.width[self$intrvls.width <= gvars$tolerr] <- 1
      self$reg$intrvls.width <- self$intrvls.width						  
      names(self$reg$intrvls.width) <- names(self$intrvls.width) <- self$bin_nms
      if (gvars$verbose)  {
        print("ContinModel outcome: " %+% self$outvar)
        print("ContinModel nbins: " %+% self$nbins)
      }
      bin_regs <- def_regs_subset(self = self)                                                          
      super$initialize(reg = bin_regs, ...)
    },

    # Transforms data for continous outcome to discretized bins A[j] -> BinsA[1], ..., BinsA[M] and calls $super$fit on that transformed data
    # Gets passed redefined subsets that exclude degenerate Bins (prev subset is defined for names in A - names have changed though)    
    fit = function(data, savespace = TRUE) {
      assert_that(is.DatKeepClass(data))
      # Binirizes & saves binned matrix inside DatNet.sWsA
      data$binirize.sVar(name.sVar = self$outvar, intervals = self$intrvls, nbins = self$nbins, bin.nms = self$bin_nms)
      if (gvars$verbose) {
        print("performing fitting for continuous outcome: " %+% self$outvar)
        print("freq counts by bin for continuous outcome: "); print(table(data$ord.sVar))
        print("binned dataset: "); print(head(cbind(data$ord.sVar, data$dat.bin.sVar), 5))
      }
      super$fit(data, savespace = savespace) # call the parent class fit method
      if (gvars$verbose) message("fit for outcome " %+% self$outvar %+% " succeeded...")
      if (savespace) data$emptydat.bin.sVar # wiping out binirized mat in data DatKeepClass object...
      if (savespace) self$wipe.alldat # wiping out all data traces in ContinModel...
      invisible(self)
    },
    # P(A=1|W=w): uses private$m.fit to generate predictions
    predict = function(newdata, savespace = TRUE) {
      if (gvars$verbose) print("performing prediction for continuous outcome: " %+% self$outvar)
      if (missing(newdata)) stop("must provide newdata")
      assert_that(is.DatKeepClass(newdata))   
      # mat_bin doesn't need to be saved (even though its invisibly returned); 
      # mat_bin is automatically saved in DatKeepClass - a potentially dangerous side-effect!!!
      newdata$binirize.sVar(name.sVar = self$outvar, intervals = self$intrvls, nbins = self$nbins, bin.nms = self$bin_nms)
      super$predict(newdata, savespace = savespace)
      if (savespace) newdata$emptydat.bin.sVar # wiping out binirized mat in newdata DatKeepClass object...
      invisible(self)
    },
    # Convert contin. A vector into matrix of binary cols, then call parent class method: super$predictAeqa()
    # Invisibly return cumm. prob P(A=a|W=w)
    predictAeqa = function(newdata, savespace = TRUE, wipeProb = TRUE) { # P(A=a|W=w) - calculating the likelihood for obsdat.A[i] (n vector of a's)
      if (gvars$verbose) print("performing prediction for continuous outcome: " %+% self$outvar)
      assert_that(is.DatKeepClass(newdata))
      newdata$binirize.sVar(name.sVar = self$outvar, intervals = self$intrvls, nbins = self$nbins, bin.nms = self$bin_nms)
      bws <- newdata$get.sVar.bw(name.sVar = self$outvar, intervals = self$intrvls)
      self$bin_weights <- (1 / bws) # weight based on 1 / (sVar bin widths)
      # Option 1: ADJUST FINAL PROB by bw.j TO OBTAIN density at a point f(sa|sw) = P(sA=sa|sW=sw):
      cumprodAeqa <- super$predictAeqa(newdata = newdata, savespace = savespace, wipeProb = wipeProb) * self$bin_weights
      # Alternative 2: ALso integrate the difference of sA value and its left most bin cutoff: x - b_{j-1} and pass it
      # This is done so that we can integrate the constant hazard all the way to the value of x:
        # * (1 - bw.j.sA_diff*(1/self$bin_weights)*probA1) (discrete)
        # * exp(-bw.j.sA_diff*(1/self$bin_weights)*probA1) (continuous)
      # bw.j.sA_diff <- newdata$get.sVar.bwdiff(name.sVar = self$outvar, intervals = self$intrvls)
      # cumprodAeqa <- super$predictAeqa(newdata = newdata, bw.j.sA_diff = bw.j.sA_diff) * self$bin_weights
      if (savespace) newdata$emptydat.bin.sVar # wiping out binirized mat in newdata object...
      if (savespace) self$bin_weights <- NULL # wiping out self$bin_weights...
      if (savespace) self$wipe.alldat <- wipeProb # wiping out all data traces in ContinModel...
      private$cumprodAeqa <- cumprodAeqa
      return(cumprodAeqa)
    }
  ),
  active = list(
    cats = function() {seq_len(self$nbins)}
  )
)


## ---------------------------------------------------------------------
#' R6 class for modeling (fitting and predicting) joint probability for a univariate categorical outcome \code{A[m]}
#'
#'  \code{CategorModel} inherits from \code{\link{GenericModel}} class, defining and modeling a conditional density \eqn{P(A[m]|W,E...)}
#'  where \code{A[m]} is univariate and categorical. By calling \code{self$new()}, \code{A[m]} will be redefined into number of bins 
#'  \code{length(levels)} (i.e., number of unique categories in \code{A[m]}). By calling \code{self$fit()}, it fits hazard regressoin
#'  \code{Bin_A[m][k] ~ W + E} on \code{data} (a \code{\link{DatKeepClass}} class), which is the hazard probaility of the observation 
#'  of A[m] belongs to bin \code{Bin_A[m][t]}, given covariates \eqn{(W, E)} and that observation doesn't belong to any precedent bins 
#'  \code{Bin_A[m][1]}, \code{Bin_A[m][2]}, ..., \code{Bin_A[m][k-1]}.
#'
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#' \item{\code{reg}} - .
#' \item{\code{outvar}} - .
#' \item{\code{levels}} - Numeric vector of all unique categories in outcome outvar. 
#' \item{\code{nbins}} - .
#' \item{\code{bin_nms}} - .
#' }
#' @section Methods:
#' \describe{
#'   \item{\code{new(reg, DatKeepClass.g0, ...)}}{Instantiate an new instance of \code{CategorModel} for a univariate categorical outcome \code{A[m]}}
#'   \item{\code{fit(data)}}{...}
#'   \item{\code{predict(newdata)}}{...}
#'   \item{\code{predictAeqa(newdata)}}{...}
#' }
#' @section Active Bindings:
#' \describe{
#'   \item{\code{cats}}{...}
#' }
#' @seealso \code{\link{DatKeepClass}}, \code{\link{RegressionClass}}, \code{\link{GenericModel}}, \code{\link{BinaryOutModel}}
#' @export
CategorModel <- R6Class(classname = "CategorModel",
  inherit = GenericModel,
  portable = TRUE,
  class = TRUE,
  public = list(
    reg = NULL,
    outvar = character(),     # the name of the categorical outcome var (A[m])
    levels = numeric(),       # all unique values for A[j] sorted in increasing order
    nbins = integer(),
    bin_nms = character(),
    # Define settings for fitting cat sA and then call $new for super class (SummariesModel)
    initialize = function(reg, DatKeepClass.g0, ...) {
      self$reg <- reg
      self$outvar <- reg$outvar
      # Define the number of bins (no. of binary regressions to run) based on number of unique levels for categorical sVar:
      # all predvars remain unchanged
      assert_that(is.DatKeepClass(DatKeepClass.g0))
      self$levels <- DatKeepClass.g0$detect.cat.sVar.levels(reg$outvar)

      self$nbins <- self$reg$nbins <- length(self$levels)
      self$bin_nms <- DatKeepClass.g0$bin.nms.sVar(reg$outvar, self$nbins)
      if (gvars$verbose)  {
        print("CategorSummaryModel outcome: "%+%self$outvar)
        print("CategorSummaryModel levels: "); print(self$levels)
      }
      bin_regs <- def_regs_subset(self = self)
      super$initialize(reg = bin_regs, ...)
    },
    # Transforms data for categorical outcome to bin indicators A[m] -> BinA[1], ..., BinA[K] and calls $super$fit on that transformed data
    # Gets passed redefined subsets that exclude degenerate Bins (prev subset is defined for names in A - names have changed though)
    fit = function(data, savespace = TRUE) {
      assert_that(is.DatKeepClass(data))
      # Binirizes & saves binned matrix inside DatKeepClass for categorical sVar
      data$binirize.cat.sVar(name.sVar = self$outvar, levels = self$levels)
      if (gvars$verbose) {
        print("performing fitting for categorical outcome: " %+% self$outvar)
        print("freq counts by bin for categorical outcome: "); print(table(data$get.sVar(self$outvar)))
        print("binned dataset: "); print(head(cbind(sA = data$get.sVar(self$outvar), data$dat.bin.sVar), 5))
      }
      super$fit(data, savespace = savespace) # call the parent class fit method
      if (gvars$verbose) message("fit for " %+% self$outvar %+% " var succeeded...")
      if (savespace) data$emptydat.bin.sVar # wiping out binirized mat in data DatKeepClass object...
      # self$wipe.alldat # wiping out all data traces in CategorModel...
      invisible(self)
    },

    # P(A=1|W=w): uses private$m.fit to generate predictions
    predict = function(newdata, savespace = TRUE) {
      if (gvars$verbose) print("performing prediction for categorical outcome: " %+% self$outvar)
      if (missing(newdata)) stop("must provide newdata")  
      assert_that(is.DatKeepClass(newdata))
      newdata$binirize.cat.sVar(name.sVar = self$outvar, levels = self$levels)
      super$predict(newdata, savespace = savespace)
      if (savespace) newdata$emptydat.bin.sVar # wiping out binirized mat in newdata DatKeepClass object...
      invisible(self)
    },

    # Invisibly return cumm. prob P(A=a|W=w)
    # P(A=a|W=w) - calculating the likelihood for obsdat.A[m] (n vector of a's):
    predictAeqa = function(newdata, savespace = TRUE, wipeProb = TRUE) {
      if (gvars$verbose) print("performing prediction for categorical outcome: " %+% self$outvar)
      assert_that(is.DatKeepClass(newdata))
      newdata$binirize.cat.sVar(name.sVar = self$outvar, levels = self$levels)
      cumprodAeqa <- super$predictAeqa(newdata = newdata, savespace = savespace, wipeProb = wipeProb)
      if (savespace) newdata$emptydat.bin.sVar # wiping out binirized mat in newdata object...
      # self$wipe.alldat # wiping out all data traces in CategorModel...
      private$cumprodAeqa <- cumprodAeqa
      return(cumprodAeqa)
    }
  ),
  active = list(
    cats = function() {seq_len(self$nbins)}
  )
)
chizhangucb/tmleCommunity documentation built on May 20, 2019, 3:34 p.m.