Nothing
#----------------------------------------------------------------------------------
# Classes that control modelling of the multivariate joint probability model P(sA|sW).
#----------------------------------------------------------------------------------
# **********
# TO DO (ContinSummaryModel)
# x) (low priority) see how to generalize to pooled fits, k-specific fits, etc (use subset definitions + ?)
# **********
# TO DO (ContinSummaryModel, binirize)
# * (***BUG***) Currently make.bins_mtx_1 fails on binary A with automatic bin detection (all values are placed in last (2nd) bin)
# * Implement regression for categorical outvar: the process is identical to contin, except that normalize, define.intervals() & discretize() is skipped
# * Need to test that make.bins_mtx_1 will do the right thing when x_cat is (0, ..., ncats) instead of (1, ..., ncats) (IT SHOULD WORK)
#' @importFrom assertthat assert_that
# Generic S3 constructor for the summary model classes:
newsummarymodel <- function(reg, DatNet.sWsA.g0, ...) { UseMethod("newsummarymodel") }
# Summary model constructor for binary outcome sA[j]:
newsummarymodel.binary <- function(reg, ...) {
# if (gvars$verbose) print("Calling BinOutModel constructor for binary outcome: " %+% reg$outvar)
BinOutModel$new(reg = reg, ...)
}
# Summary model constructor for continuous outcome sA[j]:
newsummarymodel.contin <- function(reg, DatNet.sWsA.g0, ...) {
# if (gvars$verbose) print("Calling ContinSummaryModel constructor for continuous outcome:" %+% reg$outvar)
ContinSummaryModel$new(reg = reg, DatNet.sWsA.g0 = DatNet.sWsA.g0, ...)
}
# Summary model constructor for categorical outcome sA[j]:
newsummarymodel.categor <- function(reg, DatNet.sWsA.g0, ...) {
# if (gvars$verbose) print("Calling CategorSummaryModel constructor for categorical outcome: " %+% reg$outvar)
CategorSummaryModel$new(reg = reg, DatNet.sWsA.g0 = DatNet.sWsA.g0, ...)
}
## ---------------------------------------------------------------------
#' R6 class that defines regression models evaluating P(sA|sW), for summary measures (sW,sA)
#'
#' This R6 class defines fields and methods that controls all the parameters for non-parametric
#' modeling and estimation of multivariate joint conditional probability model \code{P(sA|sW)} for summary measures \code{(sA,sW)}.
#' Note that \code{sA} can be multivariate and any component of \code{sA[j]} can be either binary, categorical or continuous.
#' The joint probability for \code{P(sA|sA)} = \code{P(sA[1],...,sA[k]|sA)} is first factorized as
#' \code{P(sA[1]|sA)} * \code{P(sA[2]|sA, sA[1])} * ... * \code{P(sA[k]|sA, sA[1],...,sA[k-1])},
#' where each of these conditional probability models is defined by a new instance of a \code{\link{SummariesModel}} class
#' (and a corresponding instance of the \code{RegressionClass} class).
#' If \code{sA[j]} is binary, the conditional probability \code{P(sA[j]|sW,sA[1],...,sA[j-1])} is evaluated via logistic regression model.
#' When \code{sA[j]} is continuous (or categorical), its estimation will be controlled by a new instance of
#' the \code{\link{ContinSummaryModel}} class (or the \code{\link{CategorSummaryModel}} class), as well as the accompanying new instance of the
#' \code{RegressionClass} class. The range of continuous \code{sA[j]} will be fist partitioned into \code{K} bins and the corresponding \code{K}
#' bin indicators (\code{B_1,...,B_K}), with \code{K} new instances of \code{\link{SummariesModel}} class, each instance defining a
#' single logistic regression model for one binary bin indicator outcome \code{B_j} and predictors (\code{sW, sA[1],...,sA[k-1]}).
#' Thus, the first instance of \code{RegressionClass} and \code{SummariesModel} classes will automatically
#' spawn recursive calls to new instances of these classes until the entire tree of binary logistic regressions that defines
#' the joint probability \code{P(sA|sW)} is build.
#'
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#' \item{\code{outvar.class}} - Character vector indicating a class of each outcome var: \code{bin} / \code{cont} / \code{cat}.
#' \item{\code{outvar}} - Character vector of regression outcome variable names.
#' \item{\code{predvars}} - Either a pool of all character predictors (\code{sW}) or regression-specific predictor names.
#' \item{{reg_hazard}} - Logical, if TRUE, the joint probability model P(outvar | predvars) is factorized as
#' \\prod_{j}{P(outvar[j] | predvars)} for each j outvar (for fitting hazard).
#' \item{\code{subset}} - Subset expression (later evaluated to logical vector in the envir of the data).
#' \item{\code{ReplMisVal0}} - Logical, if TRUE all gvars$misval among predicators are replaced with with gvars$misXreplace (0).
#' \item{\code{nbins}} - Integer number of bins used for a continuous outvar, the intervals are defined inside
#' \code{ContinSummaryModel$new()} and then saved in this field.
#' \item{\code{bin_nms}} - Character vector of column names for bin indicators.
#' \item{\code{useglm}} - Logical, if TRUE then fit the logistic regression model using \code{\link{glm.fit}},
#' if FALSE use \code{\link{speedglm.wfit}}..
#' \item{\code{parfit}} - Logical, if TRUE then use parallel \code{foreach::foreach} loop to fit and predict binary logistic
#' regressions (requires registering back-end cluster prior to calling the fit/predict functions)..
#' \item{\code{bin_bymass}} - Logical, for continuous outvar, create bin cutoffs based on equal mass distribution.
#' \item{\code{bin_bydhist}} - Logical, if TRUE, use dhist approach for bin definitions. See Denby and Mallows "Variations on the
#' Histogram" (2009)) for more..
#' \item{\code{max_nperbin}} - Integer, maximum number of observations allowed per one bin.
#' \item{\code{pool_cont}} - Logical, pool binned continuous outvar observations across bins and only fit only regression model
#' across all bins (adding bin_ID as an extra covaraite)..
#' \item{\code{outvars_to_pool}} - Character vector of names of the binned continuous outvars, should match \code{bin_nms}.
#' \item{\code{intrvls.width}} - Named numeric vector of bin-widths (\code{bw_j : j=1,...,M}) for each each bin in \code{self$intrvls}.
#' When \code{sA} is not continuous, \code{intrvls.width} IS SET TO 1. When sA is continuous and this variable \code{intrvls.width}
#' is not here, the intervals are determined inside \code{ContinSummaryModel$new()} and are assigned to this variable as a list,
#' with \code{names(intrvls.width) <- reg$bin_nms}. Can be queried by \code{BinOutModel$predictAeqa()} as: \code{intrvls.width[outvar]}.
#' \item{\code{intrvls}} - Numeric vector of cutoffs defining the bins or a named list of numeric intervals for \code{length(self$outvar) > 1}.
#' \item{\code{cat.levels}} - Numeric vector of all unique values in categorical outcome variable.
#' Set by \code{\link{CategorSummaryModel}} constructor.
#' }
#' @section Methods:
#' \describe{
#' \item{\code{new(outvar.class = gvars$sVartypes$bin,
#' outvar, predvars, subset, intrvls,
#' ReplMisVal0 = TRUE,
#' useglm = getopt("useglm"),
#' parfit = getopt("parfit"),
#' nbins = getopt("nbins"),
#' bin_bymass = getopt("bin.method")%in%"equal.mass",
#' bin_bydhist = getopt("bin.method")%in%"dhist",
#' max_nperbin = getopt("maxNperBin"),
#' pool_cont = getopt("poolContinVar")}}{Uses the arguments to instantiate an object of R6 class and define the future regression model.}
#' \item{\code{ChangeManyToOneRegresssion(k_i, reg)}}{ Take a clone of a parent \code{RegressionClass} (\code{reg}) for \code{length(self$outvar)} regressions
#' and set self to a single univariate \code{k_i} regression for outcome \code{self$outvar[[k_i]]}.}
#' \item{\code{ChangeOneToManyRegresssions(regs_list)}}{ Take the clone of a parent \code{RegressionClass} for univariate (continuous outvar) regression
#' and set self to \code{length(regs_list)} bin indicator outcome regressions.}
#' \item{\code{resetS3class()}}{...}
#' }
#' @section Active Bindings:
#' \describe{
#' \item{\code{S3class}}{...}
#' \item{\code{get.reg}}{...}
#' }
#' @export
RegressionClass <- R6Class("RegressionClass",
class = TRUE,
portable = TRUE,
public = list(
outvar.class = character(), # vector for classes of the outcome vars: bin / cont / cat
outvar = character(), # vector of regression outcome variable names
predvars = character(), # either a pool of all predictors (sW) or regression-specific predictor names
reg_hazard = FALSE, # If TRUE, the joint P(outvar|predvars) is factorized as \prod_{j}{P(outvar[j] | predvars)} for each j outvar (for fitting hazard)
subset = NULL, # subset expression (later evaluated to logical vector in the envir of the data)
ReplMisVal0 = TRUE, # if TRUE all gvars$misval among predicators are replaced with with gvars$misXreplace (0)
nbins = NULL, # actual nbins used, for cont. outvar, defined in ContinSummaryModel$new()
bin_nms = NULL, # column names for bin indicators
useglm = logical(), # TRUE to fit reg with glm.fit(), FALSE to fit with speedglm.wfit
parfit = logical(), # TRUE for fitting binary regressions in parallel
bin_bymass = logical(), # for cont outvar, create bin cutoffs based on equal mass distribution?
bin_bydhist = logical(), # if TRUE, use dhist approach for bin definitions
max_nperbin = integer(), # maximum n observations allowed per binary bin
pool_cont = logical(), # Pool binned cont outvar obs into long format (adding bin_ID as a covaraite)
outvars_to_pool = character(), # Names of the binned continuous sVars, should match bin_nms
intrvls.width = 1L, # Named vector of bin-widths (bw_j : j=1,...,M) for each each bin in self$intrvls
# When sA is not continuous, intrvls.width IS SET TO 1.
# When sA is continuous, intrvls.width is SET TO self$intrvls.width INSIDE ContinSummaryModel$new() with names(intrvls.width) <- reg$bin_nms
# CAN BE QUERIED BY BinOutModel$predictAeqa() as: intrvls.width[outvar]
intrvls = numeric(), # Vector of numeric cutoffs defining the bins or a named list of numeric intervals (for length(self$outvar) > 1)
levels = NULL,
# family = NULL, # (NOT IMPLEMENTED) to run w/ other than "binomial" family
# form = NULL, # (NOT IMPLEMENTED) reg formula, if provided run using the usual glm / speedglm functions
initialize = function(outvar.class = gvars$sVartypes$bin,
outvar, predvars, subset, intrvls,
ReplMisVal0 = TRUE, # Needed to add ReplMisVal0 = TRUE for case sA = (netA, sA[j]) with sA[j] continuous, was causing an error otherwise:
useglm = getopt("useglm"),
parfit = getopt("parfit"),
nbins = getopt("nbins"),
bin_bymass = getopt("bin.method")%in%"equal.mass",
bin_bydhist = getopt("bin.method")%in%"dhist",
max_nperbin = getopt("maxNperBin"),
pool_cont = getopt("poolContinVar")
) {
assert_that(length(outvar.class) == length(outvar))
self$outvar.class <- outvar.class
self$outvar <- outvar
self$predvars <- predvars
self$ReplMisVal0 <- ReplMisVal0
self$useglm <- useglm
self$parfit <- parfit
self$nbins <- nbins
self$bin_bymass <- bin_bymass
self$bin_bydhist <- bin_bydhist
self$max_nperbin <- max_nperbin
self$pool_cont <- pool_cont
n_regs <- length(outvar)
if (!missing(intrvls)) {
assert_that(is.list(intrvls))
assert_that(length(outvar) == length(intrvls))
assert_that(all(names(intrvls) %in% outvar))
self$intrvls <- intrvls
} else {
self$intrvls <- NULL
}
self$levels <- NULL
if (!missing(subset)) {
self$subset <- subset
if (length(subset) < n_regs) {
self$subset <- rep_len(subset, n_regs)
} else if (length(subset) > n_regs) {
# ... TO FINISH ...
if (!is.logical(subset)) stop("not implemented")
# increase n_regs to all combinations of (n_regs x subset)
}
} else {
self$subset <- rep_len(list(TRUE), n_regs)
}
},
# take the clone of a parent RegressionClass (reg) for length(self$outvar) regressions
# and set self to a single univariate k_i regression for outcome self$outvar[[k_i]]
ChangeManyToOneRegresssion = function(k_i, reg) {
assert_that(!missing(k_i))
if (missing(reg)) stop("reg must be also specified when k_i is specified")
assert_that(is.count(k_i))
assert_that(k_i <= length(reg$outvar))
n_regs <- length(reg$outvar)
self$outvar.class <- reg$outvar.class[[k_i]] # Class of the outcome var: binary, categorical, continuous:
self$outvar <- reg$outvar[[k_i]] # An outcome variable that is being modeled:
if (self$reg_hazard) {
# when modeling bin hazard indicators, no need to condition on previous outcomes as they will all be degenerate
self$predvars <- reg$predvars # Regression covars (predictors):
} else {
self$predvars <- c(reg$outvar[-c(k_i:n_regs)], reg$predvars) # Regression covars (predictors):
}
# the subset is a list when RegressionClass specifies several regression models at once,
# obtain the appropriate subset for this regression k_i and set it to self
if (is.list(reg$subset)) {
self$subset <- reg$subset[[k_i]]
}
if (is.list(reg$intrvls)) {
outvar_idx <- which(names(reg$intrvls) %in% self$outvar)
self$intrvls <- reg$intrvls[[outvar_idx]]
}
self$S3class <- self$outvar.class # Set class on self for S3 dispatch...
return(invisible(self))
},
# take the clone of a parent RegressionClass for univariate (cont outvar) regression
# and set self to length(regs_list) bin indicator outcome regressions
ChangeOneToManyRegresssions = function(regs_list) {
self$outvar.class <- regs_list$outvar.class # Vector of class(es) of outcome var(s): binary, categorical, continuous
self$outvar <- regs_list$outvar # An outcome variable that is being modeled:
self$predvars <- regs_list$predvars
self$subset <- regs_list$subset
return(invisible(self))
},
resetS3class = function() class(self) <- c("RegressionClass", "R6")
),
active = list(
# For S3 dispatch on newsummarymodel():
S3class = function(newclass) {
if (!missing(newclass)) {
if (length(newclass) > 1) stop("cannot set S3 class on RegressionClass with more than one outvar variable")
if (length(class(self)) > 2) stop("S3 dispatch class on RegressionClass has already been set")
class(self) <- c(class(self), newclass)
} else {
return(class(self))
}
},
get.reg = function() {
list(outvar.class = self$outvar.class,
outvar = self$outvar,
predvars = self$predvars,
subset = self$subset)
}
)
)
## ---------------------------------------------------------------------
#' R6 class for fitting and predicting model P(sA|sW) under g.star or g.0
#'
#' This R6 class Class for defining, fitting and predicting the probability model
#' \code{P(sA|sW)} under \code{g_star} or under \code{g_0} for summary measures
#' (\code{sW,sA}). Defines and manages the factorization of the multivariate conditional
#' probability model \code{P(sA=sa|...)} into univariate regression models
#' \code{sA[j] ~ sA[j-1] + ... + sA[1] + sW}. The class \code{self$new} method automatically
#' figures out the correct joint probability factorization into univariate conditional
#' probabilities based on name ordering provided by (\code{sA_nms}, \code{sW_nms}).
#' When the outcome variable \code{sA[j]} is binary, this class will automatically call
#' a new instance of \code{\link{BinOutModel}} class.
#' Provide \code{self$fit()} function argument \code{data} as a \code{\link{DatNet.sWsA}} class object.
#' This data will be used for fitting the model \code{P(sA|sW)}.
#' Provide \code{self$fit()} function argument \code{newdata} (also as \code{DatNet.sWsA} class) for predictions of the type
#' \code{P(sA=1|sW=sw)}, where \code{sw} values are coming from \code{newdata} object.
#' Finally, provide \code{self$predictAeqa} function \code{newdata} argument
#' (also \code{DatNet.sWsA} class object) for getting the likelihood predictions \code{P(sA=sa|sW=sw)}, where
#' both, \code{sa} and \code{sw} values are coming from \code{newdata} object.
#'
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#' \item{\code{n_regs}} - .
#' \item{\code{parfit_allowed}} - .
#' }
#' @section Methods:
#' \describe{
#' \item{\code{new(reg, ...)}}{...}
#' \item{\code{length}}{...}
#' \item{\code{getPsAsW.models}}{...}
#' \item{\code{getcumprodAeqa}}{...}
#' \item{\code{copy.fit(SummariesModel)}}{...}
#' \item{\code{fit(data)}}{...}
#' \item{\code{predict(newdata)}}{...}
#' \item{\code{predictAeqa(newdata, ...)}}{...}
#' }
#' @section Active Bindings:
#' \describe{
#' \item{\code{wipe.alldat}}{...}
#' }
#' @export
SummariesModel <- R6Class(classname = "SummariesModel",
portable = TRUE,
class = TRUE,
public = list(
n_regs = integer(), # total no. of reg. models (logistic regressions)
parfit_allowed = FALSE, # allow parallel fit of multivar outvar when 1) reg$parfit = TRUE & 2) all.outvar.bin = TRUE
initialize = function(reg, ...) {
self$n_regs <- length(reg$outvar) # Number of sep. logistic regressions to run
all.outvar.bin <- all(reg$outvar.class %in% gvars$sVartypes$bin)
if (reg$parfit & all.outvar.bin & (self$n_regs > 1)) self$parfit_allowed <- TRUE
if (gvars$verbose) {
print("#----------------------------------------------------------------------------------")
print("New instance of SummariesModel:")
print("#----------------------------------------------------------------------------------")
print("Outcomes: " %+% paste(reg$outvar, collapse = ", "))
print("Predictors: " %+% paste(reg$predvars, collapse = ", "))
print("No. of regressions: " %+% self$n_regs)
print("All outcomes binary? " %+% all.outvar.bin)
if (self$parfit_allowed) print("Performing parallel fits: " %+% self$parfit_allowed)
print("#----------------------------------------------------------------------------------")
}
# factorize the joint into univariate regressions, by dimensionality of the outcome variable (sA_nms):
for (k_i in 1:self$n_regs) {
reg_i <- reg$clone()
reg_i$ChangeManyToOneRegresssion(k_i, reg)
# Calling the constructor for the summary model P(sA[j]|\bar{sA}[j-1], sW}), dispatching on reg_i class
PsAsW.model <- newsummarymodel(reg = reg_i, ...)
private$PsAsW.models <- append(private$PsAsW.models, list(PsAsW.model))
names(private$PsAsW.models)[k_i] <- "P(sA|sW)."%+%k_i
}
invisible(self)
},
length = function(){ base::length(private$PsAsW.models) },
getPsAsW.models = function() { private$PsAsW.models }, # get all summary model objects (one model object per outcome var sA[j])
getcumprodAeqa = function() { private$cumprodAeqa }, # get joint prob as a vector of the cumulative prod over j for P(sA[j]=a[j]|sW)
# # **********************************************************************
# # TO DO: Add $copy.fit(SummariesModel) method that will propagate copy all the model fits down the line
# copy.fit = function(SummariesModel) {},
# # **********************************************************************
fit = function(data) {
assert_that(is.DatNet.sWsA(data))
# serial loop over all regressions in PsAsW.models:
if (!self$parfit_allowed) {
for (k_i in seq_along(private$PsAsW.models)) {
private$PsAsW.models[[k_i]]$fit(data = data)
}
# parallel loop over all regressions in PsAsW.models:
} else if (self$parfit_allowed) {
val <- checkpkgs(pkgs=c("foreach", "doParallel", "matrixStats"))
mcoptions <- list(preschedule = FALSE)
# NOTE: Each fitRes[[k_i]] will contain a copy of every single R6 object that was passed by reference ->
# *** the size of fitRes is 100x the size of private$PsAsW.models ***
fitRes <- foreach::foreach(k_i = seq_along(private$PsAsW.models), .options.multicore = mcoptions) %dopar% {
private$PsAsW.models[[k_i]]$fit(data = data)
}
# copy the fits one by one from BinOutModels above into private field for BinOutModels
for (k_i in seq_along(private$PsAsW.models)) {
private$PsAsW.models[[k_i]]$copy.fit(fitRes[[k_i]])
}
}
invisible(self)
},
# TO DO rename to:
# predictP_1 = function(newdata) { # P(A^s=1|W^s=w^s): uses private$m.fit to generate predictions
predict = function(newdata) {
if (missing(newdata)) {
stop("must provide newdata")
}
assert_that(is.DatNet.sWsA(newdata))
# serial loop over all regressions in PsAsW.models:
if (!self$parfit_allowed) {
for (k_i in seq_along(private$PsAsW.models)) {
private$PsAsW.models[[k_i]]$predict(newdata = newdata)
}
# parallel loop over all regressions in PsAsW.models:
} else if (self$parfit_allowed) {
val <- checkpkgs(pkgs=c("foreach", "doParallel", "matrixStats"))
mcoptions <- list(preschedule = FALSE)
# NOTE: Each predRes[[k_i]] will contain a copy of every single R6 object that was passed by reference ->
# *** the size of fitRes is 100x the size of private$PsAsW.models ***
predRes <- foreach::foreach(k_i = seq_along(private$PsAsW.models), .options.multicore = mcoptions) %dopar% {
private$PsAsW.models[[k_i]]$predict(newdata = newdata)
}
# copy the predictions one by one from BinOutModels above into private field for BinOutModels
for (k_i in seq_along(private$PsAsW.models)) {
private$PsAsW.models[[k_i]]$copy.predict(predRes[[k_i]])
}
}
invisible(self)
},
# WARNING: This method cannot be chained together with other methods (s.a, class$predictAeqa()$fun())
# Uses daughter objects (stored from prev call to fit()) to get predictions for P(sA=obsdat.sA|sW=sw)
# Invisibly returns the joint probability P(sA=sa|sW=sw), also saves it as a private field "cumprodAeqa"
# P(A^s=a^s|W^s=w^s) - calculating the likelihood for obsdat.sA[i] (n vector of a's):
predictAeqa = function(newdata, ...) {
assert_that(!missing(newdata))
assert_that(is.DatNet.sWsA(newdata))
n <- newdata$nobs
if (!self$parfit_allowed) {
cumprodAeqa <- rep.int(1, n)
# loop over all regressions in PsAsW.models:
for (k_i in seq_along(private$PsAsW.models)) {
cumprodAeqa <- cumprodAeqa * private$PsAsW.models[[k_i]]$predictAeqa(newdata = newdata, ...)
}
} else if (self$parfit_allowed) {
val <- checkpkgs(pkgs=c("foreach", "doParallel", "matrixStats"))
mcoptions <- list(preschedule = TRUE)
probAeqa_list <- foreach::foreach(k_i = seq_along(private$PsAsW.models), .options.multicore = mcoptions) %dopar% {
private$PsAsW.models[[k_i]]$predictAeqa(newdata = newdata, ...)
}
# cbind_t <- system.time(
probAeqa_mat <- do.call('cbind', probAeqa_list)
# )
# rowProds_t <- system.time(
cumprodAeqa <- matrixStats::rowProds(probAeqa_mat)
# )
}
private$cumprodAeqa <- cumprodAeqa
return(cumprodAeqa)
}
),
active = list(
# recursively call all saved daughter model fits and wipe out any traces of saved data
wipe.alldat = function() {
for (k_i in seq_along(private$PsAsW.models)) {
private$PsAsW.models[[k_i]]$wipe.alldat
}
return(self)
}
),
private = list(
deep_clone = function(name, value) {
# if value is is an environment, quick way to copy:
# list2env(as.list.environment(value, all.names = TRUE), parent = emptyenv())
# if a list of R6 objects, make a deep copy of each:
if (name == "PsAsW.models") {
lapply(value, function(PsAsW.model) PsAsW.model$clone(deep=TRUE))
# to check the value is an R6 object:
} else if (inherits(value, "R6")) {
value$clone(deep=TRUE)
} else {
# For all other fields, just return the value
value
}
},
PsAsW.models = list(),
fitted.pbins = list(),
cumprodAeqa = NULL
)
)
# -------------------------------------------------------------------------------------------
# same code in ContinSummaryModel$new and CategorSummaryModel$new replaced with outside function:
# Define subset evaluation for new bins:
# -------------------------------------------------------------------------------------------
def_regs_subset <- function(self) {
bin_regs <- self$reg$clone() # instead of defining new RegressionClass now cloning parent reg object and then ADDING new SETTINGS
bin_regs$reg_hazard <- TRUE # don't add degenerate bins as predictors in each binary regression
if (!self$reg$pool_cont) {
add.oldsubset <- TRUE
new.subsets <- lapply(self$reg$bin_nms,
function(var) {
res <- var
if (add.oldsubset) res <- c(res, self$reg$subset)
res
})
new.sAclass <- as.list(rep_len(gvars$sVartypes$bin, self$reg$nbins))
names(new.sAclass) <- self$reg$bin_nms
bin_regs$ChangeOneToManyRegresssions(regs_list = list(outvar.class = new.sAclass,
outvar = self$reg$bin_nms,
predvars = self$reg$predvars,
subset = new.subsets))
# Same but when pooling across bin indicators:
} else {
bin_regs$outvar.class <- gvars$sVartypes$bin
bin_regs$outvar <- self$outvar
bin_regs$outvars_to_pool <- self$reg$bin_nms
if (gvars$verbose) {
print("pooled bin_regs$outvar: "); print(bin_regs$outvar)
print("bin_regs$outvars_to_pool: "); print(bin_regs$outvars_to_pool)
print("bin_regs$subset: "); print(bin_regs$subset)
}
}
bin_regs$resetS3class()
return(bin_regs)
}
# -------------------------------------------------------------------------------------------
## ---------------------------------------------------------------------
#' R6 class for fitting and predicting joint probability for a univariate continuous summary measure sA[j]
#'
#' This R6 class defines and fits a conditional probability model \code{P(sA[j]|sW,...)} for a univariate
#' continuous summary measure \code{sA[j]}. This class inherits from \code{\link{SummariesModel}} class.
#' Defines the fitting algorithm for a regression model \code{sA[j] ~ sW + ...}.
#' Reconstructs the likelihood \code{P(sA[j]=sa[j]|sW,...)} afterwards.
#' Continuous \code{sA[j]} is discretized using either of the 3 interval cutoff methods,
#' defined via \code{\link{RegressionClass}} object \code{reg} passed to this class constructor.
#' The fitting algorithm estimates the binary regressions for hazard \code{Bin_sA[j][i] ~ sW},
#' i.e., the probability that continuous \code{sA[j]} falls into bin \code{i}, \code{Bin_sA[j]_i},
#' given that \code{sA[j]} does not belong to any prior bins \code{Bin_sA[j]_1, ..., Bin_sA[j]_{i-1}}.
#' The dataset of discretized summary measures (\code{BinsA[j]_1,...,BinsA[j]_M}) is created
#' inside the passed \code{data} or \code{newdata} object while discretizing \code{sA[j]} into \code{M} bins.
#'
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#' \item{\code{reg}} - .
#' \item{\code{outvar}} - .
#' \item{\code{intrvls}} - .
#' \item{\code{intrvls.width}} - .
#' \item{\code{bin_weights}} - .
#' }
#' @section Methods:
#' \describe{
#' \item{\code{new(reg, DatNet.sWsA.g0, DatNet.sWsA.gstar, ...)}}{...}
#' \item{\code{fit(data)}}{...}
#' \item{\code{predict(newdata)}}{...}
#' \item{\code{predictAeqa(newdata)}}{...}
#' }
#' @section Active Bindings:
#' \describe{
#' \item{\code{cats}}{...}
#' }
#' @export
ContinSummaryModel <- R6Class(classname = "ContinSummaryModel",
inherit = SummariesModel,
portable = TRUE,
class = TRUE,
public = list(
reg = NULL,
outvar = character(), # the name of the continous outcome var (sA[j])
intrvls = NULL,
intrvls.width = NULL,
bin_weights = NULL,
# Define settings for fitting contin sA and then call $new for super class (SummariesModel)
initialize = function(reg, DatNet.sWsA.g0, DatNet.sWsA.gstar, ...) {
self$reg <- reg
self$outvar <- reg$outvar
if (is.null(reg$intrvls)) {
assert_that(is.DatNet.sWsA(DatNet.sWsA.g0))
self$intrvls <- DatNet.sWsA.g0$detect.sVar.intrvls(reg$outvar,
nbins = self$reg$nbins,
bin_bymass = self$reg$bin_bymass,
bin_bydhist = self$reg$bin_bydhist,
max_nperbin = self$reg$max_nperbin)
if (!missing(DatNet.sWsA.gstar)) {
assert_that(is.DatNet.sWsA(DatNet.sWsA.gstar))
gstar.intrvls <- DatNet.sWsA.gstar$detect.sVar.intrvls(reg$outvar,
nbins = self$reg$nbins,
bin_bymass = self$reg$bin_bymass,
bin_bydhist = self$reg$bin_bydhist,
max_nperbin = self$reg$max_nperbin)
self$intrvls <- unique(sort(union(self$intrvls, gstar.intrvls)))
}
# Define the number of bins (no. of binary regressions to run),
# new outvar var names (bin names); all predvars remain unchanged;
self$reg$intrvls <- self$intrvls
} else {
self$intrvls <- self$reg$intrvls
}
self$reg$nbins <- length(self$intrvls) - 1
self$reg$bin_nms <- DatNet.sWsA.g0$bin.nms.sVar(reg$outvar, self$reg$nbins)
# Save bin widths in reg class (naming the vector entries by bin names):
self$intrvls.width <- diff(self$intrvls)
self$intrvls.width[self$intrvls.width <= gvars$tolerr] <- 1
self$reg$intrvls.width <- self$intrvls.width
names(self$reg$intrvls.width) <- names(self$intrvls.width) <- self$reg$bin_nms
if (gvars$verbose) {
print("ContinSummaryModel outcome: "%+%self$outvar)
# print("ContinSummaryModel reg$nbins: " %+% self$reg$nbins)
}
bin_regs <- def_regs_subset(self = self)
super$initialize(reg = bin_regs, ...)
},
# Transforms data for continous outcome to discretized bins sA[j] -> BinsA[1], ..., BinsA[M] and calls $super$fit on that transformed data
# Gets passed redefined subsets that exclude degenerate Bins (prev subset is defined for names in sA - names have changed though)
fit = function(data) {
assert_that(is.DatNet.sWsA(data))
# Binirizes & saves binned matrix inside DatNet.sWsA
data$binirize.sVar(name.sVar = self$outvar, intervals = self$intrvls, nbins = self$reg$nbins, bin.nms = self$reg$bin_nms)
if (gvars$verbose) {
print("performing fitting for continuous outcome: " %+% self$outvar)
print("freq counts by bin for continuous outcome: "); print(table(data$ord.sVar))
print("binned dataset: "); print(head(cbind(data$ord.sVar, data$dat.bin.sVar), 5))
}
super$fit(data) # call the parent class fit method
if (gvars$verbose) message("fit for outcome " %+% self$outvar %+% " succeeded...")
data$emptydat.bin.sVar # wiping out binirized mat in data DatNet.sWsA object...
self$wipe.alldat # wiping out all data traces in ContinSummaryModel...
invisible(self)
},
# TO DO: rename to:
# predictP_1 = function(newdata) { # P(A^s=1|W^s=w^s): uses private$m.fit to generate predictions
predict = function(newdata) {
if (missing(newdata)) {
stop("must provide newdata")
}
assert_that(is.DatNet.sWsA(newdata))
if (gvars$verbose) print("performing prediction for continuous outcome: " %+% self$outvar)
# mat_bin doesn't need to be saved (even though its invisibly returned); mat_bin is automatically saved in datnet.sW.sA - a potentially dangerous side-effect!!!
newdata$binirize.sVar(name.sVar = self$outvar, intervals = self$intrvls, nbins = self$reg$nbins, bin.nms = self$reg$bin_nms)
super$predict(newdata)
newdata$emptydat.bin.sVar # wiping out binirized mat in newdata DatNet.sWsA object...
invisible(self)
},
# Convert contin. sA vector into matrix of binary cols, then call parent class method: super$predictAeqa()
# Invisibly return cumm. prob P(sA=sa|sW=sw)
predictAeqa = function(newdata) { # P(A^s=a^s|W^s=w^s) - calculating the likelihood for obsdat.sA[i] (n vector of a's)
assert_that(is.DatNet.sWsA(newdata))
newdata$binirize.sVar(name.sVar = self$outvar, intervals = self$intrvls, nbins = self$reg$nbins, bin.nms = self$reg$bin_nms)
if (gvars$verbose) print("performing prediction for categorical outcome: " %+% self$outvar)
bws <- newdata$get.sVar.bw(name.sVar = self$outvar, intervals = self$intrvls)
self$bin_weights <- (1 / bws) # weight based on 1 / (sVar bin widths)
# Option 1: ADJUST FINAL PROB by bw.j TO OBTAIN density at a point f(sa|sw) = P(sA=sa|sW=sw):
cumprodAeqa <- super$predictAeqa(newdata = newdata) * self$bin_weights
# Alternative 2: ALso integrate the difference of sA value and its left most bin cutoff: x - b_{j-1} and pass it
# This is done so that we can integrate the constant hazard all the way to the value of x:
# * (1 - bw.j.sA_diff*(1/self$bin_weights)*probA1) (discrete)
# * exp(-bw.j.sA_diff*(1/self$bin_weights)*probA1) (continuous)
# bw.j.sA_diff <- newdata$get.sVar.bwdiff(name.sVar = self$outvar, intervals = self$intrvls)
# cumprodAeqa <- super$predictAeqa(newdata = newdata, bw.j.sA_diff = bw.j.sA_diff) * self$bin_weights
newdata$emptydat.bin.sVar # wiping out binirized mat in newdata object...
self$bin_weights <- NULL # wiping out self$bin_weights...
self$wipe.alldat # wiping out all data traces in ContinSummaryModel...
private$cumprodAeqa <- cumprodAeqa
return(cumprodAeqa)
}
),
active = list(
cats = function() {seq_len(self$reg$nbins)}
)
)
## ---------------------------------------------------------------------
#' R6 class for fitting and predicting joint probability for a univariate categorical summary measure sA[j]
#'
#' This R6 class defines and fits a conditional probability model \code{P(sA[j]|sW,...)} for a univariate
#' categorical summary measure \code{sA[j]}. This class inherits from \code{\link{SummariesModel}} class.
#' Defines the fitting algorithm for a regression model \code{sA[j] ~ sW + ...}.
#' Reconstructs the likelihood \code{P(sA[j]=sa[j]|sW,...)} afterwards.
#' Categorical \code{sA[j]} is first redefined into \code{length(levels)} bin indicator variables, where
#' \code{levels} is a numeric vector of all unique categories in \code{sA[j]}.
#' The fitting algorithm estimates the binary regressions for hazard for each bin indicator, \code{Bin_sA[j][i] ~ sW},
#' i.e., the probability that categorical \code{sA[j]} falls into bin \code{i}, \code{Bin_sA[j]_i},
#' given that \code{sA[j]} does not fall in any prior bins \code{Bin_sA[j]_1, ..., Bin_sA[j]_{i-1}}.
#' The dataset of bin indicators (\code{BinsA[j]_1,...,BinsA[j]_M}) is created
#' inside the passed \code{data} or \code{newdata} object when defining \code{length(levels)} bins for \code{sA[j]}.
#'
#' @docType class
#' @format An \code{\link{R6Class}} generator object
#' @keywords R6 class
#' @details
#' \itemize{
#' \item{\code{reg}} - .
#' \item{\code{outvar}} - .
#' \item{\code{levels}} - .
#' \item{\code{nbins}} - .
#' }
#' @section Methods:
#' \describe{
#' \item{\code{new(reg, DatNet.sWsA.g0, ...)}}{...}
#' \item{\code{fit(data)}}{...}
#' \item{\code{predict(newdata)}}{...}
#' \item{\code{predictAeqa(newdata)}}{...}
#' }
#' @section Active Bindings:
#' \describe{
#' \item{\code{cats}}{...}
#' }
#' @export
CategorSummaryModel <- R6Class(classname = "CategorSummaryModel",
inherit = SummariesModel,
portable = TRUE,
class = TRUE,
public = list(
reg = NULL,
outvar = character(), # the name of the categorical outcome var (sA[j])
levels = numeric(), # all unique values for sA[j] sorted in increasing order
nbins = integer(),
# Define settings for fitting cat sA and then call $new for super class (SummariesModel)
initialize = function(reg, DatNet.sWsA.g0, ...) {
self$reg <- reg
self$outvar <- reg$outvar
# Define the number of bins (no. of binary regressions to run) based on number of unique levels for categorical sVar:
# all predvars remain unchanged
if (is.null(reg$levels)) {
assert_that(is.DatNet.sWsA(DatNet.sWsA.g0))
self$levels <- self$reg$levels <- DatNet.sWsA.g0$detect.cat.sVar.levels(reg$outvar)
} else {
self$levels <- self$reg$levels
}
self$nbins <- self$reg$nbins <- length(self$levels)
self$reg$bin_nms <- DatNet.sWsA.g0$bin.nms.sVar(reg$outvar, self$reg$nbins)
if (gvars$verbose) {
print("CategorSummaryModel outcome: "%+%self$outvar)
# print("CategorSummaryModel reg$levels: "); print(self$reg$levels)
# print("CategorSummaryModel reg$nbins: " %+% self$reg$nbins)
}
bin_regs <- def_regs_subset(self = self)
super$initialize(reg = bin_regs, ...)
},
# Transforms data for categorical outcome to bin indicators sA[j] -> BinsA[1], ..., BinsA[M] and calls $super$fit on that transformed data
# Gets passed redefined subsets that exclude degenerate Bins (prev subset is defined for names in sA - names have changed though)
fit = function(data) {
assert_that(is.DatNet.sWsA(data))
# Binirizes & saves binned matrix inside DatNet.sWsA for categorical sVar
data$binirize.cat.sVar(name.sVar = self$outvar, levels = self$levels)
if (gvars$verbose) {
print("performing fitting for categorical outcome: " %+% self$outvar)
print("freq counts by bin for categorical outcome: "); print(table(data$get.sVar(self$outvar)))
print("binned dataset: "); print(head(cbind(sA = data$get.sVar(self$outvar), data$dat.bin.sVar), 5))
}
super$fit(data) # call the parent class fit method
if (gvars$verbose) message("fit for " %+% self$outvar %+% " var succeeded...")
data$emptydat.bin.sVar # wiping out binirized mat in data DatNet.sWsA object...
self$wipe.alldat # wiping out all data traces in ContinSummaryModel...
invisible(self)
},
# TO DO: rename to:
# predictP_1 = function(newdata) { # P(A^s=1|W^s=w^s): uses private$m.fit to generate predictions
predict = function(newdata) {
if (missing(newdata)) {
stop("must provide newdata")
}
assert_that(is.DatNet.sWsA(newdata))
if (gvars$verbose) print("performing prediction for categorical outcome: " %+% self$outvar)
newdata$binirize.cat.sVar(name.sVar = self$outvar, levels = self$levels)
super$predict(newdata)
newdata$emptydat.bin.sVar # wiping out binirized mat in newdata DatNet.sWsA object...
invisible(self)
},
# Invisibly return cumm. prob P(sA=sa|sW=sw)
# P(A^s=a^s|W^s=w^s) - calculating the likelihood for obsdat.sA[i] (n vector of a's):
predictAeqa = function(newdata) {
assert_that(is.DatNet.sWsA(newdata))
if (gvars$verbose) print("performing prediction for categorical outcome: " %+% self$outvar)
newdata$binirize.cat.sVar(name.sVar = self$outvar, levels = self$levels)
cumprodAeqa <- super$predictAeqa(newdata = newdata)
newdata$emptydat.bin.sVar # wiping out binirized mat in newdata object...
self$wipe.alldat # wiping out all data traces in ContinSummaryModel...
private$cumprodAeqa <- cumprodAeqa
return(cumprodAeqa)
}
),
active = list(
cats = function() {seq_len(self$reg$nbins)}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.