#' DensityEstimation
#'
#' This class performs the actual density estimation for each of the relevant
#' variables provided to it.
#'
#' @docType class
#' @importFrom R6 R6Class
#' @importFrom condensier DataStore fit_density predict_probability sample_value speedglmR6
#' @section Methods:
#' \describe{
#' \item{\code{initialize(nbins = 30, bin_estimator = NULL, online = FALSE, name = 'default', verbose = FALSE) }}{
#' Creates a new density estimator. One can provide various hyper
#' parameters. First of all the number of bins the estimator uses can be
#' configured, by default this is 30. Then one can define the actual
#' estimator the density estimator uses for estimating the conditional
#' densities. By default it uses speedglm. Finally, one can select whether
#' to treat the algorithm as an online or batch algorithm. If online is set
#' to false, we will keep a record of the data that has been used to train
#' an estimator. Note that this is currently very inefficient.
#'
#' @param nbins (default = 30) integer the number of bins to use for the
#' estimator.
#'
#' @param bin_estimator (default = NULL) ML.Base the actual estimator used
#' for fitting the conditional density
#'
#' @param online (default = FALSE) boolean does the algorithm have an
#' updating possibility? And if so, should we treat it as an online
#' algorithm?
#'
#' @param name (default = 'default') the name to use for the density
#' estimator.
#'
#' @param verbose (default = FALSE) the verbosity (log level) to use while running.
#' }
#'
#' \item{\code{predict(data, sample = FALSE, subset = NULL, plot = FALSE, check = FALSE) }}{
#' Method to perform the prediction on all (or a subset of) the relevant
#' variables / conditional densities. One can provide the option to
#' \code{sample}, which means that if this is set, the predict function
#' will sample new values from the underlying distributions. If this
#' argument is set to false, this function will return predicted
#' probabilities.
#'
#' @param data the data from which to predict the outcome. It depends on
#' the goal of the prediction what this needs to be. If the goal is to
#' sample a new relevant variable, the value for the relevant variables to
#' sample does not need to be set in the data table (they can be NA).
#' However, if one wants to know the probability of an instance of a relevant
#' variable given the other variables, ($P(Y | X_1, X2_)$), then the relevant
#' variable cannot be empty.
#'
#' @param sample (default = FALSE) boolean would we like to sample a value
#' (true) or a probability (false) from the conditional density
#'
#' @param subset (default = NULL) stringarray do we want to perform predictions for all
#' variables? (NULL), or just a subset thereof?
#'
#' @param plot (default = FALSE) boolean plot the predicted outcomes to a
#' file in tmp. This can be used for debugging (i.e., it shows the
#' sampled distribution over the actual distribution.
#'
#' @return list a list containing the predictions, where each entry is one
#' of the relevantvariables for which a conditional distribution was fit.
#' }
#'
#' \item{\code{predict_probability(datO, X, Y, plot = FALSE, check = FALSE) }}{
#' Internal method used by the \code{predict} function. This function
#' predicts a $P(Y | X)$. These arguments are therefore instances of the
#' RelevantVariable class. The data of these relevant variables needs to be
#' included in the \code{datO} argument.
#'
#' @param datO the data from which to predict the probability.
#' As we want to predict the probability of Y given a set of X, ($P(Y |
#' X_1, X2_)$), the relevant variable column in \code{datO} cannot be empty.
#'
#' @param sample (default = FALSE) boolean would we like to sample a value
#' (true) or a probability (false) from the conditional density
#'
#' @param subset (default = NULL) stringarray do we want to perform predictions for all
#' variables? (NULL), or just a subset thereof?
#'
#' @param plot (default = FALSE) boolean plot the predicted outcomes to a
#' file in tmp. This can be used for debugging (i.e., it shows the
#' sampled distribution over the actual distribution.
#'
#' @return list a list containing the predictions, where each entry is one
#' of the relevantvariables for which a conditional distribution was fit.
#' }
#'
#' \item{\code{getConditionalDensities(outcome = NULL) }}{
#' Function to get all fitted conditional densities. By default it will
#' return the full list of conditional densities (when \code{outcome =
#' NULL}). One could also provide a subset of relevant variables to the
#' function. Note that if only a single variable is returned (e.g.
#' \code{outcome = 'Y'}), it will return a single conditional density
#' (i.e., this outcome is not encapsulated in a list). If a vector of
#' outcomes is provided it will return a list.
#'
#' @param outcome (default = NULL) the subset of outcomes for which a
#' conditional density needs to be returned. When \code{NULL} it will
#' return all outcomes.
#'
#' @return either all conditional densities in a list, a subset (in a
#' list), or a single density.
#' }
#'
#' \item{\code{is_online()}}{
#' Active method. returns whether the estimator is initialized to be online
#' or not.
#'
#' @return boolean true if the estimator is fitted as an online estimator.
#' }
#'
#' \item{\code{get_bin_estimator()}}{
#' Active method. Returns the algorithm that is used for fitting the bins
#' (i.e., the machine learning algorithm).
#'
#' @return ML.base the actual algorithm used to fit the density.
#' }
#'
#' \item{\code{get_nbins()}}{
#' Active method. the number of bins used to split the continuous density
#' distribution.
#'
#' @return integer the number of bins.
#' }
#'
#' \item{\code{get_name()}}{
#' Active method. Returns the set name for the current estimator.
#'
#' @return string the name of the estimator.
#' }
#'
#' \item{\code{get_raw_conditional_densities()}}{
#' Active method. Returns the conditional densities. Note that this method
#' should preferably not be used. Using the
#' \code{getConditionalDensities()} method is prefered.
#'
#' @return list the conditional densities
#' }
#'
#' \item{\code{get_estimator_type()}}{
#' Active method. returns a list with two elements. First \code{fitfunname}
#' the name of the function used to fit the density, and \code{lmclass} the
#' lmclass for each of the algorithms.
#'
#' @return list with the \code{fitfunname} and the \code{lmclass}
#' }
#' }
DensityEstimation <- R6Class ("DensityEstimation",
#class = FALSE,
cloneable = FALSE,
portable = FALSE,
public =
list(
initialize = function(nbins = 30, bin_estimator = NULL, online = FALSE, name = 'default', verbose = FALSE) {
private$verbose <- Arguments$getVerbose(verbose)
private$is_online_estimator <- Arguments$getLogical(online)
private$nbins <- Arguments$getIntegers(as.numeric(nbins), c(1, Inf))
if (is.null(bin_estimator)) { bin_estimator <- condensier::glmR6$new() }
private$bin_estimator <- Arguments$getInstanceOf(bin_estimator, 'logisfitR6')
private$conditional_densities <- list()
self$set_name(name = name)
},
##TODO: Implement a way to run the prediction on a subset of outcomes
predict = function(data, sample = FALSE, subset = NULL, plot = FALSE, check = FALSE) {
if (check) {
data <- Arguments$getInstanceOf(data, 'data.table')
plot <- Arguments$getLogical(plot)
if (is.null(self$get_relevant_variables) || length(self$get_raw_conditional_densities) == 0) {
throw('The conditional_densities need to be fit first!')
}
}
results <- lapply(self$get_relevant_variables, function(rv) {
current_outcome <- rv$getY
## Return NA if we want to skip this iteration (next is not available in lapply)
if (!is.null(subset) && !(current_outcome %in% subset)) return(NA)
if(sample) {
self$sample(datO=data, Y = current_outcome, X = rv$getX, plot = plot)
} else {
self$predict_probability(datO = data, Y = current_outcome, X = rv$getX, plot = plot, check = check)
}
})
results[!is.na(results)]
},
predict_probability = function(datO, X, Y, plot = FALSE, check = FALSE) {
if(check && !(Y %in% names(datO))) throw('In order to predict the probability of an outcome, we also need the outcome')
yValues <- datO[[Y]]
conditionalDensity <- self$getConditionalDensities(Y)
## TODO: Why would we want to use predict over Aeqa?
## This fix might not be necessary. It seems that whenever we have 1 row of data and 1 covariate, everythin
## crashes and burns. Using this simple fix actually fixes that problem. Unfortunately, when using this fix,
## it doesnt work when there are more than 1 covariate.
##
## THIS WAS A TOUGH ONE, BUT NOW APPEARS FIXED. ALSO ADDED TEST FOR PREDICTIONS WITH ONLY ONE ROW OF DATA.
##
## fixed <- FALSE
## if(nrow(datO) == 1) {
## datO <- rbind(datO, datO)
## fixed <- TRUE
## }
## Predict the instances where A=A (i.e., the outcome is the outcome)
## NOTE! These estimated probabilities contain NAs whenever an estimator was fitted without any oata.
estimated_probabilities <- condensier::predict_probability(
model_fit = conditionalDensity,
newdata = datO
)
## We undo our fix here:
## if(fixed) { estimated_probabilities <- estimated_probabilities[[1]] }
if (plot && length(yValues) > 1) {
OutputPlotGenerator.create_density_plot(
yValues = yValues,
estimated_probabilities = estimated_probabilities,
output = paste(self$get_name, Y)
)
}
estimated_probabilities
},
## Generates a sample given the provided data.
sample = function(datO, X, Y, plot = FALSE, check = FALSE) {
## TODO: Implement sampling from a non conditional distribution
if(length(X) == 0) throw('Sampling from non conditional distribution is not yet supported!')
#print(paste('Sampling from:',Y, 'conditional on',paste(X, collapse='+')))
yValues <- datO[[Y]]
conditionalDensity <- self$getConditionalDensities(Y)
## TODO: BUG! For some weird reason, the code doesn't work with NA
## FIXED
## if(anyNA(datO)) {
## # TODO: warning('Some NA values were replaced with zeros!')
## # datO[is.na(datO)] <- 0
## }
# Outcome (sampled_data) is a vector of samples
sampled_data <- condensier::sample_value(
model_fit = conditionalDensity,
newdata = datO
)
if (plot && length(yValues) > 1) {
density_for_plot <- density(x = sampled_data)
OutputPlotGenerator.create_density_plot(
yValues = yValues,
estimated_probabilities = density_for_plot$y,
estimated_y_values = density_for_plot$x,
output = paste('sampled', self$get_name, Y, sep = '-')
)
}
sampled_data
},
fit = function(datO, relevantVariables){
if(is.null(self$get_relevant_variables)) {
self$set_relevant_variables(relevantVariables = relevantVariables)
}
## Fit conditional density for all of the relevantVariables
for(rv in private$relevantVariables) {
## TODO: Currently it is is not yet possible to sample from an non-conditional distribution!
## OS: Maybe the following hack (its probably not a very good one):
## 1) Fit unconditional density using the same method (histogram) with intercept only GLMs
## 2) Sample from that fit just like conditional density
X <- rv$getX
Y <- rv$getY
family <- rv$getFamily
if(length(X) > 0) {
dens_fit <- self$fit_single_rv(
datO = datO,
X = X,
Y = Y,
family = family
)
private$store_conditional_density(Y = Y, density = dens_fit)
}
}
TRUE
},
fit_single_rv = function(datO, X, Y, family) {
if (family == 'binomial') {
bins <- 2
} else {
bins <- private$nbins
}
private$verbose && cat(private$verbose, 'Fitting density: ', Y, ' on ', self$get_name)
dens_fit <- condensier::fit_density(
X = X,
Y = Y,
input_data = datO,
nbins = bins,
bin_estimator = private$bin_estimator
)
return(dens_fit)
},
set_relevant_variables = function(relevantVariables) {
# Convert the formula in vectors (can probably be done easier)
for (rv in relevantVariables) {
private$relevantVariables[[rv$getY]] <- Arguments$getInstanceOf(rv, 'RelevantVariable')
}
},
# Updates the used condistional density
update = function(newdata) {
for (rv in private$relevantVariables) {
## TODO: Currently it is is not yet possible to sample from an non-conditional distribution!
## OS: Maybe the following hack (its probably not a very good one):
## 1) Fit unconditional density using the same method (histogram) with intercept only GLMs
## 2) Sample from that fit just like conditional density
X <- rv$getX
Y <- rv$getY
if(length(X) > 0) {
private$verbose && enter(private$verbose, 'Updating density: ', Y)
data_obj <- private$create_data_store(newdata = newdata, Y = Y, X = X)
dens_fit <- self$getConditionalDensities(Y)
## Update the fit with the new data
updated_dens_fit <- dens_fit$update(newdata = data_obj)
private$store_conditional_density(Y = Y, density = updated_dens_fit)
private$verbose && exit(private$verbose)
}
}
TRUE
},
getConditionalDensities = function(outcome = NULL) {
if(length(private$conditional_densities) == 0) throw('Densities not yet fitted')
## If no outcome type is provided, just return all of them
if(is.null(outcome)) return(private$conditional_densities)
outcome <- Arguments$getCharacters(outcome)
if(!(all(outcome %in% names(private$conditional_densities)))) throw(paste(outcome, 'is not a fitted outcome'))
if (length(outcome) == 1) return(private$conditional_densities[[outcome]])
return(private$conditional_densities[outcome])
},
set_name = function(name) {
private$name <- Arguments$getCharacters(name)
}
),
active =
list(
is_online = function() {
return(private$is_online_estimator)
},
get_relevant_variables = function() {
private$relevantVariables
},
get_bin_estimator = function() {
return(private$bin_estimator)
},
get_nbins = function() {
return(private$nbins)
},
get_name = function() {
return(private$name)
},
get_raw_conditional_densities = function() {
return(private$conditional_densities)
},
get_estimator_type = function() {
list(
fitfunname = private$bin_estimator$fitfunname,
lmclass = private$bin_estimator$lmclass
)
}
),
private =
list(
## Variables
# =========
conditional_densities = NULL,
data = NULL,
nbins = NULL,
verbose = NULL,
relevantVariables = NULL,
bin_estimator = NULL,
is_online_estimator = NULL,
name = NULL,
# Functions
# =========
store_conditional_density = function(Y, density) {
private$conditional_densities[Y] <- list(density)
},
create_data_store = function(newdata, Y, X) {
condensier::DataStore$new(input_data = newdata, Y = Y, X = X, auto_typing = FALSE)
}
)
)
#' Static method to check whether all included estimators are in fact specified
#' as being online estimators. If this is not the case, we should keep a cache
#' of all data somewhere.
#' @param estimators list of all estimator objects (should be density estimator objects)
#' @return boolean TRUE if all estimators are online, FALSE if not
#' @export
DensityEstimation.are_all_estimators_online = function(estimators) {
for (estimator in estimators) {
if(!estimator$is_online) {
return(FALSE)
}
}
return(TRUE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.