#' Closure generating function to calculate gains for splits using a random forest classifier
#'
#' @param control an object of type \code{hdcd_control} returned by \link{hdcd_control}
#'
#' @return A closure with parameters \code{x}, \code{start} and \code{end}, that when evaluated
#' will itself return a closure with parameter \code{split_point}. This calculates the gain when
#' splitting the segment \code{(start, end]} of \code{x} at \code{split_point}. If the closure is
#' additionally supplied with \code{evaluate_all = TRUE}, an array of length \code{nrow(x)} is
#' returned with predictions for each observation in \code{(start, end]} when split at
#' \code{split_point}.
get_rf_gain_function <- function(control){
# prepare parameters
n_tree <- control$random_forest_n_tree
mtry <- control$random_forest_mtry
sample_fraction <- control$random_forest_sample_fraction
function(x, start, end, ...){
function(split_point, evaluate_all = F, ...){
n0 <- split_point - start
n1 <- end - split_point
# prepare labels
y <- c(rep(0, n0), rep(1, n1))
# normalize predictions to [-1/2, 1/2] such that expected prediction is 0,
#fit$predictions <- ifelse(fit$predictions < expected_prediction,
# fit$predictions * 0.5 / expected_prediction - 0.5,
# 0.5 * (fit$predictions - expected_prediction) / (1 - expected_prediction))
if(!evaluate_all){
# +1 for label 1 and -1 for label 0
c <- 2 * y - 1
# fit a random forest
fit <- ranger::ranger(y ~ ., data = data.frame(x = x[(start + 1) : end, ], y = y),
num.trees = n_tree, mtry = mtry, sample.fraction = sample_fraction, keep.inbag = TRUE)
# count number of observations drawn from left segment for each tree
counts <- sapply(lapply(fit$inbag.counts, '[', 1 : n0), sum)
# matrix of dimension (n0 + n1) x n_tree with TRUE in M_ij if the ith observation was OOB in tree j
M <- sapply(fit$inbag.counts, '==', 0)
# expected prediction for each observation (or its oob prediction)
t_0 <- 1 - M %*% counts / apply(M, 1, sum) / (n0 + n1)
# measure of how much better than random the prediction was
#mean((fit$predictions - t_0)[1 : n0]) + mean((fit$predictions - t_0)[(n0 + 1) : (n0 + n1)])
mean((fit$predictions - t_0) * c)
} else {
fit <- ranger::ranger(y ~ ., data = data.frame(x = x[(start + 1) : end, ], y = y),
num.trees = n_tree, mtry = mtry, sample.fraction = sample_fraction, keep.inbag = TRUE)
gain <- rep(NA, nrow(x))
#res <- c(0, cumsum(fit$predictions) )
#res <- res[length(res)] - 2 * res
gain[(start + 1) : end] <- fit$predictions #(res / length(res))
gain
}
}
}
}
#' Closure generating function to calculate gains when splitting and learning a glasso model
#'
#' @param control an object of type \code{hdcd_control} returned by \link{hdcd_control}
#'
#' @return A closure with parameters \code{x}, \code{start} and \code{end}, that when evaluated
#' will itself return a closure with parameter \code{split_point}. This calculates the gain when
#' splitting the segment \code{(start, end]} of \code{x} at \code{split_point}. If the closure is
#' additionally supplied with \code{evaluate_all = TRUE}, an array of length \code{nrow(x)} is
#' returned with differences of loglikelihoods for each observation in \code{(start, end]} when split at
#' \code{split_point}.
get_glasso_gain_function <- function(control){
function(x, start, end, lambda) {
if(is.null(lambda)){
stop('Please supply a value for lambda to the glasso gain function')
}
control$n_obs <- nrow(x)
fit_global <- get_glasso_fit(x[(start + 1) : end, , drop = F], lambda = lambda, control = control)
function(split_point, evaluate_all = FALSE){
fit_left <- get_glasso_fit(x[(start + 1) : split_point, , drop = F], lambda = lambda, control = control)
fit_right <- get_glasso_fit(x[(split_point + 1) : end, , drop = F], lambda = lambda, control = control)
# work in progress
if(evaluate_all){
# Here we need to remove the bias given through overfitting. For this we resample the observations in the
# segment, learn on the split and record the means of the difference in loglikelihood left and right of
# the split. We then remove this afterwards from the difference of the loglikelihoods before/after split
# without resampling.
j <- sample((start + 1) : end)
j_left <- j[(start + 1) : split_point - start]
j_right <- j[(split_point + 1) : end - start]
fit_left_null <- get_glasso_fit(x[j_left, , drop = F], lambda = lambda, control = control)
fit_right_null <- get_glasso_fit(x[j_right, , drop = F], lambda = lambda, control = control)
# Only use inds that were both used in estimation left and right!
inds <- fit_left$inds & fit_right$inds
inds_null <- fit_left_null$inds & fit_right_null$inds
# initiate gains array
gain <- array(NA, control$n_obs)
# get loglikelihoods
loglik_left <- loglikelihood(x[(start + 1) : end, inds, drop = F],
fit_left$mu[inds, drop = F],
fit_left$w[inds, inds, drop = F],
fit_left$wi[inds,inds, drop = F])
loglik_right <- loglikelihood(x[(start + 1) : end, inds, drop = F],
fit_right$mu[inds, drop = F],
fit_right$w[inds, inds, drop = F],
fit_right$wi[inds, inds, drop = F])
loglik_left_null <- loglikelihood(x[j, inds_null, drop = F],
fit_left_null$mu[inds_null, drop = F],
fit_left_null$w[inds_null, inds_null, drop = F],
fit_left_null$wi[inds_null, inds_null, drop = F])
loglik_right_null <- loglikelihood(x[j, inds_null, drop = F],
fit_right_null$mu[inds_null, drop = F],
fit_right_null$w[inds_null, inds_null, drop = F],
fit_right_null$wi[inds_null, inds_null, drop = F])
bias <- c(rep(mean(loglik_left_null[1 : (split_point - start)] - loglik_right_null[1 : (split_point - start)]), split_point - start),
rep(mean(loglik_left_null[(split_point - start + 1) : (end - start)] - loglik_right_null[(split_point - start + 1) : (end - start)]), end - split_point)
)
gain[(start + 1) : end] <- loglik_left - loglik_right - bias
gain
} else {
# currently does ot work! TODO
if(TRUE | !all(fit_left$inds[fit_global$inds], fit_right$inds[fit_global$inds])){
(sum(loglikelihood(x[(start + 1) : split_point, fit_left$inds, drop = F],
fit_global$mu[fit_left$inds, drop = F],
fit_global$wi[fit_left$inds, fit_left$inds, drop = F]
#,fit_global$wi[fit_left$inds, fit_left$inds, drop = F]
)) +
sum(loglikelihood(x[(split_point + 1) : end, fit_right$inds, drop = F],
fit_global$mu[fit_right$inds, drop = F],
fit_global$wi[fit_right$inds, fit_right$inds, drop = F]
#,fit_global$wi[fit_right$inds, fit_right$inds, drop = F]
)) -
sum(loglikelihood(x[(start + 1) : split_point, fit_left$inds, drop = F],
fit_left$mu[fit_left$inds, drop = F],
fit_left$wi[fit_left$inds, fit_left$inds, drop = F]
#,fit_left$wi[fit_left$inds, fit_left$inds, drop = F]
)) -
sum(loglikelihood(x[(split_point + 1) : end, fit_right$inds, drop = F],
fit_right$mu[fit_right$inds, drop = F],
fit_right$wi[fit_right$inds, fit_right$inds, drop = F]
#,fit_right$wi[fit_right$inds, fit_right$inds, drop = F]
))) / nrow(x)
} else {
fit_global$loglik - fit_left$loglik - fit_right$loglik
}
}
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.