rfcd: Random Forest Change Point Detection

#' Closure generating function to calculate gains for splits using a random forest classifier
#' 
#' @param control an object of type \code{hdcd_control} returned by \link{hdcd_control}
#' 
#' @return A closure with parameters \code{x}, \code{start} and \code{end}, that when evaluated
#' will itself return a closure with parameter \code{split_point}. This calculates the gain when
#' splitting the segment \code{(start, end]} of \code{x} at \code{split_point}. If the closure is
#' additionally supplied with \code{evaluate_all = TRUE}, an array of length \code{nrow(x)} is
#' returned with predictions for each observation in \code{(start, end]} when split at 
#' \code{split_point}.
get_rf_gain_function <- function(control){
  
  # prepare parameters
  n_tree <- control$random_forest_n_tree
  mtry <- control$random_forest_mtry
  sample_fraction <- control$random_forest_sample_fraction
  
  function(x, start, end, ...){
    
    function(split_point, evaluate_all = F, ...){
  
      n0 <- split_point - start
      n1 <- end - split_point
      
      # prepare labels
      y <- c(rep(0, n0), rep(1, n1))
      
      # normalize predictions to [-1/2, 1/2] such that expected prediction is 0, 
      #fit$predictions <- ifelse(fit$predictions < expected_prediction,
      #                          fit$predictions * 0.5 / expected_prediction - 0.5,
      #                          0.5 * (fit$predictions - expected_prediction) / (1 - expected_prediction))
      
      if(!evaluate_all){
        
        # +1 for label 1 and -1 for label 0
        c <- 2 * y - 1

        # fit a random forest
        fit <- ranger::ranger(y ~ ., data = data.frame(x = x[(start + 1) : end, ], y = y),
                              num.trees = n_tree, mtry = mtry, sample.fraction = sample_fraction, keep.inbag = TRUE)
        
        # count number of observations drawn from left segment for each tree
        counts <- sapply(lapply(fit$inbag.counts, '[', 1 : n0), sum)
        
        # matrix of dimension (n0 + n1) x n_tree with TRUE  in M_ij if the ith observation was OOB in tree j
        M <- sapply(fit$inbag.counts, '==', 0)
        
        # expected prediction for each observation (or its oob prediction)
        t_0 <- 1 - M %*% counts / apply(M, 1, sum) / (n0 + n1)
        
        # measure of how much better than random the prediction was
        #mean((fit$predictions - t_0)[1 : n0]) + mean((fit$predictions - t_0)[(n0 + 1) : (n0 + n1)])
        mean((fit$predictions - t_0) * c)
        
      } else {
        fit <- ranger::ranger(y ~ ., data = data.frame(x = x[(start + 1) : end, ], y = y),
                              num.trees = n_tree, mtry = mtry, sample.fraction = sample_fraction, keep.inbag = TRUE)
        
        
        gain <- rep(NA, nrow(x))
        #res <- c(0, cumsum(fit$predictions) )
        #res <- res[length(res)] - 2 * res
        gain[(start + 1) : end] <- fit$predictions #(res / length(res))
        gain
      }
    }
  }
}


#' Closure generating function to calculate gains when splitting and learning a glasso model
#' 
#' @param control an object of type \code{hdcd_control} returned by \link{hdcd_control}
#' 
#' @return A closure with parameters \code{x}, \code{start} and \code{end}, that when evaluated
#' will itself return a closure with parameter \code{split_point}. This calculates the gain when
#' splitting the segment \code{(start, end]} of \code{x} at \code{split_point}. If the closure is
#' additionally supplied with \code{evaluate_all = TRUE}, an array of length \code{nrow(x)} is
#' returned with differences of loglikelihoods for each observation in \code{(start, end]} when split at 
#' \code{split_point}.
get_glasso_gain_function <- function(control){
  
  function(x, start, end, lambda) {
    
    if(is.null(lambda)){
      stop('Please supply a value for lambda to the glasso gain function')
    }
    
    control$n_obs <- nrow(x)
    
    fit_global <- get_glasso_fit(x[(start + 1) : end, , drop = F], lambda = lambda, control = control)
    
    function(split_point, evaluate_all = FALSE){
      
      fit_left <- get_glasso_fit(x[(start + 1) : split_point, , drop = F], lambda = lambda, control = control)
      fit_right <- get_glasso_fit(x[(split_point + 1) : end, , drop = F], lambda = lambda, control = control)
    
      # work in progress
      if(evaluate_all){
        
        # Here we need to remove the bias given through overfitting. For this we resample the observations in the 
        # segment, learn on the split and record the means of the difference in loglikelihood left and right of 
        # the split. We then remove this afterwards from the difference of the loglikelihoods before/after split
        # without resampling.
        j <- sample((start + 1) : end)
        j_left <- j[(start + 1) : split_point - start]
        j_right <- j[(split_point + 1) : end - start]
        fit_left_null <- get_glasso_fit(x[j_left, , drop = F], lambda = lambda, control = control)
        fit_right_null <- get_glasso_fit(x[j_right, , drop = F], lambda = lambda, control = control)


        # Only use inds that were both used in estimation left and right!
        inds <- fit_left$inds & fit_right$inds
        inds_null <- fit_left_null$inds & fit_right_null$inds

        # initiate gains array
        gain  <- array(NA, control$n_obs)
        
        # get loglikelihoods
        loglik_left <- loglikelihood(x[(start + 1) : end, inds, drop = F],
                                           fit_left$mu[inds, drop = F],
                                           fit_left$w[inds, inds, drop = F],
                                           fit_left$wi[inds,inds, drop = F])
        loglik_right <- loglikelihood(x[(start + 1) : end, inds, drop = F],
                                             fit_right$mu[inds, drop = F],
                                             fit_right$w[inds, inds, drop = F],
                                             fit_right$wi[inds, inds, drop = F])
        
        loglik_left_null <- loglikelihood(x[j, inds_null, drop = F],
                                     fit_left_null$mu[inds_null, drop = F],
                                     fit_left_null$w[inds_null, inds_null, drop = F],
                                     fit_left_null$wi[inds_null, inds_null, drop = F])
        loglik_right_null <- loglikelihood(x[j, inds_null, drop = F],
                                      fit_right_null$mu[inds_null, drop = F],
                                      fit_right_null$w[inds_null, inds_null, drop = F],
                                      fit_right_null$wi[inds_null, inds_null, drop = F])
        
        bias <- c(rep(mean(loglik_left_null[1 : (split_point - start)] - loglik_right_null[1 : (split_point - start)]), split_point - start),
                  rep(mean(loglik_left_null[(split_point - start + 1) : (end - start)] - loglik_right_null[(split_point - start + 1) : (end - start)]), end - split_point)
        )
         


        gain[(start + 1) : end] <- loglik_left - loglik_right - bias
        
        gain
        
      } else {
        
        # currently does ot work! TODO
        if(TRUE | !all(fit_left$inds[fit_global$inds], fit_right$inds[fit_global$inds])){
          
          (sum(loglikelihood(x[(start + 1) : split_point, fit_left$inds, drop = F],
                        fit_global$mu[fit_left$inds, drop = F],
                        fit_global$wi[fit_left$inds, fit_left$inds, drop = F]
                        #,fit_global$wi[fit_left$inds, fit_left$inds, drop = F]
                        )) +
            sum(loglikelihood(x[(split_point + 1) : end, fit_right$inds, drop = F],
                          fit_global$mu[fit_right$inds, drop = F],
                          fit_global$wi[fit_right$inds, fit_right$inds, drop = F]
                          #,fit_global$wi[fit_right$inds, fit_right$inds, drop = F]
                          )) -
            sum(loglikelihood(x[(start + 1) : split_point, fit_left$inds, drop = F],
                          fit_left$mu[fit_left$inds, drop = F],
                          fit_left$wi[fit_left$inds, fit_left$inds, drop = F]
                          #,fit_left$wi[fit_left$inds, fit_left$inds, drop = F]
                          )) -
            sum(loglikelihood(x[(split_point + 1) : end, fit_right$inds, drop = F],
                          fit_right$mu[fit_right$inds, drop = F],
                          fit_right$wi[fit_right$inds, fit_right$inds, drop = F]
                          #,fit_right$wi[fit_right$inds, fit_right$inds, drop = F]
                          ))) / nrow(x)
        } else {
          
          fit_global$loglik - fit_left$loglik - fit_right$loglik
          
        }
      }
    }
  }
}