rfcd: Random Forest Change Point Detection

#' Sample Folds
#' 
#' @param n number of observations
#' @param k number of folds
#' @param randomize should folds be randomized? equispaced folds are returned if randomize = F
#' @export
sample_folds <- function(n, k, randomize = FALSE){
  if (k == 1){
    as.factor(rep(0, n))
  } else  if (randomize){
    random_draw <- runif(n)
    k_quantiles <- quantile(random_draw, 0:k/k)
    cut(random_draw, k_quantiles, labels = 1:k, include.lowest = TRUE)
  } else {
    as.factor(rep(1:k, ceiling(n/k))[1:n])
  }
}

#' print.binary_segmentation_tree
#'
#' S3 method for printing a binary_segmentation_tree object.
#'
#' Decorate the print method of the data.tree package to see more details at each node.
#'
#' @param x A data.tree node.
#' @param ... Further arguments passed to print generic.
#' @export
print.binary_segmentation_tree <- function(x, ...) {
  
  # temp <- list('max_gain')
  # if(!is.null(x$model_selection_statistic)){
  #   temp[[length(temp) + 1]] <- 'model_selection_statistic'
  # }
  NextMethod(generic = NULL, object = NULL, 'split_point', 'model_selection_statistic', 'cv_loss', 'cv_improvement', 'relative_cv_improvement',  'lambda',  ...)
}

plot.binary_segmentation_tree <- function(x, ...){
  par(mfrow = c(2,1))
  plot(c(1 : x$root$n_obs, 1 : x$root$n_obs), unlist(x$predictions), type = 'n')
  points(x$predictions[[2]], col = 'red')
  points(x$predictions[[1]], col = 'blue')
  abline(v = x$splits[2], col = 'red')
  abline(v = x$splits[1], col = 'blue')
  
  plot(x$gain[[2]], col = 'red')
  points(x$gain[[1]], col = 'blue')
  abline(v = x$splits[2], col = 'red')
  abline(v = x$splits[1], col = 'blue')
}


#' Logarithmically Scaled Sequence Generation
#' 
#' Generates a logarithmically scaled sequence
#' 
#' @param from the starting value of the sequence
#' @param to the end value of the sequence
#' @param length.out the length of the sequence
#' @export
log_space <- function(from, to, length.out){
  exp(seq(from = log(from), to = log(to), length.out = length.out))
}

#' Get Change Points from a binary_segmentation_tree
#'
#' Utility function to get the change points with positive value for some variable from a binary_segmentation_tree
#'
#' @param tree An object of class \strong{binary_segmentation_tree}
#' @param variable Name of the variable with respect to which should be pruned
#' @export
#' @return A vector with the sorted changepoints.
get_change_points_from_tree <- function(tree, variable = 'cv_improvement'){
  
  alpha <- tree$Get('split_point', filterFun = function(x){!is.na(x[[variable]]) && !is.null(x[[variable]]) &&  x[[variable]] > 0})
  
  unname(sort(alpha))
}

#'Rand type performance indices
#'
#' Calculate rand type performance indices for two sets of changepoints. Typically one
#' of them will be the oracle estimate. See clues package for more details.
#'
#' @param cpts_a A sequence of changepoints.
#' @param cpts_b A sequence of changepoints.
#' @param n Total size of dataset from which both changepoint estimates originate.
#' @importFrom clues adjustedRand
#' @return Returns a vector of the index values.
#' @export
#'
#' @examples
#' CompareChangepointsRand(c(20, 50), c(30, 70), 100)
compare_change_points <- function(cpts_a, cpts_b, n) {
  cpts_a <- sort(cpts_a)[!duplicated(sort(cpts_a))]
  cpts_b <- sort(cpts_b)[!duplicated(sort(cpts_b))]
  
  MarkGroupings <- function(cpts) {
    diffs <- c(cpts, n) - c(0, cpts)
    rep(1:length(diffs), diffs)
  }
  
  clues::adjustedRand(MarkGroupings(cpts_a), MarkGroupings(cpts_b))
}


#' Draw Random segments
#' 
#' Draw \code{n_segments} segments with start and endpoints drawn uniformly in \code{start}, ..., \code{end} and possibly
#' switched. Segments are redwrawn if their length is smaller than \code{minimal_segment_length}.
#' 
#' @export 
draw_segments <- function(start, end, n_segments, minimal_segment_length = 1, include_full_segment = FALSE){
  
  if(n_segments == 0){
    return(data.table::data.table())
  }
  
  if(include_full_segment){
    return(
      rbind(data.table::data.table(start = start, end = end),
            draw_sequences(start, end, n_segments - 1, minimal_segment_length, FALSE))
    )
  }
  
  indices <- sample(start : end, 2 * n_segments, replace = T)
  
  dt <- data.table::data.table(start = pmin(indices[1 : n_segments], indices[(n_segments + 1) : (2 * n_segments)]),
                   end = pmax(indices[(1) : n_segments], indices[(n_segments + 1) : (2 * n_segments)]))
  
  dt <- dt[end - start >= minimal_segment_length]
  
  rbind(dt, draw_sequences(start, end, n_segments - nrow(dt), minimal_segment_length))
}