R/find_best_split_change_in_mean.R

#' find best split shift in mean and variance
#'
#' Efficiently calculates the gain in a one-dimensional shift in mean and variance scenario.
#'
#' @param x Array with entries that are assumed to have a shift in mean and variance at some split point.
#' @return An array on length \code{length(x)} with gains resulting from splitting at that specific split point.
#' @export
#' 
#' The negative gaussian loglikelihood of observations x for estimated mean and variance is \eqn{-n/2 * (log(2 \pi \hat\sigma^2) + 1)}.
#' The gaussian maximum likelihood estimate \code{\hat\sigma^2} is \code{(sum(x^2) - sum(x)^2/length(x))/length(x)}
find_best_split_shift_in_mean_and_variance <- function(x, ...){
  y <- x[!is.na(x)]
  n <- length(y)
  
  cumsum_x <- cumsum(y)
  cumsum_x_2 <- cumsum(y ^ 2)
  
  sigma_1 <- cumsum_x_2 / (1 : n) - cumsum_x ^ 2 / (1 : n) ^ 2
  sigma_2 <- (cumsum_x_2[n] - cumsum_x_2) / ((n - 1) : 0) - (cumsum_x[n] - cumsum_x) ^ 2 / ((n - 1) : 0)^2
  
  sigma_1 <- sigma_1 + 0.0001 * max(sigma_1[is.finite(sigma_1)])
  sigma_2 <- sigma_2 + 0.0001 * max(sigma_2[is.finite(sigma_2)])
  
  sigma_1[1] <- sigma_2[n] <- sigma_2[n-1] <- NA
  gain <- rep(NA, length(x))
  gain[!is.na(x)] <- log(sigma_1[n]) - (1:n)/n * log(sigma_1) - ((n-1):0)/n * log(sigma_2)
  gain
}


#' find_best_split_rank
#' 
#' rank based detection of splits in onedimensional scenario
find_best_split_rank <- function(x, ...){
  y <- x[!is.na(x)]
  n <- length(y)
  
  ranks <- rank(x)
  R <- cumsum(ranks)
  n_12 <- (1 : n) * c((n - 1) : 0)
  U_1 <- (R - (1 : n) * (1 : n + 1) / 2 - n_12 / 2) / sqrt((n_12 * (n + 1)) / 12)
  gain <- rep(NA, length(x))
  gain[!is.na(x)] <- U_1
  gain
}

#' find_best_split_shift_in_mean
#' 
#' Efficently calculates the gain i a one-dimensional shift in mean scenario. Variance ist assumed to be constant.
find_best_split_shift_in_mean <- function(x, ...){
  y <- x[!is.na(x)]
  n <- length(y)
  
  cumsum_x <- cumsum(y)
  #cumsum_x_2 <- cumsum(y ^ 2)
  
  gain <- rep(NA, length(x))
  gain[!is.na(x)] <- abs(sqrt(((n - 1) : 0) / n / (1 : (n))) * cumsum_x - 
    sqrt((1 : (n)) / n / (n - 1) : 0) * (cumsum_x[n] - cumsum_x))
  gain
}


find_best_split_shift_in_median <- function(x, ...){
  y <- x[!is.na(x)]
  n <- length(y)
  gain <- rep(NA, length(x))
  
  f <- function(s){
    sum(abs(y[1 : s] - median(y[1 : s]))) + sum(abs(y[(s + 1) : n] - median(y[(s + 1) : n])))
  }
  
  gain[!is.na(x)] <- sapply(1 : n, f)
  gain
}

find_best_split_carlstein <- function(x, ...){
  y <- x[!is.na(x)]
  n <- length(y)
  
  # result is an n x n matrix with entries (x_i <= x_j)_i,j
  d_0 <- sapply(y, function(z) y >= z)
  
  d_1 <- apply(d_0, 1, cumsum)
  
  d_2 <-abs(d_1 / 1 : n - t(d_1[n, ] - t(d_1)) / c((n - 1) : 1, 1))
  
  gain <- rep(NA, length(x))
  gain[!is.na(x)] <-  sqrt( (1 : n) * c((n - 1) : 1, 1) ) *  apply(d_2, 1, sum)
  
}

find_best_split_cusum <- function(x, ...){
  y <- x[!is.na(x)]
  n <- length(y)
  
  cumsum_x <- cumsum(x)
  
  gain <- rep(NA, length(x))
  gain[!is.na(x)] <-  cumsum_x[n] * (1 : n)/n - cumsum_x
}

# find_best_classifier_split <- function(y, initial_guess){
#   x <- y[!is.na(x)]
#   n <- length(x)
#   
#   gain <- array(NA, length(x))
#   
#   # recreate the labels used for learning
#   y <- c(rep(1, initial_guess), rep(-1, n - initial_guess))
#   c <- (y + 1) / 2
#   
#   # expected prediction given no information. This is (initial_guess - 1) / (n - 1) for i <= initial_guess
#   # and initial_guess / (n - 1) for i > initial guess, as (x_i, y_i) is not used for OOB prediction.
#   t_0 <- (initial_guess - c) / (n - 1)
#   
#   # store l = l(t_i, 1). Then -l = l(t_i, -1)
#   l <- -  (x - t_0)
#   f <- function(s){
#     y_cur <-  c(rep(1, s), rep(-1, n - s))
#     
#     s <- s - (y_cur == 1)
#     initial_guess <- initial_guess - c(rep(1, initial_guess), rep(0, n - initial_guess))
#     n <- n - 1
#     
#     rho_plus <- pmax(0, (s - initial_guess) / s)
#     rho_minus <- pmax(0, (initial_guess - s) / (n - s))
#     sum(y_cur * l - rho_plus*l + rho_minus * l)
#   }
#   
#   gain[!is.na(y)] <- sapply(1 : n, f)
# }  
  

# find_best_01_loss_split <- function(t, initial_guess){
#   n <- length(t)
#   
#   #thres
#   
#   # recreate the labels used for learning
#   y <- c(rep(1, initial_guess), rep(-1, n - initial_guess))
#   c <- (y + 1) / 2
#   
#   f <- function(s){
#     y_cur <-  c(rep(T, s), rep(F, n - s))
#     
#     s <- s - c(rep(1, s), rep(0, n - s))
#     initial_guess <- initial_guess - c(rep(1, initial_guess), rep(0, n - initial_guess))
#     n <- n - 1
#     
#     rho_plus <- pmax(0, (s - initial_guess) / s)
#     rho_minus <- pmax(0, (initial_guess - s) / (n - s))
#     
#     predictions <- t >= ifelse(s >= initial_guess, 0.5 + 0.4 * (s - initial_guess) / (n - initial_guess), 0.1 + 0.4 * s / initial_guess)
#     sum(predictions  == y_cur)
#   }
#   
#   sapply(1 : n, f)
# }  
MalteLond/rfcd documentation built on June 19, 2019, 2:52 p.m.