#' find best split shift in mean and variance
#'
#' Efficiently calculates the gain in a one-dimensional shift in mean and variance scenario.
#'
#' @param x Array with entries that are assumed to have a shift in mean and variance at some split point.
#' @return An array on length \code{length(x)} with gains resulting from splitting at that specific split point.
#' @export
#'
#' The negative gaussian loglikelihood of observations x for estimated mean and variance is \eqn{-n/2 * (log(2 \pi \hat\sigma^2) + 1)}.
#' The gaussian maximum likelihood estimate \code{\hat\sigma^2} is \code{(sum(x^2) - sum(x)^2/length(x))/length(x)}
find_best_split_shift_in_mean_and_variance <- function(x, ...){
y <- x[!is.na(x)]
n <- length(y)
cumsum_x <- cumsum(y)
cumsum_x_2 <- cumsum(y ^ 2)
sigma_1 <- cumsum_x_2 / (1 : n) - cumsum_x ^ 2 / (1 : n) ^ 2
sigma_2 <- (cumsum_x_2[n] - cumsum_x_2) / ((n - 1) : 0) - (cumsum_x[n] - cumsum_x) ^ 2 / ((n - 1) : 0)^2
sigma_1 <- sigma_1 + 0.0001 * max(sigma_1[is.finite(sigma_1)])
sigma_2 <- sigma_2 + 0.0001 * max(sigma_2[is.finite(sigma_2)])
sigma_1[1] <- sigma_2[n] <- sigma_2[n-1] <- NA
gain <- rep(NA, length(x))
gain[!is.na(x)] <- log(sigma_1[n]) - (1:n)/n * log(sigma_1) - ((n-1):0)/n * log(sigma_2)
gain
}
#' find_best_split_rank
#'
#' rank based detection of splits in onedimensional scenario
find_best_split_rank <- function(x, ...){
y <- x[!is.na(x)]
n <- length(y)
ranks <- rank(x)
R <- cumsum(ranks)
n_12 <- (1 : n) * c((n - 1) : 0)
U_1 <- (R - (1 : n) * (1 : n + 1) / 2 - n_12 / 2) / sqrt((n_12 * (n + 1)) / 12)
gain <- rep(NA, length(x))
gain[!is.na(x)] <- U_1
gain
}
#' find_best_split_shift_in_mean
#'
#' Efficently calculates the gain i a one-dimensional shift in mean scenario. Variance ist assumed to be constant.
find_best_split_shift_in_mean <- function(x, ...){
y <- x[!is.na(x)]
n <- length(y)
cumsum_x <- cumsum(y)
#cumsum_x_2 <- cumsum(y ^ 2)
gain <- rep(NA, length(x))
gain[!is.na(x)] <- abs(sqrt(((n - 1) : 0) / n / (1 : (n))) * cumsum_x -
sqrt((1 : (n)) / n / (n - 1) : 0) * (cumsum_x[n] - cumsum_x))
gain
}
find_best_split_shift_in_median <- function(x, ...){
y <- x[!is.na(x)]
n <- length(y)
gain <- rep(NA, length(x))
f <- function(s){
sum(abs(y[1 : s] - median(y[1 : s]))) + sum(abs(y[(s + 1) : n] - median(y[(s + 1) : n])))
}
gain[!is.na(x)] <- sapply(1 : n, f)
gain
}
find_best_split_carlstein <- function(x, ...){
y <- x[!is.na(x)]
n <- length(y)
# result is an n x n matrix with entries (x_i <= x_j)_i,j
d_0 <- sapply(y, function(z) y >= z)
d_1 <- apply(d_0, 1, cumsum)
d_2 <-abs(d_1 / 1 : n - t(d_1[n, ] - t(d_1)) / c((n - 1) : 1, 1))
gain <- rep(NA, length(x))
gain[!is.na(x)] <- sqrt( (1 : n) * c((n - 1) : 1, 1) ) * apply(d_2, 1, sum)
}
find_best_split_cusum <- function(x, ...){
y <- x[!is.na(x)]
n <- length(y)
cumsum_x <- cumsum(x)
gain <- rep(NA, length(x))
gain[!is.na(x)] <- cumsum_x[n] * (1 : n)/n - cumsum_x
}
# find_best_classifier_split <- function(y, initial_guess){
# x <- y[!is.na(x)]
# n <- length(x)
#
# gain <- array(NA, length(x))
#
# # recreate the labels used for learning
# y <- c(rep(1, initial_guess), rep(-1, n - initial_guess))
# c <- (y + 1) / 2
#
# # expected prediction given no information. This is (initial_guess - 1) / (n - 1) for i <= initial_guess
# # and initial_guess / (n - 1) for i > initial guess, as (x_i, y_i) is not used for OOB prediction.
# t_0 <- (initial_guess - c) / (n - 1)
#
# # store l = l(t_i, 1). Then -l = l(t_i, -1)
# l <- - (x - t_0)
# f <- function(s){
# y_cur <- c(rep(1, s), rep(-1, n - s))
#
# s <- s - (y_cur == 1)
# initial_guess <- initial_guess - c(rep(1, initial_guess), rep(0, n - initial_guess))
# n <- n - 1
#
# rho_plus <- pmax(0, (s - initial_guess) / s)
# rho_minus <- pmax(0, (initial_guess - s) / (n - s))
# sum(y_cur * l - rho_plus*l + rho_minus * l)
# }
#
# gain[!is.na(y)] <- sapply(1 : n, f)
# }
# find_best_01_loss_split <- function(t, initial_guess){
# n <- length(t)
#
# #thres
#
# # recreate the labels used for learning
# y <- c(rep(1, initial_guess), rep(-1, n - initial_guess))
# c <- (y + 1) / 2
#
# f <- function(s){
# y_cur <- c(rep(T, s), rep(F, n - s))
#
# s <- s - c(rep(1, s), rep(0, n - s))
# initial_guess <- initial_guess - c(rep(1, initial_guess), rep(0, n - initial_guess))
# n <- n - 1
#
# rho_plus <- pmax(0, (s - initial_guess) / s)
# rho_minus <- pmax(0, (initial_guess - s) / (n - s))
#
# predictions <- t >= ifelse(s >= initial_guess, 0.5 + 0.4 * (s - initial_guess) / (n - initial_guess), 0.1 + 0.4 * s / initial_guess)
# sum(predictions == y_cur)
# }
#
# sapply(1 : n, f)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.