R/u_statistic_model_selection.R

#' u_statistic_model_selection_funciton
#' 
#' returns a test statistic based on oob u statistics
#' 
u_statistic_model_selection_function <- function(){
  
  function(x, start, split_point, end, lambda = 0){
    n <- end - start
    y <-  c(rep(0, split_point - start), rep(1, end - split_point))
    t_value <- get_t_value(x[(start + 1) : end, ], y, 10000, 25, 1000)
    list(is_significant = t_value > 2, statistic = t_value)
  }
}

estimate_sigma_1 <- function(x, y, n_z, n_MC){
  n <- nrow(x)
  k <- sample(1 : n, n_z)
  res <- matrix(NA, ncol = n, nrow = n_z)
  for (i in 1:n_z){
    w <- rep(0.001 / (n-1), n)
    w[k[i]] <- 0.999 #dirty fix
    fit <- ranger::ranger(y ~ ., data = data.frame(x = x, y = y), num.trees = n_MC, replace = FALSE, sample.fraction = 1 / sqrt(n), case.weights = w)
    res[i, ] <- fit$predictions
    res[i, k[i]] <- NA
  }
  cov(res, use = 'pairwise')
}

estimate_sigma_n <- function(x, y, n_z){
  n <- nrow(x)
  fit <- ranger::ranger(y ~ ., data = data.frame(x = x, y = y), num.trees = n_z * 10, replace = FALSE, sample.fraction = 1 / sqrt(n), keep.inbag = T)
  res <- predict(fit, data.frame(x = x), predict.all = T)$predictions
  inbag <- matrix(1 == unlist(fit$inbag.counts), nrow = n)
  res[inbag] <- NA
  cov(t(res), use = 'pair')
}

get_u_stat <- function(x, y, n_tree, n_z, n_MC){
  n <- nrow(x)
  fit <- ranger::ranger(y ~ ., data.frame(x = x, y = y), num.trees = n_tree, replace = F, sample.fraction = 1 / sqrt(n))
  sigma_1 <- estimate_sigma_1(x, y, n_z, n_MC)
  sigma_n <- estimate_sigma_n(x, y, n_MC)
  #sigma_n <- 0
  list(fit$predictions, (n_tree * sigma_1 + sigma_n) / n_tree)
}

get_t_value <- function(x, y, n_tree, n_z, n_MC){
  n <- nrow(x)
  varU <- get_u_stat(x, y, n_tree, n_z, n_MC)
  c <- 2 * y - 1
  v <- 1 #sqrt(diag(varU[[2]])) #individual stds
  t <- sum((varU[[1]] - 0.5 + c / (2 * n - 2)) * c / v ) / sqrt(t(c / v) %*% varU[[2]] %*% (c / v))
  t
}
MalteLond/rfcd documentation built on June 19, 2019, 2:52 p.m.