R/best_split_functions.R

Defines functions best_split_function_from_gain_function kNN_best_split_function classifier_best_split_function

Documented in best_split_function_from_gain_function classifier_best_split_function kNN_best_split_function

#' Get a get_best_split function from a classifier
#'
#' Returns a closure with formal arguments \code{x}, \code{start}, \code{end} and \code{split_candidates} that finds the
#' maximizer of the gain_function given data \code{x}, \code{start} and \code{end} on \code{split_candidates}
#' using the optimizer \code{optimizer}.
#'
#' @param classifier A function with formal arguments \code{x_train}, \code{y_train} and \code{control} that returns class probabilities in
#' a matrix of size nrow(x_train) x k, where k is the number of classes in y_train.
#' @inheritParams hdcd
classifier_best_split_function <-
  function(classifier, optimizer, control) {
    stopifnot(optimizer %in% c('section_search', 'line_search', 'two_step_search'))
    
    function(x, start, end, split_candidates, lambda = 0) {
      # Sequence of checks to avoid unwanted behaviour
      stopifnot(start >= 0)
      stopifnot(end <= nrow(x))
      stopifnot(start < min(split_candidates))
      stopifnot(end > max(split_candidates))
      
      gain_function <-
        classifier_gain_function(
          x = x,
          start = start,
          end = end,
          classifier = classifier,
          control = control
        )
      n <- nrow(x)
      # Apply optimizer
      if (optimizer == 'section_search') {
        res <-
          section_search(
            gain_function = function(x)
              gain_function(x)$max_gain,
            split_candidates = split_candidates,
            n = n,
            control = control
          )
        #  } else if(optimizer == 'smooth_section_search'){
        #    res <- smooth_section_search(gain_function = function(x) gain_function(x)$gain[x],
        #                                 split_candidates = split_candidates, n = n, control = control)
      } else if (optimizer == 'line_search') {
        res <-
          line_search(
            gain_function = function(x)
              gain_function(x)$max_gain,
            split_candidates = split_candidates,
            n = n,
            control = control
          )
      } else if (optimizer == 'two_step_search') {
        res <- two_step_search(
          gain_function = gain_function,
          split_candidates = split_candidates,
          n = n,
          control = control
        )
      }
      
      # In some situations (e.g. with missing values) the gain function cannot be evaluated at any of the
      # split_candidates. In such a situation NA is returned as best_split and binary_segmentation is stopped.
      if (is.na(res$best_split)) {
        warning(
          'the gain_function could not be evaluated at any of the split_candidates. NA is returned and binary_segmentation
          stopped for segment (',
          start,
          ', ',
          end,
          ']',
          sep = ''
          )
      }
      
      res
    }
  }

#' Get a get_best_split function from the kNN classifier
#'
#' Returns a closure with formal arguments \code{x}, \code{start}, \code{end} and \code{split_candidates} that finds the
#' maximizer of the gain_function given data \code{x}, \code{start} and \code{end} on \code{split_candidates}
#' using the optimizer \code{optimizer}.
#'
#' @inheritParams hdcd
kNN_best_split_function <- function(x, control) {
  distance_matrix <-
    as.matrix(dist(x)) # calculate pairwise distances of all observations
  
  function(x, start, end, split_candidates, lambda = 0) {
    n <-  end - start
    
    k <- min(ceiling(sqrt(n)), floor(n / 2))
    
    # get matrix of neighbors. the ijs entry is monotonely increasing in the distance of the ith observation to
    # the jth observation. Entries are unique in the columns
    # The ith observation is the j's (neighbors[i, j])-nearest neighbor
    neighbors <-
      apply(distance_matrix[(start + 1):end, (start + 1):end], 1, function(x)
        order(order(x)))
    
    neighbors <- neighbors >= 2 & neighbors <= k + 1
    # The k-th nearest neigbors are (in rows) those entries >=2 and <= k+1.
    # The prediction for the jth observation is the proportion of how many of its k-nn are before the split,
    # i.e the jth entry of the rowwise sum of the k-NN matrix up to split, i.e. sum(kNN_matrix[1 : split, j])
    
    # thus predictions[j, ] are the predictions if the split_point is set at j.
    predictions <- apply(neighbors, 2, cumsum) / k
    
    predictions <-
      (n - 1) * ((predictions / c(1, 1:(n - 1)) * !upper.tri(predictions)) + ((1 - predictions) / c((n - 1):1, 1) * upper.tri(predictions)))
    gain <- rep(NA, nrow(x))
    gain[(start + 1):end] <- rowSums(log_eps(predictions)) / nrow(x)
    best_split <-
      split_candidates[which.max(gain[split_candidates])]
    max_gain <- gain[best_split]
    
    n_perm <- control$permutation_test_n
    
    permutation_test <- array(NA, n_perm)
    set.seed(0)
    for (i in 1:n_perm) {
      sigma <- sample(1:nrow(x))
      sigma <- sigma[sigma > start & sigma <= end] - start
      neighbors_perm <- neighbors[sigma, sigma]
      predictions_perm <- apply(neighbors_perm, 2, cumsum) / k
      predictions_perm <-
        (n - 1) * ((
          predictions_perm / c(1, 1:(n - 1)) * !upper.tri(predictions_perm)
        ) + ((1 - predictions_perm) / c((n - 1):1, 1) * upper.tri(predictions_perm)
        ))
      permutation_test[i] <-
        max(rowSums(log_eps(predictions_perm))[split_candidates - start]) / nrow(x)
    }
    
    list(
      gain = gain,
      max_gain = max_gain,
      best_split = best_split,
      permutation_test = permutation_test,
      pval = (1 + sum(permutation_test >= max_gain)) / (1 + length(permutation_test))
    )
  }
}

#' Get a best_split_function from a gain_function
#'
#' @inheritParams hdcd
#' @param gain_function A function with argiument \code{x}, \code{start}, \code{end}, \code{lambda}, \code{control} that returns a closure with arguments
#'  split_point and possibly split_candidates that returns the gain after splitting the segment (\code{start}, \code{end}]
#'  at \code{split_point} given data \code{x} and tuning parameter \code{lambda}.
#' @return A function with formal arguments \code{x}, \code{start}, \code{end}, \code{split_candidates} and \code{lambda} that
#'  uses the optimizer specified to search for a maximum of the gain_function on the \code{split_candidates} given a segment
#'  (\code{start}, end], data \code{x} and a tuning parameter \code{lambda} and returns a list with arguments \code{gain}, \code{max_gain},
#'  \code{best_split} and possible \code{permutation_test} and \code{pval}.
best_split_function_from_gain_function <-
  function(gain_function, optimizer, control) {
    stopifnot(
      optimizer %in% c(
        'section_search',
        'smooth_section_search',
        'line_search',
        'two_step_search',
        'two_step_search_shift_in_mean',
        'two_step_search_adjusted'
      )
    )
    
    # Stop with error if gain_function does not take required arguments
    if (!all(c('x', 'start', 'end') %in% methods::formalArgs(gain_function))) {
      stop(
        'gain_function is not of the required form. Make sure that gain_function takes formal arguments x, start, end, lambda.'
      )
    }
    
    # Return closure that estimates /  calculates the location of the best split (with maximum gain) within (start, end]
    function(x, start, end, split_candidates, lambda = 0) {
      # Sequence of checks to avoid unwanted behaviour
      stopifnot(start >= 0)
      stopifnot(end <= nrow(x))
      stopifnot(start < min(split_candidates))
      stopifnot(end > max(split_candidates))
      
      n <- nrow(x)
      
      gain_function <-
        gain_function(
          x = x,
          start = start,
          end = end,
          lambda = lambda,
          control = control
        )
      
      # Apply optimizer
      if (optimizer == 'section_search') {
        res <-
          section_search(function(x)
            gain_function(x)$max_gain,
            split_candidates,
            n,
            control)
      } else if (optimizer == 'line_search') {
        res <-
          line_search(function(x)
            gain_function(x)$max_gain,
            split_candidates,
            n,
            control)
      } else if (optimizer == 'two_step_search') {
        res <- two_step_search(gain_function, split_candidates, n, control)
      } else {
        stop(
          'Make sure that optimizer is one of \'section_search\', \'line_search\' or \'two_step_search\''
        )
      }
      
      # In some situations (e.g. with missing values) the gain function cannot be evaluated at any of the
      # split_candidates. In such a situation NA is returned as best_split and binary_segmentation is stopped.
      if (is.na(res$best_split)) {
        warning(
          'the gain_function could not be evaluated at any of the split_candidates. NA is returned and binary_segmentation
          stopped for segment (',
          start,
          ', ',
          end,
          ']',
          sep = ''
          )
      }
      
      res
    }
  }
mlondschien/hdcd documentation built on Jan. 5, 2021, 11:26 p.m.