#' Change Point Detection Algorithm
#'
#' Find Changepoints in a design matrix
#'
#' @param x A design matrix
#' @param method The method used to find splits. One of \code{glasso}, \code{random_forest}, \code{elastic_net} or \code{custom}.
#' @param optimizer The optimizer used to find change points. Supply \code{line_search} for most reliable results (except for the rf method),
#' \code{section_search} for a faster (OBS) approach or \code{two_step_search} for an extremely fast method, best for the rf method.
#' @param delta The minimal relative segment length.
#' @param lambda A regularisation parameter for methods that require one.
#' @param loss_function A function with formal arguments \code{x} and possibly \code{lambda} that returns
#' some kind of training loss.
#' @param gain_function A function with formal arguments \code{x}, \code{start}, \code{end} and \code{lambda} that returns
#' a closure with argument \code{split_point}, that returns the gain after splitting the segment (\code{start}, \code{end}]
#' at \code{split_point} given data \code{x} and tuning parameter \code{lambda}.
#' @param get_best_split A function with formal arguments \code{x}, \code{start}, \code{end} and \code{split_candidates} that returns
#' a list with arguments \code{gain} and \code{best_split}, where gain is an array of length \code{nrow(x)} with possibly evaluations
#' of a gains curve or similar saved and \code{best_split} it the best element of \code{split_candidates} to split the interval
#' (\code{start}, \code{end}].
#' @param lambda A tuning parameter used in the evaluation of the gain curve. If a \code{cross_validation_function} is supplied
#' it will used lambda as an initial guess and in the following the optimal value from cross-validation will be used frot the
#' evaluation of the gain curve.
#' @param model_selection_function A function with formal arguments \code{x}, \code{start}, \code{split_point} and \code{end} that
#' returns a list with arguments \code{statistic}, a value that measures the significance of the split at \code{split_point}, and
#' \code{is_siginificant}, a boolean indicating whether the value returned for \code{statistic} is significant.
#' @param cross_validation_function A function with formal arguments \code{x}, \code{start}, \code{end}, \code{lambda} and \code{folds} that returns
#' a list with arguments \code{cv_loss} and \code{lambda_opt}.
#' @param control An object of class \code{hdcd_control} as generated by \link{hdcd_control}.
#' @export
#' @return A tree with the splitting structure of the binary segmentation algorithm. If some form of inner cross validation or model selection
#' was used, the estimated change points can be extracted via \link{get_change_points_from_tree}.
#'
#' A function that estimates change points in \code{x}. Currently available methods are \code{random_forest}, \code{glasso}, \code{elastic_net} and \code{custom}.
#' For all but the latter loglikelihood based loss functions are used to estimate the best split in a binary segmentation fashion. For the method \code{custom} an
#' individual loss, gain or best_split_function can be supplied to find change points. The best split in each step of BS is found using optimizer, which can be set
#' to be one of \code{line_search}, \code{section_search} or \code{two_step_search}. Line Search finds the maximum of the gains function by evaluating it at every
#' possible split. Section Search (also Optimistic Binary Search) makes use of the piecewise convex structure of the gains curve to find one of the local maxima with
#' approximately \code{log(n)} evaluations of the gain function. Two Step Search uses the individual loglikelihoods of predictions after a fit at a first guess to
#' obtain a second guess which gets refined to an final best split point. For methods other than \code{random_forest} we encourage the use of Line Search whenever the
#' computational cost allows this and Section Search else. We recommend the Two Step Search for the \code{random_forest} method.
hdcd <- function(x, method = NULL,
optimizer = NULL,
delta = NULL, lambda = NULL, loss_function = NULL, gain_function = NULL, best_split_function = NULL, cross_validation_function = NULL, model_selection_function = NULL, control = hdcd_control()){
if(!is.null(optimizer)){
optimizer <- match.arg(optimizer, choices = c('section_search', 'line_search', 'two_step_search'))
}
# hdcd needs the input to be a design matrix. If a data.frame / similar is supplied, it will be converted
# to a matrix.
if(!is.matrix(x)){
x <- as.matrix(x)
warning("Input data x has been coerced to matrix by hdcd.")
}
stopifnot(nrow(x) > 1)
# We set the minimal relative segment length delta to be 0.1 per default if no other value is supplied.
if(is.null(delta)){
delta <- 0.1
warning('The minimum relative segment length delta has been set to 0.1 since no value was supplied to hdcd.')
}
if(is.null(optimizer)){
if(method == 'random_forest'){
optimizer <- 'two_step_search'
warning('The optimizer was set to be two_step_search')
} else {
optimizer <- 'line_search'
warning('The optimizer was set to be line_search. Try using section_search for better performance')
}
}
control$n_obs <- nrow(x)
if(method == 'random_forest'){
gain_function <- get_rf_gain_function(control)
model_selection_function <- get_u_statistic_model_selection_function(control)
best_split_function <- get_best_split_function_from_gain_function(gain_function, optimizer, control)
}
if(method == 'glasso'){
# glasso requires a regularisation parameter lambda to function. If not supplied, we set lambda according
# to some asymptotic theory. If control$cv_inner is set to be true, an optimal lambda will be searched for
# via in sample cross validation, such that the exact choice of this lambda is not too critical.
if(is.null(lambda)){
cov_mat <- get_cov_mat(x, control$glasso_NA_method)$mat
lambda <- max(abs(cov_mat[upper.tri(cov_mat)]))
warning('Lambda for glasso set by asymptotic theory to ', lambda)
}
if(control$cv_inner){
if(!control$cv_inner_search_lambda & is.null(control$cv_inner_lambda)){
control$cv_inner_lambda <- log_space(control$cv_inner_min_grid_ratio * lambda,
lambda / control$cv_inner_min_grid_ratio,
length.out = control$cv_inner_n_lambda)
warning('The grid of lambdas for inner cross validation was set to (', paste(signif(control$cv_inner_lambda, digits = 2), collapse = ', '),
'). Supply a cv_inner_lambda to hdcd_control() or set cv_inner_search_lambda = TRUE to avoid automatic selection of the grid.')
}
cross_validation_function <- get_glasso_cross_validation_function(control)
}
gain_function <- get_glasso_gain_function(control)
best_split_function <- get_best_split_function_from_gain_function(gain_function, optimizer, control)
}
# This is still work in progress. We assume the first variable to be the regression variable y
if(method == 'elastic_net'){
gain_function <- get_elastic_net_gain_function(control)
best_split_function <- get_best_split_function_from_gain_function(gain_function, optimizer, control)
}
# If method is set to custom, the supplied loss / gain / find_best_split functions are used. We always use the
# most precise function, i.e gain_function over loss_function and best_split_function over gain_function.
if(method == 'custom'){
if(is.null(gain_function) & is.null(best_split_function) & !is.null(loss_function)){
# If only a loss_function is supplied
gain_function <- get_gain_function_from_loss_function(loss_function)
best_split_function <- get_best_split_function_from_gain_function(gain_function, optimizer, control)
} else if (!is.null(gain_function) & is.null(best_split_function)){
# If a gain_function is supplied but no best_split_function is supplied
best_split_function <- get_best_split_function_from_gain_function(gain_function, optimizer, control)
if(!is.null(loss_function)){
warning('The supplied loss_function was ignored since a gain_function was supplied')
}
} else if (!is.null(best_split_function)){
if(!is.null(loss_function)){
warning('The supplied loss_function was ignored since a best_split_function was supplied')
}
if(!is.null(gain_function)){
warning('The supplied gain_function was ignored since a best_split_function was supplied')
}
} else {
stop('Supply one of loss_function, gain_function or best_split_function when setting method to custom')
}
}
tree <- binary_segmentation(x, best_split_function, delta, lambda, model_selection_function, cross_validation_function, control)
tree
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.