R/distance.R

Defines functions weighted_distance edges_between_terminal_nodes depth_distance proximity_distance distance_random_forest

Documented in depth_distance distance_random_forest edges_between_terminal_nodes proximity_distance weighted_distance

#' Distance calculation based on RandomForest Proximity or Depth
#'
#' @param x a data.frame
#' @param y a second data.frame
#' @param rfObject \code{ranger} object
#' @param method distance calculation method, Proximity (Default) or Depth.
#' @param threads number of threads to use
#'
#' @return a \code{dist} or a matrix object with pairwise distance of
#' observations in x vs y (if not null)
#'
#' @examples
#' \donttest{
#' library(ranger)
#' # proximity pairwise distances
#' rf.fit <- ranger(Species ~ ., data = iris, num.trees = 500, write.forest = TRUE)
#' distance_random_forest(x = iris[, -5], rfObject = rf.fit, method = "Proximity", threads = 1)
#'
#' # depth distance for train versus test subset
#' set.seed(1234L)
#' learn <- sample(1:150, 100)
#' test <- (1:150)[-learn]
#' rf.fit <- ranger(Species ~ ., data = iris[learn, ], num.trees = 500, write.forest = TRUE)
#' distance_random_forest(x = iris[learn, -5], y = iris[test, -5], rfObject = rf.fit, method = "Depth")
#' }
#'
#' @export
distance_random_forest <- function(x, y = NULL, rfObject, method = "Proximity", threads = NULL) {
  method <- match.arg(method, c("Proximity", "Depth"))
  validate_ranger(rfObject)

  # set number of threads
  if (!is.null(threads)) {
    RcppParallel::setThreadOptions(numThreads = threads)
  }

  # Distance calculation
  if (method == "Proximity") {
    rf_dist <- proximity_distance(x = x, y = y, rfObject = rfObject)
  } else if (method == "Depth") {
    rf_dist <- depth_distance(x = x, y = y, rfObject = rfObject)
    rf_dist <- rf_dist / rfObject$num.trees
  }
  rf_dist
}



#' Get proximity matrix of an ranger object
#'
#' @param x a new dataset
#' @param y a second new dataset (Default: NULL)
#' @param rfObject \code{ranger} object
#' @param as_dist Bool, return a dist object.
#'
#' @return a \code{dist} or a matrix object with pairwise proximity of
#' observations in x vs y (if not null)
#'
#' @examples
#' \donttest{
#' library(ranger)
#' rf <- ranger(Species ~ ., data = iris, num.trees = 5, write.forest = TRUE)
#' proximity_distance(x = iris[, -5], rfObject = rf)
#'
#' set.seed(1234L)
#' learn <- sample(1:150, 100)
#' test <- (1:150)[-learn]
#' rf <- ranger(Species ~ ., data = iris[learn, ], num.trees = 500, write.forest = TRUE)
#' proximity_distance(x = iris[learn, -5], y = iris[test, -5], rfObject = rf)
#' }
#' @export
proximity_distance <- function(x, y = NULL, rfObject, as_dist = TRUE) {
  xNodes <- terminal_nodes(as.matrix(x), rfObject)
  if (is.null(y)) {
    d <- cpp_proximityMatrix(xNodes)
    n <- nrow(x)
    # convert to dist object
    rf_dist <- as_dist_object(d, n, "RFProximity")
  } else {
    yNodes <- terminal_nodes(as.matrix(y), rfObject)
    rf_dist <- cpp_proximityMatrixRangerXY(xNodes, yNodes)
  }

  # to distance
  if (as_dist) {
    rf_dist <- sqrt(max(rf_dist) - rf_dist)
  }
  rf_dist
}


#' @title Depth Distance
#'
#' @description This function returns for each observation the pairwise sum of edges
#' between the corresponding terminal nodes over each tree in the random forest.
#'
#' @param x A data.frame with the same columns as in the training data of the RandomForest model
#' @param y A data.frame with the same columns as in the training data of the RandomForest model
#' @param rfObject \code{ranger} object
#'
#' @examples
#' \donttest{
#' library(ranger)
#' rf <- ranger(Species ~ ., data = iris, num.trees = 5, write.forest = TRUE)
#' depth_distance(x = iris[, -5], rfObject = rf)
#' }
#'
#' @export
depth_distance <- function(x, y = NULL, rfObject) {
  xNodes <- terminal_nodes(as.matrix(x), rfObject)
  rfTrees <- ranger_forests_to_matrix(rfObject)
  if (is.null(y)) {
    d <- cpp_depthMatrix(xNodes, rfTrees)
    n <- nrow(x)
    # convert to dist object
    rf_dist <- as_dist_object(d, n, "RFDepth")
  } else {
    yNodes <- terminal_nodes(as.matrix(y), rfObject)
    rf_dist <- cpp_depthMatrixRangerXY(xNodes, yNodes, rfTrees)
  }
  rf_dist
}


#' @title Number of Edges between Terminal Nodes
#'
#' @description first two columns are terminal node IDs; If an ID pair do not
#' appear in a tree -1 is inserted
#'
#' @param rfObject \code{ranger} object
#'
#' @return a \code{matrix} object with pairwise terminal node edge length
#'
#' @examples
#' \donttest{
#' library(ranger)
#' rf.fit <- ranger(Species ~ ., data = iris, num.trees = 5, write.forest = TRUE)
#' edges_between_terminal_nodes(rf.fit)
#' }
#'
#' @export
edges_between_terminal_nodes <- function(rfObject) {
  validate_ranger(rfObject)
  rfTrees <- ranger_forests_to_matrix(rfObject)
  cpp_TerminalNodeDistance(rfTrees)
}


#' Weighted Distance calculation
#'
#' @param x a new dataset
#' @param y a second new dataset
#' @param weights a vector of weights
#'
#' @return a \code{dist} or \code{matrix} object
#'
#' @examples
#' \donttest{
#' library(ranger)
#' rf <- ranger(Species ~ ., data = iris, num.trees = 5, write.forest = TRUE)
#' terminal_nodes(iris[, -5], rf)
#' }
#'
#' @export
weighted_distance <- function(x, y = NULL, weights = NULL) {
  if (is.null(weights)) {
    weights <- seq(1, ncol(x))
  }
  if (is.null(y)) {
    d <- cpp_weightedDistance(x, weights)
    return(as_dist_object(d, nrow(x), "weightedDistance"))
  } else {
    return(cpp_weightedDistanceXY(x, y, weights))
  }
}

Try the CaseBasedReasoning package in your browser

Any scripts or data that you put into this service are public.

CaseBasedReasoning documentation built on Feb. 27, 2026, 9:06 a.m.