R/getThresholds.R

Defines functions getModes_all getMode getThresholds

Documented in getThresholds

#' Get thresholds to discretize variables according to their split in tree ensemble models
#' @param conditions character vector with all conditions from which to find the thresholds
#' @param data data to discretize
#' @param Kmax numeric, maximal number of categories for each variable (default: Kmax = 2).
#' @export

getThresholds <- function(conditions, data, Kmax = 2) {
  ### a little slow, may be improved

  # get all individual sub conditions per variable
  var_cond <- unlist(lapply(conditions, function(x) {
    unlist(strsplit(x, split = " & "))
  }))
  # make it to a data.frame with variable and thresholds used
  var_cond <- data.frame(
    var = as.numeric(str_extract(var_cond, pattern = "[:digit:]*(?=\\])")),
    thr = as.numeric(str_extract(var_cond, pattern = "-?[:digit:]*\\.?[:digit:]*$"))
  )

  # and now transform to a list
  var_split <- lapply(sort(unique(var_cond$var)),
    function(x, var_cond, data) {
      thr <- unlist(subset(var_cond, var == x, select = thr))
      var_v <- data[[x]]
      thr[thr < min(var_v)] <- min(var_v)
      thr[thr > max(var_v)] <- max(var_v)
      return(thr)
    },
    var_cond = var_cond, data = data
  )
  names(var_split) <- as.character(sort(unique(var_cond$var)))

  # remove non-numeric variables
  are_num <- as.character(which(sapply(data, function(x) {
    length(unique(x)) > 2
  })))
  are_num <- are_num[are_num %in% names(var_split)]
  var_split <- var_split[are_num]

  # get thresholds for discretization
  if (Kmax == 2) {
    new_thr <- lapply(var_split, getMode)
  } else {
    new_thr <- lapply(var_split, getModes_all)
    new_thr <- lapply(new_thr, function(x, Kmax) {
      x[1:min(Kmax - 1, length(x))]
    }, Kmax = Kmax)
  }

  # remove thresholds out of range
  colNb <- as.integer(names(new_thr))
  new_thr <- lapply(colNb, function(x, data, thr) {
    list(
      "var_v" = data[[x]],
      "thr" = thr[[as.character(x)]]
    )
  }, data = data, thr = new_thr)

  new_thr <- lapply(new_thr, function(x) {
    x$thr <- x$thr[x$thr >= min(x$var_v) & x$thr <= max(x$var_v)]
    if (length(x$thr) == 0) x$thr <- min(x$var_v)
    return(x)
  })
  names(new_thr) <- as.character(colNb)

  return(new_thr)
}


##########

getMode <- function(x) {
  if (length(x) == 1) {
    return(x)
  } else {
    tmp <- density(x)
    return(tmp$x[which.max(tmp$y)])
  }
}


getModes_all <- function(var) {
  # copied from the pastecs R-package:
  # https://github.com/phgrosjean/pastecs/blob/master/R/turnpoints.R
  # (I just removed the unecessary bits..)

  if (length(x) == 1) {
    return(x)
  }

  x <- as.vector(density(var)$y)
  n <- length(x)
  diffs <- c(x[1] - 1, x[1:(n - 1)]) != x

  uniques <- x[diffs]

  n2 <- length(uniques)
  poss <- (1:n)[diffs]
  exaequos <- c(poss[2:n2], n + 1) - poss - 1

  m <- n2 - 2
  ex <- matrix(uniques[1:m + rep(3:1, rep(m, 3)) - 1], m)
  peaks <- c(FALSE, apply(ex, 1, max, na.rm = TRUE) == ex[, 2], FALSE)
  tppos <- (poss + exaequos)[peaks]

  # Now, order the peaks and return the x values
  y_peaks <- x[tppos]
  x_peaks <- density(var)$x[tppos]

  peaks_order <- order(y_peaks, decreasing = TRUE)
  # y_peaks <- y_peaks[peaks_order]
  # x_peaks <- x_peaks[peaks_order]
  return(x_peaks[peaks_order])
}
aruaud/endoR documentation built on Jan. 25, 2025, 2:20 a.m.