R/sidClustering.rfsrc.R
In randomForestSRC: Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

Documented in sidClustering.rfsrc

sidClustering.rfsrc <- function(data,
                                method = "sid",
                                k = NULL,
                                reduce = TRUE,
                                ntree = 500,
                                ntree.reduce = function(p, vtry){100 * p / vtry},
                                fast = FALSE,
                                x.no.sid = NULL,
                                use.sid.for.x = TRUE,
                                x.only = NULL, y.only = NULL,
                                dist.sharpen = TRUE,
                                ...)
{
  ##------------------------------------------------------------------
  ##
  ## preliminary processing
  ##
  ##------------------------------------------------------------------
  ## make sure we have a data frame
  if (!is.data.frame(data)) {
    data <- data.frame(data)
  }
  ## decide which method to use
  method <- match.arg(method, c("sid",
                                "unsupv", 
                                "shi-horvath",  "sh",  "SH",  "sh1", "SH1", "sh-1", "SH-1",
                                "shi-horvath2", "sh2", "SH2", "sh-2", "SH-2"))
  if (grepl("sid", method)) {
    methodClass <- 1
  }
  else if (grepl("sh", method)) {
    methodClass <- 2
  }
  else if (grepl("un", method)) {
    methodClass <- 3
  }
  else {
    stop("method not set correctly:", method)
  }
  ## set k if user has not provided a value
  if (is.null(k)) {
    k <- 1:max(1, min(10, nrow(data) - 1))
  }
  else {
    k <- k[k<=nrow(data)]
    k <- k[k != 0]
    if (length(k) == 0) {
      k <- 1:max(1, min(10, nrow(data) - 1))
    }
  }
  ## determine the grow interface - rfsrc or rfsrc.fast?
  if (!fast) {
    rfsrc.grow <- "rfsrc"
  }
  else {
    rfsrc.grow <- "rfsrc.fast"
  }
  ##------------------------------------------------------------------
  ##
  ## run SH-1 to acquire cheap but effective VIMP for dimension reduction
  ##
  ## here's the rationale for Zcut and eps ...
  ## Let I=importance and s=standard error
  ## The left edge of the CI for a strong variable satisfies:
  ##
  ## I - s Zcut > eps' > 0  (eps' quantifies how far left CI is away from 0)
  ##
  ## With some rearrangment:
  ##
  ## I / (eps'/Zcut + s) > Zcut
  ##
  ## Let eps=eps'/Zcut, we have
  ##
  ## I / (eps + s) > Zcut
  ##
  ## As illustration, eps = .25, Zcut = 2 ---> eps'=.5
  ##
  ##------------------------------------------------------------------
  if (methodClass == 1 && reduce && nrow(data) > 2) {
    ## pull unnamed parameters to match against rfsrc formal arguments
    sv.f <- as.formula("classes ~ .")
    dots <- list(...)
    dots$importance <- TRUE
    dots$splitrule <- NULL ##needed to allow Mahalanobis splitting for sidClustering
    rfnames <- names(formals(rfsrc))
    rfnames <- rfnames[rfnames != "distance" & rfnames != "proximity"]
    ## remove any column with less than two unique values
    void.var <- sapply(data, function(x){length(unique(x, na.rm = TRUE)) < 2})
    if (sum(void.var) > 0) {
      data[, which(void.var)] <- NULL
    }
    ## ... the remaining part of the code block works if there is only 1 variable left
    ## ... there needs to be enough unique values always 
    ## run the supervised classifier and acquire VIMP
    dots.sv <- dots
    dots.sv$mtry <- NULL
    o <- do.call(holdout.vimp, c(list(formula = sv.f, data = make.sh(data, 1), ntree = ntree.reduce),
                     dots.sv[names(dots.sv) %in% rfnames]))
    ## identify significant variables
    Z <- o$importance[, 1]
    ## we need at least TWO (2) variables to pass filtering (otherwise there will be no SID)
    if (sum(Z > 0) > 1) {
      xvar.filter <- names(Z)[Z > 0]
    }
    else {
      ## there were less than two variables
      ## careful - it's possible there's only one variable
      xvar.filter <- names(Z)[order(Z, decreasing = T)[1:min(length(Z), 2)]]
    }
    ## subset the data
    data <- data[, xvar.filter, drop = FALSE]
  }
  ##-----------------------------------------------------------
  ##
  ## sid clustering method
  ##
  ##-----------------------------------------------------------
  if (methodClass == 1) {
    ## terminate if there is one case or less than two variables
    terminate <- FALSE
    ## did the user provide protected x-values - convert to data frame if so
    if (!is.null(x.no.sid) && !is.character(x.no.sid)) {
      x.no.sid <- data.frame(x.no.sid)
    }
    ## did the user provide protected x-values as a character vector?
    if (!is.null(x.no.sid) && is.character(x.no.sid)) {
      drops <- (names(data) %in% x.no.sid)
      if (sum(drops) > 0) {##check that x.no.sid is still viable - reduce (filtering) could remove it
        x.no.sid <- data[, drops, drop = FALSE]
        if (sum(!drops) > 0) {
          data <- data[ , !drops, drop = FALSE]
        }
        else {
          terminate <- TRUE
        }
      }
      else {
        x.no.sid <- NULL
      }
    }
    ## last chance to fail 
    if (terminate || ncol(data) < 2 || nrow(data) < 2) {
      rO <- list(clustering = rep(1, nrow(data)), rf = NULL, dist = NULL, sid = NULL, outcome.names = NULL)
      class(rO) <- c("rfsrc-failed", "sidClustering", method)
      return(invisible(rO))
    }
    ## sidIFY the data
    sid.o <- make.sid(data)
    rf.dat <- data.frame(sid.o$y, sid.o$x)
    if (use.sid.for.x) {
      outcome.names <- colnames(sid.o$y)
    }
    else {
      outcome.names <- colnames(sid.o$x)
    }
    ## if there are protected x-values, add them now to the data
    if (!is.null(x.no.sid) && is.data.frame(x.no.sid) && sum(names(data) %in% names(x.no.sid)) == 0) {
      rf.dat <- data.frame(rf.dat, x.no.sid)
      outcome.names <- c(outcome.names, colnames(x.no.sid))
    }
    ##remove the original data
    rm(data)
    ## make the mv formula
    mv.f <- as.formula(paste("Multivar(", paste(outcome.names, collapse = ","), paste(") ~ ."), sep = ""))
    ## pull unnamed parameters to match against rfsrc formal arguments
    ### extract the distance (oob by default)
    dots <- list(...)
    if (is.null(dots$distance)) {
      dots$distance <- "oob"
    }
    rfnames <- names(formals(rfsrc))
    ## mvRF call - in extreme cases (n=1,2 sample size) this can fail
    rf <- tryCatch({do.call(rfsrc.grow,
            c(list(formula = mv.f, data = rf.dat, ntree = ntree),
             dots[names(dots) %in% rfnames]))}, error=function(ex){NULL})
    if (is.null(rf)) {## call failed
      rO <- list(clustering = rep(1, nrow(rf.dat)), rf = NULL, dist = NULL, sid = NULL, outcome.names = NULL)
      class(rO) <- c("rfsrc-failed", "sidClustering", method)
      return(invisible(rO))
    }
    d <- rf$distance
    ## euclidean distance sharpening (depends on distance.sharpening option - this is slow!!!!)
    if (dist.sharpen) {
      d <- distance(d)
    }
    ## OOB distance may contain NA's - set these to 1
    d[is.na(d)] <- 1
  }##end RF-SID
  ##-----------------------------------------------------------
  ##
  ## Shi-Horvath (mode 1 and 2) clustering
  ## converts problem to classification using artifical class data
  ##
  ##-----------------------------------------------------------
  if (methodClass == 2) {
    ## set NULL values
    sid.o <- outcome.names <- NULL
    ## make SH synthetic class data - mode 1 or 2
    if (grepl("2", method)) {
      mode <- 2
      method <- "sh2"
    }
    else {
      mode <- 1
      method <- "sh1"
    }
    hv.dat <- make.sh(data, mode)
    ## make the supervised formula
    sv.f <- as.formula("classes ~ .")
    ## pull unnamed parameters to match against rfsrc formal arguments
    ## extract the distance (oob by default)
    dots <- list(...)
    if (is.null(dots$proximity)) {
      dots$proximity <- "oob"
    }
    rfnames <- names(formals(rfsrc))
    ## run the supervised classifier and extract the proximity distance
    rf <- do.call(rfsrc.grow, c(list(formula = sv.f, data = hv.dat, ntree = ntree),
            dots[names(dots) %in% rfnames]))
    d <- 1 - rf$proximity[rf$yvar == 1, rf$yvar == 1]
    ## OOB proximity may contain NA's - set these to 1
    d[is.na(d)] <- 1
  }
  ##-----------------------------------------------------------
  ##
  ## plain vanilla unsupervised splitting
  ## multivariate splitting x against x
  ##
  ##-----------------------------------------------------------
  if (methodClass == 3) {
    ## set NULL values
    sid.o <- outcome.names <- NULL
    ## scale the y-values
    y.dat <- scale(data.matrix(data))
    ## make mv formula 
    ## duplicate the data
    ## watch out for x.only and y.only requests
    outcome.values <- colnames(y.dat[, setdiff(colnames(y.dat), x.only), drop = FALSE])
    mv.f <- get.mv.formula(outcome.values)
    rf.data <- data.frame(y.dat, data[, setdiff(colnames(data), y.only), drop = FALSE])
    ## remove data
    rm(data)
    ## pull unnamed parameters to match against rfsrc formal arguments
    ### extract the distance (oob by default)
    dots <- list(...)
    if (is.null(dots$distance)) {
      dots$distance <- "oob"
    }
    rfnames <- names(formals(rfsrc))
    ## mvRF call - in extreme cases (n=1,2 sample size) this can fail
    rf <- tryCatch({do.call(rfsrc.grow,
                            c(list(formula = mv.f, data = rf.data, ntree = ntree), 
                              dots[names(dots) %in% rfnames]))}, error=function(ex){NULL})
    if (is.null(rf)) {## call failed
      rO <- list(clustering = rep(1, nrow(rf.data)), rf = NULL, dist = NULL)
      #class(rO) <- c("rfsrc-failed", "sgreedy.unsupv")
      return(invisible(rO))
    }
    d <- rf$distance
    ## euclidean distance sharpening (depends on distance.sharpening option - this is slow!!!!)
    if (dist.sharpen) {
      d <- distance(d)
    }
    ## OOB distance may contain NA's - set these to 1
    d[is.na(d)] <- 1
  }    
  ##-----------------------------------------------------------
  ##
  ## hierarchical clustering
  ##
  ##-----------------------------------------------------------
  clus <- cutree(hclust(as.dist(d)), k = k)
  ##-----------------------------------------------------------
  ##
  ## return the goodies
  ##
  ##-----------------------------------------------------------
  rO <- list(clustering = clus, rf = rf, dist = d, sid = sid.o, outcome.names = outcome.names)
  class(rO) <- c("rfsrc", "sidClustering", method)
  invisible(rO)
}
sidClustering <- sidClustering.rfsrc

Any scripts or data that you put into this service are public.

randomForestSRC documentation built on Nov. 5, 2025, 7:41 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

randomForestSRC
Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

R/sidClustering.rfsrc.R
In randomForestSRC: Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

Defines functions sidClustering.rfsrc

Documented in sidClustering.rfsrc

Try the randomForestSRC package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

randomForestSRC Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

R/sidClustering.rfsrc.R In randomForestSRC: Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

Defines functions sidClustering.rfsrc

Documented in sidClustering.rfsrc

Try the randomForestSRC package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

randomForestSRC
Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

R/sidClustering.rfsrc.R
In randomForestSRC: Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)