R/utilities.unsupervised.R
In randomForestSRC: Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

Documented in sid.perf.metric

##  make SH data (modes 1 and 2)
make.sh <- function(dat, mode = 1) {
  ## extract sample size dimension
  nr <- dim(dat)[[1]]
  nc <- dim(dat)[[2]]
  if (nc == 0) {
    stop("can't make SH data ... not enough unique values\n")
  }
  ## coerce to data frame format
  if (!is.data.frame(dat)) {
    dat <- data.frame(dat)
  }
  ## mode 1
  if (mode == 1) {
    data.frame(classes = factor(c(rep(1, nr), rep(2, nr))),
      rbind(dat, data.frame(mclapply(dat, sample, replace = TRUE))))
  }
  ## mode 2
  else {
    data.frame(classes = factor(c(rep(1, nr), rep(2, nr))),
      rbind(dat, data.frame(mclapply(dat, function(x) {
        if (is.factor(x)) {
          resample(x, replace = TRUE)
        }
        else {
          runif(nr, min(x, na.rm = TRUE), max(x, na.rm = TRUE))
        }
      }))))
  }
}
##  make sid (staggered interaction data)
make.sid <- function(dat, order.by.range = TRUE, delta = NULL) {
  ## coerce to data frame format
  if (!is.data.frame(dat)) {
    dat <- data.frame(dat)
  }
  ## remove any column with less than two unique values
  void.var <- sapply(dat, function(x) {length(unique(x, na.rm = TRUE)) < 2})
  if (sum(void.var) > 0) {
    dat[, which(void.var)] <- NULL
  }
  ## there might be nothing left (small sample size issue)
  if (ncol(dat) == 0) {
    stop("can't make sid data ... not enough unique values\n")
  }
  ## order columns by range of values: noted improvement Alex 04/30/2018
  ## we do this before positivity and translation - but we could just as
  ## well have done this afterwards
  if (order.by.range) {
    order.range <- order(sapply(dat, function(x) {
    if (is.factor(x)) {
      1##changed from -Inf to 1 which is the correct value of a factor noted by Alex 06/05/2018
    }
    else {
      diff(range(x, na.rm = TRUE))
    }
  }), decreasing = TRUE)
  dat <- dat[, order.range, drop = FALSE]
  }
  ## make continuous variables positive
  posdat <- dat
  lapply(1:ncol(posdat), function(i) {
    if(!is.factor(posdat[, i]) && min(posdat[, i], na.rm = TRUE) < 0) {
      posdat[, i] <<- abs(min(posdat[, i], na.rm = TRUE)) + posdat[, i]
    }
    NULL
  })
  ## define the delta offset value (delta = 1 by default)
  ## if ordering is in effect, translate features to have the same maximum value
  ## this was observed in counter-example of theorem of revised paper 05/04/2018 
  if (is.null(delta)) {
    delta <- 1
  }
  if (order.by.range) {
    cont.feature <- sapply(posdat, function(x) {!is.factor(x)})
    ## acquire the maximum value
    if (sum(cont.feature) > 1) {
      maxV <- max(c(0, sapply(which(cont.feature), function(i) {
        max(posdat[, i], na.rm = TRUE)
      })), na.rm = TRUE)
      ## translate the features to have the same max
      lapply(which(cont.feature), function(i) {
        posdat[, i] <<- (maxV - max(posdat[, i], na.rm = TRUE)) + posdat[, i]
        NULL
      })	
    }
  }		
  ## two level factors are converted to numeric
  binary.fac <- sapply(posdat, function(x) {is.factor(x) & length(levels(x)) == 2})
  if (sum(binary.fac) > 0) {
    lapply(which(binary.fac), function(i) {
      names(posdat)[i] <<- names(posdat)[i]
      posdat[, i] <<- as.numeric(posdat[, i])
      NULL 
    })
  }
  ## the following list makes it possible to map SID back to original variable names 
  org.names <- list()
  ## make anova data for remaining factors - i.e. make everything numeric
  ## updated to make column names more appealing for binary factors (03/14/2018)
  counter <- 0
  numdat <- data.frame(lapply(1:ncol(posdat), function(j) {
    m.j <- model.matrix(~.-1, posdat[,j, drop = FALSE])
    ##SID main effect names mapped back to original names
    org.names[(counter + 1):(counter + ncol(m.j))] <<- colnames(posdat)[j]
    counter <<- counter + ncol(m.j)
    m.j
  }))
  ## stagger the positive data using delta
  ## use integer values when staggering to minimize creating double precision values
  ## the latter is accomplished using ceiling
  numvar <- dim(numdat)[2]
  staggerdat <- numdat
  staggerdat[, 1] <- staggerdat[, 1] + delta
  if (numvar > 1) {##handles pathological case of one column design matrix - maybe add this as a failure check?
    lapply(2:numvar, function(i) {
      staggerdat[, i] <<- staggerdat[, i] + ceiling(max(staggerdat[, (i-1)], na.rm = TRUE)) + delta
    })
  }
  ## make interactions
  ## -- do not create interactions WITHIN levels of the same factor -- noted by Alex M. 04/25/17
  ##    such instances are caught using unstaggered numerical data and checking x[,i]*x[,j]=constant
  ## -- interactions BETWEEN distinct factors (two or more levels) must always produce a factor
  ##    we catch this by checking the number of distinct values of each variable
  ##    all factors at this point have <= 2 unique values (whether they are dummy anova or numerical)
  intdat <- staggerdat
  if (numvar > 1) {##interactions do not apply if only one variable is supplied - maybe add this as a failure check?
    ##SID interaction names mapped back to original names
    ##we do this first inside of an lapply and not in next mclapply
    counter <- numvar
    lapply(1:(numvar-1), function(i) {
      for (j in (i + 1):numvar) {
        if (length(unique(numdat[,i] * numdat[,j])) > 1) {
          counter <<- counter + 1
          org.names[[counter]] <<- c(org.names[[i]], org.names[[j]])
        }
      }
      NULL
    })
    ## suggested by Alex M. 02/19/2019
    ## make interaction matrix after creating interactions - faster on big p
    ints <- mclapply(1:(numvar - 1), function(i) {
      d <- data.frame(rep(NA, dim(dat)[1]))
      d <- d[, -1]
      counter.ints <- 1
      for (j in (i + 1):numvar) {
        if (length(unique(numdat[,i] * numdat[,j])) > 1) {
          if (length(unique(numdat[, i])) <= 2 & length(unique(numdat[, j])) <= 2) {
            holder <- factor(staggerdat[, i] * staggerdat[, j])
            ## suggested by Alex M. 11/12/17
            levels(holder) <- c('FF','FT','TF','TT') #new factor levels, should always come out in same order
            d <- cbind(d, holder)
          }
          else {
            d <- cbind(d, staggerdat[, i] * staggerdat[, j])
          }
          names(d)[counter.ints] <- paste(names(intdat)[i], "_", names(intdat)[j], sep = "")
          counter.ints <- counter.ints + 1
        }
      }
      d
    })
    intdat <- data.frame(intdat, ints)
  }
  ## suggested by Alex M. 11/12/17
  ## final processing of numeric 0/1 variables to convert them to a binary factor with new levels
  binary.fac2 <- sapply(intdat, function(x){is.numeric(x) & length(unique(x)) == 2})
  if (sum(binary.fac2) > 0) {
    lapply(which(binary.fac2), function(i) {
      intdat[, i] <<- as.factor(intdat[, i])
      levels(intdat[, i]) <<- c('F','T') #new levels, similarly tested
      NULL
    })
  }
  ## pull the "x" and "y" features
  ## y=staggered main effects
  ## x=staggered interactions
  y <- intdat[, 1:numvar, drop = FALSE]
  names(org.names) <- colnames(intdat)
  y.names <- org.names[1:numvar]
  if (numvar > 1) {
    x <- intdat[, -(1:numvar), drop = FALSE]
    x.names <- org.names[-(1:numvar)]
  }
  else {
    x <- x.names <- NULL
  }
  ## return the goodies
  list(y = y, x = x, y.names = y.names, x.names = x.names, delta = delta)
}
#weighted gini/entropy performance metric
#mode is equal to either 'gini' or 'entropy'
sid.perf.metric <- function(truth,cluster,mode=c("entropy", "gini")){
  ## verify mode option
  mode <- match.arg(mode, c("entropy", "gini"))
  ## confusion matrix
  k=length(unique(truth))
  tab=table(truth,cluster)
  clustersizes=colSums(tab)
  clustersizesnorm=clustersizes/sum(clustersizes)
  tabprop=tab
  lapply(1:(dim(tab)[2]),function(i){
    tabprop[,i]<<-tabprop[,i]/clustersizes[i]
    NULL
  })
  if(mode=="entropy"){
    measure=0
    maxmeasure=0
    lapply(1:(dim(tabprop)[2]),function(i){
      clustermeasure=0
      maxclustermeasure=0
      lapply(1:(dim(tabprop)[1]),function(j){
        if(tabprop[j,i]!=0){
          clustermeasure<<-clustermeasure+-tabprop[j,i]*log2(tabprop[j,i])
        }
        maxclustermeasure<<-maxclustermeasure+-1/k*log2(1/k)
        NULL
      })
      measure<<-measure+clustersizesnorm[i]*clustermeasure
      maxmeasure<<-maxmeasure+clustersizesnorm[i]*maxclustermeasure
    })
  }
  if(mode=="gini"){
    measure=0
    maxmeasure=0
    lapply(1:(dim(tabprop)[2]),function(i){
      clustermeasure=1
      maxclustermeasure=1
      lapply(1:(dim(tabprop)[1]),function(j){
        if(tabprop[j,i]!=0){
          clustermeasure<<-clustermeasure+(-tabprop[j,i]^2)
        }
        maxclustermeasure<<-maxclustermeasure+(-1/k^2)
        NULL
      })
      measure<<-measure+clustersizesnorm[i]*clustermeasure
      maxmeasure<<-maxmeasure+clustersizesnorm[i]*maxclustermeasure
      NULL
    })
  }
  list(result=measure,measure=mode,normalized_measure=measure/maxmeasure)
}

Any scripts or data that you put into this service are public.

randomForestSRC documentation built on June 25, 2024, 5:08 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

randomForestSRC
Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

R/utilities.unsupervised.R
In randomForestSRC: Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

Defines functions sid.perf.metric make.sid make.sh

Documented in sid.perf.metric

Try the randomForestSRC package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

randomForestSRC Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

R/utilities.unsupervised.R In randomForestSRC: Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

Defines functions sid.perf.metric make.sid make.sh

Documented in sid.perf.metric

Try the randomForestSRC package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

randomForestSRC
Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)

R/utilities.unsupervised.R
In randomForestSRC: Fast Unified Random Forests for Survival, Regression, and Classification (RF-SRC)