R/m00-pruner.R
In Modeler: Classes and Methods for Training and Using Binary Prediction Models

Documented in fsChisquared fsEntropy fsFisherRandomForest fsMedSplitOddsRatio fsModifiedFisher fsPearson fsSpearman fsTailRank fsTtest keepAll

# Copyright (C) Kevin R. Coombes, 2007-2016

###
### m00-pruner.R
###

# Pruner interface
# INPUT:
#	(training) data
#	(training) class-status
#	optional parameter list
# OUTPUT:
#	logical vector equal in length to number of data rows

# the keep-everything feature selection method
keepAll <- function(data, group) {
  rep(TRUE, length=nrow(data))
}

# pruning by a t-statistic
fsTtest <- function(fdr, ming=500) {
  function(data, group) {
    mtt <- MultiTtest(data, group)
    bum <- Bum(mtt@p.values)
    x <- selectSignificant(bum, alpha=fdr, by="FDR")
    if (sum(x) < ming) {
      p <- sort(mtt@p.values)[ming]
      x <- mtt@p.values <= p
    }
    x
  }
}

# feature selection or "pruning" by a modified Fisher criterion F-score
fsModifiedFisher <- function(q) {
  function(data, group) {
    isA <- group == levels(group)[1]
    isB <- !isA
    m  <- oompaBase::matrixMean(data)
    mA <- oompaBase::matrixMean(data[,isA])
    mB <- oompaBase::matrixMean(data[,isB])
    vA <- oompaBase::matrixVar(data[,isA], mA)
    vB <- oompaBase::matrixVar(data[,isB], mB)
    fval <- as.vector( ((mA - m)^2 + (mB - m)^2)/(vA + vB) )
    fval > quantile(fval, q)
  }
}

## hidden helper function
.corstat <- function(data, group, q, rho) {
  target <- matrix(scale(1 * (group == levels(group)[1])), ncol=1)
  scaled <- t(scale(t(data)))
  x <- abs(as.vector(scaled %*% target) / length(group))
  if (!is.null(q)) {
    rho <- quantile(x, q)
  }
  x > rho
}

# pruning by Pearson correlation
fsPearson <- function(q = NULL, rho) {
  function(data, group) {
    .corstat(data, group, q, rho)
  }
}

# pruning by Spearman correlation
fsSpearman <- function(q = NULL, rho) {
  function(data, group) {
    data <- t(apply(data, 1, rank))
    .corstat(data, group, q, rho)
  }
}


# pruning by median-split odds-ratio
fsMedSplitOddsRatio <- function(q = NULL, OR) {
  function(data, group) {
    meds <- apply(data, 1, median)
    x <- 1*(sweep(data, 1, meds, '-') > 0)
    A <- levels(group)[1]
    pHA <- as.vector( x %*% matrix(group == A, ncol=1) ) / sum(group == A)
    pHB <- 1 - pHA
    pLA <- as.vector((1 - x) %*% matrix(group == A, ncol=1) ) / sum(group == A)
    pLB <- 1 - pLA
    logodds <- log( (pHA/pLA) / (pHB/pLB) )
    lor <- ifelse(is.null(q), log(OR), quantile(abs(logodds), q))
    abs(logodds) > lor
  }
}

# discretize continuous data
# the 'simple' part of binning inspires the name...
disco <- function(x, k=10, type=c("width", "freq")) {
  type <- match.arg(type)
  cutpts <- switch(type,
                   width = seq(min(x), max(x), length=k+1),
                   freq = quantile(x, seq(0, 1, by=1/k)))
  cutpts <- sort(unique(cutpts)) # may be needed for frequency-based cuts
  cut(x, cutpts, include.lowest = TRUE, labels=FALSE)
}

discodata <- function(data, type=c("width", "freq")) {
  k <- ceiling(min(10, sqrt(ncol(data))))
  t(apply(data, 1, disco, k=k, type=type))
}

fsChisquared <- function(q = NULL, cutoff) {
  function(data, group) {
    dd <- discodata(data, type='freq')
    stats <- apply(dd, 1, function(x) {
      tab <- table(x, group)
        rowsums = apply(tab, 1, sum)
        colsums = apply(tab, 2, sum)
        allsum = sum(colsums)
        expectedmatrix = t(as.matrix(colsums) %*% t(as.matrix(rowsums)))/allsum
        chis = sum((tab - expectedmatrix)^2/expectedmatrix)
        if (chis == 0 || length(colsums) < 2 || length(rowsums) < 2) {
            return(0)
        } else {
          return(sqrt(chis/(allsum * min(length(colsums) - 1,
                                         length(rowsums) - 1))))
        }
    })
    if (!is.null(q)) cutoff <- quantile(stats, q)
    stats > cutoff
  }
}

entropic <- function(x) {
  y <- table(x, useNA='always')
  freqs <- y / sum(y)
  -sum( ifelse(freqs > 0, freqs*log(freqs), 0) )
}

fsEntropy <- function(q = 0.9, kind=c("information.gain",
                                         "gain.ratio",
                                         "symmetric.uncertainty")) {
  kind <- match.arg(kind)
  function(data, group) {
    dd <- discodata(data, 'width')
    itemEnt <- apply(dd, 1, entropic)
    groupEnt <- entropic(group)
    jointEnt <- apply(dd, 1, function(x) {
      entropic(data.frame(group, x))
    })
    IG <- groupEnt + itemEnt - jointEnt
    results <- switch(kind,
                      information.gain = IG,
                      gain.ratio = IG / itemEnt,
                      symmetric.uncertainty = 2 * IG / (itemEnt + groupEnt))
    results > quantile(results, q)
  }
}

fsFisherRandomForest <- function(q) {
  function(data, group) {
    temp <- fsModifiedFisher(q=2*q-1)(data, group)
    daft <- data.frame(Y=group, t(data[temp,]))
    orchard <- randomForest(Y ~ ., daft, ntree = 1000, keep.forest = FALSE, 
                            importance = TRUE)
    imp <- importance(orchard, type=1)
    rating <- rep(min(imp)-1, length(temp))
    rating[temp] <- imp
    rating > quantile(rating, q)
  }
}

fsTailRank <- function(specificity=0.9, tolerance=0.5, confidence=0.5) {
  function(data, group) {
    trt <- TailRankTest(data, group, direction="two",
                        specificity=specificity,
                        tolerance=tolerance,
                        confidence=confidence)
    as.logical(trt)
  }
}

Any scripts or data that you put into this service are public.

Modeler documentation built on May 7, 2019, 1:03 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Modeler
Classes and Methods for Training and Using Binary Prediction Models

R/m00-pruner.R
In Modeler: Classes and Methods for Training and Using Binary Prediction Models

Defines functions keepAll fsTtest fsModifiedFisher .corstat fsPearson fsSpearman fsMedSplitOddsRatio disco discodata fsChisquared entropic fsEntropy fsFisherRandomForest fsTailRank

Documented in fsChisquared fsEntropy fsFisherRandomForest fsMedSplitOddsRatio fsModifiedFisher fsPearson fsSpearman fsTailRank fsTtest keepAll

Try the Modeler package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Modeler Classes and Methods for Training and Using Binary Prediction Models

R/m00-pruner.R In Modeler: Classes and Methods for Training and Using Binary Prediction Models

Defines functions keepAll fsTtest fsModifiedFisher .corstat fsPearson fsSpearman fsMedSplitOddsRatio disco discodata fsChisquared entropic fsEntropy fsFisherRandomForest fsTailRank

Documented in fsChisquared fsEntropy fsFisherRandomForest fsMedSplitOddsRatio fsModifiedFisher fsPearson fsSpearman fsTailRank fsTtest keepAll

Try the Modeler package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Modeler
Classes and Methods for Training and Using Binary Prediction Models

R/m00-pruner.R
In Modeler: Classes and Methods for Training and Using Binary Prediction Models