R/eset_lm_1.R

Defines functions lm_for_one eset_lm_shuffled

#' Nested Linear Model Analysis
#' 
#' A function for analysis of sigfinicance of factors (akin ANOVA)
#' or continuous variable (regression) using nested linear models.
#' Shuffling-based FDR estimation is available as an option.
#' 
#' @param eset eset (or most likely eset subclass) object
#' @param form.alt character formulation of alternative model
#' @param form.nul character formulation of NULL model
#' @param facs data frame with the factors in its columns.
#'          If NULL, then pData(eset) will be used.
#' @param norm.coef vector with sample-to-sample normalization 
#'          coefficients in log2 scale.
#' @param N number of shuffles for FDR estimation. Default is NULL, that is
#'          no shuffling-based FDR estimation. Warning! Will be N-times slower.
#'          This is critical for Windows, since paralellization is not impletemted.
#'          In case of enabling shuffling-based FDR estimation, 
#'          the recommended value is at least N=1000.
#' 
#' @return data.frame
#'      \describe{
#'          \item{\code{effect}}{either max contrast in case of factor 
#'                               or slope in case of continuous variable}
#'          \item{\code{F.stat}}{F statistic}
#'          \item{\code{p.value}}{p-value}
#'      }
#' @importFrom Biobase exprs pData featureNames
#' @export eset_lm
#' 
#' @examples
#' data(srm_msnset)
#' head(varLabels(msnset))
#' out <- eset_lm(msnset, "~ subject.type", "~ 1")
#' head(out)
#' # now with shuffling
#' out <- eset_lm(msnset, "~ subject.type", "~ 1", N = 100)
#' head(out)
#' head(out[order(out$p.value),])


eset_lm <- function (eset, form.alt, form.nul, facs = NULL, norm.coef = NULL, N=NULL) 
{
    if(is.null(N))
        return(eset_lm_one_pass(eset, form.alt, form.nul, facs, norm.coef))
    else
        return(eset_lm_shuffled(eset, form.alt, form.nul, facs, norm.coef, N))
}




eset_lm_one_pass <- function (eset, form.alt, form.nul, facs = NULL, norm.coef = NULL) 
{
    # Check models. They should be nested and differ only in one term.
    #stopifnot(length(setdiff(all.vars(as.formula(form.alt)),
    #                         all.vars(as.formula(form.nul)))) == 1)
    
    #
    data <- exprs(eset)
    if (is.null(facs)) 
        facs <- pData(eset)
    if (is.null(norm.coef)) 
        norm.coef <- rep(0, ncol(data)) # log2 scale
    Np <- nrow(data)
    res <- data.frame(effect = numeric(Np), 
                         F.stat = numeric(Np), 
                         p.value = numeric(Np))
    # make parallel
    if(.Platform$OS.type == "unix")
        mc.cores <- parallel::detectCores()
    else
        mc.cores <- 1
    res <- parallel::mclapply(seq_len(nrow(eset)), function(i){
        lm_for_one(data[i,], form.alt, form.nul, 
                   facs, norm.coef)}, 
        mc.cores=mc.cores)
    res <- Reduce(rbind, res)
    
    #     for (i in seq_len(Np)) 
    #         result[i, ] <- lm.for.one(data[i,], form.alt, form.nul, 
    #                                   facs, norm.coef)
    
    rownames(res) <- rownames(data)
    res <- as.data.frame(res) # coerce to specific type in case of mat/df ambig
    res$fdr <- p.adjust(res$p.value, method="fdr")
    return(res)
}



eset_lm_shuffled <- function(eset, form.alt, form.nul, 
                             facs = NULL, norm.coef = NULL, N=1000)
{
    # Check models. They should be nested and differ only in one term.
    # stopifnot(length(setdiff(all.vars(as.formula(form.alt)),
    #                         all.vars(as.formula(form.nul)))) == 1)
    
    #
    # normal test here
    res <- eset_lm(eset, form.alt, form.nul, facs, norm.coef)
    #
    sh.res.mat <- matrix(NA, ncol=N, nrow=nrow(eset))
    for(i in seq_len(N)){
        # shuffle here
        sh.idx <- sample(seq_len(ncol(eset)))
        # nested LM test
        res.sh <- eset_lm(eset[,sh.idx], form.alt, form.nul, facs, norm.coef)
        # count
        sh.res.mat[,i] <- sapply(res$p.value, function(x) sum(res.sh$p.value <= x))
    }
    #
    trufls <- sapply(res$p.value, function(x) sum(res$p.value <= x))  # T + F
    fls <- rowSums(sh.res.mat)/N  # F
    res$fdr.sh <- fls/trufls
    res$fdr.sh[res$fdr.sh > 1] <- 1
    rownames(res) <- featureNames(eset)
    return(res)
}



lm_for_one <- function(ints, form.alt, form.nul, facs, off) {
    #
    data <- data.frame(y = ints, facs, off)
    selected.rows <- rownames(model.frame(as.formula(form.alt), data=data))
    data <- data[selected.rows,]
    #
    mod.alt <- lm(as.formula(paste("y", form.alt)), offset = off, data = data)
    mod.nul <- lm(as.formula(paste("y", form.nul)), offset = off, data = data)
    anstat  <- anova(mod.alt, mod.nul, test = "F")
    p.value <- anstat[2, 'Pr(>F)']
    F.stat <- anstat[2, 'F']
    
    # extract the effect ---
    # get terms from models
    alt.terms <- attr(terms(as.formula(form.alt)), "term.labels")
    null.terms <- attr(terms(as.formula(form.nul)), "term.labels")
    dif.term <- setdiff(alt.terms, null.terms)
    
    if(length(dif.term) != 1) {
        stop(c("form.alt should have exactly 1 more term than form.nul.",
               "\nNumber of different terms: ", length(dif.term)))
    }
    
    if(length(grep(":", dif.term)) != 0) {
        stop("Interaction terms are not currently supported.")
    }
    
    if(is.null(data[[dif.term]])) {
        stop(c(dif.term, " is not a variable in the eset object."))
    }
    
    # determine if this is a multifactorial ANOVA
    if(is.factor(data[[dif.term]]) & nlevels(data[[dif.term]]) > 2){
        # basically I want to pull out maximum contrast
        col.pos <- grep(dif.term, names(coef(mod.alt)))
        effect <- coef(mod.alt)[col.pos]
        if("(Intercept)" %in% names(coef(mod.alt)))
            effect <- c(0, effect)
        effect <- diff(range(c(effect)))
    }else{
        col.pos <- grep(dif.term, names(coef(mod.alt)))
        effect <- coef(mod.alt)[col.pos]
    }
    return(c(effect=effect, F.stat=F.stat, p.value=p.value))
}
vladpetyuk/vp.misc documentation built on June 25, 2021, 6:35 a.m.