R/stats.R

Defines functions top_pathways get_pathways_summary do_limma do_pca wilcox_data_frame wilcox_test_fun wilcoxsign_test_fun calculate_wilcox_test do_wilcoxon cor_data_frame cor_test_fun do_cor anova_test_fun do_anova_test normalize_matrix normalize_data

Documented in do_pca do_wilcoxon get_pathways_summary normalize_data top_pathways

##
## stats.R
## Statistics functions
##
## Written by Marta R. Hidalgo, marta.hidalgo@outlook.es
##
## Code style by Hadley Wickham (http://r-pkgs.had.co.nz/style.html)
## https://www.bioconductor.org/developers/how-to/coding-style/
##

#' Normalize expression data from a SummarizedExperiment or matrix
#' to be used in \code{hipathia}
#'
#' Transforms the rank of the SummarizedExperiment or matrix of gene expression
#' to [0,1] in order
#' to be processed by \code{hipathia}. The transformation may be performed
#' in two different ways. If \code{percentil = FALSE}, the transformation
#' is a re-scaling of the rank of the matrix. If \code{percentil = TRUE},
#' the transformation is performed assigning to each cell its percentil in
#' the corresponding distribution. This option is recommended for
#' distributions with very long tails.
#'
#' This transformation may be applied either to the whole matrix
#' (by setting \code{by_gene = FALSE}), which we strongly recommend, or to
#' each of the rows (by setting \code{by_gene = TRUE}), allowing each gene
#' to have its own scale.
#'
#' A previous quantiles normalization may be applied by setting
#' \code{by_quantiles = TRUE}. This is recommended for noisy data.
#'
#' For distributions with extreme outlayer values, a percentil \code{p}
#' may be given to the parameter \code{truncation_percentil}. When provided,
#' values beyond percentil p are truncated to the value of percentil p, and
#' values beyond 1-p are truncated to percentil 1-p. This step is performed
#' before any other tranformation. By default no truncation is performed.
#'
#' @param data Either a SummarizedExperiment or a matrix of gene expression.
#' @param sel_assay Character or integer, indicating the assay to be normalized
#' in the SummarizedExperiment. Default is 1.
#' @param by_quantiles Boolean, whether to normalize the data by quantiles.
#' Default is FALSE.
#' @param by_gene Boolean, whether to transform the rank of each row of the
#' matrix to [0,1]. Default is FALSE.
#' @param percentil Boolean, whether to take as value the percentil of each
#' sample in the corresponding distribution.
#' @param truncation_percentil Real number p in [0,1]. When provided, values
#' beyond percentil p are truncated to the value of percentil p, and values
#' beyond 1-p are truncated to percentil 1-p. By default no truncation
#' is performed.
#'
#' @return Matrix of gene expression whose values are in [0,1].
#'
#' @examples data("brca_data")
#' trans_data <- translate_data(brca_data, "hsa")
#' exp_data <- normalize_data(trans_data)
#' exp_data <- normalize_data(trans_data, by_quantiles = TRUE,
#' truncation_percentil=0.95)
#'
#' @export
#' @import preprocessCore
#' @import SummarizedExperiment
#' @importFrom stats quantile
#' @importFrom stats ecdf
#' @importFrom methods is
#'
normalize_data <- function(data, sel_assay = 1, by_quantiles = FALSE,
                         by_gene = FALSE, percentil = FALSE,
                         truncation_percentil = NULL){

    if(is(data, "SummarizedExperiment")){
        se_flag <- TRUE
        mat <- assay(data, sel_assay)
    }else if(is(data, "matrix")){
        se_flag <- FALSE
        mat <- data
    }else{
        stop("Only SummarizedExperiment or matrix classes accepted as data")
    }
    norm_mat <- normalize_matrix(mat, by_quantiles = by_quantiles,
                                 by_gene = by_gene, percentil = percentil,
                                 truncation_percentil=truncation_percentil)
    if(se_flag == TRUE){
        norm_data <- SummarizedExperiment(list(norm = norm_mat),
                                          colData = colData(data))
    }else{
        norm_data <- norm_mat
    }
    return(norm_data)
}


#' @import preprocessCore
#' @importFrom stats quantile
#' @importFrom stats ecdf
#' @importFrom DelayedArray rowMins
#' @importFrom DelayedArray rowMaxs
normalize_matrix <- function(mat, sel_assay = 1, by_quantiles = FALSE,
                           by_gene = FALSE,
                           percentil = FALSE, truncation_percentil = NULL){

    # Normalize data matrix
    norm_data <- data.matrix(mat)
    if(!is.null(truncation_percentil)){
        if( truncation_percentil >= 0 & truncation_percentil <= 1){
            # Guarantees that truncation_percentil is in [0.5,1]
            if(truncation_percentil < (1-truncation_percentil))
                truncation_percentil <- 1-truncation_percentil
            # Truncates by the percentil
            norm_data <- t(apply(norm_data, 1, function(x){
                quan_inf <- stats::quantile(x, 1 - truncation_percentil,
                                            na.rm = TRUE)
                x[x < quan_inf] <- quan_inf
                quan_sup <- stats::quantile(x, truncation_percentil,
                                            na.rm = TRUE)
                x[x > quan_sup] <- quan_sup
                x
            }))
        }
        else{
            stop("Parameter truncation_percentil must be in [0,1]")
        }
    }
    if(by_quantiles == TRUE){
        norm_data <- preprocessCore::normalize.quantiles(norm_data)
    }
    if(by_gene == TRUE){
        if(percentil == TRUE){
            norm_data <- t(apply(norm_data, 1, function(x){stats::ecdf(x)(x)}))
        }else{
            min <- rowMins(norm_data, na.rm = TRUE)
            max <- rowMaxs(norm_data, na.rm = TRUE)
            norm_data <- (norm_data - min)/(max - min)
        }
    } else {
        if(percentil == TRUE){
            emp <- stats::ecdf(norm_data)
            norm_data <- t(apply(norm_data, 1, emp))
        }else{
            norm_data <- (norm_data - min(norm_data, na.rm = TRUE))/
                (max(norm_data, na.rm = TRUE) - min(norm_data, na.rm = TRUE))
        }
    }
    colnames(norm_data) <- colnames(mat)
    rownames(norm_data) <- rownames(mat)

    return(norm_data)
}


#'@importFrom stats p.adjust
do_anova_test <- function(data, group, adjust = TRUE){
    tests <- apply(data, 1, anova_test_fun, group = group)
    out <- do.call("rbind", lapply(tests, function(x){
        c(summary(x$fit)[[1]][[1, "Pr(>F)"]])
    }))
    out[is.na(out)] <- 1
    if(adjust == TRUE){
        out <- cbind(out, stats::p.adjust(out[,1], method = "fdr"))
    } else {
        out <- cbind(out, out[,1])
    }
    out <- as.data.frame(out, stringsAsFactors = FALSE)
    colnames(out) <- c("p.value", "adj.p.value")
    return(out)
}

#' @importFrom stats aov
#' @importFrom stats TukeyHSD
anova_test_fun <- function(values, group, verbose = FALSE){
    group <- factor(group)
    fit <- stats::aov(values ~ group)
    tuckey <- stats::TukeyHSD(fit)
    if(verbose==TRUE){
        summary(fit)
    }
    return(list(fit = fit,tuckey = tuckey))
}


#'@importFrom stats p.adjust
do_cor <- function(sel_vals, design, adjust = TRUE){

    data <- sel_vals[,design$sample]

    testData <- do.call("rbind",apply(data, 1, cor_test_fun, design$value))
    if(adjust==TRUE){
        fdrData <- stats::p.adjust(testData[,1], method = "fdr")
    } else {
        fdrData <- testData[,1]
    }
    data2 <- data.frame(testData[,seq_len(3)], fdrData,
                        stringsAsFactors = FALSE)

    colnames(data2) <- c("p.value", "UP/DOWN", "correlation", "FDRp.value")
    data2[data2$statistic>0,"UP/DOWN"] <- "UP"
    data2[data2$statistic<0,"UP/DOWN"] <- "DOWN"
    return(data2)
}


#'@importFrom stats cor.test
cor_test_fun <- function(x, values){
    r <- try(stats::cor.test(x = as.numeric(x), y = as.numeric(values)))
    result <- cor_data_frame(r)
    return(result)
}


cor_data_frame <- function(wilcox){
    if (is(wilcox, "try-error") | is.na(wilcox$p.value)){
        pvalue <- 1
        class <- "0"
        esti <- 0
    } else{
        pvalue <- wilcox$p.value
        esti <- wilcox$estimate[1]
        if (esti < 0){
            class <- "DOWN" ## regarding DISEASE
        } else if (esti > 0){
            class <- "UP" ## regarding DISEASE
        } else if (esti == 0){
            if (wilcox$conf.int[1] == 0){
                class <- "UP"
            } else if (wilcox$conf.int[2] == 0){
                class <- "DOWN"
            } else{
                class <- 0
            }
        }
    }
    result <- data.frame(pvalue, class, esti, stringsAsFactors = FALSE)
    return(result)
}


#' Apply Wilcoxon test
#'
#' Performs a Wilcoxon test for the values in \code{sel_vals} comparing
#' conditions \code{g1} and \code{g2}
#'
#' @param data Either a SummarizedExperiment object or a matrix, containing the
#' values. Columns represent samples.
#' @param group Either a character indicating the name of the column in colData
#' including the classes to compare, or a character vector with the class to
#' which each sample belongs.
#' Samples must be ordered as in \code{data}
#' @param g1 String, label of the first group to be compared
#' @param g2 String, label of the second group to be compared
#' @param paired Boolean, whether the samples to be compared are paired.
#' If TRUE, function \code{wilcoxsign_test} from package \code{coin} is
#' used. If FALSE, function \code{wilcox.test} from package \code{stats}
#' is used.
#' @param adjust Boolean, whether to adjust the p.value with
#' Benjamini-Hochberg FDR method
#' @param sel_assay Character or integer, indicating the assay to be normalized
#' in the SummarizedExperiment. Default is 1.
#' @param order Boolean, whether to order the results table by the
#' \code{FDRp.value} column. Default is FALSE.
#'
#' @return Dataframe with the result of the comparison
#'
#' @examples
#' data(path_vals)
#' data(brca_design)
#' sample_group <- brca_design[colnames(path_vals),"group"]
#' comp <- do_wilcoxon(path_vals, sample_group, g1 = "Tumor", g2 = "Normal")
#'
#' @export
#' @import SummarizedExperiment
#' @importFrom methods is
#'
do_wilcoxon <- function(data, group, g1, g2, paired = FALSE,
                        adjust = TRUE, sel_assay = 1, order = FALSE){

    if(is(data, "SummarizedExperiment")){
        se_flag <- TRUE
        if(is(group, "character") & length(group) == 1)
            if(group %in% colnames(colData(data))){
                group <- colData(data)[[group]]
            }else{
                stop("Group variable must be a column in colData(data)")
            }
        vals <- assay(data, sel_assay)
    }else if(is(data, "matrix")){
        se_flag <- FALSE
        vals <- data
    }else{
        stop("Only SummarizedExperiment or matrix classes accepted as data")
    }

    g1_indexes <- which(group == g1)
    g2_indexes <- which(group == g2)

    stat_vals <- suppressWarnings(
        calculate_wilcox_test(vals, g2_indexes, g1_indexes, paired = paired,
                              adjust = adjust))
    if(se_flag == TRUE && "feat.name" %in% colnames(rowData(data)))
        stat_vals <- cbind(name = rowData(data)[["feat.name"]], stat_vals)
    if(order == TRUE)
        stat_vals <- stat_vals[order(stat_vals$FDRp.value, decreasing = FALSE),]
    return(stat_vals)
}




#'@importFrom stats p.adjust
calculate_wilcox_test <- function(data, control, disease, paired, adjust=TRUE){
    if(paired == TRUE){
        dat <- apply(data, 1, wilcoxsign_test_fun, control, disease)
        testData <- do.call("rbind", dat)
        if(adjust == TRUE){
            fdrData <- stats::p.adjust(testData[,1], method = "fdr")
        } else {
            fdrData <- testData[,1]
        }
        data2 <- data.frame(testData[,c(2,3,1)], fdrData,
                            stringsAsFactors = FALSE)
    }else{
        testData <- do.call("rbind",
                            apply(data, 1, wilcox_test_fun,
                                  control, disease, paired))
        if(adjust == TRUE){
            fdrData <- stats::p.adjust(testData[,1], method = "fdr")
        } else {
            fdrData <- testData[,1]
        }
        # Standardize statistic
        lc <- length(control)
        ld <- length(disease)
        m <- lc*ld/2
        sigma <- sqrt(lc * ld * (lc + ld + 1)/12)
        z <- (testData[,3] - m)/sigma
        data2 <- data.frame(testData[,2], z, testData[,1], fdrData,
                            stringsAsFactors = FALSE)
        rownames(data2) <- rownames(testData)
        need0 <- which(data2$pvalue == 1 &
                           data2$class == "0" &
                           data2$fdrData == 1)
        data2[need0, "z"] <- 0
    }
    colnames(data2) <- c("UP/DOWN", "statistic", "p.value", "FDRp.value")
    data2[data2$statistic > 0,"UP/DOWN"] <- "UP"
    data2[data2$statistic < 0,"UP/DOWN"] <- "DOWN"
    return(data2)
}


wilcoxsign_test_fun <- function(x, control, disease){
    r <- try(coin::wilcoxsign_test(
        as.numeric(x[disease]) ~ as.numeric(x[control]), showWarnings = FALSE))
    if (is(r, "try-error")){
        pvalue <- 1
        class <- "0"
        stat <- 0
    } else{
        pvalue <- coin::pvalue(r)
        stat <- coin::statistic(r, "standardized")[1]
        if(stat > 0){
            class <- "UP"
        }else if (stat < 0){
            class <- "DOWN"
        }else{
            class <- "0"
        }
    }
    result <- data.frame(pvalue = pvalue,
                         class = class,
                         stat = stat,
                         stringsAsFactors = FALSE)
    return(result)
}


#'@importFrom stats wilcox.test
wilcox_test_fun <- function(x, control, disease, paired){
    r <- try(stats::wilcox.test(x = as.numeric(x[disease]),
                                y = as.numeric(x[control]),
                                conf.int = TRUE,
                                alternative = "two.sided",
                                paired = paired),
             silent = TRUE)
    result <- wilcox_data_frame(r)
    return(result)
}


wilcox_data_frame <- function(wilcox){
    if (is(wilcox, "try-error")){
        pvalue <- 1
        class <- "0"
        stat <- 0
    } else{
        pvalue <- wilcox$p.value
        esti <- wilcox$estimate
        stat <- wilcox$statistic[[1]]
        if (esti < 0){
            class <- "DOWN" ## regarding DISEASE
        } else if (esti > 0){
            class <- "UP" ## regarding DISEASE
        } else if (esti == 0){
            if (wilcox$conf.int[1] == 0){
                class <- "UP"
            } else if (wilcox$conf.int[2] == 0){
                class <- "DOWN"
            } else{
                class <- 0
            }
        }
    }
    result <- data.frame(pvalue, class, stat,stringsAsFactors=FALSE)
    return(result)
}


#'
#' Performs a Principal Components Analysis
#'
#' @param data SummarizedExperiment or matrix of values to be analyzed. Samples
#' must be represented in the columns.
#' @param sel_assay Character or integer, indicating the assay to be normalized
#' in the SummarizedExperiment. Default is 1.
#' @param cor A logical value indicating whether the calculation should use
#' the correlation matrix or the covariance matrix. (The correlation matrix
#' can only be used if there are no constant variables.)
#'
#' @return \code{do_pca} returns a list with class \code{princomp}.
#'
#' @examples
#' data(path_vals)
#' pca_model <- do_pca(path_vals[seq_len(ncol(path_vals)),])
#'
#' @export
#' @importFrom stats princomp
#' @importFrom methods is
#'
do_pca <- function(data, sel_assay = 1, cor = FALSE){
    if(is(data, "SummarizedExperiment")){
        data <- assay(data, sel_assay)
    }else if(is(data, "matrix")){
        data <- data
    }else{
        stop("Only SummarizedExperiment or matrix classes accepted as data")
    }
    fit <- stats::princomp(t(data), cor = cor)
    fit$var <- fit$sdev^2
    fit$explain_var <- fit$var/sum(fit$var)
    fit$acum_explain_var <- cumsum(fit$explain_var)
    return(fit)
}


do_limma <- function(data, groups, expdes, g2 = NULL, sel_assay = 1,
                     order = FALSE){

    if(is(data, "SummarizedExperiment")){
        se_flag <- TRUE
        if(is(groups, "character") & length(groups) == 1)
            if(groups %in% colnames(colData(data))){
                groups <- colData(data)[[groups]]
            }else{
                stop("Group variable must be a column in colData(data)")
            }
        vals <- assay(data, sel_assay)
    }else if(is(data, "matrix")){
        se_flag <- FALSE
        vals <- data
    }else{
        stop("Only SummarizedExperiment or matrix classes accepted as data")
    }

    if(!is.null(g2))
        expdes <- paste(expdes, "-", g2)

    # Make contrasts
    design <- stats::model.matrix(~0 + factor(groups))
    colnames(design) <- levels(factor(groups))
    rownames(design) <- colnames(vals)
    cont_matrix <- limma::makeContrasts(contrasts = expdes, levels = design)

    # Remove nodes with 0 variance
    vars <- apply(vals, 1, var)
    novar <- vals[-which(vars == 0),]

    # Do analysis
    fit <- limma::lmFit(novar, design)
    fit1 <- limma::contrasts.fit(fit, cont_matrix)
    fit2 <- limma::eBayes(fit1)
    tt <- limma::topTable(fit2, coef = 1, number = "all", sort.by = "none")

    # Add nodes with 0 variance to results table
    ttnovar <- data.frame(path = names(vars[vars == 0]), logFC = 0, AveExpr = 1,
                          t = -8, P.Value = 1, adj.P.Val = 1, B = -10)
    rownames(ttnovar) <- ttnovar$path
    ttnovar <- ttnovar[,-1]
    tt <- rbind(tt, ttnovar)
    tt <- tt[rownames(vals),]

    # Create results data frame
    updown <- c("UP", "DOWN", "UP")
    names(updown) <- c("1", "-1", "0")
    ud <- updown[as.character(sign(tt$logFC))]
    comp <- data.frame(ud, tt$t, tt$P.Value, tt$adj.P.Val,
                       stringsAsFactors = FALSE)
    colnames(comp) <- c("UP/DOWN", "statistic", "p.value", "FDRp.value")
    rownames(comp) <- rownames(tt)

    if(se_flag == TRUE && "feat.name" %in% colnames(rowData(data)))
        comp <- cbind(name = rowData(data)[["feat.name"]], comp)
    if(order == TRUE)
        comp <- comp[order(comp$p.value, decreasing = FALSE),]
    return(comp)
}


#' Compute pathway summary
#'
#' Computes a summary of the results, summarizing the number and proportion
#' of up- and down-regulated subpathways in each pathway.
#'
#' @param comp Comparison data frame as returned by the \code{do_wilcoxon}
#' function.
#' @param metaginfo Pathways object
#' @param conf Level of significance of the comparison for the adjusted
#' p-value. Default is 0.05.
#'
#' @return Table with the summarized information for each of the pathways.
#' Rows are the analized pathways. Columns are:
#' * \code{num_total_paths} Number of total subpathways in which each pathway
#' is decomposed.
#' * \code{num_significant_paths} Number of significant subpathways in the
#' provided comparison.
#' * \code{percent_significant_paths} Percentage of significant subpathways
#' from the total number of subpathways in a pathway.
#' * \code{num_up_paths} Number of significant up-regulated subpathways in the
#' provided comparison.
#' * \code{percent_up_paths} Percentage of significant up-regulated subpathways
#' from the total number of subpathways in a pathway.
#' * \code{num_down_paths} Number of significant down-regulated subpathways in
#' the provided comparison.
#' * \code{percent_down_paths} Percentage of significant down-regulated
#' subpathways from the total number of subpathways in a pathway.
#'
#' @examples
#' data(comp)
#' pathways <- load_pathways(species = "hsa", pathways_list = c("hsa03320",
#' "hsa04012"))
#' get_pathways_summary(comp, pathways)
#'
#' @export
#'
get_pathways_summary <- function(comp, metaginfo, conf = 0.05){
    comp$pathways <- sapply(strsplit(rownames(comp), split = "-"), "[[", 2)
    local_paths <- unique(metaginfo$all.labelids[,c("path.id", "path.name")])
    id_pathways <- local_paths[,"path.id"]
    name_pathways <- local_paths[,"path.name"]
    summ <- lapply(id_pathways, function(pathway){
        minicomp <- comp[comp$pathways == pathway,]
        num_total_paths <- nrow(minicomp)
        num_sig_paths <- sum(minicomp$FDRp.value < conf)
        percent_sig_paths <- round(num_sig_paths/nrow(minicomp)*100, digits = 2)
        is_up_path <- minicomp$FDRp.value < conf & minicomp$`UP/DOWN` == "UP"
        num_up_paths <- sum(is_up_path)
        percent_up_paths <- round(num_up_paths/nrow(minicomp) * 100, digits = 2)
        is_down_path <- minicomp$FDRp.value < conf &
            minicomp$`UP/DOWN` == "DOWN"
        num_down_paths <- sum(is_down_path)
        percent_down_paths <- round(num_down_paths/nrow(minicomp) * 100,
                                    digits = 2)
        data.frame(num_total_paths = num_total_paths,
                   num_significant_paths = num_sig_paths,
                   percent_significant_paths = percent_sig_paths,
                   num_up_paths = num_up_paths,
                   percent_up_paths = percent_up_paths,
                   num_down_paths = num_down_paths,
                   percent_down_paths = percent_down_paths)
    })
    summ <- do.call("rbind", summ)
    summ <- cbind(id_pathways, summ)
    rownames(summ) <- name_pathways
    summ <- summ[order(summ$percent_significant_paths, decreasing = TRUE),]
    return(summ)
}


#' Computes pathway significance
#'
#' Performs a test for each pathway checking if the number of significant
#' paths is significant, compared to not having any of the paths as significant.
#'
#' @param comp Comparison data frame as returned by the \code{do_wilcoxon}
#' function.
#'
#' @return
#' Table with the names of the pathways and their p-value for the Fisher test
#' comparing the proportion of significant subpaths vs. 0.
#'
#' @examples
#' data(comp)
#' top_pathways(comp)
#'
#' @export
#'
top_pathways <- function(comp){

    if("name" %in% colnames(comp)){
        path_names <- as.character(comp$name)
        comp$pathways <- sapply(strsplit(path_names, split = ":"), "[[", 1)
    }else{
        path_names <- rownames(comp)
        comp$pathways <- sapply(strsplit(path_names, split = "-"), "[[", 2)
    }
    pathways <- unique(comp$pathways)

    tests <- do.call(rbind, lapply(pathways, function(path) {
        t1 <- table(comp[comp$pathways == path,"FDRp.value"] < 0.05)
        t2 <- c(sum(t1), 0)

        ft <- fisher.test(rbind(t1, t2))
        data.frame(pathway = path, pval = ft$p.value, stringsAsFactors = FALSE)
    }))

    tests <- tests[order(tests$pval),]

    return(tests)
}
martahidalgo/hipathia documentation built on Jan. 12, 2023, 1:44 p.m.