R/DE_edgeR.R

Defines functions erssa_edger_parallel erssa_edger

Documented in erssa_edger erssa_edger_parallel

#' @title Run edgeR for computed sample combinations
#'
#' @description
#' \code{erssa_edger} function runs classic edgeR method to identify
#' differentially expressed (DE) genes for each sample combination computed by
#' \code{comb_gen} function. A gene is considered to be
#' differentially expressed by defined FDR (Default=0.05) and logFC
#' (Default=1) values. As an option, the function can also save the edgeR
#' topTags tables as csv files to the drive.
#'
#' @details
#' The main function calls edgeR functions to perform exact test for each
#' computed combinations generated by \code{comb_gen}. In all tests, the
#' pair-wise test sets the condition defined in the object "control" as the
#' control condition.
#'
#' In typical usage, after each test, the list of differentially expressed genes
#' are filtered by FDR and log2FC values and only the filtered gene
#' names are saved for further analysis. However, it is also possible
#' to save all of the generated TopTags table to the drive for additional
#' analysis that is outside the scope of this package.
#'
#' @param count_table.filtered Count table pre-filtered to remove non- to
#' low- expressing genes. Can be the output of \code{count_filter} function.
#' @param combinations List of combinations that is produced by \code{comb_gen}
#' function.
#' @param condition_table A condition table with two columns and each sample as
#' a row. Column 1 contains sample names and Column 2 contains sample condition
#' (e.g. Control, Treatment).
#' @param control One of the condition names that will serve as control.
#' @param cutoff_stat The cutoff in FDR for DE consideration. Genes with lower
#' FDR pass the cutoff. Default = 0.05.
#' @param cutoff_Abs_logFC The cutoff in abs(logFC) for differential expression
#' consideration. Genes with higher abs(logFC) pass the cutoff. Default = 1.
#' @param save_table Boolean. When set to TRUE, function will, in addition, save
#' the generated edgeR TopTags table as csv files. The files are saved on the
#' drive in the working directory in a new folder named "ERSSA_edgeR_table".
#' Tables are saved separately by the replicate level. Default = FALSE.
#' @param path Path to which the files will be saved. Default to current working
#' directory.
#'
#' @return A list of list of vectors. Top list contains elements corresponding to
#' replicate levels. Each child list contains elements corresponding to each
#' combination at the respective replicate level. The child vectors contain
#' differentially expressed gene names.
#'
#' @author Zixuan Shao, \email{Zixuanshao.zach@@gmail.com}
#'
#' @examples
#' # load example filtered count_table, condition_table and combinations
#' # generated by comb_gen function
#' # example dataset containing 1000 genes, 4 replicates and 5 comb. per rep.
#' # level
#' data(count_table.filtered.partial, package = "ERSSA")
#' data(combinations.partial, package = "ERSSA")
#' data(condition_table.partial, package = "ERSSA")
#'
#' #run erssa_edger with heart condition as control
#' deg.partial = erssa_edger(count_table.filtered.partial, combinations.partial,
#'  condition_table.partial, control='heart')
#'
#' @references
#' Robinson MD, McCarthy DJ, Smyth GK (2010). “edgeR: a Bioconductor package
#' for differential expression analysis of digital gene expression data.”
#' Bioinformatics, 26(1), 139-140.
#'
#' @export
#'
#' @importFrom utils write.csv
#' @importFrom edgeR DGEList
#' @importFrom edgeR calcNormFactors
#' @importFrom edgeR estimateCommonDisp
#' @importFrom edgeR estimateTagwiseDisp
#' @importFrom edgeR exactTest
#' @importFrom edgeR topTags

erssa_edger = function(count_table.filtered=NULL, combinations=NULL,
                       condition_table=NULL, control=NULL, cutoff_stat = 0.05,
                       cutoff_Abs_logFC = 1, save_table=FALSE, path='.'){

    # check all required arguments supplied
    if (is.null(count_table.filtered)){
        stop(paste0('Missing required count_table.filtered argument in ',
                    'erssa_edger function'))
    } else if (is.null(combinations)){
        stop("Missing required combinations argument in erssa_edger function")
    } else if (is.null(condition_table)){
        stop(paste0("Missing required condition_table argument in erssa_edger",
                    " function"))
    } else if (is.null(control)){
        stop("Missing required control argument in erssa_edger function")
    } else if (!(is.data.frame(count_table.filtered))){
        stop('count_table is not an expected data.frame object')
    } else if (length(unique(sapply(count_table.filtered, class)))!=1){
        stop(paste0('More than one data type detected in count table, please',
                    ' make sure count table contains only numbers and that the',
                    ' list of gene names is the data.frame index'))
    } else if (!(is.data.frame(condition_table))){
        stop('condition_table is not an expected data.frame object')
    }

    message(paste0('Start classic edgeR Exact Test with FDR cutoff = ',
                   cutoff_stat,
                   ', Abs(logFC) cutoff = ', cutoff_Abs_logFC,'\n'))
    message(paste0('Save topTags tables to drive: ', save_table,'\n'))

    # rename input condition table column name
    colnames(condition_table) = c('sample_name','condition')

    # check control is one of the two conditions
    if (!(control %in% condition_table$condition)){
        stop('Control name does not match one of the two sample conditions')
    }

    if (save_table==TRUE){
        # create dir to save results
        folder_path = file.path(path, 'ERSSA_edgeR_table')
        dir.create(folder_path, showWarnings = FALSE)
    }

    # loop through each replicate level
    DE_genes = lapply(names(combinations), function(rep_level) {

        comb_rl = combinations[rep_level][[1]]

        if (save_table==TRUE){
            # create dir to save results
            folder_path = file.path(path, 'ERSSA_edgeR_table', rep_level)
            dir.create(folder_path, showWarnings = FALSE)
        }

        DE_genes_rl = lapply(seq_along(comb_rl), function(index) {

            # combination string to vector
            comb_samples_i = strsplit(comb_rl[index], ';')[[1]]

            # parse out sample counts
            count_table.filtered_i = count_table.filtered[,colnames(
                count_table.filtered) %in% comb_samples_i]

            # edgeR normalization, dispersion estimation
            group = sapply(colnames(count_table.filtered_i), function(x)
                condition_table$condition[which(
                    condition_table$sample_name==x)])
            y = DGEList(counts=count_table.filtered_i, group=group)
            y = calcNormFactors (y)
            y = estimateCommonDisp (y)
            y = estimateTagwiseDisp (y)

            # DE test on all genes
            uniq_cond = unique(condition_table$condition)
            et = exactTest(y, pair=c(control, as.character(
                uniq_cond[uniq_cond != control])))
            DE_et = topTags(et, n=Inf)$table
            DE_et_cutoff = DE_et[which(DE_et$FDR < cutoff_stat &
                                       abs(DE_et$logFC) > cutoff_Abs_logFC),]

            # save entire topTags table to drive
            if (save_table==TRUE){
                write.csv(DE_et, file = file.path(folder_path,paste0(
                    'ERSSA_edgeR_',rep_level,'_comb',index,'.csv')))
            }

            message(paste0(rep_level,'; combination_',index,' | done\n'))

            # save list of DE genes to list
            return(rownames(DE_et_cutoff))
        })

        # rename list name
        names(DE_genes_rl) = paste0('comb_', seq_along(DE_genes_rl))

        # add DE gene lists to main list
        return(DE_genes_rl)

    })

    names(DE_genes) = names(combinations)


    return(DE_genes)
}




#' @title Run edgeR for computed sample combinations with parallel computing
#'
#' @description
#' \code{erssa_edger_parallel} function performs the same calculation as
#' \code{erssa_edger} except now employs BiocParallel to perform parallel
#' edgeR calculations. This function runs classic edgeR method to identify
#' differentially expressed (DE) genes for each sample combination computed by
#' \code{comb_gen} function. A gene is considered to be
#' differentially expressed by defined FDR (Default=0.05) and logFC
#' (Default=1) values. As an option, the function can also save the edgeR
#' topTags tables as csv files to the drive.
#'
#' @details
#' The main function calls edgeR functions to perform exact test for each
#' computed combinations generated by \code{comb_gen}. In all tests, the
#' pair-wise test sets the condition defined in the object "control" as the
#' control condition.
#'
#' In typical usage, after each test, the list of differentially expressed genes
#' are filtered by FDR and log2FC values and only the filtered gene
#' names are saved for further analysis. However, it is also possible
#' to save all of the generated TopTags table to the drive for additional
#' analysis that is outside the scope of this package.
#'
#' @param count_table.filtered Count table pre-filtered to remove non- to low-
#' expressing genes. Can be the output of \code{count_filter} function.
#' @param combinations List of combinations that is produced by \code{comb_gen}
#' function.
#' @param condition_table A condition table with two columns and each sample as
#' a row. Column 1 contains sample names and Column 2 contains sample condition
#' (e.g. Control, Treatment).
#' @param control One of the condition names that will serve as control.
#' @param cutoff_stat The cutoff in FDR for DE consideration. Genes with lower
#' FDR pass the cutoff. Default = 0.05.
#' @param cutoff_Abs_logFC The cutoff in abs(logFC) for differential expression
#' consideration. Genes with higher abs(logFC) pass the cutoff. Default = 1.
#' @param save_table Boolean. When set to TRUE, function will, in addition, save
#' the generated edgeR TopTags table as csv files. The files are saved on the
#' drive in the working directory in a new folder named "ERSSA_edgeR_table".
#' Tables are saved separately by the replicate level. Default = FALSE.
#' @param path Path to which the files will be saved. Default to current working
#' directory.
#' @param num_workers Number of workers for parallel computing. Default=1.
#'
#' @return A list of list of vectors. Top list contains elements corresponding to
#' replicate levels. Each child list contains elements corresponding to each
#' combination at the respective replicate level. The child vectors contain
#' differentially expressed gene names.
#'
#' @author Zixuan Shao, \email{Zixuanshao.zach@@gmail.com}
#'
#' @examples
#' # load example filtered count_table, condition_table and combinations
#' # generated by comb_gen function
#' # example dataset containing 1000 genes, 4 replicates and 5 comb. per rep.
#' # level
#' data(count_table.filtered.partial, package = "ERSSA")
#' data(combinations.partial, package = "ERSSA")
#' data(condition_table.partial, package = "ERSSA")
#'
#' # run erssa_edger_parallel with heart condition as control
#' deg.partial = erssa_edger_parallel(count_table.filtered.partial,
#' combinations.partial, condition_table.partial,
#' control='heart', num_workers=1)
#'
#' @references
#' Morgan M, Obenchain V, Lang M, Thompson R, Turaga N (2018). BiocParallel:
#' Bioconductor facilities for parallel evaluation. R package version 1.14.1,
#' https://github.com/Bioconductor/BiocParallel.
#'
#' Robinson MD, McCarthy DJ, Smyth GK (2010). “edgeR: a Bioconductor package
#' for differential expression analysis of digital gene expression data.”
#' Bioinformatics, 26(1), 139-140.
#'
#' @export
#'
#' @importFrom utils write.csv
#' @importFrom edgeR DGEList
#' @importFrom edgeR calcNormFactors
#' @importFrom edgeR estimateCommonDisp
#' @importFrom edgeR estimateTagwiseDisp
#' @importFrom edgeR exactTest
#' @importFrom edgeR topTags
#' @importFrom BiocParallel SnowParam
#' @importFrom BiocParallel bplapply

erssa_edger_parallel = function(count_table.filtered=NULL, combinations=NULL,
                                condition_table=NULL, control=NULL,
                                cutoff_stat = 0.05, cutoff_Abs_logFC = 1,
                                save_table=FALSE, path='.', num_workers=1){

    # check all required arguments supplied
    if (is.null(count_table.filtered)){
        stop(paste0('Missing required count_table.filtered argument in ',
                    'erssa_edger_parallel function'))
    } else if (is.null(combinations)){
        stop(paste0("Missing required combinations argument in ",
                    "erssa_edger_parallel function"))
    } else if (is.null(condition_table)){
        stop(paste0("Missing required condition_table argument in ",
                    "erssa_edger_parallel function"))
    } else if (is.null(control)){
        stop(paste0("Missing required control argument in erssa_edger_parallel",
                    " function"))
    } else if (!(is.data.frame(count_table.filtered))){
        stop('count_table is not an expected data.frame object')
    } else if (length(unique(sapply(count_table.filtered, class)))!=1){
        stop(paste0('More than one data type detected in count table, please ',
                    'make sure count table contains only numbers and that the ',
                    'list of gene names is the data.frame index'))
    } else if (!(is.data.frame(condition_table))){
        stop('condition_table is not an expected data.frame object')
    }

    message(paste0('Start classic edgeR Exact Test with FDR cutoff = ',
                   cutoff_stat,
                   ', Abs(logFC) cutoff = ', cutoff_Abs_logFC,'\n'))
    message(paste0('Save topTags tables to drive: ', save_table,'\n'))
    message(paste0('Run parallel edgeR with ',num_workers, ' workers\n'))

    # rename input condition table column name
    colnames(condition_table) = c('sample_name','condition')

    # check control is one of the two conditions
    if (!(control %in% condition_table$condition)){
        stop('Control name does not match one of the two sample conditions')
    }

    folder_path = file.path(path, 'ERSSA_edgeR_table')
    if (save_table==TRUE){
        # create dir to save results
        dir.create(folder_path, showWarnings = FALSE)
    }

    # loop through each replicate level
    DE_genes = lapply(names(combinations), function(rep_level) {

        comb_rl = combinations[rep_level][[1]]

        if (save_table==TRUE){
            # create dir to save results
            folder_path = file.path(path, 'ERSSA_edgeR_table', rep_level)
            dir.create(folder_path, showWarnings = FALSE)
        }

        edger_par = function(index, rep_level, comb_rl, count_table.filtered,
                             condition_table, control, cutoff_stat,
                             cutoff_Abs_logFC, folder_path, save_table){

            # combination string to vector
            comb_samples_i = strsplit(comb_rl[index], ';')[[1]]

            # parse out sample counts
            count_table.filtered_i = count_table.filtered[,colnames(
                count_table.filtered) %in% comb_samples_i]

            # edgeR normalization, dispersion estimation
            group = sapply(colnames(count_table.filtered_i), function(x)
                condition_table$condition[which(
                    condition_table$sample_name==x)])
            y = edgeR::DGEList(counts=count_table.filtered_i, group=group)
            y = edgeR::calcNormFactors (y)
            y = edgeR::estimateCommonDisp (y)
            y = edgeR::estimateTagwiseDisp (y)

            # DE test on all genes
            uniq_cond = unique(condition_table$condition)
            et = edgeR::exactTest(y, pair=c(control, as.character(
                uniq_cond[uniq_cond != control])))
            DE_et = edgeR::topTags(et, n=Inf)$table
            DE_et_cutoff = DE_et[which(DE_et$FDR < cutoff_stat &
                                       abs(DE_et$logFC) > cutoff_Abs_logFC),]

            # save entire result table to drive
            if (save_table==TRUE){
                write.csv(DE_et, file = file.path(folder_path,paste0(
                    'ERSSA_edgeR_',rep_level,'_comb',index,'.csv')))
            }

            return(rownames(DE_et_cutoff))
        }

        param = SnowParam(workers = num_workers, type = "SOCK")

        DE_genes_rl=bplapply(X=seq_along(comb_rl),
                             FUN=edger_par, BPPARAM = param,
                             rep_level, comb_rl, count_table.filtered,
                             condition_table, control, cutoff_stat,
                             cutoff_Abs_logFC, folder_path,
                             save_table)
        names(DE_genes_rl) = sapply(seq_along(DE_genes_rl),
                                    function(x) paste0('comb_',x))

        # add DE gene lists to main list
        message(paste0(rep_level,' combinations | done\n'))

        return(DE_genes_rl)
    })

    names(DE_genes) = names(combinations)

    return(DE_genes)
}
zshao1/ERSSA documentation built on July 19, 2023, 9:20 p.m.