R/05_03_quantification_featureCounts.R

Defines functions featureCounts2se featureCounts

Documented in featureCounts featureCounts2se

#' Count reads with featureCounts
#' 
#' @param sample_info Data frame of sample metadata created with the
#' functions \code{create_sample_info} and \code{infer_strandedness}.
#' @param mappingdir Directory where .bam files are stored.
#' @param gff_path Path to GFF/GTF file with annotations.
#' @param fcountsdir Directory where the matrix of gene-level read counts will
#' be stored.
#' @param threads Number of threads for featureCounts. Default: 2.
#' 
#' @return A gene expression matrix with genes in row names and samples in 
#' column names. Two .tsv files with the gene expression matrix and count stats
#' are saved to `fcountsdir`.
#' @importFrom rtracklayer import export
#' @importFrom tools file_ext
#' @importFrom Rsubread featureCounts
#' @importFrom utils write.table
#' @export
#' @rdname featureCounts
#' @examples 
#' data(sample_info)
#' mappingdir <- system.file("extdata", package="bears")
#' gff_path <- system.file("extdata", "Homo_sapiens.GRCh37.75_subset.gtf",
#'                         package="bears")
#' fcountsdir <- file.path(tempdir(), "fcountsdir")
#' if(subread_is_installed()) {
#'     counts <- featureCounts(sample_info, mappingdir, gff_path, fcountsdir)
#' }
featureCounts <- function(sample_info = NULL, 
                          mappingdir = "results/04_read_mapping",
                          gff_path = NULL,
                          fcountsdir = "results/05_quantification/featureCounts",
                          threads = 2) {
    
    if(!subread_is_installed()) { stop("Unable to find subread in PATH.") }
    if(!dir.exists(fcountsdir)) { dir.create(fcountsdir, recursive = TRUE) }
    
    bamfiles <- list.files(mappingdir, pattern = ".bam", full.names = FALSE)
    bamfiles <- gsub("Aligned.*", "", bamfiles)
    sample_meta <- sample_info[!duplicated(sample_info$BioSample), ]
    sample_meta <- sample_meta[sample_meta$BioSample %in% bamfiles, ]
    
    is_paired <- ifelse(sample_meta$Layout == "PAIRED", TRUE, FALSE)
    strandedness <- as.integer(vapply(seq_len(nrow(sample_meta)), function(x) {
        return(translate_strandedness(sample_meta[x, "Orientation"], 
                                      sample_meta[x, "Layout"])$fcounts)
    }, numeric(1)))
    file <- paste0(mappingdir, "/", 
                   sample_meta$BioSample, "Aligned.sortedByCoord.out.bam")
    
    if(tools::file_ext(gff_path) == "gff3") { 
        gff <- rtracklayer::import(gff_path) 
        gff_path <- tempfile(fileext = ".gtf")
        export <- rtracklayer::export(gff, gff_path)
    }
    fc <- Rsubread::featureCounts(
        file, annot.ext = gff_path, nthreads = threads, minMQS = 20,
        isPairedEnd = is_paired, strandSpecific = strandedness,
        isGTFAnnotationFile = TRUE
    )
    exp_matrix <- fc$counts
    colnames(exp_matrix) <- gsub("Aligned.*", "", colnames(exp_matrix))
    utils::write.table(exp_matrix, quote=FALSE, sep = "\t", row.names = TRUE,
                file = paste0(fcountsdir, "/exp_matrix.tsv"))
    write.table(fc$stat, quote=FALSE, sep="\t", row.names = FALSE, 
                file = paste0(fcountsdir, "/count_stats_per_sample.tsv"))
    return(exp_matrix)
}


#' Create a SummarizedExperiment object from featureCounts output
#'
#' @param sample_info Data frame of sample metadata created with the
#' functions \code{create_sample_info}.
#' @param fc_output Either a gene expression matrix generated by \code{fcount}, 
#' or the path to the .tsv file containing the gene expression matrix as
#' generated by \code{fcount}.
#' 
#' @return A SummarizedExperiment object.
#' @rdname featureCounts2se
#' @export
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @importFrom methods is
#' @importFrom utils read.table
#' @examples 
#' data(sample_info)
#' mappingdir <- system.file("extdata", package="bears")
#' gff_path <- system.file("extdata", "Homo_sapiens.GRCh37.75_subset.gtf",
#'                         package="bears")
#' fcountsdir <- file.path(tempdir(), "fcountsdir")
#' if(subread_is_installed()) {
#'     counts <- featureCounts(sample_info, mappingdir, gff_path, fcountsdir)
#'     se <- featureCounts2se(sample_info, counts)
#' }
featureCounts2se <- function(sample_info = NULL, 
                             fc_output = NULL) {
    if(!is(fc_output, "matrix")) { fc_output <- utils::read.table(fc_output) }
    
    sample_metadata <- sample_info[!duplicated(sample_info$BioSample), ]
    samples <- colnames(fc_output)
    coldata <- sample_metadata[sample_metadata$BioSample %in% samples, ]
    rownames(coldata) <- coldata$BioSample
    coldata$BioSample <- NULL
    
    se <- SummarizedExperiment::SummarizedExperiment(
        assays = list(fcounts = fc_output), colData = coldata
    )
    return(se)
}
almeidasilvaf/GEAtlas documentation built on April 14, 2023, 6:58 p.m.