R/kallistoRes.R

Defines functions collectKallistoCounts collectKallistoCountsTXItoGene addKallistoResToTable consolidateData

Documented in addKallistoResToTable collectKallistoCounts collectKallistoCountsTXItoGene

#' Collects raw counts of gene (or transcript) data generated by kallisto.
#' @description Extracts raw counts from each of the "abundance.tsv" files generated by kallisto (\url{https://pachterlab.github.io/kallisto/about}) expression analysis. The counts are put into a matrix (genes by sample), and rounded. This step collects the raw count data ready for further expression analysis, or for differential expression, using \code{\link{DESeq2}}.
#' @param baseDir A character string of the directory where kallisto results are stored. For example, baseDir = "kallistoResults" will search "kallistoResults/*/abundance.tsv".
#' @return matrix. A matrix of (rounded) raw counts of genes (or transcripts) for each sample. 
#' @seealso kallisto (\url{https://pachterlab.github.io/kallisto/about}) 
#' @export
collectKallistoCounts <- function(baseDir = "./kallistoResults"){
        sample_id <- dir(baseDir)        
        consolData <- lapply(sample_id, consolidateData, baseDir) 
        kallistoRes <- do.call(cbind, consolData)
        return(as.matrix(kallistoRes))
}

#' Collects raw counts of genes, based on transcript data generated by kallisto.
#' @description Extracts raw counts of transcripts from each of the "abundance.tsv" files generated by kallisto (\url{https://pachterlab.github.io/kallisto/about}) expression analysis and converts this to gene counts. The counts are put into a matrix (genes by sample). This step collects the raw count data ready for further expression analysis, or for differential expression, using \code{\link{DESeq2}}.
#' @param baseDir A character string of the directory where kallisto results are stored. For example, baseDir = "kallistoResults" will search "kallistoResults/*/abundance.tsv".
#' @return matrix. A matrix of raw counts of genes for each sample. 
#' @import  tximport 
#' @import readr
#' @import tximportData
#' @export
collectKallistoCountsTXItoGene <- function(baseDir = "./kallistoResults"){

        importDir <- system.file("extdata", package="RNASeqAnalysis")
        tx2gene <- read.csv(file.path(importDir, "tx2gene.csv"))
        
        sample_id <- dir(baseDir)
        files <- file.path(baseDir, sample_id, "abundance.tsv")
        names(files) <- sample_id
        txi <- tximport(files, type="kallisto", tx2gene=tx2gene, reader=read_tsv)
        return(txi)
}

#' Adds the output kallisto directory to the original datafile.
#' @description GOES HERE
#' @param dataFile MORE
#' @param countDat MORE
#' @return dataframe MORE
#' @seealso \code{\link{collectKallistoCounts}}
#' @export
addKallistoResToTable <- function(dataFile, countDat){
        sample_id <- colnames(countDat)
        
        datKal <- lapply(sample_id, function(x) {
                datKallisto <- dataFile[grepl(paste("^", x, "(_1.fastq.gz|_2.fastq.gz|.fastq.gz)$", sep=""), dataFile$FILTEREDFILE), ]
                datKallisto$kallistoRes <- x
                return(datKallisto)
        } 
        )
        datKal <- do.call(rbind, datKal)
        datKal <- datKal[!duplicated(datKal$kallistoRes), ]
}

# private function
consolidateData <- function(sample_id, baseDir){
        baseDir <- "./kallistoResults"
        abundanceFile <- paste(baseDir, sample_id, "abundance.tsv", sep="/")
        abundance <- read.csv(abundanceFile, sep = "\t", row.names = "target_id")
        abundance <- round(abundance["est_counts"])
        colnames(abundance) <- sample_id
        return (abundance)
}
nixstix/RNASeqAnalysis documentation built on May 23, 2019, 7:06 p.m.