ballgown: Flexible, isoform-level differential expression analysis

Documented in ballgown

### constructor function for ballgown objects

### helper functions:
.readIntron <- function(file, meas){
    cc = c('integer', 'character', 'factor', 'integer', 'integer', 'integer',
        'integer', 'numeric')
    if(!identical(meas, 'all')){
        dummy = c(rep(meas[1], 5), 'rcount', 'ucount', 'mrcount')
        cc[which(!(dummy %in% meas))] = 'NULL'
    }
    intron = read.table(file, header=TRUE, sep="\t", colClasses=cc, quote="")
    intron = intron[order(intron$i_id), ]
    rownames(intron) = 1:nrow(intron)
    return(intron)
}

.readExon <- function(file, meas) {
    cc = c('integer', 'character', 'factor', 'integer', 'integer',
        'integer', 'integer', rep('numeric', 5))
    if(!identical(meas, 'all')){
        dummy = c(rep(meas[1], 5), 'rcount', 'ucount', 'mrcount', 'cov',
            'cov_sd', 'mcov', 'mcov_sd')
        cc[which(!(dummy %in% meas))] = 'NULL'
    }
    exon = read.table(file, header=TRUE, sep="\t", colClasses=cc, quote="")
    exon = exon[order(exon$e_id), ]
    rownames(exon) = 1:nrow(exon)
    return(exon)
}

.readTranscript <- function(file, meas) {
    cc = c('integer', 'character', 'factor', 'integer', 'integer',
        'character', 'integer', 'integer', 'character', 'character',
        'numeric', 'numeric')
    if(!identical(meas, 'all')){
        if(!('cov' %in% meas)) cc[11] = 'NULL'
        if(!('FPKM' %in% meas)) cc[12] = 'NULL'
    }
    transcript = read.table(file, header=TRUE, sep="\t",
        colClasses=cc, quote="")
    transcript = transcript[order(transcript$t_id), ]
    rownames(transcript) = 1:nrow(transcript)
    return(transcript)
}

# make it so I can write warning/messages and whatnot that respect the 80char
# limit in my script, but still look pretty when displayed.
.makepretty = function(x){
    msg = gsub('\n', ' ', x)
    msg = gsub('    ', '', msg)
    msg
}

#' constructor function for ballgown objects
#'
#' @name ballgown-constructor
#'
#' @param samples vector of file paths to folders containing sample-specific
#'   ballgown data (generated by \code{tablemaker}).  If \code{samples} is
#'   provided, \code{dataDir} and \code{samplePattern} are not used.
#' @param dataDir file path to top-level directory containing sample-specific
#'   folders with ballgown data in them.  Only used if \code{samples} is NULL.
#' @param samplePattern regular expression identifying the subdirectories of\
#'   \code{dataDir} containing data to be loaded into the ballgown object (and
#'   only those subdirectories).  Only used if \code{samples} is NULL.
#' @param bamfiles optional vector of file paths to read alignment files for
#'    each sample.  If provided, make sure to sort properly (e.g., in the same
#'    order as \code{samples}).  Default NULL.
#' @param pData optional \code{data.frame} with rows corresponding to samples and
#'    columns corresponding to phenotypic variables.
#' @param verbose if \code{TRUE}, print status messages and timing information
#'    as the object is constructed.
#' @param meas character vector containing either "all" or one or more of:
#'   "rcount", "ucount", "mrcount", "cov", "cov_sd", "mcov", "mcov_sd", or
#'   "FPKM". The resulting ballgown object will only contain the specified
#'   expression measurements, for the appropriate features. See vignette for
#'   which expression measurements are available for which features. "all"
#'   creates the full object.
#'
#' @details Because experimental data is recorded so variably, it is the user's
#'   responsibility to format \code{pData} correctly.  In particular, it's
#'   really important that the rows of \code{pData} (corresponding to samples)
#'   are ordered the same way as \code{samples} or the
#'   \code{dataDir}/\code{samplePattern} combo. You can run
#'   \code{list.files(path = dataDir, pattern = samplePattern)} to see the sample
#'   order if \code{samples} was not used.
#'
#' If you are creating a ballgown object for a large experiment, this function
#' may run slowly and use a large amount of RAM. We recommend running this
#' constructor as a batch job and saving the resulting ballgown object as an rda
#' file.  The rda file usually has reasonable size on disk, and the object in it
#' shouldn't take up too much RAM when loaded, so the time and memory use in
#' creating the object is a one-time cost.
#'
#' @return an object of class \code{ballgown}
#'
#' @author Leonardo Collado-Torres, Alyssa Frazee
#'
#' @rdname ballgown-constructor
#'
#' @seealso \code{\link{ballgownrsem}}, for loading RSEM output into a ballgown
#'   object
#' @export
#' @examples
#' bg = ballgown(dataDir=system.file('extdata', package='ballgown'),
#'     samplePattern='sample')
#' pData(bg) = data.frame(id=sampleNames(bg), group=rep(c(1,0), each=10))
ballgown = function(samples=NULL, dataDir=NULL, samplePattern=NULL,
    bamfiles = NULL, pData = NULL, verbose=TRUE, meas="all"){
    if(verbose) message(date())

    if(all(c(is.null(samples),is.null(dataDir),is.null(samplePattern)))){
        stop(.makepretty('must provide either "samples" or both "dataDir" and
            "samplePattern"'))
    }

    if(length(setdiff(meas, c("rcount", "ucount", "mrcount", "cov", 
        "cov_sd", "mcov", "mcov_sd", "FPKM", "all"))) != 0){
        msg = 'meas can be either "all" or one or more of "rcount", 
            "ucount", "cov", "cov_sd", "mcov", "mcov_sd", or "FPKM". See
            vignette for details.'
        stop(.makepretty(msg))
    }
    if('all' %in% meas & length(meas) > 1){
        stop(.makepretty('when meas is "all", all types of measurements are
            included by default.'))
    }

    ## Determine where data is located
    if(is.null(samples)){
        if(is.null(samplePattern)|is.null(dataDir)){
            stop(.makepretty('must provide both "dataDir" and "samplePattern"
                if "samples" is NULL.'))
        }
        samples = list.files(path=dataDir, 
            pattern=samplePattern, full.names=TRUE)
        names(samples) = list.files(path=dataDir, pattern=samplePattern)
    }else{
        names(samples) = sapply(samples, function(x){
            tail(strsplit(x, split="/")[[1]],n=1)
        }, USE.NAMES=FALSE)
    }

    ## check to see if you actually have ballgown data:
    n = length(samples)
    subfiles = list.files(samples)
    ctabs = subfiles[subfiles %in% 
        c('e_data.ctab', 'i_data.ctab', 't_data.ctab', 'e2t.ctab', 'i2t.ctab')]
    if(length(ctabs) != 5*n){
        msg = 'Something is wrong: are you missing .ctab files? Do extra
        files/folders (other than tablemaker output folders) match your
        samples/dataDir/samplePattern argument(s)?'
        stop(.makepretty(msg))
    }

    ## Read tables linking exons/introns to transcripts
    if(verbose) message(paste0(date(), ": Reading linking tables"))
    e2t = read.table(list.files(samples[1], "e2t.ctab", full.names=TRUE), 
        header=TRUE, sep="\t", colClasses=c("integer", "integer"))
    i2t = read.table(list.files(samples[1], "i2t.ctab", full.names=TRUE), 
        header=TRUE, sep="\t", colClasses=c("integer", "integer"))

    ## Order by transcript id
    e2t = e2t[order(e2t$t_id), ]
    i2t = i2t[order(i2t$t_id), ]
    rownames(e2t) = 1:nrow(e2t)
    rownames(i2t) = 1:nrow(i2t)

    ## Read counts for all introns 
    if(verbose) message(paste0(date(), ": Reading intron data files"))

    # if intron data is requested:
    if( any(c('all', 'rcount', 'ucount', 'mrcount') %in% meas)){
        intronFiles = sapply(samples, list.files, pattern="i_data.ctab", 
            full.names=TRUE)
        intronAll = NULL
        for(f in intronFiles){
            ind = which(intronFiles == f)
            intronAll[[ind]] = .readIntron(f, meas)
        }

        ## Merge the intron results
        if(verbose) message(paste0(date(), ": Merging intron data"))
        #[ensure ctab files all contain same introns]
        sumdiff = sapply(intronAll, function(x){
            sum(x$i_id != intronAll[[1]]$i_id) 
        })
        if(!(all(sumdiff==0))){
            msg = 'intron ids were either not the same or not in the same order
                across samples. double check i_data.ctab for each sample.' 
            stop(.makepretty(msg))
        }

        idataOnly = lapply(intronAll, function(x) x[,-c(1:5)] )

        intron = data.frame(intronAll[[1]][,1:5], as.data.frame(idataOnly))
        if(identical(meas, 'all')){
            colnames(intron) = c('i_id', 'chr', 'strand', 'start', 'end', 
                paste(c('rcount', 'ucount', 'mrcount'), 
                    rep(names(samples), each=3), sep="."))
        }else{
            meas_avail = intersect(c('rcount', 'ucount', 'mrcount'), meas) 
            #^order important! meas has to go after rcount/ucount/mrcount 
            #^because of file order. intersect() keeps in order of first thing.
            colnames(intron) = c('i_id', 'chr', 'strand', 'start', 'end',
                paste(meas_avail, rep(names(samples), each=length(meas_avail)), 
                    sep="."))
        }
    } else { # no intron data requested:
        f1 = list.files(samples[1], pattern='i_data.ctab', full.names=TRUE)
        intron = .readIntron(f1, meas)
        stopifnot(ncol(intron) == 5)
    }

    ## Make intron data into GRanges object
    #A. fix strand information for compatibility w/ GRanges
    intron$strand = as.character(intron$strand)
    intron$strand[intron$strand=="."] <- "*"
    #B. get names of transcripts each intron belongs to
    tnamesin = split(i2t$t_id, i2t$i_id)
    tnamesin.ord = as.character(tnamesin)[match(intron$i_id, names(tnamesin))]
    #C. make the GRanges object
    introngr = GRanges(seqnames=Rle(intron$chr), 
        ranges=IRanges(start=intron$start, end=intron$end),
        strand = Rle(intron$strand), 
        id=intron$i_id, 
        transcripts = tnamesin.ord)

    ## Read exon data
    if(verbose) message(paste0(date(), ": Reading exon data files"))

    # if any exon data is requested:
    emeas = c('all', 'rcount', 'ucount', 'mrcount', 'cov', 'cov_sd', 'mcov', 
        'mcov_sd')
    if( any(emeas %in% meas)){
        exonFiles = sapply(samples, list.files, pattern="e_data.ctab", 
            full.names=TRUE)
        exonAll = NULL
        for(f in exonFiles){
            ind = which(exonFiles == f)
            exonAll[[ind]] = .readExon(f, meas)
        }

        ## Merge the exon results
        if(verbose) message(paste0(date(), ": Merging exon data"))
        #[ensure ctab files all contain same exons]
        sumdiffex = sapply(exonAll, function(x){
            sum(x$e_id != exonAll[[1]]$e_id)
        })
        if(!(all(sumdiffex==0))){
            stop(.makepretty('exon ids were either not the same or not in the
                same order across samples. double check e_data.ctab for each
                sample.'))
        }

        edataOnly = lapply(exonAll, function(x) x[,-c(1:5)])
        exon = data.frame(exonAll[[1]][,1:5], as.data.frame(edataOnly))
        if(identical(meas, 'all')){
            colnames(exon) = c('e_id', 'chr', 'strand', 'start', 'end', 
                paste(emeas[-1], rep(names(samples), each=7), sep="."))
        }else{
            meas_avail = intersect(emeas[-1], meas) 
            #^order important! 
            colnames(exon) = c('e_id', 'chr', 'strand', 'start', 'end',
                paste(meas_avail, rep(names(samples), each=length(meas_avail)), 
                    sep="."))
        }
    } else { # no exon data requested:
        f1 = list.files(samples[1], pattern='e_data.ctab', full.names=TRUE)
        exon = .readExon(f1, meas)
        stopifnot(ncol(exon) == 5)
    }

    ## Make exon data into GRanges object
    #A. fix strand information for compatibility w/ GRanges
    exon$strand = as.character(exon$strand)
    exon$strand[exon$strand=="."] <- "*"
    #B. get names of transcripts each exon belongs to
    tnamesex = split(e2t$t_id, e2t$e_id)
    #C. make the GRanges object
    tnamesex.ord = as.character(tnamesex)[match(exon$e_id, names(tnamesex))]
    exongr = GRanges(seqnames=Rle(exon$chr), 
        ranges=IRanges(start=exon$start, end=exon$end), 
        strand=Rle(exon$strand), 
        id=exon$e_id, 
        transcripts=tnamesex.ord)

    ## Read transcript data
    if(verbose) message(paste0(date(), ": Reading transcript data files"))

    # if any transcript data requested:
    if( any(c('all', 'cov', 'FPKM') %in% meas)){
        tFiles = sapply(samples, list.files, pattern="t_data.ctab", 
            full.names=TRUE)
        tAll = NULL
        for(f in tFiles){
            ind = which(tFiles == f)
            tAll[[ind]] = .readTranscript(f, meas)
        }

        ## Merge the transcript results
        if(verbose) message(paste0(date(), ": Merging transcript data"))
        #[ensure ctab files all contain same transcripts]
        sumdifft = sapply(tAll, function(x) sum(x$t_id != tAll[[1]]$t_id))
        if(!(all(sumdifft==0))){
            msg = 'Transcript ids were either not the same or not in the same
                order across samples. Double check t_data.ctab for each sample.'
            stop(.makepretty(msg))
        }

        tdataOnly = lapply(tAll, function(x) x[,-c(1:10)])
        transcript = data.frame(tAll[[1]][,1:10], as.data.frame(tdataOnly))
        if(identical(meas, 'all')){
            colnames(transcript) = c('t_id', 'chr', 'strand', 'start', 'end', 
                't_name', 'num_exons', 'length', 'gene_id', 'gene_name', 
                paste(c('cov', 'FPKM'), rep(names(samples), each=2), sep="."))
        }else{
            meas_avail = intersect(c('cov', 'FPKM'), meas) 
            #^order important!
            colnames(transcript) = c('t_id', 'chr', 'strand', 'start', 'end', 
                't_name','num_exons', 'length', 'gene_id', 'gene_name',
                paste(meas_avail, rep(names(samples), each=length(meas_avail)),
                    sep="."))
        }
    } else { # no transcript data requested:
        f1 = list.files(samples[1], pattern='t_data.ctab', full.names=TRUE)
        transcript = .readTranscript(f1, meas)
        stopifnot(ncol(transcript) == 10)
    }

    ## Make transcripts into a GRanges list object
    mm = match(e2t$e_id, mcols(exongr)$id)
    if(any(is.na(mm))){
        warning(paste('the following exon(s) did not appear in e_data.ctab:',
        paste(e2t$e_id[which(is.na(mm))], collapse=", ")))
    }
    tgrl = split(exongr[mm[!is.na(mm)]], e2t$t_id[!is.na(mm)])

    ## Connect transcripts to genes:
    t2g = data.frame(t_id = transcript$t_id, g_id = transcript$gene_id)

    ## Read phenotype table, if given:
    stopifnot(is.null(pData) | class(pData) == 'data.frame')
    if(!is.null(pData)){
        if(!all(pData[,1] == names(samples))){
            msg = 'Rows of pData did not seem to be in the same order as the
                columns of the expression data. Attempting to rearrange
                pData...'
            warning(.makepretty(msg))
            tmp = try(pData <- pData[,match(names(samples), pData[,1])], 
                silent=TRUE)
            if(class(tmp) == "try-error"){
                msg = 'first column of pData does not match the names of the
                    folders containing the ballgown data.'
                stop(.makepretty(msg))
            }else{
                message('successfully rearranged!')
            }
        }
    }

    if(verbose) message('Wrapping up the results')
    result = new('ballgown', 
        expr=list(intron=intron, exon=exon, trans=transcript), 
        indexes=list(e2t=e2t, i2t=i2t, t2g=t2g, bamfiles=bamfiles, pData=pData),
        structure=list(intron=introngr, exon=exongr, trans=tgrl), 
        dirs=samples, mergedDate=date(), meas=meas, RSEM=FALSE)

    if(verbose) message(date())
    return(result)
}