R/simulate_experiment.R

.makepretty = function(x){
    msg = gsub('\n', ' ', x)
    msg = gsub('    ', '', msg)
    msg
}

#' simulate RNA-seq experiment using negative binomial model
#'
#' create FASTA files containing RNA-seq reads simulated from provided 
#'   transcripts, with optional differential expression between two groups
#' @param fasta path to FASTA file containing transcripts from which to simulate
#'   reads. See details.
#' @param gtf path to GTF file containing transcript structures from which reads
#'   should be simulated. See details.
#' @param seqpath path to folder containing one FASTA file (\code{.fa} 
#'   extension) for each chromosome in \code{gtf}. See details. 
#' @param  num_reps How many biological replicates should be in each group? If
#'   \code{num_reps} is a single integer, \code{num_reps} replicates will be 
#'   simulated in each group. Otherwise, \code{num_reps} can be a length-2 
#'   vector, where \code{num_reps[1]} and \code{num_reps[2]} replicates will be
#'   simulated in each of the two groups.
#' @param  fraglen Mean RNA fragment length. Sequences will be read off the 
#'   end(s) of these fragments.
#' @param fragsd Standard deviation of fragment lengths. 
#' @param readlen Read length.
#' @param error_rate Sequencing error rate. Must be between 0 and 1. A uniform 
#'   error model is assumed. 
#' @param  paired If \code{TRUE}, paired-end reads are simulated; else 
#'   single-end reads are simulated.
#' @param reads_per_transcript baseline mean number of reads to simulate 
#'   from each transcript. Can be an integer, in which case this many reads
#'   are simulated from each transcript, or an integer vector whose length
#'   matches the number of transcripts in \code{fasta}. 
#' @param  fold_changes Vector of multiplicative fold changes between groups,
#'   one entry per transcript in \code{fasta}. A fold change > 1 means the 
#'   transcript is overexpressed in the first \code{num_reps} (or 
#'   \code{num_reps[1]}) samples. Fold change < 1 means transcript is 
#'   overexpressed in the last \code{num_reps} (or \code{num_reps[2]}) samples.
#'   The change is in the mean number of reads generated from the transcript, 
#'   between groups.
#' @param size the negative binomial \code{size} parameter (see 
#'   \code{\link{NegBinomial}}) for the number of reads drawn per transcript. 
#'   If left blank, defaults to \code{reads_per_transcript / 3}. Negative 
#'   binomial variance is mean + mean^2 / size. Can either be left at default,
#'   a vector of the same length as number of transcripts in \code{fasta},
#'   if the two groups should have the same size parameters, or a list with 2
#'   elements, each of which is a vector with length equal to the number of 
#'   transcripts in \code{fasta}, which represent the size parameters for each
#'   transcript in groups 1 and 2, respectively.
#' @param outdir character, path to folder where simulated reads should be 
#'   written, with *no* slash at the end. By default, reads are 
#'   written to current working directory.
#' @param write_info If \code{TRUE}, write a file matching transcript IDs to 
#'   differential expression status into the file \code{outdir/sim_info.txt}.
#' @param transcriptid optional vector of transcript IDs to be written into 
#'   \code{sim_info.txt} and used as transcript identifiers in the fasta files.
#'   Defaults to \code{names(readDNAStringSet(fasta))}. This option is useful
#'   if default names are very long or contain special characters.
#' @param seed Optional seed to set before simulating reads, for 
#'   reproducibility.
#' @param ... additional arguments to pass to \code{seq_gtf} if using 
#'   \code{gtf} and \code{seqpath}
#' @return No return, but simulated reads and a simulation info file are written
#'   to \code{outdir}.
#' @export
#' @details Reads can either be simulated from a FASTA file of transcripts 
#'   (provided with the \code{fasta} argument) or from a GTF file plus DNA 
#'   sequences (provided with the \code{gtf} and \code{seqpath} arguments). 
#'   Simulating from a GTF file and DNA sequences may be a bit slower: it took 
#'   about 6 minutes to parse the GTF/sequence files for chromosomes 1-22, X, 
#'   and Y in hg19.
#' @examples \donttest{
#'   ## simulate a few reads from chromosome 22
#' 
#'   fastapath = system.file("extdata", "chr22.fa", package="polyester")
#'   numtx = count_transcripts(fastapath)
#'   set.seed(4)
#'   fold_changes = sample(c(0.5, 1, 2), size=numtx, 
#'      prob=c(0.05, 0.9, 0.05), replace=TRUE)
#'   library(Biostrings)
#'   # remove quotes from transcript IDs:
#'   tNames = gsub("'", "", names(readDNAStringSet(fastapath))) 
#' 
#'   simulate_experiment(fastapath, reads_per_transcript=10, 
#'      fold_changes=fold_changes, outdir='simulated_reads', 
#'      transcriptid=tNames, seed=12)
#'}
simulate_experiment = function(fasta=NULL, gtf=NULL, seqpath=NULL, num_reps=10, 
    fraglen=250, fragsd=25, readlen=100, error_rate=0.005, paired=TRUE, 
    reads_per_transcript=300, fold_changes, size=NULL, outdir=".", 
    write_info=TRUE, transcriptid=NULL, seed=NULL, ...){

    if(!is.null(fasta) & is.null(gtf) & is.null(seqpath)){
        transcripts = readDNAStringSet(fasta)
    }else if(is.null(fasta) & !is.null(gtf) & !is.null(seqpath)){
        message('parsing gtf and sequences...')
        transcripts = seq_gtf(gtf, seqpath, ...)
        message('done parsing')
    }else{
        stop('must provide either fasta or both gtf and seqpath')
    }

    # check argument lengths:
    ntx = length(transcripts)
    if(length(fold_changes)!=ntx){
        stop(.makepretty('fold_changes must be a vector with one entry per
            transcript in fasta or gtf; use count_transcripts to find out how
            many transcripts are in fasta or gtf.'))
    }
    if(length(reads_per_transcript)!=1 & length(reads_per_transcript)!=ntx){
        stop(.makepretty('reads_per_transcript must be a single number or a
            vector with one entry per transcript in fasta or gtf; use
            count_transcripts to find out how many transcripts are in fasta or
            gtf.'))
    }
    if(length(size)!=0 & length(size)!=1 & length(size)!=ntx){
        stop(.makepretty('size must be a single number or a vector with one
            entry per transcript in fasta or gtf; use count_transcripts to find
            out how many transcripts are in fasta or gtf.'))
    }
    if(length(transcriptid)!=0 & length(transcriptid)!=ntx){
        stop(.makepretty('transcriptid must be a character vector with one entry
            per transcript in fasta or gtf; use count_transcripts to find out
            how many transcripts are in fasta or gtf.'))
    }

    L = width(transcripts)
    if(!is.null(transcriptid)){
        names(transcripts) = transcriptid
    }
        
    if(!is.null(seed)) set.seed(seed)

    ## get number of reps per group
    stopifnot(length(num_reps)==1 | length(num_reps)==2)
    if(length(num_reps)==2){
        n1 = num_reps[1]
        n2 = num_reps[2]
    }else{
        n1 = n2 = num_reps
    }

    ## get baseline means for each group from fold changes:
    if(exp(mean(log(fold_changes))) != 1){
        warning(.makepretty('provided fold changes mean that one group will
            have more overall expression than the other, so some of the DE
            signal might be lost due to library size adjustment.'))
    }
    basemeans1 = ifelse(fold_changes > 1, 
        reads_per_transcript*fold_changes, reads_per_transcript)
    basemeans1 = round(basemeans1)
    basemeans2 = ifelse(fold_changes < 1,
        reads_per_transcript/fold_changes, reads_per_transcript)
    basemeans2 = round(basemeans2)

    if (is.null(size)) {
        nbdp1 = basemeans1/3
        nbdp2 = basemeans2/3
    }
    else if(class(size)=='list'){
        stopifnot(length(size)==2)
        stopifnot(length(size[[1]]) == length(basemeans1))
        stopifnot(length(size[[2]]) == length(basemeans2))
        nbdp1 = size[[1]]
        nbdp2 = size[[2]]
    }else{
        if(length(size) == 1 | length(size) == length(basemeans1)){
            nbdp1 = nbdp2 = size
        }else{
            stop(.makepretty('size must either be NULL, length 1,
                or a vector with the same number of elements as
                transcripts you have.'))
        }
    }

    numreadsList = vector("list", n1+n2)
    for(i in 1:n1){
        numreadsList[[i]] = NB(basemeans1, nbdp1)
    }
    for(i in (1:n2)+n1){
        numreadsList[[i]] = NB(basemeans2, nbdp2)
    }


    ## simulate reads for each sample:
    #######################################
    sysoutdir = gsub(' ', '\\\\ ', outdir)
    if(.Platform$OS.type == 'windows'){
        shell(paste('mkdir', sysoutdir))
    }else{
        system(paste('mkdir -p', sysoutdir))    
    }
    for(i in 1:(n1+n2)){
        
        tObj = rep(transcripts, times=numreadsList[[i]])
        
        #get fragments
        tFrags = generate_fragments(tObj, fraglen=fraglen, fragsd=fragsd)

        #reverse_complement some of those fragments
        rctFrags = reverse_complement(tFrags)

        #add sequencing error
        errFrags = add_error(rctFrags, error_rate)

        #get reads from fragments
        reads = get_reads(errFrags, readlen, paired)

        #write read pairs
        write_reads(reads, readlen=readlen, 
            fname=paste0(outdir, '/sample_', sprintf('%02d', i)), paired=paired)
    }

    ## write out simulation information, if asked for:
    if(write_info){
        if(is.null(transcriptid)){
            transcriptid = names(transcripts)
        }
        sim_info = data.frame(transcriptid=transcriptid, 
            foldchange=fold_changes, DEstatus=fold_changes!=1)
        write.table(sim_info, row.names=FALSE, quote=FALSE, sep="\t", 
            file=paste0(outdir, '/sim_info.txt'))
    }
}
alyssafrazee/polyester-release documentation built on May 12, 2019, 2:32 a.m.