R/simulate_experiment_countmat.R

#' Simulate RNA-seq experiment 
#'
#' create FASTA files containing RNA-seq reads simulated from provided 
#'   transcripts, with optional differential expression between two groups
#'   (designated via read count matrix)
#' @param fasta path to FASTA file containing transcripts from which to simulate
#'    reads. See details.
#' @param gtf path to GTF file containing transcript structures from which reads
#'   should be simulated. See details.
#' @param seqpath path to folder containing one FASTA file (\code{.fa} 
#'   extension) for each chromosome in \code{gtf}. See details. 
#' @param readmat matrix with rows representing transcripts and columns 
#'   representing samples. Entry i,j specifies how many reads to simulate from
#'   transcript i for sample j.
#' @param outdir character, path to folder where simulated reads should be 
#'   written, without a slash at the end of the folder name. By default, reads
#'   written to the working directory.
#' @param fraglen Mean RNA fragment length. Sequences will be read off the 
#'   end(s) of these fragments.
#' @param fragsd Standard deviation of fragment lengths. 
#' @param readlen Read length
#' @param error_rate Sequencing error rate. Must be between 0 and 1. A uniform 
#'   error model is assumed. 
#' @param paired If \code{TRUE}, paired-end reads are simulated; else single-end
#'   reads are simulated.
#' @param seed Optional seed to set before simulating reads, for 
#'   reproducibility.
#' @param ... Further arguments to pass to \code{seq_gtf}, if \code{gtf} is not
#'   \code{NULL}.
#' @return No return, but simulated reads are written to \code{outdir}.
#' @export
#' @details Reads can either be simulated from a FASTA file of transcripts
#'   (provided with the \code{fasta} argument) or from a GTF file plus DNA
#'   sequences (provided with the \code{gtf} and \code{seqpath} arguments). 
#'   Simulating from a GTF file and DNA sequences may be a bit slower: it took
#'   about 6 minutes to parse the GTF/sequence files for chromosomes 1-22, 
#'   X, and Y in hg19.
#' @examples \donttest{
#'   fastapath = system.file("extdata", "chr22.fa", package="polyester")
#'   numtx = count_transcripts(fastapath)
#'   readmat = matrix(20, ncol=10, nrow=numtx)
#'   readmat[1:30, 1:5] = 40
#' 
#'   simulate_experiment_countmat(fasta=fastapath, 
#'     readmat=readmat, outdir='simulated_reads_2', seed=5)
#'}
simulate_experiment_countmat = function(fasta=NULL, gtf=NULL, seqpath=NULL, 
    readmat, outdir=".", fraglen=250, fragsd=25, readlen=100, error_rate=0.005,
    paired=TRUE, seed=NULL, ...){

    if(!is.null(seed)) set.seed(seed)
    
    if(!is.null(fasta) & is.null(gtf) & is.null(seqpath)){
        transcripts = readDNAStringSet(fasta)
    }else if(is.null(fasta) & !is.null(gtf) & !is.null(seqpath)){
        message('parsing gtf and sequences...')
        transcripts = seq_gtf(gtf, seqpath, ...)
        message('done parsing')
    }else{
        stop('must provide either fasta or both gtf and seqpath')
    }

    stopifnot(class(readmat) == 'matrix')
    stopifnot(nrow(readmat) == length(transcripts))

    sysoutdir = gsub(' ', '\\\\ ', outdir)
    if(.Platform$OS.type == 'windows'){
        shell(paste('mkdir', sysoutdir))
    }else{
        system(paste('mkdir -p', sysoutdir))    
    }

    for(i in 1:ncol(readmat)){
        tObj = rep(transcripts, times=readmat[,i])
  
        #get fragments
        tFrags = generate_fragments(tObj, fraglen=fraglen, fragsd=fragsd)

        #reverse_complement some of those fragments
        rctFrags = reverse_complement(tFrags)

        #add sequencing error
        errFrags = add_error(rctFrags, error_rate)

        #get reads from fragments
        reads = get_reads(errFrags, readlen, paired)

        #write read pairs
        write_reads(reads, readlen=readlen, 
            fname=paste0(outdir, '/sample_', sprintf('%02d', i)), 
            paired=paired)
    }
}
alyssafrazee/polyester-release documentation built on May 12, 2019, 2:32 a.m.