rnaseq: Process, Analyze and Visualize RNA-seq Data

Documented in STAR_run

#' Map reads to genome
#'
#' Map reads to genome using STAR. Most parameter descriptions are from STAR manual.
#' @param readFilesIn Character - files with sequences to map (if paired-end data, the R1 files)
#' @param genomeDir String - Path to directory where genome indices were generated by STAR
#' @param starPath String - Path to directory with STAR executable
#' @param outDest String - Directory where output files should be saved
#' @param outSuffix String - will be appended to original filename
#' @param outTrimString String - will be trimmed from output filenames - don't include input file extension, as this is trimmed from basename(readFilesIn)
#' @param runThreadN Numeric - How many cores to use
#' @param readFilesInR2 Character - If data are paired, the R2 files
#' @param settings String - Use ENCODE settings for long or short RNA ("ENCODE_long" or "ENCODE_short")
#' @param settingsOverride String - Additional inputs to STAR; anything here that is also in ENCODE settings will override the ENCODE value
#' @details Take a list of fastq files containing reads, and get alignments with STAR.  STAR's defaults are the defaults here.  Override by
#' defining "settings" as "ENCODE_long" or "ENCODE_short", to use ENCODE settings for long or short RNA.  Override specific parameter values in the
#' ENCODE settings, and set any other parameters you want, with settingsOverride.  This argument is a string that will be tacked on to the command issued
#' to the command line, as-is.
#' If an input file doesn't exist, you'll get an error.  If an output file (at least a Log.out file) does exist, it'll just skip the corresponding input file.
#' For paired-end data, readFilesIn are the R1 files and readFilesIn2 are the R2 files.  They need to have R1 / R2 or r1 / r2 in the filenames, which based
#' on looking online is the norm.
#' TIME:
#' ~10m per 5G fastq.  3-6 hours for an entire total-RNA dataset (~25-30 samples).  Nuc-seq took only ~30m.
#' COMMAND LINE EXAMPLE, using ENCODE settings for long RNA
#' (if you want to play with the parameters while looking at just one file, this might be easiest):
#' /opt/STAR/bin/MacOSX_x86_64/STAR --genomeDir $genomeDir --readFilesIn $pathTrimmed$sn$trimmedSuffix --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.04 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMtype 'BAM SortedByCoordinate' --outFileNamePrefix $pathMapped$sn$mappedSuffix#'
#' @examples
#' Example using paired-end data.
#' gdir = '/Volumes/CodingCLub1/STAR_stuff/indexes/refGene_gtf_maxLen75/'
#' fastqs=dir(paste(dataPath,'trimmed',sep=''), pattern='fastq', full.names=TRUE)
#' r1=fastqs[which(regexpr("R1",fastqs)>0)]
#' r2=fastqs[which(regexpr("R2",fastqs)>0)]
#' Use ENCODE settings for short RNA, except for outFilterMatchNmin
#' STAR_run(r1,genomeDir=gdir,outDest='sandbox/packtest/', outTrimString='_R1_001_trimmed', readFilesInR2=r2, runThreadN=8, settings="ENCODE_short", settingsOverride=c("--outFilterMatchNmin", "8") )
#' @author Emma Myers
#' @export


STAR_run = function(readFilesIn, genomeDir, starPath="/opt/STAR/bin/MacOSX_x86_64/", outDest="./", outSuffix="", outTrimString="", runThreadN=1, readFilesInR2=NULL,
                    settings=NULL, settingsOverride=NULL, quantMode=NULL) {


    # Check arguments
    if ( !all(file.exists(readFilesIn)) ) {
        writeLines("Missing input files:")
        writeLines(readFilesIn[which(!file.exists(readFilesIn))])
        stop("Missing input file(s).  See above.")
    }
    if (! genomeDir == "") { genomeDir = dir_check(genomeDir) }
    if (! starPath == "") { starPath = dir_check(starPath) }
    outDest = dir_check(outDest)

    # Check if settings requested
    paramValues = vector(mode = "character")
    if ( !is.null(settings) ) {
        if (settings == "ENCODE_long") {
            paramValues = c(paramValues, "--outFilterType", "BySJout", "--outFilterMultimapNmax", "20", "--alignSJoverhangMin", "8", "--alignSJDBoverhangMin", "1",
                          "--outFilterMismatchNmax", "999", "--outFilterMismatchNoverLmax", "0.04", "--alignIntronMin", "20", "--alignIntronMax", "1000000",
                          "--alignMatesGapMax", "1000000")
        } else if (settings == "ENCODE_short") {
            paramValues = c(paramValues, "--outFilterMismatchNoverLmax", "0.03", "--outFilterMultimapNmax", "20", "--alignIntronMax", "1", "--outFilterMatchNmin", "16",
                          "--outFilterScoreMinOverLread", "0", "--outFilterMatchNminOverLread", "0")
        } else { stop("Invalid value for settings parameter") }
    }

    # If any arguments in settingsOverride are also in paramVals, use the value in settingsOverride
    instanceCount = function(x, y) { length(which(y==x)) }
    nameIdx = which( substr(paramValues, 1, 2)=="--")
    rmNameIdx = intersect(nameIdx, which(sapply(paramValues, instanceCount, settingsOverride)>0))
    paramValues = paramValues[ -c(rmNameIdx, rmNameIdx+1) ]

    # Check if paired-end data
    if ( !is.null(readFilesInR2) ) {
        if ( !all(file.exists(readFilesInR2)) ) {
            writeLines("Missing R2 files:")
            writeLines(readFilesIn[which(!file.exists(readFilesInR2))])
            stop("Missing R2 file(s).  See above.")
        }
        if ( !(length(readFilesIn)==length(readFilesInR2)) ) {
            stop("Different number of R1 and R2 files.")
        }
        # Try taking out R1/R2 and make sure filenames match
        # I don't actually know if small letters are ever used but let's be safe and convert everything to upper-case
        # This isn't perfect but it should catch most mismatches
        if ( !( all(gsub("R1", "", toupper(basename(readFilesIn)) )==gsub("R2", "", toupper(basename(readFilesInR2)) )) ) ) {
            stop("Mismatched filenames in R1 and R2 files (removing R1 and R2 from filenames does not yield pairs of identical filenames).")
        }
    }


    counter = 1

    for (f in readFilesIn) {

        writeLines(paste("\nProcessing file:", f))

        fOut = gsub(outTrimString, "", basename(f))
        fOut = paste(outDest, gsub(paste(".", tools::file_ext(f), sep=""), "", fOut), outSuffix, "_", sep="")

        # Check if there's already a log file for this file
        if ( file_checks(paste(fOut, "Log.out", sep=""), shouldExist=FALSE, verbose=TRUE) ) {

            # Define arguments to the STAR command
            arguments = c("--readFilesIn", f)
            # if paired-end data, include R2 filename
            if ( !is.null(readFilesInR2) ) { arguments = c(arguments, readFilesInR2[counter]) }
            # everything else
            arguments = c(arguments, "--genomeDir", genomeDir, "--outFileNamePrefix", fOut, "--runThreadN", runThreadN, paramValues)
            if ( !is.null(settingsOverride) ) { arguments = c(arguments, settingsOverride) }

            # Do the mapping (STAR displays time taken, so don't bother with that)
            system2( paste(starPath, "STAR", sep=""), args = arguments )

        } else {  writeLines("Skipping file.") }

        counter = counter + 1

    }

}