#' Map reads to genome
#'
#' Map reads to genome using STAR. Most parameter descriptions are from STAR manual.
#' @param readFilesIn Character - files with sequences to map (if paired-end data, the R1 files)
#' @param genomeDir String - Path to directory where genome indices were generated by STAR
#' @param starPath String - Path to directory with STAR executable
#' @param outDest String - Directory where output files should be saved
#' @param outSuffix String - will be appended to original filename
#' @param outTrimString String - will be trimmed from output filenames - don't include input file extension, as this is trimmed from basename(readFilesIn)
#' @param runThreadN Numeric - How many cores to use
#' @param readFilesInR2 Character - If data are paired, the R2 files
#' @param settings String - Use ENCODE settings for long or short RNA ("ENCODE_long" or "ENCODE_short")
#' @param settingsOverride String - Additional inputs to STAR; anything here that is also in ENCODE settings will override the ENCODE value
#' @details Take a list of fastq files containing reads, and get alignments with STAR. STAR's defaults are the defaults here. Override by
#' defining "settings" as "ENCODE_long" or "ENCODE_short", to use ENCODE settings for long or short RNA. Override specific parameter values in the
#' ENCODE settings, and set any other parameters you want, with settingsOverride. This argument is a string that will be tacked on to the command issued
#' to the command line, as-is.
#' If an input file doesn't exist, you'll get an error. If an output file (at least a Log.out file) does exist, it'll just skip the corresponding input file.
#' For paired-end data, readFilesIn are the R1 files and readFilesIn2 are the R2 files. They need to have R1 / R2 or r1 / r2 in the filenames, which based
#' on looking online is the norm.
#' TIME:
#' ~10m per 5G fastq. 3-6 hours for an entire total-RNA dataset (~25-30 samples). Nuc-seq took only ~30m.
#' COMMAND LINE EXAMPLE, using ENCODE settings for long RNA
#' (if you want to play with the parameters while looking at just one file, this might be easiest):
#' /opt/STAR/bin/MacOSX_x86_64/STAR --genomeDir $genomeDir --readFilesIn $pathTrimmed$sn$trimmedSuffix --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.04 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMtype 'BAM SortedByCoordinate' --outFileNamePrefix $pathMapped$sn$mappedSuffix#'
#' @examples
#' Example using paired-end data.
#' gdir = '/Volumes/CodingCLub1/STAR_stuff/indexes/refGene_gtf_maxLen75/'
#' fastqs=dir(paste(dataPath,'trimmed',sep=''), pattern='fastq', full.names=TRUE)
#' r1=fastqs[which(regexpr("R1",fastqs)>0)]
#' r2=fastqs[which(regexpr("R2",fastqs)>0)]
#' Use ENCODE settings for short RNA, except for outFilterMatchNmin
#' STAR_run(r1,genomeDir=gdir,outDest='sandbox/packtest/', outTrimString='_R1_001_trimmed', readFilesInR2=r2, runThreadN=8, settings="ENCODE_short", settingsOverride=c("--outFilterMatchNmin", "8") )
#' @author Emma Myers
#' @export
STAR_run = function(readFilesIn, genomeDir, starPath="/opt/STAR/bin/MacOSX_x86_64/", outDest="./", outSuffix="", outTrimString="", runThreadN=1, readFilesInR2=NULL,
settings=NULL, settingsOverride=NULL, quantMode=NULL) {
# Check arguments
if ( !all(file.exists(readFilesIn)) ) {
writeLines("Missing input files:")
writeLines(readFilesIn[which(!file.exists(readFilesIn))])
stop("Missing input file(s). See above.")
}
if (! genomeDir == "") { genomeDir = dir_check(genomeDir) }
if (! starPath == "") { starPath = dir_check(starPath) }
outDest = dir_check(outDest)
# Check if settings requested
paramValues = vector(mode = "character")
if ( !is.null(settings) ) {
if (settings == "ENCODE_long") {
paramValues = c(paramValues, "--outFilterType", "BySJout", "--outFilterMultimapNmax", "20", "--alignSJoverhangMin", "8", "--alignSJDBoverhangMin", "1",
"--outFilterMismatchNmax", "999", "--outFilterMismatchNoverLmax", "0.04", "--alignIntronMin", "20", "--alignIntronMax", "1000000",
"--alignMatesGapMax", "1000000")
} else if (settings == "ENCODE_short") {
paramValues = c(paramValues, "--outFilterMismatchNoverLmax", "0.03", "--outFilterMultimapNmax", "20", "--alignIntronMax", "1", "--outFilterMatchNmin", "16",
"--outFilterScoreMinOverLread", "0", "--outFilterMatchNminOverLread", "0")
} else { stop("Invalid value for settings parameter") }
}
# If any arguments in settingsOverride are also in paramVals, use the value in settingsOverride
instanceCount = function(x, y) { length(which(y==x)) }
nameIdx = which( substr(paramValues, 1, 2)=="--")
rmNameIdx = intersect(nameIdx, which(sapply(paramValues, instanceCount, settingsOverride)>0))
paramValues = paramValues[ -c(rmNameIdx, rmNameIdx+1) ]
# Check if paired-end data
if ( !is.null(readFilesInR2) ) {
if ( !all(file.exists(readFilesInR2)) ) {
writeLines("Missing R2 files:")
writeLines(readFilesIn[which(!file.exists(readFilesInR2))])
stop("Missing R2 file(s). See above.")
}
if ( !(length(readFilesIn)==length(readFilesInR2)) ) {
stop("Different number of R1 and R2 files.")
}
# Try taking out R1/R2 and make sure filenames match
# I don't actually know if small letters are ever used but let's be safe and convert everything to upper-case
# This isn't perfect but it should catch most mismatches
if ( !( all(gsub("R1", "", toupper(basename(readFilesIn)) )==gsub("R2", "", toupper(basename(readFilesInR2)) )) ) ) {
stop("Mismatched filenames in R1 and R2 files (removing R1 and R2 from filenames does not yield pairs of identical filenames).")
}
}
counter = 1
for (f in readFilesIn) {
writeLines(paste("\nProcessing file:", f))
fOut = gsub(outTrimString, "", basename(f))
fOut = paste(outDest, gsub(paste(".", tools::file_ext(f), sep=""), "", fOut), outSuffix, "_", sep="")
# Check if there's already a log file for this file
if ( file_checks(paste(fOut, "Log.out", sep=""), shouldExist=FALSE, verbose=TRUE) ) {
# Define arguments to the STAR command
arguments = c("--readFilesIn", f)
# if paired-end data, include R2 filename
if ( !is.null(readFilesInR2) ) { arguments = c(arguments, readFilesInR2[counter]) }
# everything else
arguments = c(arguments, "--genomeDir", genomeDir, "--outFileNamePrefix", fOut, "--runThreadN", runThreadN, paramValues)
if ( !is.null(settingsOverride) ) { arguments = c(arguments, settingsOverride) }
# Do the mapping (STAR displays time taken, so don't bother with that)
system2( paste(starPath, "STAR", sep=""), args = arguments )
} else { writeLines("Skipping file.") }
counter = counter + 1
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.