#' Filter SAM file
#'
#' Filter unknown, mitochondrial, and "random" chromosomes from a SAM file
#' @param fileIn String -
#' @param fileOut String -
#' @param verbose Logical -
#' @param deleteTemps Logical
#' @details Remove mitochondrial, unmapped, and "random" chromosome reads from one SAM file, and save under
#' a specified filename (ideally ending in '.filtered.sam')
#' In the command line, all three steps can be accomplished for one file with one line:
#' sed '/chrM/d;/random/d;/chrUn/d' fileIn.sam > fileOut.sam
#' In order to do this in R, this function calls the system2() function, which issues commands
#' to the command line. However, bash interprets the above line of code as three commands
#' one call to sed per input (inputs are separated by semicolons). Therefore, system2() is called three times.
#' The first call filters out lines with 'chrM' and writes the results to a temporary file. The second call
#' filters out lines with 'chrUn' from that temporary file, and writes the results to another temporary file.
#' The final call to system2() takes that file, filters out lines with 'random', and writes the results to
#' the output file. Temporary files are deleted by default (see "deleteTemps" parameter).
#' This function operates on SAM files generated by mapping with STAR.
#' Not BAM, because Erin found that "random" chromosome reads weren't getting removed from bams along with the mitochondrial and unknown ones.
#' Might be worth trying to figure that out, and make this handle BAMs as well.
#' Time:
# cb_p036_1_mapped_Aligned.out.filtered, ~18-18.5 million lines. ~4m on yasuimac.
#' @examples
#' Example at the command line:
#' system2('/opt/UCSC/wigToBigWig', args=c('samplename.gencov.bedgraph', '/path/to/genomeSizeFile.txt', samplename.bigwig))
#' @author Emma Myers
#' @export
sam_filter = function(fileIn, fileOut, verbose=FALSE, deleteTemps=TRUE) {
# If fileOut already exists, don't do anything
if (!file_checks(fileOut, shouldExist = FALSE, verbose=TRUE) ) {
return()
}
t1All = proc.time()[3] # time in seconds
# Pass between fileIn and fileTmp til going to fileOut; otherwise end up with empty file for some reason
fileTmp1 = paste(fileIn, '.tmp1', sep='')
fileTmp2 = paste(fileIn, '.tmp2', sep='')
# Filter mitochondrial reads (lines with string "chrM")
if (verbose) { writeLines(paste('Filtering mitochondrial reads from', fileIn)); flush.console() }
t1M = proc.time()[3]
system2('sed', args = '/chrM/d', stdin = fileIn, stdout = fileTmp1)
t2M = proc.time()[3] - t1M
if (verbose) {writeLines(paste('\tFinished with mitochondrial reads. Time:', round(t2M/60, digits=2), 'm')); flush.console()}
# Filter unmapped reads (lines with string "chrUn")
if (verbose) { writeLines(paste('Filtering unmapped reads from', fileTmp1)); flush.console() }
t1U = proc.time()[3]
system2('sed', args = '/chrUn/d', stdin = fileTmp1, stdout = fileTmp2)
t2U = proc.time()[3] - t1U
if (verbose) {writeLines(paste('\tFinished with unmapped reads. Time:', round(t2U/60, digits=2), 'm')); flush.console()}
# Remove temporary file
if (verbose & deleteTemps) { writeLines('\tDeleting fileTmp1.'); flush.console() }
if (deleteTemps) { file.remove(fileTmp1) }
# Filter "random" reads (lines with string "random")s
if (verbose) { writeLines(paste('Filtering "random" reads from', fileOut)); flush.console() }
t1R = proc.time()[3]
system2('sed', args = '/random/d', stdin = fileTmp2, stdout = fileOut)
t2R = proc.time()[3] - t1R
if (verbose) {writeLines(paste('\tFinished with "random" reads. Time:', round(t2R/60, digits=2), 'm')); flush.console()}
# Remove temporary file
if (verbose & deleteTemps) { writeLines('\tDeleting fileTmp2.'); flush.console() }
if (deleteTemps) { file.remove(fileTmp2) }
t2All = proc.time()[3] - t1All
if (verbose) {writeLines(paste('\tFinished filtering file. Total time:', round(t2All/60, digits=2), 'm')); flush.console()}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.