R/sam_filter.R

Defines functions sam_filter

Documented in sam_filter

#' Filter SAM file
#'
#' Filter unknown, mitochondrial, and "random" chromosomes from a SAM file
#' @param fileIn String -
#' @param fileOut String -
#' @param verbose Logical -
#' @param deleteTemps Logical
#' @details Remove mitochondrial, unmapped, and "random" chromosome reads from one SAM file, and save under
#' a specified filename (ideally ending in '.filtered.sam')
#' In the command line, all three steps can be accomplished for one file with one line:
#' sed '/chrM/d;/random/d;/chrUn/d' fileIn.sam > fileOut.sam
#' In order to do this in R, this function calls the system2() function, which issues commands
#' to the command line.  However, bash interprets the above line of code as three commands
#' one call to sed per input (inputs are separated by semicolons).  Therefore, system2() is called three times.
#' The first call filters out lines with 'chrM' and writes the results to a temporary file.  The second call
#' filters out lines with 'chrUn' from that temporary file, and writes the results to another temporary file.
#' The final call to system2() takes that file, filters out lines with 'random', and writes the results to
#' the output file.  Temporary files are deleted by default (see "deleteTemps" parameter).
#' This function operates on SAM files generated by mapping with STAR.
#' Not BAM, because Erin found that "random" chromosome reads weren't getting removed from bams along with the mitochondrial and unknown ones.
#' Might be worth trying to figure that out, and make this handle BAMs as well.
#' Time:
# cb_p036_1_mapped_Aligned.out.filtered, ~18-18.5 million lines. ~4m on yasuimac.
#' @examples
#' Example at the command line:
#' system2('/opt/UCSC/wigToBigWig', args=c('samplename.gencov.bedgraph', '/path/to/genomeSizeFile.txt', samplename.bigwig))
#' @author Emma Myers
#' @export

sam_filter = function(fileIn, fileOut, verbose=FALSE, deleteTemps=TRUE) {

    # If fileOut already exists, don't do anything
    if (!file_checks(fileOut, shouldExist = FALSE, verbose=TRUE) ) {
        return()
    }

    t1All = proc.time()[3] # time in seconds

    # Pass between fileIn and fileTmp til going to fileOut; otherwise end up with empty file for some reason
    fileTmp1 = paste(fileIn, '.tmp1', sep='')
    fileTmp2 = paste(fileIn, '.tmp2', sep='')

    # Filter mitochondrial reads (lines with string "chrM")
    if (verbose) { writeLines(paste('Filtering mitochondrial reads from', fileIn)); flush.console() }
    t1M = proc.time()[3]
    system2('sed', args = '/chrM/d', stdin = fileIn, stdout = fileTmp1)
    t2M = proc.time()[3] - t1M
    if (verbose) {writeLines(paste('\tFinished with mitochondrial reads.  Time:', round(t2M/60, digits=2), 'm')); flush.console()}

    # Filter unmapped reads (lines with string "chrUn")
    if (verbose) { writeLines(paste('Filtering unmapped reads from', fileTmp1)); flush.console() }
    t1U = proc.time()[3]
    system2('sed', args = '/chrUn/d', stdin = fileTmp1, stdout = fileTmp2)
    t2U = proc.time()[3] - t1U
    if (verbose) {writeLines(paste('\tFinished with unmapped reads.  Time:', round(t2U/60, digits=2), 'm')); flush.console()}

    # Remove temporary file
    if (verbose & deleteTemps) { writeLines('\tDeleting fileTmp1.'); flush.console() }
    if (deleteTemps) { file.remove(fileTmp1) }

    # Filter "random" reads (lines with string "random")s
    if (verbose) { writeLines(paste('Filtering "random" reads from', fileOut)); flush.console() }
    t1R = proc.time()[3]
    system2('sed', args = '/random/d', stdin = fileTmp2, stdout = fileOut)
    t2R = proc.time()[3] - t1R
    if (verbose) {writeLines(paste('\tFinished with "random" reads.  Time:', round(t2R/60, digits=2), 'm')); flush.console()}

    # Remove temporary file
    if (verbose & deleteTemps) { writeLines('\tDeleting fileTmp2.'); flush.console() }
    if (deleteTemps) { file.remove(fileTmp2) }

    t2All = proc.time()[3] - t1All
    if (verbose) {writeLines(paste('\tFinished filtering file.  Total time:', round(t2All/60, digits=2), 'm')); flush.console()}


}
e-myers/rnaseq documentation built on May 20, 2019, 9:14 p.m.