R/sam_to_bam.R

Defines functions sam_to_bam

Documented in sam_to_bam

#' SAM files filtered and converted to BAM format
#'
#' Take a list of SAM files, unknown, mitochondrial, and "random" chromosomes, and save each to BAM
#' Needs Rsamtools.
#' @param filenames Character - List of SAM files
#' @param fileFilteredSuffix String -
#' @param fileBamSuffix String -
#' @param outDest String -
#' @param filterVerbose Logical -
#' @details # Filter out mitochondrial, unmapped, and "random" chromosome reads from sam files generated by mapping with STAR.
#' Convert the sam file to bam, sort the bam, and create an index file for it..
#' This script operates on sam files generated by mapping with STAR.
#' Not bam, because Erin found that "random" chromosome reads weren't getting removed from bams along with the mito and unknown ones.
#' But maybe I should try that again now that we're doing three separate commands; maybe that was the problem somehow.
#' TIME:
#' On yasuimac/Smintheus:
#'  3.5 G sam file - 4m/5.5m filtering, 5.5m/6.6m asBam()
#'   5 G sam file - 5.8m filtering, 7m asBam()
#'   18 G sam file - Xm/26.6m filtering, Xm/33.5m asBam()
#' Here are the equivalent commands at the command line:
#' Filter sam file (took maybe 3-4m for a 4.6G file, and cut it down to 3.6G):
#' sed '/chrM/d;/random/d;/chrUn/d' samplename_mapped_Aligned.out.sam > samplename_mapped_Aligned.out.filter.sam
#' Convert to bam:
#' samtools view -b samplename_mapped_Aligned.out.filter.sam > samplename_mapped_Aligned.out.filter.bam
#' Sort the bam file (it will create a bunch of temporary files while it's working, and then merge them):
#' samtools sort samplename_mapped_Aligned.out.filter.bam samplename_mapped_Aligned.out.filter.sorted -@ 4
#' Index the bam file:
#' samtools index samplename_mapped_Aligned.out.filter.sorted.bam
#' @examples
#' fn = list.files(pattern='.sam')
#' sam_to_bam(fn, outDest='/path/to/filtered/sams')
#' @author Emma Myers
#' @export


sam_to_bam = function(filenames, fileFilteredSuffix=".filtered.sam", fileBamSuffix=".filtered",
                      outDest="./", filterVerbose=FALSE) {

    # Check that destination directory exists
    outDest = dir_check(outDest)

    for (f in filenames) {

        writeLines("\n")

        # Make sure file exists and has extension .sam
        if ( ! file_checks(f, extension = ".sam", verbose = TRUE) ) { next }
        flush.console()

        writeLines(paste("Processing file:", f))
        flush.console()


        ###########################################################################
        # Remove mitochondrial, unmapped, and random chromosomes with sam_filter().
        ###########################################################################
        # Define name of filtered file and check that it doesn't already exist
        fileFiltered = sub(".sam", fileFilteredSuffix, basename(f))
        fileFilteredFull = paste(outDest, fileFiltered, sep="")
        if ( file_checks(fileFilteredFull, shouldExist=FALSE, verbose=TRUE) ){
            writeLines("Filtering...", sep="")
            flush.console()
            # Announce output file name
            writeLines(paste("file will be saved as ", fileFilteredFull), sep="")
            flush.console()
            # Call and time sam_filter()
            tStartFilter = proc.time()[3]
            sam_filter(f, fileFilteredFull, verbose = filterVerbose)
            tElapsedFilter = proc.time()[3] - tStartFilter
            # Announce you're done
            writeLines(paste("...done (", round(tElapsedFilter/60, digits=2), "m).", sep=""))
            flush.console()
        } else { writeLines("Not filtering.") }


        ###############################################################################
        # Convert to sam to bam, sort, and generate index file.  All donw with asBam().
        ###############################################################################
        # Define name of bam file and check that it doesn't already exist
        # asBam tacks ".bam" onto the end, so our "full" filename doesn't actually include it
        fileBamFull = paste(outDest, sub(fileFilteredSuffix, fileBamSuffix, basename(fileFiltered)), sep="")
        if ( file_checks(paste(fileBamFull, ".bam", sep=""), shouldExist=FALSE, verbose=TRUE) ) {
            writeLines("Converting to bam, sorting, and generating index file...", sep="")
            flush.console()
            # Announce output file names
            flush.console(); writeLines(paste("file will be saved as ", fileBamFull, sep=""), sep="")
            flush.console()
            # Call and time asBam()
            tStartBam = proc.time()[3]
            # asBam() automatically adds .bam to the end of the file, so get rid of that if it's there
            Rsamtools::asBam(fileFilteredFull, destination = fileBamFull)
            tElapsedBam = proc.time()[3] - tStartBam
            # Announce you're done
            writeLines(paste("...done (", round(tElapsedBam/60, digits=2), "m).", sep=""))
            flush.console()
        } else { writeLines("Not converting to bam, sorting, and generating index file.") }

        writeLines("Done with file. Don\'t be concerned by any warning messages concerning merging files; asBam() does that and it\'s fine.")

    }

}
e-myers/rnaseq documentation built on May 20, 2019, 9:14 p.m.