R/BulkAlignment.R

Defines functions BulkAlignment

Documented in BulkAlignment

#' Bulk alignment function
#'
#' @param lalista list of samples
#' @param nodes logic cores
#' @param readsPath sample folders
#' @param GenomeIndex genome index
#' @param outBam output folder
#' @param threads processes
#' @param outFormat BAM or SAM
#' @param phredScore quality score
#' @param maxExtractedSubreads number of subreads
#' @param consensusVote consensus
#' @param mismatchMax mismatch
#' @param uniqueOnly no multimapping
#' @param maxMultiMapped multimapping
#' @param indelLength indel
#' @param fragmentMinLength fragment minumum length 
#' @param fragmentMaxLength fragment maximum length
#' @param matesOrientation mate orientation
#' @param readOrderConserved read order
#' @param coordinatesSorting sorting
#' @param allJunctions junctions
#' @param tempfolder temporary folder
BulkAlignment <- function(lalista, nodes,
                          readsPath,
                          GenomeIndex,
                          outBam,
                          threads,
                          outFormat,
                          phredScore,
                          maxExtractedSubreads,
                          consensusVote,
                          mismatchMax,
                          uniqueOnly,
                          maxMultiMapped,
                          indelLength,
                          fragmentMinLength,
                          fragmentMaxLength,
                          matesOrientation,
                          readOrderConserved,
                          coordinatesSorting,
                          allJunctions,
                          tempfolder
) {
    ## nested function
    ## unmapped extraction
    ## filtering BAM from unmapped
    UnmappedExtraction <- function(
        temporarySource, 
        outputBam, 
        prefix,
        sampleBasename,
        subsetting = 1000000
    ) {
        fromBamToFastq <- function(alignFile, Setting, OutPut) {
            SubsetImported <- Rsamtools::scanBam(
                file = alignFile,
                param = Setting
            )
            SubsetShortReads <- ShortRead::ShortReadQ(sread = SubsetImported[[1]]$seq,
                                                      quality = SubsetImported[[1]]$qual,
                                                      id = Biostrings::BStringSet(SubsetImported[[1]]$qname))
            ShortRead::writeFastq(object = SubsetShortReads,
                                  file = OutPut,
                                  mode = "a", compress = TRUE)
        }
        ##### R1 unmapped, mate unmapped
        ## parameters
        fieldR1unmap2 <- Rsamtools::ScanBamParam(
            flag =  Rsamtools::scanBamFlag(
                isFirstMateRead = TRUE,
                isUnmappedQuery = TRUE,
                hasUnmappedMate = TRUE),
            reverseComplement = TRUE,
            what = c("qname", "seq", "qual")
        )
        ## PROCESSING
        #1. setting chunk number of BAM
        docBam <- open(Rsamtools::BamFile(
            file.path(temporarySource,
                      paste(prefix, sampleBasename, "_supplementary", sep = "")),
            yieldSize = subsetting))
        cycleBAM <- 0
        while(nrec <- length(Rsamtools::scanBam(docBam)[[1]][[1]])) {
            cat("records:", nrec, "\n")
            cycleBAM <- cycleBAM+1
            print(cycleBAM)
        }
        close(docBam)
        #2. inizialization
        file.create(
            file.path(
                temporarySource, paste(
                    prefix, "FastqR1unmap2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = "")
            )
        )
        #3. iterate,import,class,write
        docBam <- open(Rsamtools::BamFile(
            file.path(temporarySource,
                      paste(prefix, sampleBasename, "_supplementary", sep = "")),
            yieldSize = subsetting))
        R1up <- 1
        while (R1up<=cycleBAM) {
            print(R1up)
            fromBamToFastq(alignFile = docBam,
                           Setting = fieldR1unmap2,
                           OutPut = file.path(
                               temporarySource, paste(
                                   prefix, "FastqR1unmap2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = "")
                           ))
            R1up <- R1up+1
        }
        close(docBam)
        ## category 2: unmapped R2 (mate unmapped)
        ## PARAMETERS
        fieldR2unmap2 <- Rsamtools::ScanBamParam(
            flag =  Rsamtools::scanBamFlag(
                isSecondMateRead = TRUE,
                isUnmappedQuery = TRUE,
                hasUnmappedMate = TRUE),
            reverseComplement = TRUE,
            what = c("qname", "seq", "qual")
        )
        ## PROCESSing
        #2.inizialization
        file.create(
            file.path(
                temporarySource, paste(
                    prefix, "FastqR2unmap2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = "")
            )
        )
        
        #3.iterate,import,class,write
        docBam <- open(Rsamtools::BamFile(
            file.path(temporarySource,
                      paste(prefix, sampleBasename, "_supplementary", sep = "")), 
            yieldSize = subsetting))
        R2up <- 1
        while (R2up<=cycleBAM) {
            print(R2up)
            fromBamToFastq(alignFile = docBam,
                           Setting = fieldR2unmap2,
                           OutPut = file.path(
                               temporarySource, paste(
                                   prefix, "FastqR2unmap2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = "")
                           )
            )
            R2up <- R2up+1
        }
        close(docBam)
        ## "SPURIE": R1 unmapped e R2 mapped
        ## unmapped R1
        ## PARAMETers
        fieldR1mateUnmapped2 <- Rsamtools::ScanBamParam(
            flag = Rsamtools::scanBamFlag(
                isUnmappedQuery = TRUE,
                isFirstMateRead = TRUE,
                hasUnmappedMate = FALSE
            ), reverseComplement = TRUE, 
            what = c("qname", "seq", "qual")
        )
        ## PROCESSING
        #2.inizialization
        file.create(
            file.path(
                temporarySource, paste(
                    prefix, "FastqR1mateUnmapped2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = "")
            )
        )
        #3.iterate,import,class,write
        docBam <- open(Rsamtools::BamFile(
            file.path(temporarySource,
                      paste(prefix, sampleBasename, "_supplementary", sep = "")), 
            yieldSize = subsetting))
        R1mu <- 1
        while (R1mu<=cycleBAM) {
            print(R1mu)
            fromBamToFastq(alignFile = docBam,
                           Setting = fieldR1mateUnmapped2,
                           OutPut = file.path(
                               temporarySource, paste(
                                   prefix, "FastqR1mateUnmapped2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = "")
                           ))
            R1mu <- R1mu+1
        }
        ## mapped R2, mate R1 unmapped
        ## PARAMETers
        fieldR2mateMapped2 <- Rsamtools::ScanBamParam(
            flag = Rsamtools::scanBamFlag(
                isUnmappedQuery = FALSE,
                isSecondMateRead = TRUE,
                hasUnmappedMate = TRUE
            ), reverseComplement = TRUE,
            what = c("qname", "seq", "qual")
        )
        ## PROCESSing
        #2.inizialization
        file.create(
            file.path(
                temporarySource, paste(
                    prefix, "FastqR2mateMapped2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = "")
            )
        )
        #3.iterate,import,class,write
        docBam <- open(Rsamtools::BamFile(
            file.path(temporarySource,
                      paste(prefix, sampleBasename, "_supplementary", sep = "")), 
            yieldSize = subsetting))
        R2mm <- 1
        while (R2mm<=cycleBAM) {
            print(R2mm)
            fromBamToFastq(alignFile = docBam,
                           Setting = fieldR2mateMapped2,
                           OutPut = file.path(
                               temporarySource, paste(
                                   prefix, "FastqR2mateMapped2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = "")
                           ))
            R2mm <- R2mm+1
        }
        ## "SPURIE": R1 mapped e R2 unmapped
        ## mapped R1
        fieldR1mateMapped2 <- Rsamtools::ScanBamParam(
            flag = Rsamtools::scanBamFlag(
                isUnmappedQuery = FALSE,
                isFirstMateRead = TRUE,
                hasUnmappedMate = TRUE
            ), reverseComplement = TRUE,
            what = c("qname", "seq", "qual")
        )
        ## PROCESSing
        #2.inizialization
        file.create(
            file.path(
                temporarySource, paste(
                    prefix, "FastqR1mateMapped2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = "")
            )
        )
        #3.iterate,import,class,write
        docBam <- open(Rsamtools::BamFile(
            file.path(temporarySource,
                      paste(prefix, sampleBasename, "_supplementary", sep = "")), 
            yieldSize = subsetting))
        R1mm <- 1
        while (R1mm<=cycleBAM) {
            print(R1mm)
            fromBamToFastq(alignFile = docBam,
                           Setting = fieldR1mateMapped2,
                           OutPut = file.path(
                               temporarySource, paste(
                                   prefix, "FastqR1mateMapped2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = "")
                           ))
            R1mm <- R1mm+1
        }
        ## unmapped R2, mate R1 mapped
        fieldR2mateUnmapped2 <- Rsamtools::ScanBamParam(
            flag = Rsamtools::scanBamFlag(
                isUnmappedQuery = TRUE,
                isSecondMateRead = TRUE,
                hasUnmappedMate = FALSE
            ), reverseComplement = TRUE,
            what = c("qname", "seq", "qual")
        )
        ## PROCESSing
        #2.inizialization
        file.create(
            file.path(                                                                                        
                temporarySource, paste(
                    prefix, "FastqR2mateUnmapped2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = "")
            )
        )
        #3.iterate,import,class,write
        docBam <- open(Rsamtools::BamFile(
            file.path(temporarySource,
                      paste(prefix, sampleBasename, "_supplementary", sep = "")), 
            yieldSize = subsetting))
        R2mu <- 1
        while (R2mu<=cycleBAM) {
            print(R2mu)
            fromBamToFastq(alignFile = docBam,
                           Setting = fieldR1mateMapped2,
                           OutPut = file.path(
                               temporarySource, paste(
                                   prefix, "FastqR2mateUnmapped2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = "")
                           ))
            R2mu <- R2mu+1
        }
        ##### UNION
        ## UNION R1
        erreUno <- c(
            file.path(
                temporarySource, paste(
                    prefix, "FastqR1unmap2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = "")),
            file.path(
                temporarySource, paste(
                    prefix, "FastqR1mateUnmapped2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = "")),
            file.path(
                temporarySource, paste(
                    prefix, "FastqR1mateMapped2_", sampleBasename, "_Unmapped_R1.fq.gz", sep = ""))
        )
        Rfastp::catfastq(
            output = 
                file.path(
                    outputBam, paste(
                        prefix, "R1_", sampleBasename, "_unmapped.fastq.gz", sep = "")),
            inputFiles = erreUno,
            append = FALSE,
            paired = FALSE
        )
        ## UNION R2
        erreDue <- c(
            file.path(
                temporarySource, paste(
                    prefix, "FastqR2unmap2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = "")),
            file.path(
                temporarySource, paste(
                    prefix, "FastqR2mateMapped2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = "")),
            file.path(
                temporarySource, paste(
                    prefix, "FastqR2mateUnmapped2_", sampleBasename, "_Unmapped_R2.fq.gz", sep = ""))
        )
        Rfastp::catfastq(
            output = file.path(
                outputBam, paste(
                    prefix, "R2_", sampleBasename, "_unmapped.fastq.gz", sep = "")),
            inputFiles = erreDue,
            append = FALSE,
            paired = FALSE)
        ## FILTER BAM FROM UNMAPPED 
        ## filter parameters
        parameters <- Rsamtools::ScanBamParam(
            flag = Rsamtools::scanBamFlag(
                isPaired = TRUE,
                isUnmappedQuery = FALSE,
                hasUnmappedMate = FALSE
            ), 
            what = Rsamtools::scanBamWhat())
        ## sorting bam
        Rsamtools::sortBam(
            file = file.path(temporarySource,
                             paste(prefix, sampleBasename, "_supplementary", sep = "")),
            destination = file.path(temporarySource, paste(
                prefix, sampleBasename, "_sorted", sep = "")))
        ## index bam
        Rsamtools::indexBam(files = file.path(temporarySource, paste(
            prefix, sampleBasename, "_sorted", ".bam", sep = "")))
        ## filter e write bam
        Rsamtools::filterBam(
            file = file.path(temporarySource, paste(
                prefix, sampleBasename, "_sorted", ".bam", sep = "")),
            destination = 
                file.path(outputBam, paste(
                    prefix, sampleBasename, ".bam", sep = "")),
            param = parameters,
            indexDestination = FALSE)
    }
    ## parallelization
    parallel::parLapply(cl = clust <- parallel::makeCluster(spec = nodes, type = "PSOCK"), 
                        X = lalista,
                        fun = function(x,
                                       readsPath,
                                       GenomeIndex,
                                       outBam,
                                       threads,
                                       outFormat,
                                       phredScore,
                                       maxExtractedSubreads,
                                       consensusVote,
                                       mismatchMax,
                                       uniqueOnly,
                                       maxMultiMapped,
                                       indelLength,
                                       fragmentMinLength,
                                       fragmentMaxLength,
                                       matesOrientation,
                                       readOrderConserved,
                                       coordinatesSorting,
                                       allJunctions,
                                       tempfolder
                        ) {
                            ## first alignment
                            Rsubread::subjunc(
                                readfile1 = file.path(readsPath,
                                                      list.files(
                                                          path = readsPath,
                                                          pattern =
                                                              paste(x, "_1.fastq$|_1.fq$|_1.fastq.gz$|_1.fq.gz$", 
                                                                    sep = "")
                                                      )
                                ),
                                readfile2 = file.path(readsPath,
                                                      list.files(
                                                          path = readsPath,
                                                          pattern =
                                                              paste(x, "_2.fastq$|_2.fq$|_2.fastq.gz$|_2.fq.gz$", 
                                                                    sep = "")
                                                      )
                                ),
                                index = GenomeIndex, 
                                output_file = file.path(tempfolder,
                                                        paste("Alignment", x, "_supplementary", sep = "")
                                ),
                                nthreads = threads,
                                output_format = outFormat,
                                phredOffset = phredScore,
                                nsubreads = maxExtractedSubreads,
                                TH1 = consensusVote,
                                TH2 = consensusVote,
                                maxMismatches = mismatchMax,
                                unique = uniqueOnly,
                                nBestLocations = maxMultiMapped,
                                indels = indelLength,
                                maxFragLength = fragmentMinLength,
                                minFragLength = fragmentMaxLength,
                                PE_orientation = matesOrientation,
                                keepReadOrder = readOrderConserved,
                                sortReadsByCoordinates = coordinatesSorting,
                                reportAllJunctions = allJunctions,
                            )## end of subjunc1
                            ## extracting unmapped
                            UnmappedExtraction(temporarySource = tempfolder,
                                               outputBam = outBam,
                                               prefix = "Alignment",
                                               sampleBasename = x)
                            ## rescue indel.vcf
                            fileRescue <- c(".indel.vcf", ".junction.bed", ".summary")
                            file.rename(
                                from = file.path(tempfolder, 
                                                 paste("Alignment", x, "_supplementary", fileRescue, sep = "")), 
                                to = file.path(outBam, 
                                               paste("Alignment", x, fileRescue, sep = ""))
                            )
                            ## eliciting cleaning RAM memory
                            base::gc(reset = TRUE)
                        }, ## end of the function body
                        readsPath,
                        GenomeIndex,
                        outBam,
                        threads,
                        outFormat,
                        phredScore,
                        maxExtractedSubreads,
                        consensusVote,
                        mismatchMax,
                        uniqueOnly,
                        maxMultiMapped,
                        indelLength,
                        fragmentMinLength,
                        fragmentMaxLength,
                        matesOrientation,
                        readOrderConserved,
                        coordinatesSorting,
                        allJunctions,
                        tempfolder
    ) ## end of parlapply
    parallel::stopCluster(cl = clust) ## end of node communications
    unlink(tempfolder, recursive = TRUE)
}

Try the inDAGO package in your browser

Any scripts or data that you put into this service are public.

inDAGO documentation built on Aug. 8, 2025, 7:47 p.m.