R/bamio.R

Defines functions getBamLocally getBamMultiMatching .ezMergeLeftRightAlignments ezReadPairedAlignments ezReadGappedAlignments ezBam2bigwig ezScanBam ezSortIndexBam ezBamSeqLengths ezBamSeqNames

Documented in ezBam2bigwig ezBamSeqLengths ezBamSeqNames .ezMergeLeftRightAlignments ezReadGappedAlignments ezReadPairedAlignments ezScanBam ezSortIndexBam getBamMultiMatching

# Functional Genomics Center Zurich
# This code is distributed under the terms of the GNU General
# Public License Version 3, June 2007.
# The terms are available here: http://www.gnu.org/licenses/gpl.html
# www.fgcz.ch


##' @title Gets sequence names
##' @description Gets sequence names from a bam file.
##' @template bamFile-template
##' @param sizeSorting a character. if equal to "decreasing" the names will be sorted decreasingly.
##' @template roxygen-template
##' @return Returns a character vector of sequence names.
##' @examples
##' bamFile <- system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE)
##' ezBamSeqLengths(bamFile)
##' ezBamSeqNames(bamFile)
ezBamSeqNames <- function(bamFile, sizeSorting = "decreasing") {
  seqLengths <- ezBamSeqLengths(bamFile)
  if (sizeSorting == "decreasing") {
    seqLengths <- sort(seqLengths, decreasing = TRUE)
  }
  return(names(seqLengths))
}

##' @describeIn ezBamSeqNames Gets the sequence lengths from a bam file.
ezBamSeqLengths <- function(bamFile) {
  require(Rsamtools)
  return(scanBamHeader(bamFile)[[1]]$targets)
}

ezSortIndexBam <- function(inBam, bam, ram = 2, removeBam = TRUE, cores = 2,
                           method = c("samtools", "picard")) {
  method <- match.arg(method)
  maxMem <- str_c(floor(ram * 0.7 / cores * 1000), "M")
  ## use only 70% --> 30% safety margin before crash

  if (method == "samtools") {
    cmd <- str_c(
      method, "sort", "-l 9", "-m", maxMem, "-@", cores, inBam,
      "-o", bam,
      sep = " "
    )
    ezSystem(cmd)
    cmd <- str_c(method, "index", bam, sep = " ")
    ezSystem(cmd)
  } else {
    cmd <- str_c(
      prepareJavaTools("picard"), "SortSam",
      paste0("I=", inBam),
      paste0("O=", bam),
      "SORT_ORDER=coordinate",
      sep = " "
    )
    ezSystem(cmd)
    cmd <- paste(
      prepareJavaTools("picard"), "BuildBamIndex",
      paste0("I=", bam),
      paste0("OUTPUT=", bam, ".bai"),
      sep = " "
    )
    ezSystem(cmd)
  }
  if (removeBam) {
    file.remove(inBam)
  }
}

##' @title Scans a bam file
##' @description Scans a bam file with the option to select only a part of the output.
##' @template bamFile-template
##' @param seqname an optional character vector to keep only the specified sequence names in the returned reads.
##' @param start an optional integer vector to limit the start position of the range of returned reads.
##' @param end an optional integer vector to limit the start position of the range of returned reads.
##' @param strand a character specifying which strand to use (+, - or *).
##' @param tag passed further to \code{ScanBamParam()}.
##' @param what passed further to \code{ScanBamParam()}.
##' @param isFirstMateRead passed further to \code{scanBamFlag()}.
##' @param isSecondMateRead passed further to \code{scanBamFlag()}.
##' @param isUnmappedQuery passed further to \code{scanBamFlag()}.
##' @param isProperPair passed further to \code{scanBamFlag()}.
##' @param countOnly a logical specifying whether only counts should be returned or the whole scan of the bam file.
##' @template roxygen-template
##' @seealso \code{\link[Rsamtools]{scanBam}}
##' @seealso \code{\link[Rsamtools]{ScanBamParam}}
##' @return Returns a list representing a scanned bam file.
##' @examples
##' bamFile <- system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE)
##' bam = ezScanBam(bamFile)
ezScanBam <- function(bamFile, seqname = NULL, start = NULL, end = NULL, strand = "*",
                      tag = character(0), what = scanBamWhat(),
                      isFirstMateRead = NA, isSecondMateRead = NA, isUnmappedQuery = FALSE, isProperPair = NA,
                      isSecondaryAlignment = NA, isDuplicate = NA, countOnly = FALSE) {
  ## initialize the parameters for scanBam
  require(Rsamtools)
  param <- ScanBamParam(what = what, tag = tag)

  ### build and set the flag filter
  isMinusStrand <- switch(strand, "-" = TRUE, "+" = FALSE, NA)
  bamFlag(param) <- scanBamFlag(
    isMinusStrand = isMinusStrand, isFirstMateRead = isFirstMateRead, isSecondMateRead = isSecondMateRead,
    isUnmappedQuery = isUnmappedQuery, isProperPair = isProperPair, isSecondaryAlignment = isSecondaryAlignment, isDuplicate = isDuplicate
  )

  ### limit the returned reads by chromosome and start/end if these are provided
  if (!is.null(seqname)) {
    seqLengths <- ezBamSeqLengths(bamFile)
    if (!seqname %in% names(seqLengths)) {
      return(list(error = paste("Specified seqname: ", seqname, " not in chromosome list of ", bamFile, ":", paste(names(seqLengths), collapse = " "))))
    }
    if (is.null(start)) start <- 1
    if (is.null(end)) end <- seqLengths[seqname]
    bamWhich(param) <- GRanges(seqnames = seqname, ranges = IRanges(start = start, end = end))
  }
  if (countOnly) {
    return(countBam(bamFile, param = param))
  } else {
    return(scanBam(bamFile, param = param)[[1]])
  }
}

##' @title Converts from bam to bigwig
##' @description Converts a bam file into a bigwig file.
##' @template bamFile-template
##' @param bigwigPrefix a character representing a prefix to add to the bigwig file names.
##' @param param a list of parameter to extract the \code{strandMode} from.
##' @param paired a logical indicating whether the samples are paired.
##' @template roxygen-template
##' @seealso \code{\link[Rsamtools]{scanBamHeader}}
##' @examples
##' bamFile <- system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE)
##' ezBam2bigwig(bamFile, bigwigPrefix="bg", param=ezParam(), paired=TRUE)
ezBam2bigwig <- function(bamFile, bigwigPrefix, param = NULL, paired = NULL) {

  ## from bam to wig
  require(Rsamtools)
  sh <- scanBamHeader(bamFile)
  seqLengths <- sh[[1]]$targets
  chromSize <- data.frame(seqLengths, row.names = names(seqLengths))
  chromSizeFile <- paste0("chromSize-", Sys.getpid(), ".txt")
  write.table(chromSize, sep = "\t", file = chromSizeFile, col.names = FALSE, quote = FALSE)
  outputWig <- paste0("outputWig-", Sys.getpid())
  strand_rule <- NULL
  if (paired) {
    strand_rule <- switch(param$strandMode,
      both = "",
      sense = "--strand='1++,1--,2+-,2-+'",
      antisense = "--strand='1+-,1-+,2++,2--'"
    )
  } else {
    strand_rule <- switch(param$strandMode,
      both = "",
      sense = "--strand='++,--'",
      antisense = "--strand='+-,-+'"
    )
  }
  # bam2wig.py is in RSeQC module
  cmd <- paste("bam2wig.py", "-i", bamFile, "-s", chromSizeFile, "-o", outputWig, strand_rule)
  ezSystem(cmd)

  ## from wig to bigwig
  wigFiles <- list.files(path = ".", pattern = paste0(outputWig, ".*\\.wig$"))
  bigwigFiles <- gsub(outputWig, bigwigPrefix, wigFiles)
  bigwigFiles <- gsub(".wig$", ".bw", bigwigFiles)
  for (i in 1:length(wigFiles)) {
    cmd <- paste("wigToBigWig", wigFiles[i], chromSizeFile, bigwigFiles[i])
    try(ezSystem(cmd))
  }

  file.remove(wigFiles)
  file.remove(chromSizeFile)
}

##' @title Reads gapped alignments from bam
##' @description Reads gapped alignments from a bam file.
##' @template bamFile-template
##' @param seqname an optional character vector to keep only the specified sequence names in the returned reads.
##' @param start an optional integer vector to limit the start position of the range of returned reads.
##' @param end an optional integer vector to limit the start position of the range of returned reads.
##' @param strand a character specifying which strand to use (+, - or *).
##' @param tag passed further to \code{ScanBamParam()}.
##' @param what passed further to \code{ScanBamParam()}.
##' @param use.names passed further to \code{readGAlignments()}.
##' @param isFirstMateRead passed further to \code{scanBamFlag()}.
##' @param isSecondMateRead passed further to \code{scanBamFlag()}.
##' @param isUnmappedQuery passed further to \code{scanBamFlag()}.
##' @param isProperPair passed further to \code{scanBamFlag()}.
##' @param minMapQuality an integer specifying the minimal mapping quality to use.
##' @param keepMultiHits a logical specifying whether to keep multiple hits.
##' @template roxygen-template
##' @seealso \code{\link[Rsamtools]{scanBam}}
##' @seealso \code{\link[Rsamtools]{ScanBamParam}}
##' @seealso \code{\link[GenomicAlignments]{readGAlignments}}
##' @return Returns an object of the class GAlignments.
##' @examples
##' bamFile <- system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE)
##' ga = ezReadGappedAlignments(bamFile)
ezReadGappedAlignments <- function(bamFile, seqname = NULL, start = NULL, end = NULL, strand = "*",
                                   tag = character(0), what = character(0), use.names = TRUE,
                                   isFirstMateRead = NA, isSecondMateRead = NA,
                                   isUnmappedQuery = FALSE, isProperPair = NA,
                                   isSecondaryAlignment = NA,
                                   minMapQuality = 0,
                                   tagFilter = list(),
                                   keepMultiHits = TRUE) {
  ## initialize the parameters for scanBam
  require(Rsamtools)
  require(GenomicAlignments)
  if (minMapQuality > 0) {
    what <- union(what, "mapq")
  }
  if (!keepMultiHits) {
    tag <- union(tag, c("NH", "IH"))
  }
  param <- ScanBamParam(what = what, tag = tag, tagFilter = tagFilter)

  ### build and set the flag filter
  isMinusStrand <- switch(strand, "-" = TRUE, "+" = FALSE, NA)
  bamFlag(param) <- scanBamFlag(
    isMinusStrand = isMinusStrand,
    isFirstMateRead = isFirstMateRead,
    isSecondMateRead = isSecondMateRead,
    isUnmappedQuery = isUnmappedQuery,
    isProperPair = isProperPair,
    isSecondaryAlignment = isSecondaryAlignment
  )

  ### limit the returned reads by chromosome and start/end if these are provided
  if (!is.null(seqname)) {
    seqLengths <- ezBamSeqLengths(bamFile[1])
    if (!seqname %in% names(seqLengths)) {
      return(list(error = paste("Specified seqname: ", seqname, " not in chromosome list of ", bamFile, ":", paste(names(seqLengths), collapse = " "))))
    }
    if (is.null(start)) start <- 1
    if (is.null(end)) end <- seqLengths[seqname]
    bamWhich(param) <- GRanges(seqnames = seqname, ranges = IRanges(start = start, end = end))
  }

  if (length(bamFile) > 1) {
    ga <- Reduce(c, lapply(bamFile, readGAlignments, use.names = use.names, param = param))
  } else {
    ga <- readGAlignments(bamFile, use.names = use.names, param = param)
  }
  if (minMapQuality > 0) {
    mc <- mcols(ga)
    use <- mc$mapq >= minMapQuality
    use[is.na(use)] <- TRUE ## accept also mapping quality of 255
    ga <- ga[use]
  }
  if (!keepMultiHits) {
    mc <- mcols(ga)
    numberOfHits <- NULL
    if (all(!is.na(mc$NH))) {
      numberOfHits <- mc$NH
    } else {
      if (all(!is.na(mc$IH))) {
        numberOfHits <- mc$IH
      }
    }
    if (is.null(numberOfHits)) {
      stop(paste("multi-hit information not available in bam file:", bamFile))
    }
    ga <- ga[numberOfHits == 1]
  }
  return(ga)
}

##' @title Reads paired alignments from bam
##' @description Reads paired alignments from a bam file.
##' @template bamFile-template
##' @param seqname an optional character vector to keep only the specified sequence names.
##' @param start passed further to \code{ezReadGappedAlignments()}.
##' @param end passed further to \code{ezReadGappedAlignments()}.
##' @param strand a character specifying which strand to use (+, - or *).
##' @param tag passed further to \code{ezReadGappedAlignments()}.
##' @param keepUnpaired a character indicating which aligned but unpaired reads to keep. Possible options: "none", "first", "second", "both".
##' @param fillGap a character to fill into the gaps produced by \code{ezMergeLeftRightAlignments()}.
##' @param minMapQuality passed further to \code{ezReadGappedAlignments()}.
##' @param keepMultiHits passed further to \code{ezReadGappedAlignments()}.
##' @param gaLeft the left gapped alignments.
##' @param gaRight the right gapped alignments.
##' @template roxygen-template
##' @seealso \code{\link{ezReadGappedAlignments}}
##' @return Returns an object of the class GAlignments.
##' @examples
##' bamFile <- system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE)
##' ezReadPairedAlignments(bamFile)
ezReadPairedAlignments <- function(bamFile, seqname = NULL, start = NULL, end = NULL, strand = "*",
                                   tag = c("NH"), keepUnpaired = "both", fillGap = "N", minMapQuality = 0, keepMultiHits = TRUE) {
  require(Rsamtools)
  require(GenomicAlignments)
  .loadPairedSingleChrom <- function(chrom) {
    return(ezReadPairedAlignments(bamFile,
      seqname = chrom, start = start, end = end, strand = strand,
      tag = tag, keepUnpaired = keepUnpaired, minMapQuality = minMapQuality, keepMultiHits = keepMultiHits
    ))
  }

  if (is.null(seqname)) {
    ## if we load the entire genome we run it multi-threaded and combine then the different
    ## GappedAlignment objects
    job <- ezJobStart(paste("readPaired", bamFile))
    seqNames <- ezBamSeqNames(bamFile)
    gaList <- ezMclapply(seqNames, .loadPairedSingleChrom, mc.preschedule = FALSE)
    names(gaList) <- seqNames
    ezWriteElapsed(job, "chromosomes loaded")
    ga <- GAlignments()
    for (sn in seqNames) {
      ga <- c(ga, gaList[[sn]])
      gaList[[sn]] <- NULL
      gc()
    }
    ezWriteElapsed(job, "chromosomes merged")
    return(ga)
  }

  seqLengths <- ezBamSeqLengths(bamFile)
  gaAll <- GAlignments(seqnames = Rle(factor(levels = names(seqLengths))), seqlengths = seqLengths)
  if (strand %in% c("+", "*")) {
    gaLeft <- ezReadGappedAlignments(bamFile,
      seqname = seqname, start = start, end = end, strand = "+",
      isFirstMateRead = TRUE, isSecondMateRead = FALSE, isProperPair = TRUE, what = c("mpos"), tag = tag,
      minMapQuality = minMapQuality, keepMultiHits = keepMultiHits
    )
    # ezWriteElapsed(job, "loaded paired gaLeft for positive strand")
    gaRight <- ezReadGappedAlignments(bamFile,
      seqname = seqname, start = start, end = end, strand = "-",
      isFirstMateRead = FALSE, isSecondMateRead = TRUE, isProperPair = TRUE, what = c("mpos"), tag = tag,
      minMapQuality = minMapQuality, keepMultiHits = keepMultiHits
    )
    # ezWriteElapsed(job, "loaded paired gaRight for positive strand")
    ## strand of gaRight will be wrong but we will never use it!!
    gaPositive <- .ezMergeLeftRightAlignments(gaLeft, gaRight, fillGap = fillGap)
    rm(gaLeft, gaRight)
    gc()
    gaAll <- c(gaAll, gaPositive)
    rm(gaPositive)
    gc()
  }
  if (strand %in% c("-", "*")) {
    gaLeft <- ezReadGappedAlignments(bamFile,
      seqname = seqname, start = start, end = end, strand = "+",
      isFirstMateRead = FALSE, isSecondMateRead = TRUE, isProperPair = TRUE, what = c("mpos"), tag = tag,
      minMapQuality = minMapQuality, keepMultiHits = keepMultiHits
    )
    # ezWriteElapsed(job, "loaded paired gaLeft for negative strand")
    gaRight <- ezReadGappedAlignments(bamFile,
      seqname = seqname, start = start, end = end, strand = "-",
      isFirstMateRead = TRUE, isSecondMateRead = FALSE, isProperPair = TRUE, what = c("mpos"), tag = tag,
      minMapQuality = minMapQuality, keepMultiHits = keepMultiHits
    )
    # ezWriteElapsed(job, "loaded paired gaRight for negative strand")
    gaNegative <- .ezMergeLeftRightAlignments(gaLeft, gaRight)
    if (!is.null(gaNegative)) {
      strand(gaNegative) <- "-"
    }
    rm(gaLeft, gaRight)
    gc()
    gaAll <- c(gaAll, gaNegative)
    rm(gaNegative)
    gc()
  }
  ## by default we load also read that were aligned but not paired with their mate
  stopifnot(keepUnpaired %in% c("none", "first", "second", "both"))
  if (keepUnpaired != "none") {
    if (keepUnpaired == "both") {
      isFirstMateRead <- NA
      isSecondMateRead <- NA
    }
    if (keepUnpaired == "first") {
      isFirstMateRead <- TRUE
      isSecondMateRead <- FALSE
    }
    if (keepUnpaired == "second") {
      isFirstMateRead <- FALSE
      isSecondMateRead <- TRUE
    }
    gaSingle <- ezReadGappedAlignments(bamFile,
      seqname = seqname, start = start, end = end, strand = strand,
      isProperPair = FALSE,
      isFirstMateRead = isFirstMateRead, isSecondMateRead = isSecondMateRead,
      what = c("flag"), tag = tag,
      minMapQuality = minMapQuality, keepMultiHits = keepMultiHits
    )
    isAlsoPaired <- names(gaSingle) %in% names(gaAll)
    if (any(isAlsoPaired)) {
      gaSingle <- gaSingle[!isAlsoPaired]
    }
    doFlipStrand <- bamFlagTest(values(gaSingle)$flag, "isSecondMateRead")
    if (any(doFlipStrand)) {
      myStrand <- strand(gaSingle)
      setToPos <- myStrand == "-" & doFlipStrand
      setToNeg <- myStrand == "+" & doFlipStrand
      if (any(setToPos)) {
        myStrand[setToPos] <- "+"
      }
      if (any(setToNeg)) {
        myStrand[setToNeg] <- "-"
      }
      strand(gaSingle) <- myStrand
    }
    values(gaSingle)$flag <- NULL
    gaAll <- c(gaAll, gaSingle)
    rm(gaSingle)
    gc()
  }
  return(gaAll)
}

##' @describeIn ezReadPairedAlignments Merges left and right alignments and fills gaps with \code{fillGap}.
.ezMergeLeftRightAlignments <- function(gaLeft, gaRight, fillGap = "N") {
  ## include the multi-paired-hits
  idx <- match(
    paste(names(gaLeft), values(gaLeft)[["mpos"]]),
    paste(names(gaRight), start(gaRight))
  )
  isNaIdx <- is.na(idx)
  gaAll <- gaLeft[isNaIdx]
  values(gaAll)$mpos <- NULL
  if (any(isNaIdx)) {
    message(paste("found invalid pairs:", sum(isNaIdx), "/", length(isNaIdx)))
    gaLeft <- gaLeft[!isNaIdx]
    gaRight <- gaRight[idx[!isNaIdx]]
  } else {
    gaRight <- gaRight[idx]
  }
  values(gaRight)$mpos <- NULL
  values(gaLeft)$mpos <- NULL


  dist <- start(gaRight) - end(gaLeft) - 1
  ## some useful index
  # gaRight is totally to the right of gaLeft
  hasGap <- dist >= 0
  # has overhang
  hasOverhang <- end(gaRight) > end(gaLeft)
  ## check if there are insertions or deletions
  hasIndel <- ezGrepl("D|I", cigar(gaRight))

  ## situation one: gap or zero distance between gaLeft and gaRight
  if (any(hasGap)) {
    gapString <- sub("^0N$", "", paste0(format(dist[hasGap], scientific = FALSE, trim = TRUE), fillGap))
    mergedCigar <- paste0(cigar(gaLeft)[hasGap], gapString, cigar(gaRight)[hasGap])
    gaGap <- GAlignments(
      seqnames = seqnames(gaLeft)[hasGap], pos = start(gaLeft)[hasGap], cigar = mergedCigar, strand = strand(gaLeft)[hasGap],
      names = names(gaLeft)[hasGap]
    )
    values(gaGap) <- values(gaLeft)[hasGap, , drop = FALSE]
    gaAll <- c(gaGap, gaAll)
  }

  ## situation two: overlap and overhang no indel problems
  useNarrow <- !hasGap & hasOverhang & !hasIndel
  if (any(useNarrow)) {
    gaRightNarrowed <- narrow(gaRight[useNarrow], -dist[useNarrow] + 1)
    distNarrowed <- start(gaRightNarrowed) - end(gaLeft[useNarrow]) - 1
    gapString <- sub("^0N$", "", paste0(distNarrowed, "N"))
    mergedCigar <- paste0(cigar(gaLeft)[useNarrow], gapString, cigar(gaRightNarrowed))
    gaNew <- GAlignments(
      seqnames = seqnames(gaLeft)[useNarrow], pos = start(gaLeft)[useNarrow],
      cigar = mergedCigar, strand = strand(gaLeft)[useNarrow],
      names = names(gaLeft)[useNarrow]
    )
    values(gaNew) <- values(gaLeft)[useNarrow, , drop = FALSE]
    gaAll <- c(gaNew, gaAll)
  }
  useNarrowIndel <- !hasGap & hasOverhang & hasIndel
  if (any(useNarrowIndel)) {
    ## TODO: do the narrowing of the right in the presence of indel in gaRight
    ## Current: uses only the left as a workaround!!!!
    gaAll <- c(gaLeft[useNarrowIndel], gaAll)
  }

  ## situation three: overlap but no overhang --> use left reads
  useLeft <- !hasGap & !hasOverhang
  if (any(useLeft)) {
    gaAll <- c(gaLeft[useLeft], gaAll)
  }
  return(gaAll)
}


##' @title Gets bam multi matching
##' @description Gets bam multi matching and returns the result as an integer vector.
##' @param param a list of parameters:
##' \itemize{
##'   \item{paired}{ a logical indicating whether the samples are paired.}
##' }
##' @template bamFile-template
##' @param nReads an integer specifying the number of reads. These additional reads will be counted in "0"
##' @template roxygen-template
##' @return Returns a integer vector specifying the amount of matching reads.
##' @examples
##' param = ezParam()
##' bamFile <- system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE)
##' getBamMultiMatching(param, bamFile, nReads=10000)
getBamMultiMatching <- function(param, bamFile, nReads = NULL) {
  require(data.table)
  require(Rsamtools)

  # This data.table based implementation is the fastest and most elegant!
  # test file: /srv/gstore/projects/p2438/STAR_18564_2017-06-12--13-46-30/26EV_d3_A.bam
  ## Shell based is ugly and slow. Writing temps to disks. 1534.172 seconds
  ## table(table()) method is more tidy but slow. 1521.440 seconds + reading qname
  ## plyr::count is slow: 1770.432 seconds + reading qname
  ## rle(sort()) is slow: 1781.552 seconds + reading qname
  ## data.table: 4.036 seconds + reading qname

  #  job = ezJobStart(paste("bam multimatch", basename(bamFile)))
  paramBam <- ScanBamParam(what = "qname")
  bamFlag(paramBam) <- scanBamFlag(isUnmappedQuery = FALSE)
  if (isTRUE(param$paired)) {
    #     bamFlag(param) = switch(param$paired,
    #                       paired=scanBamFlag(isFirstMateRead=TRUE, isProperPair=TRUE, isUnmappedQuery=FALSE),
    #                       first=scanBamFlag(isFirstMateRead=TRUE, isUnmappedQuery=FALSE),
    #                       second=scanBamFlag(isSecondMateRead=TRUE, isUnmappedQuery=FALSE))
    bamFlag(paramBam) <- scanBamFlag(
      isFirstMateRead = TRUE, isProperPair = param$keepProperPairsOnly,
      isUnmappedQuery = FALSE
    )
  }
  bamReads <- scanBam(bamFile, param = paramBam)[[1]]$qname
  bamReadsDT <- as.data.table(bamReads)
  bamReadsDTCount <- bamReadsDT[, .N, by = bamReads]
  colnames(bamReadsDTCount)[2] <- "qnameN"
  bamReadsDTCount2 <- bamReadsDTCount[, .N, by = qnameN]
  bamReadsDTCount2 <- bamReadsDTCount2[order(qnameN)]
  result <- set_names(
    bamReadsDTCount2$N,
    bamReadsDTCount2$qnameN
  )
  # result = table(table(bamReads))
  # if (param$paired){
  #  flagOption = "-f 3 -F 132"
  #     switch(param$pairedMode,
  #                         paired="-f 3 -F 132",
  #                         first="-F 132",
  #                         second="-F 68")
  # } else {
  #  flagOption = "-F 4"
  # }
  # countFile = paste0(Sys.getpid(), "-BamMultiMatch.txt")
  # cmd = paste("samtools", "view", flagOption, bamFile, "| cut -f1 | sort | uniq -c | cut -c 1-7 | sort | uniq -c >", countFile)
  ## direct usage of regular expression, slow
  # cmd = paste("samtools", "view", flagOption, bamFile, "| cut -f1 | sort | uniq -c | grep -o "[[:blank:]]*[[:digit:]]\+[ ]" | sort | uniq -c >", countFile)
  ## trim the leading spaces first and then cut the first field, faster
  ## set the temp directory to be the current one, because /tmp may be too small
  # cmd = paste("samtools", "view", flagOption, bamFile, "| cut -f1 | sort --temporary-directory=. | uniq -c | sed -e \"s/^[ \t]*//\" | cut -f1 -d\" \" |sort | uniq -c >", countFile)

  # ezSystem(cmd)
  # temp = read.table(countFile)
  # tempOrdered = temp[order(temp[,2]), ]
  # result = tempOrdered[ ,1]
  # names(result) = tempOrdered[, 2]
  # file.remove(countFile)

  if (!is.null(nReads)) {
    nReads <- as.integer(nReads)
    if (!is.na(nReads)) {
      nReadsUnmapped <- nReads - sum(result)
      stopifnot(nReadsUnmapped >= 0)
      result <- c("0" = nReadsUnmapped, result)
    }
  }
  return(result)
}

getBamLocally <- function(src, toSam = FALSE) {
  if (normalizePath(dirname(src)) %in% c(".", normalizePath(getwd()))) {
    return(src)
  }
  target <- basename(src)
  names(target) <- names(src)
  stopifnot(target != src)
  if (toSam) {
    target <- sub(".bam$", ".sam", target)
    if (file.exists(target)) file.remove(target)
    cmd <- paste("samtools", "view -h", "-o", target, src)
    ezSystem(cmd)
  } else {
    file.copy(from = src, to = target)
    file.copy(
      from = paste0(src, ".bai"),
      to = paste0(target, ".bai")
    )
  }
  return(target)
}
uzh/ezRun documentation built on April 24, 2024, 4:01 p.m.