R/shiftGAlignmentsList.R

Defines functions shiftGAlignmentsList

Documented in shiftGAlignmentsList

#' @title shift 5' ends
#' @description shift the GAlignmentsLists by 5' ends. 
#' All reads aligning to the positive strand will be offset by +4bp, 
#' and all reads aligning to the negative strand will be offset -5bp by default.
#' @param gal An object of \link[GenomicAlignments:GAlignmentsList-class]{GAlignmentsList}.
#' @param positive integer(1). the size to be shift for positive strand
#' @param negative integer(1). the size to be shift for negative strand
#' @param outbam file path to save shift reads. If missing, no file will be write.
#' @return An object of \link[GenomicAlignments:GAlignments-class]{GAlignments} with 5' end 
#' shifted reads.
#' @author Jianhong Ou
#' @export
#' @import S4Vectors
#' @import GenomicRanges
#' @importFrom Rsamtools mergeBam
#' @importFrom rtracklayer export
#' @examples
#' bamfile <- system.file("extdata", "GL1.bam", package="ATACseqQC")
#' tags <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT")
#' library(BSgenome.Hsapiens.UCSC.hg19)
#' which <- as(seqinfo(Hsapiens)["chr1"], "GRanges")
#' gal <- readBamFile(bamfile, tag=tags, which=which, asMates=TRUE)
#' objs <- shiftGAlignmentsList(gal)
#' export(objs, "shift.bam")
shiftGAlignmentsList <- function(gal, positive=4L, negative=5L, outbam){
    stopifnot(is.integer(positive))
    stopifnot(is.integer(negative))
    stopifnot(is(gal, "GAlignmentsList"))
    if(length(gal)==0){
      ## big file mode
      meta <- metadata(gal)
      if(!all(c("file", "param") %in% names(meta))){
        stop("length of gal could not be 0.")
      }
      ow <- getOption("warn")
      on.exit(options(warn = ow))
      options(warn=-1)
      chunk <- 100000
      index <- ifelse(length(meta$index)>0, meta$index, meta$file)
      bamfile <- BamFile(meta$file, index=index, yieldSize=chunk, asMates = meta$asMates)
      outfile <- NULL
      mpos <- NULL
      open(bamfile)
      while (length(chunk0 <- readGAlignmentsList(bamfile, param=meta$param))) {
        gal1 <- shiftGAlignmentsList(chunk0, positive = positive, negative = negative)
        this.mpos <- mcols(gal1)$mpos
        if(length(this.mpos)!=length(gal1)){
          stop("Can not get mpos info from the reads.")
        }
        names(this.mpos) <- paste(mcols(gal1)$qname, start(gal1))
        mpos <- c(mpos, this.mpos)
        outfile <- c(tempfile(fileext = ".bam"), outfile)
        if(length(meta$header)>0) metadata(gal1)$header <- meta$header
        exportBamFile(gal1, outfile[1])
        rm(gal1)
      }
      close(bamfile)
      if(length(outfile)>1){
        mergedfile <- mergeBam(outfile, 
                               destination=tempfile(fileext = ".bam"), 
                               indexDestination=TRUE,
                               header=meta$file)
        unlink(outfile)
        unlink(paste0(outfile, ".bai"))
      }else{
        if(length(outfile)==1){
          mergedfile <- outfile
        }else{
          stop("Can not get any proper mapped reads from  your inputs.")
        }
      }
      if(!missing(outbam)){
        file.copy(from=mergedfile, to=outbam)
        file.copy(from=paste0(mergedfile, ".bai"), 
                  to=paste0(outbam, ".bai"))
        gal1 <- GAlignments()
        meta$file <- outbam
        meta$index <- outbam
        meta$asMates <- FALSE
        meta$mpos <- mpos
        metadata(gal1) <- meta
      }else{
        gal1 <- readGAlignments(mergedfile, param = meta$param)
        mcols(gal1)$MD <- NULL
        names(gal1) <- mcols(gal1)$qname
        gal1 <- gal1[order(names(gal1))]
        mcols(gal1)$mpos <- mpos[paste(mcols(gal1)$qname, start(gal1))]
        unlink(mergedfile)
        unlink(paste0(mergedfile, ".bai"))
      }
      return(gal1)
    }
    stopifnot(length(gal)>0)
    stopifnot(all(elementNROWS(gal)<3))
    ## move the 5'end
    ## first reads is 5'end
    gal1 <- unlist(gal)
    gp <- rep(seq_along(gal), elementNROWS(gal))
    k <- width(gal1) <= max(c(positive, negative))
    if(any(k)){
      gal <- gal[-gp[k]]
      gal1 <- unlist(gal)
    }
    gp <- rep(1, length(gal1))
    gp1 <- rep(seq_along(gal), elementNROWS(gal))
    gp[duplicated(gp1)] <- 2
    mcols(gal1)$MD <- NULL
    gal1 <- shiftReads(gal1, 
                       positive=positive,
                       negative=negative)
    names(gal1) <- mcols(gal1)$qname
    id2 <- which(gp==2)
    id1 <- id2-1
    mcols(gal1)$mpos[gp==2] <- start(gal1)[id1]
    mcols(gal1)$mpos[id1] <- start(gal1)[id2]
    if(length(mcols(gal1)$MC)>0){
      mcols(gal1)$MC[gp==2] <- cigar(gal1[id1])
      mcols(gal1)$MC[id1] <- cigar(gal1[id2])
    }
    gal1 <- gal1[!is.na(mcols(gal1)$mpos)] ## remove the single ends
    if(length(gal1)<1){
      message("Only single end reads")
      return(gal1)
    }
    ## till now, gal1 must have mrnm, mpos, names and flag
    stopifnot(length(mcols(gal1)$mrnm)>0)
    stopifnot(length(mcols(gal1)$mpos)>0)
    stopifnot(length(mcols(gal1)$flag)>0)
    stopifnot(length(names(gal1))>0)
    if(!(missing(outbam))){
      tryCatch(exportBamFile(gal1, outbam),
               error=function(e){
                 message(e)
               })
    }
    return(gal1)
}

Try the ATACseqQC package in your browser

Any scripts or data that you put into this service are public.

ATACseqQC documentation built on Nov. 8, 2020, 11 p.m.