R/pileupMT.R

Defines functions .sanCheck .indelToMVR .puToMvr .reverseCigar .getRefSeq .getRefSyn .refSeqLengths .puToGR .byPos pileupMT

Documented in pileupMT

#' Pileup the mitochondrial reads in a BAM, for variant calling or CN purposes. 
#'
#' If a BAM filename is given, but no ScanBamParam, scanMT(bam) will be called. 
#' Human mitochondrial genomes (GRCh37+ and hg38+) are fully supported, mouse
#' mitochondrial genomes (C57BL/6J aka NC_005089) are only somewhat supported.
#' 
#' @param bam         BAM (must be indexed!) file name or object with @bam slot
#' @param sbp         optional ScanBamParam object (autogenerated if missing) 
#' @param pup         optional PileupParam object (autogenerated if missing)
#' @param callIndels  call indels? (This can reduce compute and RAM if FALSE.)
#' @param ref         aligned reference mitogenome (default is rCRS/GRCh37+)
#' @param ...         additional args to pass on to the pileup() function
#'
#' @return            an MVRanges object, constructed from pileup results 
#' 
#' @import GenomicAlignments
#' @import GenomeInfoDb
#' @import Rsamtools
#' 
#' @examples
#' library(MTseekerData)
#' BAMdir <- system.file("extdata", "BAMs", package="MTseekerData")
#' patientBAMs <- list.files(BAMdir, pattern="^pt.*.bam$")
#' (bam <- file.path(BAMdir, patientBAMs[1]))
#' (sbp <- scanMT(bam))
#' (mvr <- pileupMT(bam, sbp=sbp, callIndels=TRUE, ref="rCRS"))
#'
#' @export
pileupMT <- function(bam, sbp=NULL, pup=NULL, callIndels=TRUE, ref=c("rCRS","GRCh37","GRCh38","hg38", "GRCm38","C57BL/6J","NC_005089","mm10"), ...) { 
  
  # List of reference genomes and their lengths
  refSeqLengths <- .refSeqLengths() # synonyms 
  
  # Stop if the user forgets to specify a reference genome (FIXME: autodetect)
  if (length(ref) > 1) {
    stop("You forgot to set a reference genome! MTseeker currently supports: ", paste(names(refSeqLengths), collapse= ", "))
  }
  
  # scant support for other mitogenomes
  if (!ref %in% names(refSeqLengths)) {
    stop("Only the ", paste(names(refSeqLengths), collapse=" and "),
         " mitochondrial genome", 
         ifelse(length(refSeqLengths) > 1, "s are", " is"),
         " supported by pileupMT() at present.")
  }
  
  # Determine which reference genome to use
  # Currently supports mouse and human
  ref <- .getRefSyn(match.arg(ref)) # matching 
  
  # If you are working with multiple files
  if (length(bam) > 1) {
    mvrl <- MVRangesList(lapply(bam, pileupMT, ref=ref, sbp=sbp))
    sampNames <- lapply(bam, 
                        function(x) 
                          base::sub(paste0(".", ref), "", 
                                    base::sub(".bam", "",  basename(x))))
    names(mvrl) <- sampNames
    return(mvrl)
  }
  
  # can support multiple bams if we have sample names
  # perhaps it is worthwhile to autoextract them now
  
  if (is.null(sbp)) sbp <- scanMT(bam, mapqFilter=20) 
  bamWhat(sbp) <- "strand"

  if (is.null(pup)) pup <- PileupParam(distinguish_strands=TRUE, 
                                       distinguish_nucleotides=TRUE, 
                                       ignore_query_Ns=TRUE, 
                                       include_deletions=callIndels, 
                                       include_insertions=callIndels) 
  
  pu <- pileup(file=bam, scanBamParam=sbp, pileupParam=pup, ...)
  
  # may be handy for editing 
  refSeqDNA <- .getRefSeq(ref)
  
  # Need the total number of reads in order to calculate coverage
  readsSBP <- sbp
  bamWhat(readsSBP) <- "seq"
  reads <- readGAlignments(file=bam, param=readsSBP)
  numReads <- length(reads)
  readsWidth <- median(qwidth(reads)) - 1
  
  # Name of the bam/sample
  sampNames <- base::sub(paste0(".", ref), "", base::sub(".bam", "",  basename(bam)))

  # number of reads * length of reads / length of ref sequence
  covg <- round(numReads * readsWidth / width(refSeqDNA))
  if (is.na(covg)) covg <- 0


  if (callIndels == FALSE) { 
    indels <- subset(pu, FALSE) # brute force 
  } else {   
    # will need to handle '-' and '+' separately 
    indels <- subset(pu, nucleotide %in% c('-', '+'))
    
    if (nrow(indels) > 0) {
      # for obtaining the supporting reads and CIGARs from the BAM: 
      indels$start <- indels$pos
      indels$end <- indels$pos 
      indelSBP <- sbp
      bamWhich(indelSBP) <- as(indels, "GRanges")
      bamWhat(indelSBP) <- "seq"
      indelReads <- readGAlignments(file=bam, param=indelSBP)
      
      # Get the reads the physically contain the insertion and deletions
      indelIndex <- grep("D|I", cigar(indelReads))

      # Every other read will support the reference
      everything <- seq(1, length(indelReads), 1)
      refIndex <- which(is.na(match(everything, indelIndex)))
      refSupport <- indelReads[refIndex]
      
      # Now this contains just indel reads
      # Hopefully this will speed things up with less iteration
      indelReads <- indelReads[indelIndex]
      
      # Initialize to get the alternative sequences
      mcols(indelReads)$indelStart <- NA_integer_
      mcols(indelReads)$indelEnd <- NA_integer_
      
      mcols(indelReads)$ref <- NA_character_
      mcols(indelReads)$alt <- NA_character_

      # This is to help with debugging
      mcols(indelReads)$bam <- sampNames
      #mcols(indelReads)$potentialHaplo <- FALSE
      
      message("Tallying indels for: ", sampNames)

      if (length(indelIndex) > 500) {
        message("This may take a while, determining ", length(indelIndex), 
                " reads for indels from ", sampNames)
      }
      
      # Empty GAlignment that will have new indels appended to it
      newIndelReads <- indelReads[0]
      
      # Returns the ref and alt read for each variant
      #lapply(indelReads, .reverseCigar, ref=ref, reference=refSeqDNA)
      for (i in 1:length(indelReads)) {
        newIndelReads <- append(newIndelReads, 
                                .reverseCigar(indelReads[i], ref, refSeqDNA, i))
      }
       
      # Converts to a MVRanges
      mvrIndel <- .indelToMVR(newIndelReads,refSupport, ref, bam, covg) 

      # Copied and pasted from below
      mvrIndel$VAF <- altDepth(mvrIndel)/totalDepth(mvrIndel)
      metadata(mvrIndel)$refseq <- refSeqDNA
      
      metadata(mvrIndel)$bam <- basename(bam)
      metadata(mvrIndel)$sbp <- indelSBP
      metadata(mvrIndel)$pup <- pup
      mvrIndel$bam <- basename(bam)
      genome(mvrIndel) <- ref
      
    }
    # data(fpFilter_Triska, package="MTseeker") # for when they are... 

  }

  message("Tallying SNVs for: ", sampNames)

  # this may belong in a separate helper function...
  pu <- subset(pu, nucleotide %in% c('A','C','G','T'))
  pu$which_label <- NULL # confusing here 
  pu$ref <-  factor(strsplit(as.character(refSeqDNA), '')[[1]],
                    levels=levels(pu$nucleotide))[pu$pos]
  
  pu$alt <- pu$nucleotide 
  pu$totalDepth <- .byPos(pu, "count")
  is.na(pu$alt) <- (pu$nucleotide == pu$ref)

  # Only want to keep variants that differ from reference
  keep <- which(!is.na(pu$alt))
  pu <- pu[keep,]
  
  # If there exists SNPs, turn the pu into MVRanges
  if (nrow(pu) != 0) {
  
    strandMvr <- .puToMvr(pu, refSeqDNA, ref, bam, covg)
    
    # Because strandedness (heavy and light strands) matters sometimes
    # Some of the variants appear twice, despite having the same consequences
    # All I wants to do is add together their alt depths and set all the strands to +
    # They are all represented as through they are on the + strand anyways
    heavyMvr <- strandMvr[which(strand(strandMvr) == "+")]
    lightMvr <- strandMvr[which(strand(strandMvr) == "*")]
    
    matchedLight <- match(names(lightMvr), names(heavyMvr))
    
    # These are light stranded variants that do not have duplicated heavy stranded variants
    # However they are still representing what the variant would look like on the heavy strand
    uniqueLight <- which(is.na(matchedLight))
    uniqueLightMvr <- lightMvr[uniqueLight]
    
    # These are overlapping light strands
    matchedLightIndex <- which(!is.na(matchedLight))
    matchedLightMvr <- lightMvr[matchedLightIndex]
    
    # Keep track of variants found on both strands for debugging purposes
    if (length(heavyMvr) > 0) heavyMvr$bothStrands <- FALSE
    if (length(uniqueLightMvr) > 0) uniqueLightMvr$bothStrands <- FALSE
    
    # Only have to add the altDepths since the refDepths will be the same 
    # for 'duplicated' variants
    if (length(heavyMvr) > 0) {
      altDepth(heavyMvr[matchedLight[matchedLightIndex]]) <- 
        altDepth(heavyMvr[matchedLight[matchedLightIndex]]) + 
        altDepth(matchedLightMvr)
      heavyMvr[matchedLight[matchedLightIndex]]$bothStrands <- TRUE
    }

    # Merge together the light and heavy strand variants
    mvr <- MVRanges(c(heavyMvr, uniqueLightMvr), coverage=covg)
    
    # Recalculate the totalDepth
    totalDepth(mvr) <- altDepth(mvr) + refDepth(mvr)
    mvr <- sort(mvr, ignore.strand=T)
    
    metadata(mvr)$bam <- basename(bam)
    metadata(mvr)$sbp <- sbp
    metadata(mvr)$pup <- pup
    names(mvr) <- MTHGVS(mvr)
  }

  # There are no indels and no SNPs
  # Create an empty MVRanges
  if (nrow(pu) == 0 && nrow(indels) == 0) {
    mvr <- MVRanges(GRanges(c(seqnames=NULL,ranges=NULL,strand=NULL)),
                    coverage = covg)
    return(mvr)
  }
  
  # If there are no SNPs but there are indels
  else if (nrow(pu) == 0 && nrow(indels) > 0 ) {
    mvr <- mvrIndel
  }
  
  # Merge indels and SNPs together if both exist
  else if (nrow(indels) > 0 && nrow(pu) != 0) {
    
    mvr <- MVRanges(c(mvr, mvrIndel), coverage=covg)
    mvr <- sort(mvr, ignore.strand=T)
    names(mvr) <- MTHGVS(mvr)
  }
  
  metadata(mvr)$bam <- basename(bam)
  metadata(mvr)$sbp <- sbp
  metadata(mvr)$pup <- pup
  metadata(mvr)$coverageRle <- coverage(mvr)
  
  # A double check to make sure these two values get tallied
  totalDepth(mvr) <- altDepth(mvr) + refDepth(mvr)
  mvr$VAF <- altDepth(mvr) / totalDepth(mvr)
  
  return(mvr)
}

# helper fn
.byPos <- function(x, y) { 
  rowsum(as.numeric(x[,y]), as.character(x$pos), na.rm=1)[as.character(x$pos),]
}


# helper fn
.puToGR <- function(pu) { 
  #makeGRangesFromDataFrame(pu, start.field="pos", end.field="pos", strand="*",
  #                         keep.extra.columns=TRUE)
  
  makeGRangesFromDataFrame(pu, start.field="pos", end.field="pos",
                           keep.extra.columns=TRUE)
}


# helper fn
.refSeqLengths <- function() { 
  
  refSeqLengths <- c() 
  data(rCRSeq, package="MTseeker") # human
  data(NC_005089seq, package="MTseeker") # mouse 
  # data(mtRefs, package="MTseeker") # both
  refSeqLengths["rCRS"] <- width(rCRSeq)
  refSeqLengths["GRCh38"] <- width(rCRSeq)
  refSeqLengths["hg38"] <- width(rCRSeq)
  refSeqLengths["GRCh37"] <- width(rCRSeq)
  refSeqLengths["NC_005089"] <- width(NC_005089seq)
  refSeqLengths["GRCm38"] <- width(NC_005089seq)
  refSeqLengths["mm10"] <- width(NC_005089seq) 
  refSeqLengths["C57BL/6J"] <- width(NC_005089seq)
  return(refSeqLengths)
  
}


# helper fn
.getRefSyn <- function(ref) {
  
  # simplify references down to the supported "mouse" or "human" mitogenomes 
  if (ref %in% c("C57BL/6J", "NC_005089", "GRCm38","mm10")) ref <- "NC_005089"
  if (ref %in% c("hg38", "GRCh37", "GRCh38")) ref <- "rCRS" 
  return(ref)
  
}


# helper fn
.getRefSeq <- function(ref) { 
  
  data(rCRSeq, package="MTseeker") # human
  data(NC_005089seq, package="MTseeker") # mouse
  refs <- DNAStringSet(c(rCRS=rCRSeq[['chrM']],
                         NC_005089=NC_005089seq[['chrM']]))
  return(refs[.getRefSyn(ref)])
  
}

# helper fn
# Will take a single read and return the reference and alternative sequence
.reverseCigar <- function(indelRead, ref, reference, index) {

  # These are reads that support the reference (no indels found in them)
  if ( (!grepl("I", cigar(indelRead))) && (!grepl("D", cigar(indelRead))) ) {
    return(indelRead[0])
  }
  
  else {
    
    # Keep track of how many base pairs are being added in the cigar string
    addPos <- 0 # Matches (M in cigar strings)
    softPos <- 0 # Soft clips (S in cigar strings)
    splicePos <- 0 # Spliced zones (N in cigar strings)
    
    splitCigar <- gsub("([[:digit:]]+[[:alpha:]])", "\\1 ", cigar(indelRead))
    splitCigar <- unlist(strsplit(splitCigar, " "))
    
    # Find which elements of the cigar string are insertion or deletion
    # Are there multiple insertions and deletions?
    indelIndex <- which((grepl("I|D", splitCigar))) 
    
    # Create a new indel read to edit
    newIndelRead <- indelRead

    # Iterate through in case there are multiple deletions and insertions
    for (i in 1:length(indelIndex)) {

      #potentialHaplo <- FALSE
      
      # Find the number of base pairs that are soft clips
      # Only want to look at the information before the currently considered indel
      softPos <- grep("S", splitCigar[1:indelIndex[i]])
      softPos <- sum(as.numeric(gsub("\\D", "", splitCigar[softPos])))
      if (length(softPos) == 0) softPos <- 0
      
      # Find the number of base pairs that are matches
      # Only want to look at the information before the currently considered indel
      addPos <- grep("M", splitCigar[1:indelIndex[i]])
      addPos <- sum(as.numeric(gsub("\\D", "", splitCigar[addPos])))
      if (length(addPos) == 0) addPos <- 0
      
      # There can be spliced reads 
      # Only have to add this onto the reference sequence
      splicePos <- grep("N", splitCigar[1:indelIndex[i]])
      splicePos <- sum(as.numeric(gsub("\\D", "", splitCigar[splicePos])))
      if (length(splicePos) == 0) splicePos <- 0
      
      # If there is a previous indel in the same read
      # Need to consider how the start position will change on the reference
      # If there was an insertion beforehand, be careful when calculation startPos
      # Because those bp are adding length that does not exist into the ref seq
      
      # Add when there is a deletion
      # Subtract when there is an insertion
      if (i != 1) {
        # Indices or previous insertions or deletions in the same read
        prevIns <- which(grepl("I", splitCigar[1: (indelIndex[i] - 1) ]))
        prevDel <- which(grepl("D", splitCigar[1: (indelIndex[i] - 1) ]))
        
        # Sums of how many bp are deleted or inserted
        prevInsSum <- sum(as.numeric(gsub("\\D", "", splitCigar[prevIns])))
        prevDelSum <- sum(as.numeric(gsub("\\D", "", splitCigar[prevDel]))) 
        
      } 

      ## Now do the actual insertions or deletions

      # Insertions
      if (grepl("I", splitCigar[indelIndex[i]])) {
        
        # Start of range for the reference
        # Always off by one 
        startPos <- start(indelRead) + addPos - 1 + splicePos
        
        # Get the start index for insertion for the raw read
        altIndexStart <- addPos + softPos 
        
        # Multiple indels
        # Must take into consideration how that will effect the start positions
        # Add to the reference with previous deletion (insertions do not exist in the reference)
        # Add to the alternate with previous insertion (deletions do not add length to the alternate)
        if (i != 1) {
          startPos <- startPos + prevDelSum
          altIndexStart <- altIndexStart + prevInsSum
        }
        
        # According to VRanges documentation, insertions have width=1
        # So start and end are the same for the reference
        endPos <- startPos
        
        # Get the reference
        refs <- extractAt(reference, IRanges(startPos, endPos))
        refs <- unname(as.character(unlist(refs)))
        
        # Length of insertion
        insLength <- as.numeric(gsub("\\D", "", splitCigar[indelIndex[i]]))
        
        # Get the alt sequence from the raw read
        altSeq <- as.character(mcols(indelRead)$seq)
        altSplit <- unlist(strsplit(altSeq, split=""))
        altIndex <- seq(altIndexStart, altIndexStart + insLength, 1)
        altInd <- altSplit[altIndex]
        alt <- paste(altInd, collapse="")
        
        # If the first element of cigar string is to do the insertion
        # We have no way of knowning the element before the insertion
        # So we have to use the reference
        if (indelIndex[i] == 1) {
          alt <- paste0(refs, alt, collapse="")
          altInd <- c(refs, altInd)
        }
        
        # Raw read has an "N" where the insertion occurs
        # Toss it out
        if (grepl("N", alt)) {
          altInd <- ""
          alt <- ""
          refs <- ""
          startPos <- NA_integer_
          endPos <- NA_integer_
        }
        
        # A simple test to make sure insertion was done correctly
        # That became less simple
        else if (altInd[1] != refs) {
          
          # If the variants falls within a haplogroup region
          # Just leave it be
          data(haplomask_whitelist)
          #haploSnps <- subsetByOverlaps(ranges(haplomask_whitelist[[1]]), IRanges(startPos, endPos))
          #haploDels <- subsetByOverlaps(ranges(haplomask_whitelist[[2]]), IRanges(startPos, endPos))
          haploIns <- subsetByOverlaps(ranges(haplomask_whitelist[[3]]), IRanges(startPos, endPos))
          
          if (length(haploIns) > 0 && ref == "rCRS") {
            #message(paste("Potential haplogroup insertion variant at position:", as.character(startPos), "for sample:", as.character(mcols(indelRead)$bam)))
            #potentialHaplo <- TRUE
            
            # Leaving this if statement because this exception is fine to keep
          }
          
          else {
            message(paste("Incorrect insertion for indel read index:", as.character(index), "for", as.character(mcols(indelRead)$bam)))
            comp <- .sanCheck(reference, startPos, indelRead, softPos, refs, alt, splitCigar)
          }
        }
        
        # Sanity check!
        #comp <- .sanCheck(reference, startPos, indelRead, softPos, refs, alt, splitCigar)

      } # I
      
      # Deletions
      else if (grepl("D", splitCigar[indelIndex[i]])) {

        # Create range for the deletion
        # This must be zero indexed or something, because it is always off by one
        startPos <- start(indelRead) + addPos - 1 + splicePos
        
        # Get the index for where the deletion takes place
        altIndexStart <- addPos + softPos 
        
        # Multiple indels
        # Must take into consideration how the start and end values will change
        if (i != 1) {
          startPos <- startPos + prevDelSum
          altIndexStart <- altIndexStart + prevInsSum
        }
        
        # Length of deletion and end position
        delLength <- as.numeric(gsub("\\D", "", splitCigar[indelIndex[i]]))
        
        # In rCRS, there is an N located at bp 3107
        if (ref == "rCRS" && startPos == 3107 && delLength == 1) {
          startPos <- startPos - 1
          altIndexStart <- altIndexStart - 1
        }
        
        # Odd 1 off error that should be deleting the N at 3107
        # Showed up in TCGA data
        if (ref == "rCRS" && startPos == 3105 && delLength == 1) {
          startPos <- startPos + 1
          altIndexStart <- altIndexStart + 1
        }
        
        endPos <- startPos + delLength
        
        # Get the reference
        refs <- extractAt(reference, IRanges(startPos, endPos))
        refs <- unname(as.character(unlist(refs)))
        refSplit <- unlist(strsplit(refs, split=""))
        
        # Get the alt sequence from the raw read
        altSeq <- as.character(mcols(indelRead)$seq)
        altSplit <- unlist(strsplit(altSeq, split=""))
        alt <- altSplit[altIndexStart]

        # If the first element of cigar string is to do the deletion
        # We have no way of knowning the element before the deletion
        # So we have to use the reference
        if (indelIndex[i] == 1) {
          alt <- refSplit[1]
        }
        
        # If the raw read has an "N" where the deletion occurs
        # Toss it out
        if (grepl("N", alt)) {
          alt <- ""
          refs <- ""
          startPos <- NA_integer_
          endPos <- NA_integer_
        }
        
        # A simple check to ensure the deletion is supposedly happening correctly
        # That became less simple
        else if ( alt != refSplit[1] ) {

          # Keeps track if the variant falls in a common haplogroup region (ancestral variation)
          data(haplomask_whitelist)
          #haploSnps <- subsetByOverlaps(ranges(haplomask_whitelist[[1]]), IRanges(startPos, endPos))
          haploDels <- subsetByOverlaps(ranges(haplomask_whitelist[[2]]), IRanges(startPos, endPos))
          #haploIns <- subsetByOverlaps(ranges(haplomask_whitelist[[3]]), IRanges(startPos, endPos))
          
          if (length(haploDels) > 0 && ref == "rCRS") {
            #message(paste("Potential haplogroup deletion variant at position:", as.character(startPos), "for sample:",as.character(mcols(indelRead)$bam)))
            #potentialHaplo <- TRUE
            
            # Leave the if statement here because we do not want to remove haplogroup variants
          }
          
          else if (startPos == 3106) {
            # Continue
            # A lot of the bp before the N at 3107 are different
          }
          
          else {
            message(paste("Incorrect deletion for indel read index:", as.character(index), "for", as.character(mcols(indelRead)$bam)))
            comp <- .sanCheck(reference, startPos, indelRead, softPos, refs, alt, splitCigar)
          }
        }
        
        # Sanity check!
        #comp <- .sanCheck(reference, startPos, indelRead, softPos, refs, alt, splitCigar)
      } # D
      
      # Store the information
      # Only have one indel
      if (i == 1) {
        mcols(newIndelRead)$indelStart <- startPos
        mcols(newIndelRead)$indelEnd <- endPos
        mcols(newIndelRead)$alt <- alt
        mcols(newIndelRead)$ref <- refs
        #mcols(newIndelRead)$potentialHaplo <- potentialHaplo
      }
      
      # Append another indel to the first one
      # Occurs when you have multiple indels in a single read
      else {
        
        appendIndelRead <- indelRead
        mcols(appendIndelRead)$indelStart <- startPos
        mcols(appendIndelRead)$indelEnd <- endPos
        mcols(appendIndelRead)$alt <- alt
        mcols(appendIndelRead)$ref <- refs
        #mcols(appendIndelRead)$potentialHaplo <- potentialHaplo
        
        newIndelRead <- append(newIndelRead, appendIndelRead)
      }
      
    } # For
    
  } # Else 

  # Get rid of any indel reads which contained bad information
  # They should have NA for their start position
  keep <- which(!is.na(mcols(newIndelRead)$indelStart))
  newIndelRead <- newIndelRead[keep]
  
  return(newIndelRead)
}

# helper fn
# Turn pileup snvs into mvranges
.puToMvr <- function(pu, refSeqDNA, ref, bam, covg) {

  pu$altDepth <- ifelse(is.na(pu$alt), NA_integer_, pu$count) 
  pu$refDepth <- pu$totalDepth - .byPos(pu, "altDepth")
  pu$isAlt <- !is.na(pu$alt)
  pu$alleles <- .byPos(pu, "isAlt") + 1 
  #pu$strand <- as.character(pu$strand)
  
  columns <- c("seqnames","pos","ref","alt","totalDepth","refDepth","altDepth", "strand")
  gr <- keepSeqlevels(.puToGR(subset(pu, isAlt|alleles==1)[,columns]),
                      unique(pu$seqnames))

  # seqnames for mouse genome is "MT" 
  seqinfo(gr) <- Seqinfo(levels(seqnames(gr)), width(refSeqDNA), 
                         isCircular=TRUE, genome=ref)
  
  vr <- makeVRangesFromGRanges(gr)
  
  # Keeping all light strands as * bc MVRanges blows up otherwise
  # VRanges are not allowed to have light strand variants apparently
  strand(vr) <- as.factor(strand(gr))
  strand(vr[which(strand(vr) == "-")]) <- "*"
  
  vr <- keepSeqlevels(vr, levels(seqnames(gr))) 
  seqinfo(vr) <- seqinfo(gr)

  mvr <- MVRanges(vr, coverage=covg)
  #strand(mvr) <- rle(as.factor(strand(gr)))
  
  sampleNames(mvr) <- base::sub(paste0(".", ref), "", 
                                base::sub(".bam", "", 
                                          basename(bam)))
  names(mvr)[which(mvr$ref != mvr$alt)] <- MTHGVS(subset(mvr, ref != alt)) 
  altDepth(mvr)[is.na(altDepth(mvr))] <- 0
  
  mvr$VAF <- altDepth(mvr)/totalDepth(mvr)
  metadata(mvr)$refseq <- refSeqDNA
  
  mvr$bam <- basename(bam)
  genome(mvr) <- ref
  
  names(mvr) <- MTHGVS(mvr)
  
  return(mvr)
}

# helper fn
# Will make turn the indel information into a MVRanges
.indelToMVR <- function(indelReads, refSupport, ref, bam, covg) {

  # This is mostly copied and pasted from the mvr created at the end of pileup
  indelSupport <- as.data.frame(indelReads)
  
  indelSupport$which_label <- NULL
  
  # Set these columns to be able to turn into a GRanges
  indelSupport$refDepth <- NA_integer_
  indelSupport$altDepth <- 1
  indelSupport$totalDepth <- 1
  indelSupport$pos <- indelSupport$indelStart
  
  ### Not sure what this line is supposed to do
  #pu$alleles <- .byPos(pu, "isAlt") + 1 

  # Turn into GRanges
  columns <- c("seqnames","pos","ref","alt","totalDepth","refDepth","altDepth", "strand")
  grIndel <- keepSeqlevels(.puToGR(indelSupport[,columns]), unique(indelSupport$seqnames))
  seqinfo(grIndel) <- Seqinfo(levels(seqnames(grIndel)), width(.getRefSeq(ref)), isCircular=TRUE, genome=ref)
  
  # Turn into VRanges
  vrIndel <- makeVRangesFromGRanges(grIndel)
  vrIndel <- keepSeqlevels(vrIndel, levels(seqnames(grIndel))) 
  seqinfo(vrIndel) <- seqinfo(grIndel)
  
  # Need to mess around with strands again
  strand(vrIndel) <- strand(grIndel)
  strand(vrIndel[strand(vrIndel) == "-"]) <- "*"
  
  # Turn into MVRanges
  mvrIndel <- MVRanges(vrIndel, coverage = covg)
  sampleNames(mvrIndel) <- base::sub(paste0(".", ref), "", base::sub(".bam", "",  basename(bam)))
  
  # Fix ranges
  indelRanges <- IRanges(as.numeric(indelSupport$indelStart), as.numeric(indelSupport$indelEnd))
  ranges(mvrIndel) <- indelRanges
  
  # Assign names
  names(mvrIndel) <- MTHGVS(mvrIndel) 

  ####################################
  strandMvr <- mvrIndel
  
  # Indels with duplicated variants
  # Need to consolidate them together
  dupHeavy <- strandMvr[which(strand(strandMvr) == "+")]
  dupLight <- strandMvr[which(strand(strandMvr) == "*")]
  
  # Now have the correct depth for each variant
  # Each variant now appears once for each strand
  uniqueHeavy <- unique(dupHeavy)
  count <- table(names(dupHeavy))
  altDepth(uniqueHeavy[names(count)]) <- unname(count)
  
  uniqueLight <- unique(dupLight)
  count <- table(names(dupLight))
  altDepth(uniqueLight[names(count)]) <- unname(count)
  
  # Find any variants that have reads on both heavy and light strands
  matchedLight <- match(names(uniqueLight), names(uniqueHeavy))
  
  # For debugging purposes, keep track of which variants came from both strands
  
  # Variants that were only found on the light strand
  lightMvr <- uniqueLight[which(is.na(matchedLight))]
  
  # These are overlapping light strands
  matchedLightIndex <- which(!is.na(matchedLight))
  matchedLightMvr <- uniqueLight[matchedLightIndex]
  
  uniqueHeavy$bothStrands <- FALSE
  lightMvr$bothStrands <- FALSE
  
  # Only have to add the altDepths since the refDepths will be the same for 'duplicated' variants
  altDepth(uniqueHeavy[matchedLight[matchedLightIndex]]) <- altDepth(uniqueHeavy[matchedLight[matchedLightIndex]]) + altDepth(matchedLightMvr)
  uniqueHeavy[matchedLight[matchedLightIndex]]$bothStrands <- TRUE
  
  # Combine them together
  mvr <- MVRanges(c(uniqueHeavy, lightMvr), coverage=covg)
  mvr <- sort(mvr, ignore.strand=T)
  
  ###############################
  
  # Only keep unique variants (based upon names that were just assigned)
  #mvrIndelunique <- sort(unique(mvrIndel), ignore.strand=T)
  
  # Assign altDepths based upon what was unique
  # Sometimes the order gets mixed up, so keep track of it
  #order <- match(names(mvrIndelunique), names(table(names(mvrIndel))))
  #count <- unname(table(names(mvrIndel))[order])
  #altDepth(mvrIndelunique) <- count
  
  # Determine the refDepth
  refDepth(mvr) <- unname(countOverlaps(mvr, refSupport))
  
  # Recalculate totalDepth
  totalDepth(mvr) <- refDepth(mvr) + altDepth(mvr)
  
  #mvr <- MVRanges(mvrIndelunique, coverage = covg)
  
  return(mvr)
  
}

# helper fn
# Usually commented out unless you run into an error
# A sanity check to compare the raw read against the reference
# Prints other information like start position, cigar string, and the calculated alt and ref
.sanCheck <- function(reference, startPos, indelRead, softPos, refs, alt, splitCigar) {

  altSeq <- as.character(mcols(indelRead)$seq)
  
  # Get the corresponding ref sequence
  start <- start(indelRead) - softPos
  if (start < 0) start <- 1
  refRange <- IRanges(start, end(indelRead))
  refSeq <- as.character(unlist(extractAt(reference, refRange)))
  
  # Compare the two strings
  # Tried doing this using positions based upon cigar string but sometimes got off by one errors
  comp <- pairwiseAlignment(refSeq, altSeq)
  show(comp)
  show(splitCigar)
  print(paste("ref: ", refs))
  print(paste("alt: ", alt))
  print(paste("start: ", as.character(startPos)))

  return(comp)
}
trichelab/MTseeker documentation built on March 8, 2021, 6:20 p.m.