R/pileupXenograft.R

Defines functions .getCov .getCoverageRle .addPuCoverage .getPuSeqLen .strpop .puToMvr .getMtPile .nonRef .addReadCounts .getReadCounts .addRefSeq .grVR .puGR pileupXenograft

Documented in pileupXenograft

#' test harness for simultaneous mouse/human MT variant calling (for tracking)
#' 
#' Note that the default pileup params here do NOT distinguish strands in PDXs.
#'
#' FIXME: add an iterator to create (human, mouse) MVRangesLists when running 
#'        pileupXenograft on thousands of individual cells (as in PDX studies).
#' 
#' @param bam         the BAM or object with a @bam slot 
#' @param sbp         optional ScanBamParam object (autogenerated if missing) 
#' @param pup         optional PileupParam object (autogenerated if missing)
#' @param callIndels  call indels? (This can reduce compute and RAM if FALSE.)
#' @param minVAF      minimum VAF to accept (default is 0.03, usual NuMT cutoff)
#' @param minDepth    minimum read depth to accept (default is 20 for sanity)
#' @param verbose     be verbose, as for profiling? (default is FALSE / quiet)
#' @param ...         additional args to pass on to the pileup() function
#' 
#' @return            a list of MVRanges (one human, one mouse)
#'
#' @examples
#'
#'   BAMdir <- system.file("extdata", "BAMs", package="MTseekerData")
#'   addPath <- function(x) file.path(BAMdir, x)
#'   pdxBAMs <- addPath(list.files(path=BAMdir, pattern="^PDX.*.bam$"))
#'   names(pdxBAMs) <- sapply(strsplit(basename(pdxBAMs), "_"), `[`, 1)
#'   mvrls <- lapply(pdxBAMs, pileupXenograft, verbose=TRUE)
#'   vapply(mvrls, names, character(2))
#' 
#' @export 
pileupXenograft <- function(bam, sbp=NULL, pup=NULL, callIndels=FALSE, minVAF=0.03, minDepth=20, verbose=FALSE, ...) { 

  t_0 <- Sys.time() 

  # we will split this:
  if (verbose) message("Assembling joint mitochondrial pileup...")
  pu <- .getMtPile(bam, sbp, pup, strand=FALSE, callIndels=callIndels)
  t_pu <- Sys.time() - t_0 
  if (verbose) message("Pileup took ", round(t_pu, 3), " ", attr(t_pu, "units"))

  # detect multiple references
  # FIXME: refactor this into a function
  if (verbose) message("Fixing references...")
  refChrs <- levels(factor(pu$seqnames))
  equivs <- c(mm10="GRCm38", hg38="GRCh38")
  names(refChrs) <- sapply(strsplit(refChrs, "_"), `[`, 1) 
  for (i in names(equivs)) names(refChrs) <- sub(i, equivs[i], names(refChrs))
  refGenome <- names(refChrs)
  names(refGenome) <- refChrs

  # split by reference
  # FIXME: generalize to arbitrary meta-MT-genomes
  if (verbose) message("Splitting pileup by species...")
  pu$seqnames <- factor(pu$seqnames) # avoid nonsense
  pus <- lapply(split(pu, refGenome[pu$seqnames]), .addRefSeq)
  pus <- lapply(pus, .addPuCoverage)
  for (reference in names(pus)) pus[[reference]][, "reference"] <- reference
  names(pus) <- sub("GRCh38", "human", sub("GRCm38", "mouse", names(pus)))
  # lapply(pus, attr, "reference") # to verify it worked... 

  # call variants 
  if (callIndels == TRUE) {
    message("Indel calling is not currently supported in xenografts; skipping")
  }
  if (verbose) message("Dropping sites with VAF < ", minDepth, "...")
  variants <- lapply(pus, .nonRef, minVAF=minVAF, minDepth=minDepth) 

  # turn into a VRangesList, add read counts, coerce to MVRanges
  # FIXME: refactor the hell out of this 
  if (verbose) message("Converting to a VRangesList...")
  variantVRL <- VRangesList(lapply(lapply(variants, .puGR), .grVR))
  variantVRL <- VRangesList(lapply(variantVRL, .addReadCounts, bam=bam))
  variantVRL <- VRangesList(lapply(variantVRL, 
                                   function(v) 
                                     new("MVRanges", v, coverage=.getCov(v))))

  # tidy up the results 
  # FIXME: refactor to function
  if (verbose) message("Tidying up...")
  mergedRef <- paste(sapply(variantVRL, 
                            function(x) unique(mcols(x)$reference)), 
                     collapse="_")
  for (i in names(variantVRL)) {
    ref <- unique(variantVRL[[i]]$reference)
    variantVRL[[i]] <- keepSeqlevels(variantVRL[[i]], 
                                     seqlevelsInUse(variantVRL[[i]]))
    seqlevels(variantVRL[[i]]) <- paste0(ref, "_MT")
    seqlengths(variantVRL[[i]]) <- width(attr(pus[[i]], "reference"))
    genome(variantVRL[[i]]) <- mergedRef
    isCircular(variantVRL[[i]]) <- TRUE 
  }

  # convert to an MVRangesList
  if (verbose) message("Converting to an MVRangesList...")
  res <- as(variantVRL, "MVRangesList")

  t_1 <- Sys.time() - t_0 
  if (verbose) message("Finished in ", round(t_1, 3), " ", attr(t_1, "units"))
  return(res) 

} 


# helper
.puGR <- function(pu) { 
  if (nrow(pu) == 0) {
    res <- GRanges()
  } else { 
    res <- makeGRangesFromDataFrame(pu, start.field="pos", end.field="pos",
                                    keep.extra.columns=TRUE)
  }
  metadata(res)$coverageRle <- attr(pu, "coverageRle")
  return(res) 
}


# helper 
.grVR <- function(gr) { 
  if (length(gr) == 0) {
    res <- VRanges()
  } else { 
    res <- makeVRangesFromGRanges(gr) 
  } 
  # is this necessary?
  metadata(res)$coverageRle <- metadata(gr)$coverageRle
  return(res) 
}


# helper
.addRefSeq <- function(pu) {
  chr <- unique(sapply(strsplit(as.character(pu$seqnames), "_"), `[`, 1))
  stopifnot(length(chr) == 1)
  attr(pu, "reference") <- MTseeker:::.getRefSeq(chr) 
  return(pu)
}


# helper
.getReadCounts <- function(x, bam, equivs=NULL) {
  if (is.null(equivs)) equivs <- c(mm10="GRCm38", hg38="GRCh38")
  mapped <- subset(idxstatsBam(bam), mapped > 0)[, c("seqnames", "mapped")]
  rownames(mapped) <- as.character(mapped[,"seqnames"])
  for (e in names(equivs)) {
    rownames(mapped) <- sub(e, equivs[e], sub("_+", "_", rownames(mapped)))
  }
  mapped[unique(as.character(seqnames(x))), "mapped"]
}


# helper
.addReadCounts <- function(mvr, bam, readLength=98) {
  metadata(mvr)$bam <- bam
  metadata(mvr)$readCount <- .getReadCounts(mvr, bam=bam)
  return(mvr) 
}


# helper
.nonRef <- function(pu, minVAF=0.05, minDepth=20) {

  coverageRle <- attr(pu, "coverageRle") 
  pu$which_label <- NULL # confusing here, although used for coverageRle
  pu$ref <- factor(strsplit(as.character(attr(pu, "reference")[[1]]), "")[[1]],
                   levels=levels(pu$nucleotide))[pu$pos]
  pu <- subset(pu, nucleotide %in% c('A','C','G','T'))
  pu$alt <- pu$nucleotide
  pu$totalDepth <- MTseeker:::.byPos(pu, "count")
  is.na(pu$alt) <- (pu$nucleotide == pu$ref)
  pu$altDepth <- ifelse(is.na(pu$alt), NA_integer_, pu$count) 
  pu$refDepth <- pu$totalDepth - MTseeker:::.byPos(pu, "altDepth")
  pu$isAlt <- !is.na(pu$alt)
  pu$alleles <- MTseeker:::.byPos(pu, "isAlt") + 1 
  res <- subset(pu, alleles > 1 & totalDepth >= minDepth)
  res$VAF <- res$count / res$totalDepth
  res$variant <- paste0(res$reference, ":m.", res$pos, res$ref, ">", res$alt)
  res <- subset(res, VAF >= minVAF & isAlt)
  attr(res, "coverageRle") <- coverageRle
  return(res) 

}


# helper 
.getMtPile <- function(bam, sbp=NULL, pup=NULL, strand=FALSE, callIndels=FALSE){

  if (is.null(sbp)) sbp <- MTseeker:::scanMT(bam, mapqFilter=20) 
  if (strand) bamWhat(sbp) <- "strand"
  if (is.null(pup)) pup <- PileupParam(min_mapq=20,
                                       min_base_quality=13,
                                       ignore_query_Ns=TRUE, 
                                       min_nucleotide_depth=1,
                                       distinguish_strands=strand, 
                                       distinguish_nucleotides=TRUE, 
                                       include_deletions=callIndels, 
                                       include_insertions=callIndels) 
  pileup(file=bam, scanBamParam=sbp, pileupParam=pup)

}


# helper 
.puToMvr <- function(pu, refSeqDNA, ref, bam, covg) {

  pu$altDepth <- ifelse(is.na(pu$alt), NA_integer_, pu$count) 
  pu$refDepth <- pu$totalDepth - MTseeker:::.byPos(pu, "altDepth")
  pu$isAlt <- !is.na(pu$alt)
  pu$alleles <- .byPos(pu, "isAlt") + 1 
  #pu$strand <- as.character(pu$strand)
  
  columns <- 
    c("seqnames","pos","ref","alt","totalDepth","refDepth","altDepth", "strand")
  gr <- keepSeqlevels(.puGR(subset(pu, isAlt|alleles==1)[,columns]),
                      unique(pu$seqnames))

  # seqnames for mouse genome is "MT" 
  seqinfo(gr) <- Seqinfo(levels(seqnames(gr)), width(refSeqDNA), 
                         isCircular=TRUE, genome=ref)
  
  vr <- makeVRangesFromGRanges(gr)
  
  vr <- keepSeqlevels(vr, levels(seqnames(gr))) 
  seqinfo(vr) <- seqinfo(gr)

  mvr <- MVRanges(vr, coverage=covg)
  #strand(mvr) <- rle(as.factor(strand(gr)))
  
  sampleNames(mvr) <- base::sub(paste0(".", ref), "", 
                                base::sub(".bam", "", 
                                          basename(bam)))
  names(mvr)[which(mvr$ref != mvr$alt)] <- MTHGVS(subset(mvr, ref != alt)) 
  altDepth(mvr)[is.na(altDepth(mvr))] <- 0
  
  mvr$VAF <- altDepth(mvr)/totalDepth(mvr)
  metadata(mvr)$refseq <- refSeqDNA
  
  mvr$bam <- basename(bam)
  genome(mvr) <- ref
  
  names(mvr) <- MTHGVS(mvr)
  
  return(mvr)
}


# helper fn
.strpop <- function(x, sep=" ", idx=NULL) {
  vec <- strsplit(x, sep)[[1]]
  if (is.null(idx)) idx <- length(vec)
  paste(vec[idx], collapse=sep)
}


# helper fn
.getPuSeqLen <- function(pu) { 
  as.integer(.strpop(levels(factor(pu$which_label)), "(:|\\-)"))
}


# helper fn
.addPuCoverage <- function(pu) { 
  covg <- rep(0, .getPuSeqLen(pu))
  covg[pu$pos] <- pu$count
  attr(pu, 'coverageRle') <- Rle(covg)
  return(pu) 
}


# helper fn
.getCoverageRle <- function(x) { 
  metadata(x)$coverageRle
}


# helper fn
.getCov <- function(x) { 
  mean(.getCoverageRle(x))
} 
trichelab/MTseeker documentation built on March 8, 2021, 6:20 p.m.