#' Produces "pileup" Output Files from Read Alignments (BAM Files)
#'
#' @inheritParams pscnseq
#'
#' @return A [aroma.seq::MPileupFileSet].
#'
#' @seealso
#' This function uses `aroma.seq:mpileup()`.
#'
#' @importFrom R.utils Arguments mprintf mstr hpaste isGzipped less
#' @importFrom R.filesets getNames getTags readDataFrame fullname getFullNames setFullNamesTranslator getPathnames
#' @importFrom aroma.core isCompatibleWith
#' @importFrom aroma.seq directoryStructure<- findSamtools BamDataSet hasIndex MPileupFileSet mpileup MPileupFileSet getSeqNames
#' @importFrom utils str
#'
#' @export
pscnseq_mpileup <- function(dataset, organism, chrs, samples, fasta, gcbase, bam_pattern = NULL, verbose = FALSE) {
assert <- NULL ## To please R CMD check
verbose <- Arguments$getVerbose(verbose)
message("* Assertions")
assert %<-% {
ver <- attr(findSamtools(), "version")
print(ver)
stopifnot(ver < "1.4")
} %label% "samtools-version"
print(assert)
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Sample data
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
samples <- read_samples(samples)
str(samples)
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Annotation data
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("* Loading annotation data files ...")
fa <- FastaReferenceFile(fasta)
print(fa)
stopifnot(!isGzipped(fa))
gc <- GcBaseFile(gcbase)
print(gc)
## IMPORTANT: Sequenza requires that chromosome names in GC file
## and the FASTA file (hg19.fa) need to be identical and in the
## same order.
## PS. It is ok that BAM files are in a different order.
stopifnot(isCompatibleWith(gc, fa))
stopifnot(all(getSeqNames(gc) == getSeqNames(fa)))
message("* Loading annotation data files ... DONE")
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Sequence read data
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("* Loading all BAM files ...")
if (is.null(bam_pattern)) bam_pattern <- ".bwa.realigned.rmDups(|.recal)(|.bam)$"
path <- file.path("bamData", dataset, organism)
bams <- BamDataSet$byPath(path, recursive=TRUE, pattern=bam_pattern)
stopifnot(length(bams) > 0)
bams <- bams[grep("old", getPathnames(bams), invert=TRUE)]
stopifnot(length(bams) > 0)
bams <- setFullNamesTranslator(bams, function(name, ...) {
name <- gsub(".bwa.realigned.rmDups.recal.bam", "", name, fixed=TRUE)
name <- gsub(".bwa.realigned.rmDups.bam", "", name, fixed=TRUE)
name <- gsub(".bam", "", name, fixed=TRUE)
name <- gsub("_merged", "", name, fixed=TRUE)
name <- gsub("_[ACGT]{8}_L[0-9]{3}", "", name)
name
})
print(bams)
directoryStructure(bams) <- list(
pattern="([^/]*)/([^/]*)/([^/]*)/([^/]*)/([^/]*)",
replacement=c(rootpath="\\1", dataset="\\2", organism="\\3", sample="\\4,\\5")
)
## Keep patients of interest
names <- getNames(bams)
str(names)
tags <- sapply(bams, FUN=function(bam) getTags(bam)[1])
str(tags)
keep <- which(paste(names, tags, sep=",") %in% paste(samples$Patient_ID, samples$A0, sep=","))
str(keep)
stopifnot(length(keep) > 0)
bams <- bams[keep]
print(bams)
stopifnot(length(bams) > 0)
## Assert that all BAMs have been indexed
stopifnot(all(vapply(bams, FUN = hasIndex, FUN.VALUE = FALSE)))
message("* Loading all BAM files ... DONE")
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Process
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
if (interactive()) readline("Press ENTER to start processing of data: ")
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Count nucleotides at every(!) genomic position
## using 'samtools mpileup'
## - - - - - - - - - - - - - - - - - - - - - - - - - - -
chrLabels <- sprintf("chr%s", chrs)
## Note that the generated *.mpileup files are very large.
res <- mpileup(bams, fa=fa, chromosomes=chrLabels, verbose=less(verbose, 20))
print(res)
mps <- MPileupFileSet(res)
mps
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.