# functions via Rcpp
# f <- Rcpp::cppFunction('
# int fibonacci(const int x) {
# if (x < 2)
# return x;
# else
# return (fibonacci(x - 1)) + fibonacci(x - 2);
# }
# ')
#
# Expressions via Rcpp:
# Rcpp::evalCpp('std::numeric_limits<double>::max()')
#' Create the patient ID string in canonical form
#' @param patIdNbr the patient ID as numeric
#' @export
getPatStr <- function(patIdNbr){
paste0("pat",formatC(patIdNbr, width=5, flag="0"))
}
#' Get the patient ID as number from patient ID string
#' @param patIdStr the patient ID as string
#' @export
getPatNbr <- function(patIdStr){
as.numeric(substr(patIdStr, 4, 8))
}
#' Checks given files if they are valid.
#'
#' Do the files exist and are they non-empty?
#' @param f Filename(s)
#' @export
checkFile <- Vectorize(function(f){
if (is.null(f)) return(FALSE)
fsize <- file.info(f)$size
! is.na(fsize) && fsize > 512L
})
#' Count number of lines in a file
#'
#' It uses the command line tool wc.
#' @param txtFile the filename of the text file
getNumberOfLinesUsingWC <- function(txtFile){
if (! checkFile(txtFile)){
logwarn("File %s does not exist!", txtFile)
return(NULL)
}
wcOutput <- system(paste0("wc -l ", txtFile), intern=TRUE)
wcOutput.regex <- regexec("[[:digit:]]+", wcOutput)[[1]]
strtoi(substr(wcOutput, start=wcOutput.regex, stop=wcOutput.regex+attr(wcOutput.regex, "match.length")-1), base=10L)
}
# Find SV-intervals within candidate regions.
#
# look into the mapping of sufficiently covered regions
# and find intervals around SV-like points within candidate region. The candidate regions are, for instance, based on clipping proportion.
# mapping: reads that map on a covered region
# works only for single chromosome
# @return returns matrix with interval number, start and stop position (or NULL)
# @note Only used in findIntervals-function below.
###
findIntervalsInCandidateRegions <- function(mapping, chrom=CHROM){
stopifnot( is.data.frame(mapping) && NROW(mapping) >= 3L )
loginfo("Candidate region with %d reads.", NROW(mapping))
mapping$clipped <- factor(ifelse(grepl("S", mapping$cigar), yes=2, no=1), levels=1:2, labels=c("noS", "S"))
#moving averages from package caTools.
#+zoo::rollapply alternative to caTools
#plot(rollapply(as.numeric(mapping$clipped), 30, FUN=mean, na.rm=TRUE, fill=NA) ~ pos, data=mapping, main="clipped")
#plot(rollapply(mapping$mapq, 30, FUN=mad, na.rm=T, fill=NA) ~ pos, data=mapping, type="l", main="mapq")
# Take product of windowed mean clipped proportion and std. deviation of mappingQ
#SVscore <- rollapply((rollapply(as.numeric(mapping$clipped), 30, FUN=mean, na.rm=TRUE, fill=NA)+1) * (rollapply(mapping$mapq, 30, FUN=mad, na.rm=T, fill=NA)+1), width=80, FUN=mean, fill=NA)
#ZZZ window-width k=: should it depend on candidate region length
#SVscore <- runmean((runmean(as.numeric(mapping$clipped), k=51, endrule="NA")+10) * (log(runsd(mapping$mapq, k=51, endrule="NA")+1)), k=21, endrule="mean"); SVscoreType <- "clipped * mapQ" # clipped * mapQ
#SVscore <- runsd(mapping$mapq, k=51, endrule="NA"); SVscoreType <- "mapQ-sd" ## only mapQ-sd
#ZZZ neighborhood is defined about the ordering of reads in the mapping. It does not consider how many base-pairs the reads really lie apart
## For each read give the proportion of clipped reads in its neighborhood
#require(caTools)
SVscore <- runmean(as.numeric(mapping$clipped)-1, k=51, endrule="NA"); SVscoreType <- "clipped-prop"
SVscore.cutoff <- quantile(SVscore, probs=0.85, na.rm=TRUE) # what is a "high proportion of clipped reads in a neighbourhood": only 15% of reads have that enriched neighborhood
# look for reads where 80% of neighboring reads have "enriched neighborhood"
SVscore.high <- caTools::runquantile(as.numeric(SVscore > SVscore.cutoff), k=51, probs=0.2, endrule="NA")
#SVscore.high <- rollapply(as.numeric(SVscore > SVscore.cutoff), width=11, FUN=median, na.rm=TRUE, fill=NA) ##zoo
# indicating intervals of SV-candidates within covered region
SVscore.high.ind <- which(SVscore.high == 1)
nbr.HighSVReads <- length(SVscore.high.ind)
loginfo("Number of reads that high SV-score: %d", nbr.HighSVReads)
if ( nbr.HighSVReads == 0){
loginfo("Skipping candidate region because it does not have areas with high SV reads density.")
return(invisible(NULL))
}
pos.high <- mapping$pos[SVscore.high.ind]
mapping$svreg <- -1
is.na(mapping$svreg) <- is.na(mapping$pos)
mapping$svreg[SVscore.high.ind] <- cut(pos.high, breaks=unique(pos.high[c(1, which(diff(SVscore.high.ind) > 1L), length(pos.high))]), include.lowest=TRUE, labels=FALSE)
loginfo("Found ::%s:: svreg levels", unique(mapping$svreg))
plot( SVscore ~ pos, data=mapping, subset=!is.na(SVscore), pch=".", main=SVscoreType)
abline(h=SVscore.cutoff, lty=2, col="grey")
lines(max(SVscore, na.rm=T)*SVscore.high ~ pos, data=mapping, col="red", type="l", lty=3)
myMapping <<- mapping #ZZZ debug
# give coordinates of intervals
mapping_posRange <- range(mapping$pos, na.rm=TRUE)
mapping_help <- mapping[which(mapping$svreg > 0), c("rname", "pos", "svreg")]
retMat <- as.matrix(aggregate(mapping_help$pos, by=list(svreg=mapping_help$svreg), FUN=range, na.rm=TRUE))
colnames(retMat) <- c("svreg", "lower", "upper")
# how many BPs around the interval borders
EXTRA_BP <- 150L
retMat[,2] <- pmax(retMat[,2] - EXTRA_BP, mapping_posRange[1])
retMat[,3] <- pmin(retMat[,3] + EXTRA_BP, mapping_posRange[2])
nbrIntervalsInRegion0 <- NROW(retMat)
# merging intervals
retMat <- as.matrix(ranges(reduce(GenomicRanges::GRanges(seqnames=rep(chrom, NROW(retMat)),
#paste(retMat[,1], chrom, sep="_"),
ranges=IRanges(retMat[,2], retMat[,3])))))
retMat[,2] <- retMat[,1] + retMat[,2] - 1
retMat <- cbind(1:NROW(retMat), retMat)
loginfo("Found %d intervals (%d intervals before merging) which are SV-like.", NROW(retMat), nbrIntervalsInRegion0)
return(retMat)
}
#
# identifies intervals that are potential regions with SVs in CHROM
# first it looks for regions that are sufficiently covered
# Within each covered region it delineates intervals that are SV-candidate (via clipped reads)
#+ ZZZ: we are actually hunting for differential SVs (so better look into reads from CTR sample as well)
####
findIntervals <- function(tum.bam){ #, ctr.bam){
tum.bamFileInfo <- parseSimFile(tum.bam)
#ctr.bamFileInfo <- parseSimFile(ctr.bam)
# use bedtools to get coverage
stopifnot( exists("BEDTOOLS") )
tmpFile.tum <- tempfile(pattern="tum_", fileext=".cov")
#tmpFile.ctr <- tempfile(pattern="ctr_", fileext=".cov")
# -dz: only coverage >0, zero-based positions
system(paste0(BEDTOOLS, " genomecov -dz -ibam ", tum.bam," -g > ", tmpFile.tum),
intern=FALSE, wait=TRUE)
#system(paste0(BEDTOOLS, " genomecov -dz -ibam ", ctr.bam," -g > ", tmpFile.ctr),
# intern=FALSE, wait=TRUE)
# delete temporary files when function exits.
on.exit(expr=file.remove(tmpFile.tum)) #, tmpFile.ctr))
cov.tum <- read.delim(file=tmpFile.tum, header=FALSE)
#cov.ctr <- read.delim(file=tmpFile.ctr, header=FALSE)
names(cov.tum) <- c("chrom", "pos", "cov_tum")
#names(cov.ctr) <- c("chrom", "pos", "cov_ctr")
### ZZZ single chromosome!!
cov.tum <- cov.tum[cov.tum$chrom == CHROM,]
#cov.ctr <- cov.ctr[cov.ctr$chrom == CHROM,]
# look for gaps in coverage. Gaps should have minimum length [bp] (gap might also come from insertion-SV)
MIN_GAP_LENGTH <- 500L
# define intervals where there is read coverage and that are separated by a gap
#cov.ctr$interv_ctr <- cut(cov.ctr$pos, breaks=cov.ctr$pos[c(1, which(diff(cov.ctr$pos)>MIN_GAP_LENGTH), NROW(cov.ctr))], include.lowest=TRUE, labels=FALSE)
cov.tum$interv_tum <- cut(cov.tum$pos, breaks=cov.tum$pos[c(1, which(diff(cov.tum$pos)>MIN_GAP_LENGTH), NROW(cov.tum))], include.lowest=TRUE, labels=FALSE)
# filter on those intervals that have enough coverage in TUMOR sample
MIN_QUANT75_COVERAGE <- 10L # at least 30% of bases have coverage above 10
minCov.tum <- aggregate(x=cov.tum[, "cov_tum"], by=list(interv_tum=cov.tum$interv_tum), FUN=function(x){quantile(x, probs=0.7)})
names(minCov.tum)[NCOL(minCov.tum)] <- "cov_q75"
minCov.tum <- minCov.tum[minCov.tum$cov_q75 >= MIN_QUANT75_COVERAGE,]
loginfo("We have %d covered regions which fulfill minimum coverage requirements.", NROW(minCov.tum))
# cov.interv.ctr <- aggregate(x=cov.ctr[, "cov_ctr"],
# by=list(interv_ctr=cov.ctr$interv_ctr),
# FUN=function(x){quantile(x, probs=0.75)})
# names(cov.interv.ctr)[NCOL(cov.interv.ctr)] <- "cov_q75"
# cov.interv.ctr <- cov.interv.ctr[cov.interv.ctr$cov_q75 >= MIN_QUANT75_COVERAGE,]
#
# filter on those intervals where minimum coverage is reached.
cov.tum2 <- merge(cov.tum, minCov.tum)
cov.tum2 <- cov.tum2[order(cov.tum2$chrom, cov.tum2$pos),]
# cov.ctr2 <- merge(cov.ctr, cov.interv.ctr)
# cov.interv.ctr <- merge(cov.interv.ctr, aggregate(x=cov.ctr2[, "pos"], simplify=TRUE,
# by=list(interv_ctr=cov.ctr2$interv_ctr),
# FUN=min))
# names(cov.interv.ctr)[NCOL(cov.interv.ctr)] <- "interv_l"
# cov.interv.ctr <- merge(cov.interv.ctr, aggregate(x=cov.ctr2[, "pos"], simplify=TRUE,
# by=list(interv_ctr=cov.ctr2$interv_ctr),
# FUN=max))
# names(cov.interv.ctr)[NCOL(cov.interv.ctr)] <- "interv_h"
minCov.tum <- merge(minCov.tum, aggregate(x=cov.tum2[, "pos"], simplify=TRUE,
by=list(interv_tum=cov.tum2$interv_tum),
FUN=min))
names(minCov.tum)[NCOL(minCov.tum)] <- "interv_l"
minCov.tum <- merge(minCov.tum, aggregate(x=cov.tum2[, "pos"], simplify=TRUE,
by=list(interv_tum=cov.tum2$interv_tum),
FUN=max))
names(minCov.tum)[NCOL(minCov.tum)] <- "interv_h"
# Assess covered intervals:
# Look only into sufficiently covered intervals (at least in tumor sample)
# further narrow the list of candidate intervals
# and also narrow the interval borders so that the borders have a meaning/are related to SV
# check mapping for differential pattern (clipped reads and quality) with scanBam
loginfo("Start looking into the mapping of those candidate covered regions.")
# what fields to read out from BAM file
what <- setdiff(scanBamWhat(), c("seq", "qual", "qwidth"))
##, isFirstMateRead=TRUE
#ZZZ RangesList() needs to be properly used for multiple chromosomes! chrom as names
##here chr5: hard-coded because of NPM1
# myParam_ctr <- ScanBamParam(which=RangesList(chr5=IRanges(start=cov.interv.ctr$interv_l, end=cov.interv.ctr$interv_h)), flag=scanBamFlag(isPaired=TRUE, isProperPair=NA), what=what)
myParam_tum <- ScanBamParam(which=RangesList(chr5=IRanges(start=minCov.tum$interv_l, end=minCov.tum$interv_h)), flag=scanBamFlag(isPaired=TRUE, isProperPair=NA), what=what)
# have list of dataframes with the mapping per region which reaches minimum coverage in BAM file
mappings.tum <<- lapply(scanBam(file=tum.bam, param=myParam_tum), as.data.frame)
#mappings.ctr <- lapply(scanBam(file=ctr.bam, param=myParam_ctr), as.data.frame)
# # add name of covered region to data.frame ##seems not used/necessary
# for (i in 1:length(mappings.tum)){
# mappings.tum[[i]]$region <- names(mappings.tum)[i]
# }#rof
# #ZZZ Ideally, I would relate intervals betw tumor and control: find intervals in tumor and in control that cover (more or less) the same region.
# #+Could be that I need to merge two intervals in -say- tumor probe (when a big deletion occured)
# #+For now, I look only at the tumor intervals and check clipped reads and quality
#
# # distribution of percentages of clipped reads with mapQ > 0 in covered regions
# clippedProp.interv.tum <- sapply(mappings.tum, function(mapping){sum(grepl("S", mapping$cigar) & mapping$mapq > 0) / NROW(mapping)})
#
# PERC_INTERV <- ceiling(length(mappings.tum)/5)
# # select those tumor intervals with highest proportion of clipped reads
# highClippedRegions.tum <- sort(clippedProp.interv.tum, decreasing=TRUE)[1:PERC_INTERV]
# loginfo("The covered regions with highest clipped-reads proportion are %s.", names(highClippedRegions.tum))
#
# # robust variance
# qualVar.interv.tum <- sapply(mappings.tum, function(mapping){mad(mapping$mapq, na.rm=TRUE)})
# highMapQVarRegions.tum <- sort(qualVar.interv.tum, decreasing=TRUE)[1:PERC_INTERV]
# loginfo("The covered regions with highest mapQ-variability are %s.", names(highMapQVarRegions.tum))
#
# #covered regions that have relatively many clipped reads and relatively high qual variability
# candidateRegions1 <- intersect(names(highClippedRegions.tum), names(highMapQVarRegions.tum))
#
# mappings2.tum <- mappings.tum[candidateRegions1]
# loginfo("The following candidate covered regions remain %s.", candidateRegions1)
# return list of intervals that look SV-like
#return(lapply(mappings2.tum, findIntervalsInCandidateRegions))
retList <- lapply(mappings.tum, findIntervalsInCandidateRegions)
retMat <- NULL
for (i in 1:length(retList)){
retMat <- rbind(retMat, retList[[i]])
#ZZZ rownames important?: bamFile + region?
}
#return(t(sapply(retList[! sapply(retList, is.null)], rbind)))
return(retMat)
}
# wrapper to findIntervals() in different mapping BAM-files
# returns a matrix with coordinates per candidate interval
###
findIntervalsWrapper <- function(bamFiles){
stopifnot( ! is.character(bamFiles) || length(bamFiles) > 0 )
myResult <- NULL
samples <- character(0)
for (i in bamFiles){
loginfo("Find intervals for BAM file %s.", i)
myCurrentInterval <- findIntervals(i)
if ( NCOL(myCurrentInterval) > 0 ){
samples <- c(samples, rep(i, NROW(myCurrentInterval)))
myResult <- rbind(myResult, myCurrentInterval)
}#fi
}
return(list(sample=samples, interval=myResult))
}
#' Replaces random positions of a given DNA sequence with a random nucleotide.
#'
#' This function can be useful when simulating a virtual patient that has personal (germline) SNVs.
#' @author mkuhn, 201402
#' @param DNAseq the DNA sequence as \code{DNAString} object. Works also for rectangular \code{DNAStringSet} objects.
#' @param nbrSNVs number of SNVs (but could be silent, i.e. replace A with A)
#' @note ZZZ It was better when we set here a mutation rate, not the number of SNVs..
injectRandomSNVs <- function(DNAseq, nbrSNVs=1L){
seqLen <- if ( inherits(DNAseq, "DNAString") ) length(DNAseq) else Biostrings::width(DNAseq)
# nothing to do when empty sequence
if ( all(seqLen == 0L) ) return(DNAseq)
if ( is.numeric(nbrSNVs) && nbrSNVs > 0L ){
nbrSNVs <- round(nbrSNVs)
if ( inherits(DNAseq, "DNAString") && length(seqLen) == 1L ){
myAt <- sample(seqLen, nbrSNVs)
myLetter <- Biostrings::DNAString(paste(sample(Biostrings::DNA_BASES, nbrSNVs, replace=TRUE), collapse=""))
} else {
stopifnot( inherits(DNAseq, "DNAStringSet") && length(table(Biostrings::width(DNAseq))) == 1L )
seqWidth <- Biostrings::width(DNAseq)[1]
h <- matrix(data = FALSE, nrow = length(DNAseq), ncol = seqWidth)
for (i in 1:NROW(h)) h[i, sample(seqWidth, nbrSNVs, replace=FALSE)] <- rep(TRUE, nbrSNVs)
myAt <- h
myLetter <- replicate(n=length(DNAseq), paste(sample(Biostrings::DNA_BASES, nbrSNVs, replace=TRUE), collapse=""))
}
DNAseq <- Biostrings::replaceLetterAt(DNAseq, at=myAt, letter=myLetter )
}
return(DNAseq)
}
#' Function to abbreviate a long sequence as a string
#' @param seqObj a <code>XString</code> object
#' @param k is number of bases that can remain on either side
abbrevSeq <- function(seqObj, k=11L){
if (is.null(seqObj)) return(NULL)
seqObj.len <- length(seqObj)
#logdebug("Got seq-object ::%s:: of length %d.", seqObj, seqObj.len)
if ( seqObj.len <= 2*k + 3) return(seqObj)
subseq(seqObj, start=k+1, end=seqObj.len-k) <- Biostrings::DNAString("...")
return(seqObj)
}
#' Find local maxima on a function given on a discrete support (i.e. discrete set of function values).
#'
#' @param x numeric vector of given function values.
#' @param threshold value the local maximum is required to exceed. Defaults to \code{NULL}.
#' @return a vector of indices on x of the positions of local maxima. NULL if no valid x
#' @references http://stackoverflow.com/questions/6836409/finding-local-maxima-and-minima
#' @export
localMax <- function(x, threshold=NULL){
if (is.null(x) || length(x) == 0L) return(NULL)
if (! is.numeric(x)) return(NULL)
if (length(x) == 1L) return(1L)
y <- c(TRUE, diff(x) > 0L)
y <- cumsum(rle(y)$lengths)
y <- y[seq.int(1L, length(y), 2L)] # 1, 3, 5, ... starting from first, every second y-value: end of TRUE blocks
if (x[[1L]] == x[[2L]]) y <- y[-1L] # if first two values are equal we get initially for y: TRUE, FALSE, .. but x[1] is no max
if (! is.null(threshold)) y <- intersect(y, which(x >= threshold))
return(y)
}
#' Helper function to facilitate the sub-sampling of training and test data based on the status variable.
#'
#' It collects the requested number of training data with a given proportion of outliers.
#' The remaining observations are used for test data. The indices for a sample is given with the requested proportion.
#' \code{NA} can occur and will be preserved. The NA-treatment was motivated by test-data where initially the status was missing. In simulation it is known.
#'
#' Bootstrap samples are not supported right now.
#'
#' @param dat.status logical vector IN=\code{TRUE}, OUT=\code{FALSE}
#' @param train.n.out size of training data. If \code{NULL} or invalid value the maximal size for training data is assumed that guarantees the outlier proportion.
#' @param prop.outl requested proportion of outliers in the data
#' @return list of vector of indices, first for training set, then for test set with same proportion, often no cases are left for test set. NA-indices are separate category. In case of failure returns NULL.
#' @export
subsampleInd <- function(dat.status, train.n.out=NULL, prop.outl=.15){
##type=c("subsampling", "bootstrap"), type <- match.arg(type)
stopifnot( is.logical(dat.status) || (is.factor(dat.status) && identical(levels(dat.status), STATUS_LEVELS)) )
stopifnot( prop.outl > 0L, prop.outl < 1L )
# mkuhn, 20150425: transform status factor into a logical vector
if (is.factor(dat.status)) dat.status <- (dat.status == STATUS_LEVELS[1])
# mkuhn, 2015-12-28: added special NA-treatment
na.ind <- which(is.na(dat.status))
# calculation
n.in <- sum(! is.na(dat.status)) # formely (w/o special NA treatment) length(dat.status)
n.normal <- sum(dat.status, na.rm = TRUE) # nbr of TRUE cases
n.outl <- n.in - n.normal
# maximal training set
if ( is.null(train.n.out) || train.n.out <= 0L || train.n.out >= n.in ){
train.prop.outl <- n.outl / n.in
#loginfo("Give maximum size that guarantees the outlier proportion of %f.", prop.outl)
if (train.prop.outl >= prop.outl){
# too many outliers in training data
train.norm.ind <- sample(which(dat.status)) # take all normals available and permute
n.train.ref <- ceiling(length(train.norm.ind) / (1L-prop.outl))
train.outl.ind <- sample(which(!dat.status), size=min(n.outl, n.train.ref - length(train.norm.ind)))
} else {
# too many normals
train.outl.ind <- sample(which(!dat.status)) # take all outliers available
n.train.ref <- floor(length(train.outl.ind) / prop.outl)
train.norm.ind <- sample(which( dat.status), size=min(n.normal, n.train.ref - length(train.outl.ind)))
}
} else {
## only a subset of the training set is requested
if (train.n.out > n.in){
logging::logwarn("Requested more output then there is input!")
return(invisible(NULL))
}
# requested numbers
n.train.normal <- ceiling(train.n.out*(1L-prop.outl))
n.train.outl <- train.n.out - n.train.normal
if ( n.train.normal > length(which(dat.status)) ){
#logwarn("Would need %d normal cases but there are only %d normal cases in data! Sorry.", n.train.normal, n.normal)
return(invisible(NULL))
}
if (n.train.outl > length(which(!dat.status)) ){
#logwarn("Would need %d oultier cases but not enough outlier cases in data! Sorry.", n.train.outl)
return(invisible(NULL))
}
stopifnot( n.train.outl >= 1L )
train.norm.ind <- sample(which(dat.status), size = n.train.normal)
train.outl.ind <- sample(which(! dat.status), size = n.train.outl)
}#esle
train.ind <- c(train.norm.ind, train.outl.ind)
# remaining indices (with TRUE or FALSE at dat.status)
remain.norm.ind <- setdiff(which( dat.status), train.norm.ind)
remain.outl.ind <- setdiff(which(!dat.status), train.outl.ind)
# enforce outlier proportion in remaining data (that can be used as test data)
odds.outl <- prop.outl / (1L-prop.outl)
if ( isTRUE(length(remain.norm.ind) * odds.outl > length(remain.outl.ind)) ){
## nbr of outliers is limiting
#loginfo("Test data: Outlier data limiting.")
n.test <- floor(length(remain.outl.ind) / prop.outl)
test.norm.ind <- sample(remain.norm.ind, n.test - length(remain.outl.ind))
test.outl.ind <- remain.outl.ind
} else {
## normal limiting
#loginfo("Test data: Normal cases limiting.")
n.test <- ceiling(length(remain.norm.ind) / (1L-prop.outl))
test.norm.ind <- remain.norm.ind
test.outl.ind <- sample(remain.outl.ind, n.test - length(remain.norm.ind))
}
test.ind <- c(test.norm.ind, test.outl.ind)
return( list( train.ind=train.ind, test.ind=test.ind, na.ind=na.ind) )
}
#' Wrapper for read simulation utility.
#'
#' Specify either \code{fCov} or \code{rCount} but not both.
#' @param fastaFile character. Fasta file with reference sequence to simulate reads from.
#' @param ngs.prop contains the sequencing settings to be used for the simulation
#' @param rCount total number of reads/read pairs to be generated
#' @param fCov the fold of read coverage to be simulated
#' @param paired logical flag if paired sequencing is requested.
#' @return logical flag for success status
readSimulatorWrapper <- function(fastaFile, prefixRead="SIM", prefixFasta, paired=TRUE, ngs.prop=NULL, fCov=NULL, rCount=NULL){
stopifnot( ! (is.null(fCov) && is.null(rCount)), is.null(fCov) || is.null(rCount) ) # exactly one of both is not null!
artOptions <- paste("--noALN", if (paired) "--paired", "--seqSys ", SEQ_SYSTEM,
if (! is.null(fCov)) paste("--fcov", fCov), if (! is.null(rCount)) paste("--rcount", rCount))
simCommand <- sprintf("%s %s --id %s --in %s --out %s --len %d --mflen %d --sdev %d",
ART_EXE, artOptions, prefixRead, fastaFile, file.path(dirname(fastaFile), prefixFasta),
ngs.prop$read.length, ngs.prop$pe.ins.mean, ngs.prop$pe.ins.sd)
simCall <- system(simCommand, intern=FALSE)
if ( simCall == 0L ) logging::loginfo("Reads simulated") else {
logging::logerror("Read Simulation FAILED for with call %s", simCommand)
return(invisible(FALSE))
}
invisible(TRUE)
}
#' BWA-mapper function wrapped in R.
#' @param bwa_opts character string that describes a BWA mapping profile. Currently, "clippy" with less penalties for clipping. Standard means all penalties have their default values.
#' @param bwa_optsExtra character string with extra options (except for std mapping profile, threads, smart pairing). For instance, -I for insert size distribution.
#' @param nbrThreads number of threads to use for mapping
#' @param interleaved logical flag if smart pairing (interleaved paired-end reads) is to be used. If \code{TRUE} only the first input file \code{inputFile1} is used that should contain both reads for pairs.
#' @return The BAM filename upon mapping, sorting and indexing success. Otherwise FALSE invisibly.
#' @note The BAM-flag "proper pair" is only set by BWA if enough reads are in the FASTQ files. Is pair rescue done?
#' @export
mapWrapper <- function(inputFile1, inputFile2=NULL, interleaved=FALSE, outputDir=dirname(inputFile1), bamFilename=NULL,
genomeRef=REF_GEN, bwa_opts=c("clippy", "standard"), bwa_optsExtra=NULL, nbrThreads=2L){
bwa_opts <- match.arg(bwa_opts)
# handle BWA options
bwa_opts <- switch(bwa_opts,
clippy="-a -Y -L3 -U15", # -M", we do not use -M as we want to have supplementary alginments for clipped read ends!?
standard=,
"")
#-C: adds FASTA comment to SAM as extra field [when it complies like BC:Z:AGCGA]
#-a: all secondary alignments as separate entries in BAM file
#-Y: use soft clipping for supplementary alignments
#-L: penalty for clipped reads [default 5]
#-U: penalty for unpaired read-pair [default 17]
#-M: mark shorter split hits as secondary (and not as supplementary)
#-t: number of threads
BWA_OPT <- if (is.numeric(nbrThreads) && nbrThreads > 0L) paste0(bwa_opts, " -t", ceiling(nbrThreads)) else bwa_opts
if (! is.null(bwa_optsExtra) && nzchar(bwa_optsExtra)){
BWA_OPT <- paste(BWA_OPT, bwa_optsExtra)
}
if ( is.null(inputFile1) || ! is.character(inputFile1) ){
logerror("No valid input files given for 1st input FASTQ-file %s.", paste(inputFile1, collapse="::"))
return(invisible(FALSE))
}
if ( length(inputFile1) > 1L && is.null(inputFile2) ){
logging::loginfo("Map wrapper: taking two FASTQ files from argument inputFile1.")
inputFile2 <- inputFile1[2L]
inputFile1 <- inputFile1[1L]
}
if (isTRUE(interleaved)){
logging::loginfo("Interleaved-mapping mode")
if ( ! is.null(inputFile2) ) logging::logwarn("2nd input file is ignored!")
inputFile2 <- NULL
BWA_OPT <- paste(BWA_OPT, "-p")
} else if (is.null(inputFile2) || ! is.character(inputFile2)){
logging::loginfo("SE-mapping mode")
} else {
logging::loginfo("PE-mapping mode")
}
if (! checkFile(inputFile1) || (! is.null(inputFile2) && ! checkFile(inputFile2)) ){
logerror("Input files are not valid. file1: %s and file2: %s.", inputFile1, inputFile2)
return(invisible(FALSE))
}
# extract BAM filename
if (is.null(bamFilename)){
bamFilename <- inputFile1 %>%
basename %>%
stringr::str_replace(pattern = "[.](fq|subfq|fastq)$", replacement = "") %>%
stringr::str_replace(pattern = "_(R)*1$", replacement = "") %>%
paste0(".bam")
}
# BAM output file
outputFile0 <- file.path(outputDir, bamFilename)
bwaRet <- system(paste(BWA_EXE,"mem", BWA_OPT, genomeRef, inputFile1, inputFile2,
"|", SAMTOOLS_EXE,"view -@1 -Sb - > ", outputFile0),
intern=FALSE)
if ( bwaRet != 0 || ! checkFile(outputFile0) ){
logerror("BWA mem failed for input %s and %s.. Sorry.", inputFile1, inputFile2)
return(invisible(FALSE))
} else logdebug("Mapping of %s and %s done.", inputFile1, inputFile2)
## SORT BAM file
# sortRet <- system(paste(SAMTOOLS_EXE, "sort", outputFile, sub(".bam$", "", outputFile)), intern=FALSE)
# if ( sortRet != 0 && checkFile(outputFile) ){
# logerror("BAM SORT failed on mapping %s.", outputFile)
# return(invisible(FALSE))
# } else logdebug("BAM SORT done on mapping %s.", outputFile)
on.exit(try(unlink(outputFile0)))
outputFile1 <- Rsamtools::sortBam(outputFile0, destination = sub(".bam$", ".s", outputFile0))
if (! checkFile(outputFile1)){
logging::logerror("Failed to sort BAM file %s.", outputFile0)
return(invisible(FALSE))
}
logging::loginfo("Sorted the BAM file and saved as %s.", outputFile1)
## INDEX BAM files
Rsamtools::indexBam(outputFile1)
logging::loginfo("Indexed the BAM file %s.", outputFile1)
# # NORMAL probe mapping: indexing
# indexRet <- system(paste(SAMTOOLS_EXE, "index", outputFile), intern=FALSE)
#
# if (indexRet != 0 ){
# logerror("BAM INDEX failed on %s", outputFile)
# try(file.remove(outputFile), silent = TRUE)
# } else logdebug("BAM indexing done on %s.", outputFile)
return(outputFile1)
}
#' Hamming window
#' @param n length of window
#' @param normalized. boolean flag if weights should be normalized that they sum to n.
getHammingWindowWeights <- function(n, normalized=TRUE){
weights <- 0.54 + 0.46 * cos( 2 * pi * seq(from=-n/2, length.out = n) / n)
#weights <- 0.54 - 0.46 * cos( 2 * pi * seq(from=0, length.out = n) / n)
if (normalized) weights <- weights * length(weights) / sum(weights)
return(weights)
}
#' Graphical exploration of published SV-data from Database of Genome Variation.
#' @param dgvFile variants from the genome variant database, as an RDS object
exploreGenomeVariants <- function(dgvFile=file.path(BASEDIR, "refGen", "genomVar_hg19_20150723.rds")){
if ( ! file.exists(dgvFile) ){
logwarn("File %s not found. Sorry.", dgvFile)
return(invisible(NULL))
}
#dgv <- read.delim(dgvFile)
dgv <- readRDS(file=file.path(BASEDIR, "refGen", "genomVar_hg19_20150723.rds"))
dgv %<>% dplyr::mutate_(variantlength=~end - start + 1L,
hasGene=~factor(genes != '', labels = c("no", "yes")))
#ggplot(dgv, aes(x=variantsubtype, y=variantlength, fill=hasGene)) + geom_boxplot() + theme(axis.text.x=element_text(angle=45L, hjust=1L))
ggplot(dgv, aes(x=variantsubtype, y=variantlength, fill=hasGene)) + geom_boxplot() + ylim(c(0,500)) + theme(axis.text.x=element_text(angle=45L, hjust=1L))
ggplot(dgv, aes(x=variantsubtype, y=variantlength)) + geom_boxplot(varwidth = T) + ylim(c(0,500)) + theme(axis.text.x=element_text(angle=45L, hjust=1L))
ggplot(dgv, aes(x=variantsubtype, y=variantlength, fill=hasGene)) + geom_boxplot() + ylim(c(0,200)) + theme(axis.text.x=element_text(angle=45L, hjust=1L))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.