# Copyright (C) 2018 Jochen Weile, Roth Lab
#
# This file is part of tileseqMave.
#
# tileseqMave is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# tileseqMave is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with tileseqMave. If not, see <https://www.gnu.org/licenses/>.
#' run variant dropout troubleshooter
#'
#' @param dataDir working data directory
#' @param countDir input directory for counts, defaults to subdirectory with latest timestamp ending in _mut_count.
#' @param outDir output directory, defaults to name of input directory with _QC tag attached.
#' @param paramFile input parameter file. defaults to <dataDir>/parameters.json
#' @param srOverride single replicate override (use with caution)
#' @return NULL. Results are written to file.
#' @export
dropoutAnalysis <- function(dataDir,countDir=NA, #scoreDir=NA,
outDir=NA,
paramFile=paste0(dataDir,"parameters.json"),
srOverride=FALSE,tile=NA) {
op <- options(stringsAsFactors=FALSE)
library(yogitools)
library(hgvsParseR)
library(pbmcapply)
# library(optimization)
#make sure data exists and ends with a "/"
if (!grepl("/$",dataDir)) {
dataDir <- paste0(dataDir,"/")
}
if (!dir.exists(dataDir)) {
#we don't use the logger here, assuming that whichever script wraps our function
#catches the exception and writes to the logger (or uses the log error handler)
stop("Data folder does not exist!")
}
if (!canRead(paramFile)) {
stop("Unable to read parameter file!")
}
logInfo("Reading parameters from",normalizePath(paramFile))
params <- withCallingHandlers(
parseParameters(paramFile,srOverride=srOverride),
warning=function(w)logWarn(conditionMessage(w))
)
# if (is.na(tile)) {
# #if no tile is defined, default to first tile
# tile <- params$tiles[1,1]
# }
#find counts folder
if (is.na(countDir)) {
latest <- latestSubDir(parentDir=dataDir,pattern="_mut_call$|mut_count$")
countDir <- latest[["dir"]]
timeStamp <- latest[["timeStamp"]]
runLabel <- latest[["label"]]
} else { #if custom input dir was provided
#make sure it exists
if (!dir.exists(countDir)) {
stop("Input count folder ",countDir," does not exist!")
}
#try to extract a timestamp and label
lt <- extract.groups(countDir,"([^/]+_)?(\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}-\\d{2})")[1,]
if (!any(is.na(lt))) {
runLabel <- lt[[1]]
timeStamp <- lt[[2]]
} else {
#if none can be extracted, use current time and no tag
timeStamp <- format(Sys.time(), "%Y-%m-%d-%H-%M-%S")
runLabel <- ""
}
}
#make sure it ends in "/"
if (!grepl("/$",countDir)) {
countDir <- paste0(countDir,"/")
}
# #if no score directory was provided
# if (is.na(scoreDir)) {
# #try to guess its name
# if (grepl("_mut_count/$",countDir)) {
# scoreDir <- sub("_mut_count/$","_scores/",countDir)
# } else {
# scoreDir <- sub("/$","_scores/",countDir)
# }
# if (!dir.exists(scoreDir)) {
# stop("No matching score directory found for ",countDir,"!\n",
# "(Expecting ",scoreDir,")\n",
# "Use --scores option to define explicitly."
# )
# }
# } else {
# if (!dir.exists(scoreDir)) {
# stop("Score directory ",scoreDir," does not exist!")
# }
# }
# #make sure it ends in "/"
# if (!grepl("/$",scoreDir)) {
# scoreDir <- paste0(scoreDir,"/")
# }
#if not output directory was defined
if (is.na(outDir)) {
#derive one from the input
if (grepl("_mut_count/$",countDir)) {
outDir <- sub("_mut_count/$","_QC/",countDir)
} else {
outDir <- sub("/$","_QC/",countDir)
}
}
#make sure it ends in "/"
if (!grepl("/$",outDir)) {
outDir <- paste0(outDir,"/")
}
#make sure outdir exists
dir.create(outDir,recursive=TRUE,showWarnings=FALSE)
logInfo("Using count directory",countDir,
# "score directory", scoreDir,
"and output directory",outDir
)
#create PDF tag
# pdftag <- with(params,sprintf("%s (%s): %s%s",project,template$geneName,runLabel,timeStamp))
# params$pdftagbase <- pdftag
logInfo("Reading count data")
marginalCountFile <- paste0(countDir,"/marginalCounts.csv")
if (!file.exists(marginalCountFile)) {
stop("Invalid counts directory ",countDir,"! Must contain marginalCounts.csv!")
}
marginalCounts <- read.csv(marginalCountFile,comment.char="#")
rownames(marginalCounts) <- marginalCounts$hgvsc
#filter out frameshifts and indels
toAA <- extract.groups(marginalCounts$aaChange,"\\d+(.*)$")
indelIdx <- which(toAA=="-" | nchar(toAA) > 1)
silentIdx <- which(marginalCounts$aaChange=="silent")
marginalCounts <- marginalCounts[-union(indelIdx,silentIdx),]
#filter down to selected tile
if (!is.na(tile) && is.numeric(tile)) {
positions <- as.integer(extract.groups(marginalCounts$codonChange,"(\\d+)")[,1])
tiles <- params$pos2tile(positions)
marginalCounts <- marginalCounts[which(tiles==tile),]
}
#helper function to compare dropout sets
compareSets <- function(set1,set2,title,n=nrow(marginalCounts)) {
l1 <- length(set1)
l2 <- length(set2)
common <- length(intersect(set1,set2))
nullDistr <- replicate(1000,{
length(intersect(sample(n,l1),sample(n,l2)))
})
mNull <- mean(nullDistr)
q95 <- quantile(nullDistr,c(0.025,0.975))
sprintf("%s: %.0f from %.0f/%.0f (expected %.01f [%.01f;%.01f])",title,common,l1,l2,mNull,q95[[1]],q95[[2]])
}
#iterate over conditions
for (sCond in getSelects(params)) {
#iterate over timepoints
for (tp in params$timepoints$`Time point name`) {
logInfo("Processing",sCond,"time",tp)
# nrep <- params$numReplicates[[sCond]]
#this is to make up for missing WT conditions
null2na <- function(x) if (length(x) == 0) NA else x
#pull up matching nonselect and WT controls
nCond <- getNonselectFor(sCond,params)
condQuad <- c(
select=sCond,
nonselect=nCond,
selWT=null2na(getWTControlFor(sCond,params)),
nonWT=null2na(getWTControlFor(nCond,params))
)
sRep <- params$numReplicates[[sCond]]
#if a condition is missing entirely, we give it one "pseudoreplicate with 0 scores"
repQuad <- sapply(condQuad,function(con) if(is.na(con)) 1 else params$numReplicates[[con]])
if (!all(repQuad == sRep)) {
logWarn(paste(
"Number of replicates in conditions is not balanced or WT is missing!",
" => Correlation plot may be distorted due to recycled replicates!!",
sep="\n"
))
}
#Workaround: Since R doesn't like variable names starting with numerals,
# we need to adjust any of those cases
if (any(grepl("^\\d",condQuad))) {
culprits <- which(grepl("^\\d",condQuad))
#we add the prefix "X" to the name, just like the table parser does.
condQuad[culprits] <- paste0("X",condQuad[culprits])
}
#deal with time points
tpQuad <- sapply(condQuad, function(con) {
if (is.na(con)) {
NA
} else if (tp %in% getTimepointsFor(con,params)) {
tp
} else {
#hopefully there's only one other as you are only allowed to either have
#the same timepoints everywhere or just one
getTimepointsFor(con,params)[[1]]
}
})
#replicate column name matrix
repMatrix <- do.call(rbind,lapply(names(condQuad),function(con) {
sapply(1:repQuad[[con]],
function(repi) {
if (is.na(condQuad[[con]])) {
NA #again, compensating for potentially missing WT condition
} else {
sprintf("%s.t%s.rep%d.frequency",condQuad[[con]],tpQuad[[con]],repi)
}
}
)
}))
rownames(repMatrix) <- names(condQuad)
smallestSelect <- unique(sort(na.omit(marginalCounts[,repMatrix["select",1]])))[[2]]
pseudoCount <- 10^floor(log10(smallestSelect))
# plotCol <- sapply(1+(
# log10(marginalCounts[,repMatrix["select",1]]+pseudoCount) < log10(marginalCounts[,repMatrix["nonselect",1]])/4-4.5
# ),colAlpha,0.5)
#
quickCounts <- log10(marginalCounts[,as.vector(repMatrix[1:2,])]+pseudoCount)
maxCount <- apply(quickCounts,1,max)
plotCol <- sapply(1+(
quickCounts[,repMatrix["nonselect",1]] < (maxCount-1)
),colAlpha,0.2)
outfile <- paste0(outDir,sCond,"_t",tp,"_dropoutAll.pdf")
pdf(outfile,10,10)
opar <- par(mfrow=c(2,2),mar=c(5,4,1,1))
plot(
jitter(maxCount,amount=0.2),
jitter(quickCounts[,repMatrix["nonselect",1]],amount=0.2),
pch=20,col=plotCol,
xlab="max condition",ylab="nonselect rep 1"
)
plot(
jitter(maxCount,amount=0.2),
jitter(quickCounts[,repMatrix["nonselect",2]],amount=0.2),
pch=20,col=plotCol,
xlab="max condition",ylab="nonselect rep 2"
)
plot(
jitter(maxCount,amount=0.2),
jitter(quickCounts[,repMatrix["select",1]],amount=0.2),
pch=20,col=plotCol,
xlab="max condition",ylab="select rep 1"
)
plot(
jitter(maxCount,amount=0.2),
jitter(quickCounts[,repMatrix["select",2]],amount=0.2),
pch=20,col=plotCol,
xlab="max condition",ylab="select rep 2"
)
par(opar)
invisible(dev.off())
# xs <- 10^seq(-7,-1)
# lines(xs,xs*10)
droppedS1 <- which(quickCounts[,repMatrix["select",1]] < maxCount-1)
droppedNS1 <- which(quickCounts[,repMatrix["nonselect",1]] < maxCount-1)
droppedS2 <- which(quickCounts[,repMatrix["select",2]] < maxCount-1)
droppedNS2 <- which(quickCounts[,repMatrix["nonselect",2]] < maxCount-1)
logInfo("DROP FROM ALL:")
n <- sum(maxCount > -6)
logInfo(compareSets(droppedS1,droppedNS1,"max->s1:max->ns1",n=n))
logInfo(compareSets(droppedNS1,droppedNS2,"max->ns1:max->ns2",n=n))
logInfo(compareSets(droppedS1,droppedS2,"max->s1:max->s2",n=n))
logInfo(compareSets(droppedS1,droppedNS2,"max->s1:max->ns2",n=n))
outfile <- paste0(outDir,sCond,"_t",tp,"_dropouts.pdf")
pdf(outfile,10,10)
plotCol <- sapply(1+(
log10(marginalCounts[,repMatrix["nonselect",2]]+pseudoCount) < log10(marginalCounts[,repMatrix["nonselect",1]]+pseudoCount)-1
),colAlpha,0.2)
opar <- par(mfrow=c(2,2),mar=c(5,4,1,1))
plot(
jitter(log10(marginalCounts[,repMatrix["nonselect",1]]+pseudoCount),amount=0.2),
jitter(log10(marginalCounts[,repMatrix["nonselect",2]]+pseudoCount),amount=0.2),
pch=20,col=plotCol,
xlab="nonselect rep 1",ylab="nonselect rep 2"
)
plot(
jitter(log10(marginalCounts[,repMatrix["nonselect",1]]+pseudoCount),amount=0.2),
jitter(log10(marginalCounts[,repMatrix["select",1]]+pseudoCount),amount=0.2),
pch=20,col=plotCol,
xlab="nonselect rep 1",ylab="select rep 1"
)
plot(
jitter(log10(marginalCounts[,repMatrix["select",1]]+pseudoCount),amount=0.2),
jitter(log10(marginalCounts[,repMatrix["select",2]]+pseudoCount),amount=0.2),
pch=20,col=plotCol,
xlab="select rep 1",ylab="select rep 2"
)
plot(
jitter(log10(marginalCounts[,repMatrix["nonselect",2]]+pseudoCount),amount=0.2),
jitter(log10(marginalCounts[,repMatrix["select",1]]+pseudoCount),amount=0.2),
pch=20,col=plotCol,
xlab="nonselect rep 2",ylab="select rep 1"
)
par(opar)
invisible(dev.off())
# xs <- 10^seq(-7,-1)
# lines(xs,xs*10)
logInfo("DROP RELATIVE:")
dropSel <- which(marginalCounts[,repMatrix["select",1]] < marginalCounts[,repMatrix["nonselect",1]]/10)
riseSel <- which(marginalCounts[,repMatrix["select",1]] > marginalCounts[,repMatrix["nonselect",1]]*10)
dropRep2 <- which(marginalCounts[,repMatrix["nonselect",2]] < marginalCounts[,repMatrix["nonselect",1]]/10)
riseRep2 <- which(marginalCounts[,repMatrix["nonselect",2]] > marginalCounts[,repMatrix["nonselect",1]]*10)
dropSel2 <- which(marginalCounts[,repMatrix["select",1]] < marginalCounts[,repMatrix["nonselect",2]]/10)
riseSel2 <- which(marginalCounts[,repMatrix["select",1]] > marginalCounts[,repMatrix["nonselect",2]]*10)
logInfo(compareSets(dropRep2,dropSel,"ns2<-ns1->s1"))
logInfo(compareSets(riseRep2,dropSel2,"ns1<-ns2->s1"))
logInfo(compareSets(riseSel,riseSel2,"ns1<-s1->ns2"))
dropSelMirr <- which(marginalCounts[,repMatrix["select",2]] < marginalCounts[,repMatrix["nonselect",2]]/10)
riseSelMirr <- which(marginalCounts[,repMatrix["select",2]] > marginalCounts[,repMatrix["nonselect",2]]*10)
dropRepMirr <- which(marginalCounts[,repMatrix["select",2]] < marginalCounts[,repMatrix["select",1]]/10)
riseRepMirr <- which(marginalCounts[,repMatrix["select",2]] > marginalCounts[,repMatrix["select",1]]*10)
logInfo(compareSets(dropRep2,dropRepMirr,"ns1->ns2:s1->s2"))
logInfo(compareSets(riseRep2,riseRepMirr,"ns1<-ns2:s1<-s2"))
logInfo(compareSets(dropSel,dropSelMirr,"ns1->s1:ns2->s2"))
logInfo(compareSets(riseSel,riseSelMirr,"ns1->s1:ns2->s2"))
}
}
options(op)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.