R/selectionQC.R

Defines functions drawDistributions scoreDistributions regularizationQC replicateCorrelation logPhiBias findQuads filterProgression filterBreakdown selectionQC

Documented in drawDistributions filterProgression findQuads logPhiBias regularizationQC replicateCorrelation scoreDistributions selectionQC

# Copyright (C) 2018  Jochen Weile, Roth Lab
#
# This file is part of tileseqMave.
#
# tileseqMave is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# tileseqMave is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with tileseqMave.  If not, see <https://www.gnu.org/licenses/>.

#' run library quality control (QC)
#' 
#' @param dataDir working data directory
#' @param countDir input directory for counts, defaults to subdirectory with latest timestamp ending in _mut_count.
#' @param scoreDir input directory for scores, defaults to subdirectory with latest timestamp ending in _scores.
#' @param outDir output directory, defaults to name of input directory with _QC tag attached.
#' @param paramFile input parameter file. defaults to <dataDir>/parameters.json
#' @return NULL. Results are written to file.
#' @export
selectionQC <- function(dataDir,countDir=NA, scoreDir=NA, outDir=NA, 
                        paramFile=paste0(dataDir,"parameters.json"),
                        srOverride=FALSE) {


  op <- options(stringsAsFactors=FALSE)

  library(yogitools)
  library(hgvsParseR)
  library(pbmcapply)
  # library(optimization)

  #make sure data exists and ends with a "/"
  if (!grepl("/$",dataDir)) {
    dataDir <- paste0(dataDir,"/")
  }
  if (!dir.exists(dataDir)) {
    #we don't use the logger here, assuming that whichever script wraps our function
    #catches the exception and writes to the logger (or uses the log error handler)
    stop("Data folder does not exist!")
  }
  if (!canRead(paramFile)) {
    stop("Unable to read parameter file!")
  }


  logInfo("Reading parameters from",normalizePath(paramFile))
  params <- withCallingHandlers(
    parseParameters(paramFile,srOverride=srOverride),
    warning=function(w)logWarn(conditionMessage(w))
  )
  
  #find counts folder
  if (is.na(countDir)) {
    latest <- latestSubDir(parentDir=dataDir,pattern="_mut_call$|mut_count$")
    countDir <- latest[["dir"]]
    timeStamp <- latest[["timeStamp"]]
    runLabel <- latest[["label"]]
  } else { #if custom input dir was provided
    #make sure it exists
    if (!dir.exists(countDir)) {
      stop("Input count folder ",countDir," does not exist!")
    }
    #try to extract a timestamp and label
    lt <- extract.groups(countDir,"([^/]+_)?(\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}-\\d{2})")[1,]
    if (!any(is.na(lt))) {
      runLabel <- lt[[1]]
      timeStamp <- lt[[2]]
    } else {
      #if none can be extracted, use current time and no tag
      timeStamp <- format(Sys.time(), "%Y-%m-%d-%H-%M-%S")
      runLabel <- ""
    }
  }
  #make sure it ends in "/"
  if (!grepl("/$",countDir)) {
    countDir <- paste0(countDir,"/")
  }
  
  #if no score directory was provided
  if (is.na(scoreDir)) {
    #try to guess its name
    if (grepl("_mut_count/$",countDir)) {
      scoreDir <- sub("_mut_count/$","_scores/",countDir)
    } else {
      scoreDir <- sub("/$","_scores/",countDir)
    }
    if (!dir.exists(scoreDir)) {
      stop("No matching score directory found for ",countDir,"!\n",
           "(Expecting ",scoreDir,")\n",
           "Use --scores option to define explicitly."
      )
    }
  } else {
    if (!dir.exists(scoreDir)) {
      stop("Score directory ",scoreDir," does not exist!")
    }
  }
  #make sure it ends in "/"
  if (!grepl("/$",scoreDir)) {
    scoreDir <- paste0(scoreDir,"/")
  }
  
  #if not output directory was defined
  if (is.na(outDir)) {
    #derive one from the input
    if (grepl("_mut_count/$",countDir)) {
      outDir <- sub("_mut_count/$","_QC/",countDir)
    } else {
      outDir <- sub("/$","_QC/",countDir)
    }
  } 
  #make sure it ends in "/"
  if (!grepl("/$",outDir)) {
    outDir <- paste0(outDir,"/")
  }
  
  #make sure outdir exists
  dir.create(outDir,recursive=TRUE,showWarnings=FALSE)
  
  logInfo("Using count directory",countDir,
          "score directory", scoreDir,
          "and output directory",outDir
  )
  #create PDF tag
  tsmVersion <- sub(".9000$","",as.character(packageVersion("tileseqMave")))
  pdftag <- with(params,sprintf("tileseqPro v%s|%s (%s): %s%s",
                                tsmVersion,project,template$geneName,
                                runLabel,timeStamp
  ))
  params$pdftagbase <- pdftag

  
  logInfo("Reading count data")
  marginalCountFile <- paste0(countDir,"/marginalCounts.csv")
  if (!file.exists(marginalCountFile)) {
    stop("Invalid counts directory ",countDir,"! Must contain marginalCounts.csv!")
  }
  marginalCounts <- read.csv(marginalCountFile,comment.char="#")
  rownames(marginalCounts) <- marginalCounts$hgvsc

  #filter out frameshifts and indels
  toAA <- extract.groups(marginalCounts$aaChange,"\\d+(.*)$")
  indelIdx <- which(toAA=="-" | nchar(toAA) > 1)
  silentIdx <- which(marginalCounts$aaChange=="silent")
  if (length(union(indelIdx,silentIdx)) > 0) {
    marginalCounts <- marginalCounts[-union(indelIdx,silentIdx),]
  }

  #iterate over conditions
  for (sCond in getSelects(params)) {

    #iterate over timepoints
    for (tp in params$timepoints$`Time point name`) {

      logInfo("Processing condition",sCond, "; time",tp)

      #load score table for this condition
      scoreFile <- paste0(scoreDir,"/",sCond,"_t",tp,"_enrichment.csv")
      if (!file.exists(scoreFile)) {
        logWarn("No score file found! Skipping...")
        next
      }
      scores <- read.csv(scoreFile,comment.char="#")
      
      params$bcOverride <- FALSE
      if (!("bce" %in% colnames(scores)) || all(is.na(scores$bce))) {
        params$bcOverride <- TRUE
        scores$bce <- scores$logPhi
        scores$bce.se <- scores$logPhi.se
      }

      #ordering should match scores
      rownames(scores) <- scores$hgvsc
      # scores <- scores[marginalCounts$hgvsc,]
      marginalSubset <- marginalCounts[scores$hgvsc,]

      #Score distributions & syn/non medians
      logInfo("Plotting score distributions")
      scoreDistributions(scores,sCond,tp,outDir,params,srOverride)
      
      #plot logPhi bias vs marginal frequency
      logInfo("Plotting log(phi) bias")
      logPhiBias(scores,params,sCond,tp,outDir)

      #filter progression graph
      logInfo("Plotting filter progression")
      filterProgression(scores,sCond,tp,params,outDir)
      filterBreakdown(scores,sCond,tp,params,outDir)
      
      #examine codon agreement for same amino acids
      logInfo("Plotting codon agreement")
      codonAgreement(scores,sCond,tp,params,outDir,srOverride)
      
      #running mean of synonymous, stop and their difference
      logInfo("Plotting synonymous-nonsense deltas")
      synNonDelta(scores,sCond,tp,params,outDir,srOverride)

      #all of these analyses require more than one replicate
      # if (params$numReplicates[[sCond]] > 1) {
      if (!srOverride) {

        #run replicate correlation analysis
        logInfo("Plotting replicate correlations")
        replicateCorrelation(scores, marginalSubset, params, sCond, tp, outDir)

        #load error model file
        modelFile <- paste0(scoreDir,"/",sCond,"_t",tp,"_errorModel.csv")
        if (!file.exists(modelFile)) {
          logWarn("No error model file found. Skipping regularization QC.")
        } else {
          modelParams <- read.csv(modelFile,row.names=1)
          if (all(is.na(modelParams))) {
            logWarn("Error model failed! Skipping regularization QC.")
          } else {
            #Regularization analysis
            logInfo("Visualizing regularization model fits")
            regularizationQC(scores,modelParams,params,sCond,tp,outDir)
          }
        }
      
        #If scores could not be assigned due to synonymous-nonsense median failure
        #then we can't run an error profile analysis
        if (!all(is.na(scores$logPhi)) && !any(scores$logPhi.se < 0,na.rm=TRUE)) {
          #Error profile
          logInfo("Plotting error profile")
          errorProfile(scores,sCond,tp,outDir,params)
        } else {
          logWarn("Cannot plot error profiles: logPhi values are not available!")
        }
      }

    }
  }

}

filterBreakdown <- function(scores,sCond,tp,params,outDir) {
  
  
  regSubsets <- data.frame(
    set=c("all", params$regions$`Region Number`),
    start=c(2,params$regions$`Start AA`),
    end=c(params$template$proteinLength,params$regions$`End AA`),
    len=c(params$template$proteinLength-1,params$regions$`End AA`-params$regions$`Start AA`+1)
  )
  scorePos <- as.integer(gsub("\\D","",scores$aaChange))
  
  
  filterStacks <- do.call(cbind,lapply (1:nrow(regSubsets), function(ri) {
    localScores <- scores[which(scorePos >= regSubsets$start[[ri]] & 
                                  scorePos <= regSubsets$end[[ri]]),]
    
    ff <- factor(localScores$filter,levels=
       c("frequency","depth","bottleneck:rep","bottleneck:select","wt_excess")
    )
    contab <- table(ff,useNA="a")
    names(contab)[is.na(names(contab))] <- "passed"
    contab
  }))
  colnames(filterStacks) <- sprintf("Region %s",regSubsets$set)
  
  
  #prep plot
  outfile <- paste0(outDir,sCond,"_t",tp,"_filtering2.pdf")
  pdf(outfile,11,8.5)
  tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=1)
  opar <- par(oma=c(2,2,2,2),mar=c(2,4,1,1)+.1)
  
  plotcols <- c("firebrick3","firebrick2","gold2","gold1","orange","chartreuse3")
  ys <- apply(filterStacks,2,cumsum)
  labelYs <- apply(rbind(rep(0,ncol(ys)),ys),2,function(vs) sapply(2:length(vs),function(j)(vs[[j-1]]+vs[[j]])/2))
  xs <- barplot(filterStacks,border=NA,col=plotcols,ylab="#variants")
  grid(NA,NULL)
  for (i in 1:length(xs)) {
    toShow <- which(filterStacks[,i] > 0)
    if (length(toShow) > 0) {
      text(xs[[i]],labelYs[toShow,i],rownames(filterStacks)[toShow])
    }
  }
  
  tagger$cycle()
  par(opar)
  invisible(dev.off())
  
  
}

#' Draw replicate correlation plots
#' 
#' @param scores the score table
#' @param params the global parameter object
#' @return NULL
filterProgression <- function(scores,sCond,tp,params,outDir) {

  regSubsets <- data.frame(
    set=c("all", params$regions$`Region Number`),
    start=c(2,params$regions$`Start AA`),
    end=c(params$template$proteinLength,params$regions$`End AA`),
    len=c(params$template$proteinLength-1,params$regions$`End AA`-params$regions$`Start AA`+1)
  )
  
  
  #get reachable AA changes
  #(this includes stop, but not synonymous)
  reachable <- reachableChanges(params)
  
  #prep plot
  outfile <- paste0(outDir,sCond,"_t",tp,"_filtering.pdf")
  pdf(outfile,11,8.5)
  tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=4)
  opar <- par(oma=c(2,2,2,2),mar=c(1,1,4,1)+.1,mfrow=c(2,2))
  
  for (ri in 1:nrow(regSubsets)) {
    
    #and extract reachable codon changes
    reachableCCs <- do.call(c,with(reachable,mapply(function(w,p,ms){
      if (p >= regSubsets$start[[ri]] && p <= regSubsets$end[[ri]]) {
        paste0(w,p,ms)
      } else NULL
    },wtcodon,pos,strsplit(mutcodons,"\\|"))))
    
    reachableAAs <- reachable[with(reachable,pos >= regSubsets$start[[ri]] & pos <= regSubsets$end[[ri]]),]
    
    #make filtered subsets of the score table
    scorePos <- as.integer(gsub("\\D","",scores$aaChange))
    localScores <- scores[which(scorePos >= regSubsets$start[[ri]] & 
                                  scorePos <= regSubsets$end[[ri]]),]
    filteredScores <- scores[which(is.na(scores$filter) & scorePos >= regSubsets$start[[ri]] & 
                               scorePos <= regSubsets$end[[ri]]),]
    hqScores <- filteredScores[which(filteredScores$bce.se < params$scoring$sdThreshold),]
    
    #calculate filter census
    census <- rbind(
      possible = c(
        AllCCs = regSubsets$len[[ri]] * (4^3-1),
        ReachCCs = length(reachableCCs),
        AllAACs = regSubsets$len[[ri]] * (19+1),
        ReachAACs = nrow(reachableAAs)
      ),
      found = c(
        AllCCs = nrow(localScores),
        ReachCCs = length(intersect(localScores$codonChange, reachableCCs)),
        AllAACs = length(unique(localScores$hgvsp[localScores$type != "synonymous"])),
        ReachAACs = length(intersect(unique(localScores$hgvsp),reachable$hgvsp))
      ),
      filtered = c(
        AllCCs = nrow(filteredScores),
        ReachCCs = length(intersect(filteredScores$codonChange, reachableCCs)),
        AllAACs = length(unique(filteredScores$hgvsp[filteredScores$type != "synonymous"])),
        ReachAACs = length(intersect(unique(filteredScores$hgvsp),reachable$hgvsp))
      ),
      hiQual = c(
        AllCCs = nrow(hqScores),
        ReachCCs = length(intersect(hqScores$codonChange, reachableCCs)),
        AllAACs = length(unique(hqScores$hgvsp[hqScores$type != "synonymous"])),
        ReachAACs = length(intersect(unique(hqScores$hgvsp),reachable$hgvsp))
      )
    )
  
    #draw plot
    # widths <- census/max(census)
    widths <- apply(census,2,function(col)col/col[[1]])
    percentages <- apply(census,2,function(xs)xs/xs[[1]])*100
    
    plotCols <- c("steelblue2","steelblue3","gold2","gold3")
    ylabels <- c("All possible","Detected","Passed filters",
      # paste("\u03c3 <",params$scoring$sdThreshold)
      paste("sd <",params$scoring$sdThreshold)
    )
    xlabels <- c("All","SNV-reachable","All","SNV-reachable")
    toplabels <- c("Codon changes","AA changes")
    title <- if (regSubsets$set[[ri]]=="all") "Entire Construct" else {
      paste0("Region #",regSubsets$set[[ri]])
    }
    cex <- 0.8
    
    plot(NA,type="n",xlim=c(-.5,4.5),ylim=c(1,5),xlab="",ylab="",axes=FALSE,main=title)
    abline(h=1:4,col="gray",lty="dotted")
    text(-0.5,4:1,ylabels,pos=4,cex=cex,col=c(1,1,1,"gray50"))
    text(1:4,4.4,xlabels,cex=cex)
    text(c(1.5,3.5),4.8,toplabels,cex=cex)
    invisible(lapply(1:4, function(cati) {
      # polygon(
      #   c(cati-widths[,cati]/2,rev(cati+widths[,cati]/2)),
      #   c(4:1,1:4),col=plotCols[[cati]], border=NA
      # )
      polygon(
        c(cati-widths[-4,cati]/2,rev(cati+widths[-4,cati]/2)),
        c(4:2,2:4),col=plotCols[[cati]], border=NA
      )
      polygon(
        c(cati-widths[3:4,cati]/2,rev(cati+widths[3:4,cati]/2)),
        c(2:1,1:2),col=NA, border=plotCols[[cati]], lty='dotted', lwd=1.5
      )
      text(cati,4:1,sprintf("%d (%.02f%%)",census[,cati],percentages[,cati]),
        cex=cex,col=c(1,1,1,"gray50")
      )
    }))
    tagger$cycle()
  
  }
  par(opar)
  invisible(dev.off())

}


#' Helper function to find the quad of conditions and time points that belong 
#' with the given selection condition
#'
#' @param params the parameter sheet object
#' @param sCond the current selection condition
#' @param tp the current time point of the selection condition
#'
#' @return a list containing the condition quad ('condQuad'), 
#' replicate matrix ('repMatrix') and timepoint quad ('tpQuad')
#' @export
#'
findQuads <- function(params,sCond,tp) {
  
  # nrep <- params$numReplicates[[sCond]]
  #this is to make up for missing WT conditions
  null2na <- function(x) if (length(x) == 0) NA else x
  
  #pull up matching nonselect and WT controls
  nCond <- getNonselectFor(sCond,params)
  condQuad <- c(
    select=sCond,
    nonselect=nCond,
    selWT=null2na(getWTControlFor(sCond,params)),
    nonWT=null2na(getWTControlFor(nCond,params))
  )
  sRep <- params$numReplicates[[sCond]]
  #if a condition is missing entirely, we give it one "pseudoreplicate with 0 scores"
  repQuad <- sapply(condQuad,function(con) if(is.na(con)) 1 else params$numReplicates[[con]])
  if (!all(repQuad == sRep)) {
    logWarn(paste(
      "Number of replicates in conditions is not balanced or WT is missing!",
      " => Correlation plot may be distorted due to recycled replicates!!",
      sep="\n"
    ))
  }
  #Workaround: Since R doesn't like variable names starting with numerals, 
  # we need to adjust any of those cases
  if (any(grepl("^\\d",condQuad))) {
    culprits <- which(grepl("^\\d",condQuad))
    #we add the prefix "X" to the name, just like the table parser does.
    condQuad[culprits] <- paste0("X",condQuad[culprits])
  }
  
  #deal with time points
  tpQuad <- sapply(condQuad, function(con) {
    if (is.na(con)) {
      NA
    } else if (tp %in% getTimepointsFor(con,params)) {
      tp
    } else {
      #hopefully there's only one other as you are only allowed to either have 
      #the same timepoints everywhere or just one
      getTimepointsFor(con,params)[[1]]
    }
  })
  
  #replicate column name matrix
  repMatrix <- do.call(rbind,lapply(names(condQuad),function(con) {
    sapply(1:repQuad[[con]], 
           function(repi) {
             if (is.na(condQuad[[con]])) {
               NA #again, compensating for potentially missing WT condition
             } else {
               sprintf("%s.t%s.rep%d.frequency",condQuad[[con]],tpQuad[[con]],repi)
             }
           }
    )
  }))
  rownames(repMatrix) <- names(condQuad)
  
  return(list(condQuad=condQuad,repMatrix=repMatrix,tpQuad=tpQuad))
}

#' Plot the logPhi bias vs the nonselect marginal frequency
#'
#' @param scores the score table
#' @param params the parameter sheet object
#' @param sCond the active selection condition
#' @param tp the active time point
#' @param outDir the output directory
#'
#' @return nothing. writes a pdf file to the output dir
#' @export
logPhiBias <- function(scores,params,sCond,tp,outDir) {
  
  scores$position <- as.integer(extract.groups(scores$codonChange,"(\\d+)"))
  scores$region <- params$pos2reg(scores$position)
  # scores$tile <- params$pos2tile(scores$position)
  # 
  # nonsense <- scores[scores$type=="nonsense",]
  # nonsenseF <- scores[scores$type=="nonsense" & is.na(scores$filter),]
  # zns <- with(nonsenseF, lm(logPhi~log10(nonselect.mean)) )
  # trns <- with(nonsenseF, yogitools::runningFunction(log10(nonselect.mean),logPhi,nbins=30))
  # 
  # synonymous <- scores[scores$type=="synonymous",]
  # synonymousF <- scores[scores$type=="synonymous" & is.na(scores$filter),]
  # zsyn <- with(synonymousF, lm(logPhi~log10(nonselect.mean)) )
  # trsyn <- with(synonymousF, yogitools::runningFunction(log10(nonselect.mean),logPhi,nbins=30))
  # 
  outfile <- paste0(outDir,sCond,"_t",tp,"_logPhiBias.pdf")
  pdf(outfile,8.5,11)
  tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=6)
  opar <- par(oma=c(15,2,2,2),mfrow=c(3,2))
  
  for (currRegion in params$regions[,1]) {
    
    if (!any(scores$region == currRegion)) {
      plot.new()
      rect(0,0,1,1,col="gray80",border="gray30",lty="dotted")
      text(0.5,0.5,"no data")
      mtext(paste0("Region ",currRegion),side=3)
      tagger$cycle()
      next
    }
    
    pseudoCount <- 10^floor(log10(sort(unique(scores$nonselect.mean))[[2]]))
    floor0 <- function(xs,bot=pseudoCount) sapply(xs, max, bot)
    
    nonsenseF <- scores[with(scores,type=="nonsense" & is.na(filter) & logPhi < 0 & region==currRegion),]
    nonsenseF$lognsCorr <- with(nonsenseF,log10(floor0(nonselect.mean-nonWT.mean)))
    synonymousF <- scores[with(scores,type=="synonymous" & is.na(filter) & region==currRegion),]
    synonymousF$lognsCorr <- with(synonymousF,log10(floor0(nonselect.mean-nonWT.mean)))
    
    if (nrow(synonymousF) == 0 || nrow(nonsenseF) == 0) {
      plot.new()
      rect(0,0,1,1,col="gray80",border="gray30",lty="dotted")
      text(0.5,0.5,"insufficient data")
      mtext(paste0("Region ",currRegion),side=3)
      tagger$cycle()
      next
    }
    
    zns <- with(nonsenseF, lm(logPhi~lognsCorr) )
    zsyn <- with(synonymousF, lm(logPhi~lognsCorr) )
    
    nsmed <- with(nonsenseF, median(logPhi,na.rm=TRUE))
    synmed <- with(synonymousF, median(logPhi,na.rm=TRUE))
    
    trns <- with(nonsenseF, yogitools::runningFunction(lognsCorr,logPhi,nbins=20))
    trsyn <- with(synonymousF, yogitools::runningFunction(lognsCorr,logPhi,nbins=20))
    with(rbind(nonsenseF,synonymousF), plot(
      lognsCorr,logPhi, type="n", main=paste("Region",currRegion),
      xlab="log10(nonselect-wt)"
    ))
    with(nonsenseF, {
      points(lognsCorr,logPhi,col="firebrick3",pch=20)
      yogitools::errorBars(lognsCorr,logPhi,logPhi.se,col="firebrick3")
    })
    if (!any(is.na(zns$coefficients))) {
      abline(zns,col="firebrick2",lty="dashed")
    }
    abline(h=nsmed,col="firebrick2",lty="dotted")
    # abline(a=thetaNon[[1]],b=thetaNon[[2]],col="firebrick2")
    lines(trns,col="firebrick2",lwd=2)
    
    with(synonymousF, {
      points(lognsCorr,logPhi,col="chartreuse3",pch=20)
      yogitools::errorBars(lognsCorr,logPhi,logPhi.se,col="chartreuse3")
    })
    if (!any(is.na(zsyn$coefficients))) {
      abline(zsyn,col="chartreuse2",lty="dashed")
    }
    abline(h=synmed,col="chartreuse2",lty="dotted")
    # abline(a=thetaSyn[[1]],b=thetaSyn[[2]],col="chartreuse2")
    lines(trsyn,col="chartreuse2",lwd=2)
    
    
    # plotcols <- sapply(c("gray","firebrick3"),yogitools::colAlpha,0.5)
    # with(nonsense, plot(
    #   log10(nonselect.mean),logPhi,
    #   col=plotcols[1+is.na(filter)],
    #   pch=20
    # ))
    # # abline(h=0:1,lty="dashed",col=2:3)
    # abline(zns,col="firebrick2")
    # lines(trns,col="firebrick2",lwd=2)
    # plotcols <- sapply(c("gray","chartreuse3"),yogitools::colAlpha,0.5)
    # with(synonymous, points(
    #   log10(nonselect.mean),logPhi,
    #   col=plotcols[1+is.na(filter)],
    #   pch=20
    # ))
    # # abline(h=0:1,lty="dashed",col=2:3)
    # abline(zsyn,col="chartreuse2")
    # lines(trsyn,col="chartreuse2",lwd=2)
    # plotcols <- sapply(c("chartreuse3","firebrick3","gray"),yogitools::colAlpha,0.5)
    # legend("bottomleft",c("synonymous","nonsense","filtered out"),col=plotcols,pch=20)
    tagger$cycle()
  }
  
  par(opar)
  invisible(dev.off())
  
}

#' Draw replicate correlation plots
#' 
#' @param scores the score table
#' @param marginalCounts the marginal counts table
#' @param params the global parameter object
#' @param sCond the condition for which to draw the plot
#' @param tp the time point
#' @param outDir the output directory
#' @return NULL
replicateCorrelation <- function(scores, marginalCounts, params, sCond, tp, outDir) {
  
  sRep <- params$numReplicates[[sCond]]
  quads <- findQuads(params,sCond,tp)
  repMatrix <- quads$repMatrix
  
  #check that labels match between tables
  if (!all(scores$hgvsp == marginalCounts$hgvsp)) {
    stop("scores and marginal count files mismatch.")
  }
  
  #extract replicate values for this condition
  repValues <- lapply(1:sRep, function(repi) {
    selRaw <- marginalCounts[,repMatrix["select",repi]]
    selWTraw <- if (!is.na(repMatrix["selWT",repi])) {
      marginalCounts[,repMatrix["selWT",repi]]
    } else 0
    selFreq <-floor0(selRaw - selWTraw)
    
    nonRaw <- marginalCounts[,repMatrix["nonselect",repi]]
    nonWTraw <- if (!is.na(repMatrix["nonWT",repi])) {
      marginalCounts[,repMatrix["nonWT",repi]]
    } else 0
    nonFreq <-  floor0(nonRaw - nonWTraw)
    
    smallestSelect <- unique(sort(na.omit(selRaw)))[[2]]
    pseudoCount <- 10^floor(log10(smallestSelect))
    
    logphi <- log10((selFreq+pseudoCount) / nonFreq)
    data.frame(select=selFreq,nonselect=nonFreq,logphi=logphi)
  })
  names(repValues) <- as.character(1:sRep)
  repValues <- do.call(cbind,repValues)
  rownames(repValues) <- rownames(scores)
  #apply filter from scoring function
  repValues <- repValues[is.na(scores$filter),]

  # plotcolors <- sapply(scores[rownames(repValues),"hgvsp"],function(hgvsp) {
  #   if (grepl("Ter$",hgvsp)) {
  #     "firebrick3"
  #   } else if (grepl("=$",hgvsp)) {
  #     "darkolivegreen3"
  #   } else {
  #     "black"
  #   }
  # })


  if (sRep == 2) {
    outfile <- paste0(outDir,sCond,"_t",tp,"_replicates.pdf")
    pdf(outfile,11,8.5)
    tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=2)
    layout(cbind(1,2))
    opar <- par(oma=c(2,2,2,2),mar=c(15,4,4,1)+.1)
    plot(
      repValues[,"1.nonselect"],repValues[,"2.nonselect"],
      xlab="Frequency Replicate 1", ylab="Frequency Replicate 2",
      main=sprintf(
        "non-select frequencies R = %.03f",
        cor(fin(log10(repValues[,sprintf("%d.nonselect",1:2)])))[1,2]
      ),
      pch=".",
      # col=plotcolors,
      log="xy"
    )
    abline(0,1,col="gray",lty="dashed")
    tagger$cycle()
    plot(
      repValues[,"1.logphi"],repValues[,"2.logphi"],
      xlab="log(phi) Replicate 1", ylab="log(phi) Replicate 2",
      main=sprintf(
        "select / nonselect log-ratio R = %.03f",
        cor(fin(repValues[,sprintf("%d.logphi",1:2)]))[1,2]
      ),pch="."
    )
    abline(0,1,col="gray",lty="dashed")
    par(opar)
    invisible(dev.off())
  } else {
    panel.cor <- function(x, y,...){
      usr <- par("usr"); on.exit(par(usr)); par(usr=c(0,1,0,1))
      r <- cor(fin(cbind(x,y)))[1,2]
      txt <- sprintf("R = %.02f",r)
      # cex.cor <- 0.8/strwidth(txt)
      text(0.5, 0.5, txt)
    }
    labels <- paste0("rep.",1:sRep) 
    # imgSize <- max(4,sRep)

    #TODO: This needs to be tested!
    outfile <- paste0(outDir,sCond,"_t",tp,"_ns_replicates.pdf")
    # pdf(outfile,imgSize,imgSize)
    pdf(outfile,8.5,11)
    # tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=2)
    pairs(
      repValues[,sprintf("%d.nonselect",1:sRep)],
      lower.panel=panel.cor,pch=".",labels=labels,
      main="non-select frequencies"
    )
    op <- par(oma=c(2,2,2,2))
    mtext(paste(params$pdftagbase,"; selection condition:",sCond),side=1,outer=TRUE,line=0,cex=0.5)
    par(op)
    invisible(dev.off())

    outfile <- paste0(outDir,sCond,"_t",tp,"_phi_replicates.pdf")
    # pdf(outfile,imgSize,imgSize)
    pdf(outfile,8.5,11)
    pairs(
      repValues[,sprintf("%d.nonselect",1:sRep)],
      lower.panel=panel.cor,pch=".",labels=labels,
      main="select / nonselect log-ratios"
    )
    op <- par(oma=c(2,2,2,2))
    mtext(paste(params$pdftagbase,"; selection condition:",sCond),side=1,outer=TRUE,line=0,cex=0.5)
    par(op)
    invisible(dev.off())
  }

  return(NULL)
}

#' Draw error regulariztion model QC plots
#' 
#' @param scores the score table
#' @param params the global parameter object
#' @param sCond the condition for which to draw the plot
#' @param tp the time point
#' @param outDir the output directory
#' @return NULL
regularizationQC <- function(scores,modelParams,params,sCond,tp,outDir) {

  plotNoData <- function(tile) {
    plot.new()
    rect(0,0,1,1,col="gray80",border="gray30",lty="dotted")
    text(0.5,0.5,"no data")
    mtext(paste0("Tile ",tile),side=3)
  }
  #calculate tile assignments
  # tileStarts <- params$tiles[,"Start AA"]
  positions <- as.integer(extract.groups(scores$codonChange,"(\\d+)")[,1])
  # tiles <- sapply(positions,function(pos) max(which(tileStarts <= pos)))
  tiles <- params$pos2tile(positions)
  
  outfile <- paste0(outDir,sCond,"_t",tp,"_errorModel.pdf")
  pdf(outfile,8.5,11)
  tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=6)
  opar <- par(mfrow=c(3,2),oma=c(2,2,2,2))
  for (tile in params$tiles[,"Tile Number"]) {
    # model.fit <- tryCatch({
    #   fit.cv.model(scores[which(tiles==tile),])
    # },error=function(e) {
    #   NULL
    # })
    if (!(tile %in% tiles)) {
      plotNoData(tile)
      tagger$cycle()
      next
    }

    deepEnough <- sapply(1:nrow(scores), function(i) {
      is.na(scores$filter[[i]]) || scores$filter[[i]]!="depth"
    })
    
    with(scores[which(tiles==tile & deepEnough),],{

      if (all(is.na(nonselect.count)) || all(nonselect.count == 0) || 
          all(is.na(nonselect.cv)) || all(nonselect.cv == 0)) {
        plotNoData(tile)
        tagger$cycle()
      } else {
        theta <- modelParams[as.character(tile),paste0("nonselect.",c("static","additive","multiplicative"))]
        cv.model <- function(count) {
          10^sapply(log10(1/sqrt(count)),function(x) max(theta[[1]], theta[[2]] + theta[[3]]*x))
        }
  
        plot(nonselect.count,nonselect.cv,log="xy",main=paste("Tile",tile,"non-select"))
        runningMean <- runningFunction(
          nonselect.count,nonselect.cv,nbins=20,logScale=TRUE
        )
        nsSamples <- seq(1,max(nonselect.count,na.rm=TRUE),length.out=100)
        lines(nsSamples,1/sqrt(nsSamples),col="chartreuse3",lty="dashed",lwd=2)
        lines(runningMean,col="firebrick3",lwd=2)
        
        lines(nsSamples,cv.model(nsSamples),col="blue",lwd=2)
        mtext(sprintf(
          "stat.=%.02f; add.=%.02f; mult.=%.02f",
          theta[[1]],theta[[2]],theta[[3]]
        ))
        tagger$cycle()
      }
      
      if (all(is.na(select.count)) || all(select.count == 0) ||
          all(is.na(select.cv)) || all(select.cv == 0)) {
        plotNoData(tile)
        tagger$cycle()
      } else {
        theta <- modelParams[as.character(tile),paste0("select.",c("static","additive","multiplicative"))]
        cv.model <- function(count) {
          10^sapply(log10(1/sqrt(count)),function(x) max(theta[[1]], theta[[2]] + theta[[3]]*x))
        }
  
        plot(select.count,select.cv,log="xy",main=paste("Tile",tile,"select"))
        runningMean <- runningFunction(
          select.count,select.cv,nbins=20,logScale=TRUE
        )
        sSamples <- seq(1,max(select.count,na.rm=TRUE),length.out=100)
        lines(sSamples,1/sqrt(sSamples),col="chartreuse3",lty="dashed",lwd=2)
        lines(runningMean,col="firebrick3",lwd=2)
        
        lines(sSamples,cv.model(sSamples),col="blue",lwd=2)
        mtext(sprintf(
          "stat.=%.02f; add.=%.02f; mult.=%.02f",
          theta[[1]],theta[[2]],theta[[3]]
        ))
        tagger$cycle()
      }
      # nonselect.sd.poisson <- 1/sqrt(nonselect.count)*nonselect.mean
      # nonselect.sd.poisson[which(nonselect.mean==0)] <- 0
      # plot(nonselect.sd.poisson, nonselect.sd,log="xy")
      # runningMean <- runningFunction(
      #   nonselect.sd.poisson, nonselect.sd,nbins=20,logScale=TRUE
      # )
      # lines(runningMean,col="firebrick3",lwd=2)
      # abline(0,1,col="chartreuse3",lty="dashed",lwd=2)
      # depth <- mean(nonselect.count/nonselect.mean,na.rm=TRUE)
      # lines(
      #   sqrt(nsSamples)/depth,
      #   cv.model(nsSamples)*(nsSamples/depth),
      #   col="blue",lwd=2
      # )
    })
  }
  par(opar)
  invisible(dev.off())
}


#' Draw score distribution plots
#' 
#' @param scores the score table
#' @param sCond the condition for which to draw the plot
#' @param tp the time point
#' @param outDir the output directory
#' @return NULL
scoreDistributions <- function(scores,sCond,tp,outDir,params,srOverride) {
  
  #enact filters and last functional AA cutoff
  spos <- as.integer(gsub("\\D+","",scores$aaChange))
  filteredScores <- scores[is.na(scores$filter) & spos <= params$scoring$lastFuncPos,]
  
  if (!srOverride) {
    # #residual error isn't really reliable yet, so this will just have to be total error for now.
    # resErr <- residualError(filteredScores$bce,filteredScores$bce.sd,mirror=TRUE,wtX=1)
    # nerr <- resErr - min(resErr,na.rm=TRUE) + min(abs(resErr),na.rm=TRUE)
    nerr <- filteredScores$bce.se
  }
  
  #collapse by amino acid consequence and associate with regions
  aaScores <- as.df(with(filteredScores,tapply(1:length(hgvsp),hgvsp, function(is) {
    if (!srOverride && !any(is.na(bce.se[is]))) {
      joint <- join.datapoints(
        ms=bce[is],
        sds=bce.se[is],
        # dfs=rep(params$numReplicates[[sCond]],length(is)),
        dfs=df[is],
        ws=(1/nerr[is])/sum(1/nerr[is])
      )
    } else {
      #this is the case if srOverride is turned on and only
      #one replicate was available
      joint <- c(
        mj=mean(bce[is],na.rm=TRUE),sj=NA,
        dfj=params$numReplicates[[sCond]]*length(is)
      )
    }
    #extract AA position and derive region assignment
    p <- unique(as.integer(extract.groups(aaChange[is],"(\\d+)")[,1]))
    mutregion <- params$pos2reg(p)
    list(
      hgvsp=unique(hgvsp[is]),
      bce=joint[["mj"]],
      # sd=joint[["sj"]],
      sd=joint[["sj"]]*sqrt(joint[["dfj"]]),
      df=joint[["dfj"]],
      # se=joint[["sj"]]/sqrt(joint[["dfj"]]),
      se=joint[["sj"]],
      pos=p,
      region=mutregion
    )
  })))
  
  sdCutoff <- params$scoring$sdThreshold

  #subdivide data into regions
  # mutpos <- as.integer(extract.groups(scores$aaChange,"(\\d+)")[,1])
  # mutregion <- with(as.data.frame(params$regions),sapply(mutpos,function(p) {
  #   which(p >= `Start AA` & p <= `End AA`)
  # }))

  outfile <- paste0(outDir,sCond,"_t",tp,"_logPhiDistribution.pdf")
  pdf(outfile,11,8.5)
  opar <- par(oma=c(2,2,2,2))
  tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=2)
  layout(rbind(1,2,3,4),heights=c(1.2,1,1.2,1))
  invisible(tapply(1:nrow(aaScores),aaScores$region, function(is) {
    reg <- unique(aaScores$region[is])
    drawDistributions(aaScores[is,],Inf,reg,params$bcOverride)
    tagger$cycle()
    if (!srOverride && !any(is.na(aaScores[is,"se"])) && sdCutoff < 1) {
      drawDistributions(aaScores[is,],sdCutoff,reg,params$bcOverride)
      tagger$cycle()
    } else {
      plot.new()
      rect(0,0,1,1,col="gray80",border="gray30",lty="dotted")
      text(0.5,0.5,"sigma filtering\ndisabled or unavailable")
      plot.new()
      rect(0,0,1,1,col="gray80",border="gray30",lty="dotted")
      tagger$cycle()
    }
    return(NULL)
  }))
  drawDistributions(aaScores,Inf,"all",params$bcOverride)
  tagger$cycle()
  if (!any(is.na(aaScores[,"se"]))) {
    drawDistributions(aaScores,sdCutoff,"all",params$bcOverride)
    tagger$cycle()
  }
  par(opar)
  invisible(dev.off())
}

#' Delegation function to draw score distribution plots with a given filter setting
#' 
#' @param aaScores the score table
#' @param seCutoff the stderr cutoff to apply
#' @return NULL
drawDistributions <- function(aaScores,seCutoff=Inf,reg=NA,bcOverride=TRUE) {

  if (seCutoff < Inf) {
    #check that the cutoff leaves at least 10 nonsense variants to work with
    #otherwise, make it more permissive
    numNsSurvive <- with(aaScores,sum(se < seCutoff & grepl("Ter$",hgvsp),na.rm=TRUE))
    if (numNsSurvive < 10) {
      nonsenseSDs <- with(aaScores,se[grepl("Ter$",hgvsp)])
      r10Threshold <- sort(nonsenseSDs)[[min(10,length(nonsenseSDs))]]
      logWarn(sprintf("sdThreshold %.03f is too restrictive! Using se < %.03f instead.",seCutoff,r10Threshold))
      seCutoff <- r10Threshold
    }
  }

  #extract filtered scores
  if (!all(is.na(aaScores$se))) {
    synScores <- fin(with(aaScores,bce[grepl("=$",hgvsp) & se < seCutoff ]))
    stopScores <- fin(with(aaScores,bce[grepl("Ter$",hgvsp) & se < seCutoff]))
    misScores <- fin(with(aaScores,bce[!grepl("Ter$|=$",hgvsp) & se < seCutoff]))
  } else {
    synScores <- fin(with(aaScores,bce[grepl("=$",hgvsp)]))
    stopScores <- fin(with(aaScores,bce[grepl("Ter$",hgvsp)]))
    misScores <- fin(with(aaScores,bce[!grepl("Ter$|=$",hgvsp)]))
  }
  allScores <- fin(c(synScores,stopScores,misScores))

  #calculate plot ranges to nearest integers
  left <- floor(quantile(allScores,0.01,na.rm=TRUE))
  right <- ceiling(quantile(allScores,0.99,na.rm=TRUE))
  farleft <- floor(min(c(0,allScores),na.rm=TRUE))
  farright <- ceiling(max(c(1,allScores),na.rm=TRUE))
  #set histogram bins based on range (with extra space on the right)
  breaks <- seq(farleft,farright+0.1,0.1)
  #fill bins
  synHist <- hist(synScores,breaks=breaks,plot=FALSE)
  stopHist <- hist(stopScores,breaks=breaks,plot=FALSE)
  misHist <- hist(misScores,breaks=breaks,plot=FALSE)

  # function for calculating Cohen's d for Welch test
  calc_dScore <- function(syn, stop){
    syn_mean <- mean(syn)
    stop_mean <- mean(stop)
    syn_var <- var(syn)
    stop_var <- var(stop)
    return(
      (syn_mean - stop_mean)/
        sqrt((syn_var + stop_var)/2)
    )
  }
  
  # Cohen's d for Welch test
  dScore <- calc_dScore(synScores, stopScores)
  
  # Round d score to 2 digits
  Rounded_d <- round(dScore, digits = 2)
  
  
  #draw top plot for syn/stop
  op <- par(mar=c(2,4,2,1)+.1)
  xs <- barplot(
    rbind(synHist$density,stopHist$density),
    beside=TRUE,col=c("darkolivegreen3","firebrick3"),
    border=NA,ylab="density",space=c(0,0),
    main=if (is.infinite(seCutoff)) {
      # paste("Region",reg,"; Unfiltered")
      bquote("Region"~.(reg)~"; Any"~sigma~"; Cohen's d: "~.(Rounded_d))
    } else {
      bquote("Region"~.(reg)~";"~sigma < .(seCutoff)~"; Cohen's d: "~.(Rounded_d))
    }
  )
  grid(NA,NULL)
  #add x-axis
  pips <- left:right
  pipx <- colMeans(xs[,which(breaks %in% pips)])
  axis(1,at=pipx,labels=pips)
  if (bcOverride) {
    mtext(expression(log(phi)),side=1,line=1,at=xs[1,1],adj=0,cex=0.8)
  } else {
    mtext("bce",side=1,line=1,at=xs[1,1],adj=0,cex=0.8)
  }
  #draw legend
  legend("left",
    c("nonsense","synonymous","missense"),
    fill=c("firebrick3","darkolivegreen3","gray"),
    border=NA,bty="n"
  )
  #draw bottom plot (for missense)
  par(mar=c(1,4,0,1)+.1)
  xs <- barplot(
    -misHist$density,
    border=NA,ylab="density",space=0,
  )
  grid(NA,NULL)
  #establish user coordinate function on barplot
  x0 <- xs[which(breaks==0),1]
  x1 <- xs[which(breaks==1),1]
  uCoord <- function(x)  x0 + x*(x1-x0)
  #draw medians and percentile
  abline(v=uCoord(median(synScores,na.rm=TRUE)),col="darkolivegreen4",lwd=2,lty="dotted")
  abline(v=uCoord(median(stopScores,na.rm=TRUE)),col="firebrick3",lwd=2,lty="dotted")
  abline(v=uCoord(quantile(misScores,0.1,na.rm=TRUE)),col="gray30",lwd=2,lty="dotted")
  text(
    uCoord(median(synScores,na.rm=TRUE)),
    -max(misHist$density)/3,
    sprintf("Synonymous median\n%.03f",median(synScores)),
    col="darkolivegreen4"
  )
  text(
    uCoord(median(stopScores,na.rm=TRUE)),
    -2*max(misHist$density)/3,
    sprintf("Nonsense median\n%.03f",median(stopScores)),
    col="firebrick3"
  )
  text(
    uCoord(quantile(misScores,0.1,na.rm=TRUE)),
    -max(misHist$density)/2,
    sprintf("10th percentile\n%.03f",quantile(misScores,0.1)),
    col="gray30"
  )
  par(op)

}

#' Draw error profile for the dataset
#' 
#' @param scores the score table
#' @param sCond the condition for which to draw the plot
#' @param tp the time point
#' @param outDir the output directory
#' @return NULL
errorProfile <- function(scores,sCond,tp,outDir,params) {
  
  plotEP <- function(scores,varname,xlab) {
    
    sdname <- paste0(varname,".se")
    sdRange <- range(log10(scores[is.na(scores$filter),sdname]),finite=TRUE)
    lphiRange <- range(scores[is.na(scores$filter),varname],finite=TRUE)
    # runningMedian <- with(scores[is.na(scores$filter),],yogitools::runningFunction(
    #   get(varname),log10(get(sdname)),fun=median,nbins=20
    # ))
    # runningMedian[,2] <- 10^runningMedian[,2]
    tryCatch({
      errModel <- with(scores[is.na(scores$filter),],
        expectedError(get(varname),get(sdname),
          mirror=(varname=="bce"),wtX=as.numeric(varname=="bce")
        )
      )
    }, error=function(e) {
      logWarn("Error profile modeling failed!\n",conditionMessage(e))
    })
    
    par(mar=c(5,4,0,0)+.1)
    with(scores[is.na(scores$filter),],plot(
      get(varname),get(sdname),log="y",pch=".",
      xlab=xlab,ylab=expression(sigma)
    ))
    if (exists("errModel")) {
      lines(errModel$running,col="steelblue3",lwd=2)
      curve(errModel$model(x),add=TRUE,col="firebrick3",lwd=2)
    }
    par(mar=c(0,4,1,0)+.1) 
    breaks <- seq(lphiRange[[1]],lphiRange[[2]],length.out=50)
    lphiHist <- with(scores[is.na(scores$filter),], hist(get(varname),breaks=breaks,plot=FALSE))
    barplot(lphiHist$density,border=NA,ylab="density",space=0)
    par(mar=c(5,0,0,0)+.1) 
    breaks <- seq(sdRange[[1]],sdRange[[2]],length.out=50)
    sdHist <- with(scores[is.na(scores$filter),], hist(log10(get(sdname)),breaks=breaks,plot=FALSE))
    barplot(sdHist$density,border=NA,horiz=TRUE,space=0,xlab="density")
    tagger$cycle()
  }
  
  outfile <- paste0(outDir,sCond,"_t",tp,"_errorProfile.pdf")
  pdf(outfile,8.5,11)
  layout(rbind(c(2,0),c(1,3)),widths=c(0.8,0.2),heights=c(0.2,0.8))
  tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=4)
  opar <- par(oma=c(24,6,2,6)) 
  plotEP(scores,"logPhi",expression(log[10](phi)))
  if (!params$bcOverride) {
    plotEP(scores,"bce","bias-corrected enrichment")
  }
  par(opar)

  invisible(dev.off())
  
}



#' Draw plots to analyze agreement between equivalent codons
#'
#' @param scores the score table
#' @param sCond currently active selection condition
#' @param tp currently active time point
#' @param params parameter sheet object
#' @param outdir output directory
#' @return nothing. writes plots to output directory
codonAgreement <- function(scores,sCond,tp,params,outDir,srOverride) {
  codonGroups <- tapply(1:nrow(scores),scores$hgvsp,c)
  nCodons <- sapply(codonGroups,length)
  
  #create pairwise combinations
  pairings <- t(do.call(cbind,lapply(codonGroups[nCodons > 1],combn,2)))
  pairScores <- as.df(apply(pairings,1,function(idxs) {
    filter <- any(!is.na(scores[idxs,"filter"]))
    lphi <- scores[idxs,"logPhi"]
    lpsd <- scores[idxs,"logPhi.se"]
    c(list(filter=filter),c(logPhi=lphi,sd=lpsd))
  }))
  
  alphaCol <- colAlpha("black",0.2)
  
  outfile <- paste0(outDir,sCond,"_t",tp,"_codonCorr.pdf")
  tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=1)
  pdf(outfile,8.5,11)
  
  layout(cbind(1:3),heights=c(1,2,2))
  op <- par(mar=c(5,4,.1,1),oma=c(2,15,2,15))
  barplot(
    table(nCodons),
    xlab="#equivalent codons per AA",
    ylab="Frequency", col="steelblue3",border=NA
  )
  
  with(pairScores[!pairScores$filter,],{
    
    #plot codon scores against each other
    par(mar=c(5,4,2,1))
    plot(
      logPhi1,logPhi2,type="n",
      xlab=expression("codon 1"~log(phi)),
      ylab=expression("codon 2"~log(phi))
    )
    yogitools::errorBars(
      logPhi1,logPhi2,sd2,col=alphaCol
    )
    yogitools::errorBars(
      logPhi2,logPhi1,sd1,vertical=FALSE,col=alphaCol
    )
    abline(0,1,col="gray",lty="dashed")
    mtext(sprintf("PCC=%.02f",cor(logPhi1,logPhi2)))
   
    #plot score difference against max(sd)
    par(mar=c(5,4,2,1))
    if (!srOverride) {
      plot(
        abs(logPhi1-logPhi2),mapply(max,sd1,sd2),log="y",
        pch=20,col=alphaCol,
        xlab=expression(Delta~log(phi)),ylab=expression(max(sigma))
      )
    } else {
      plot.new()
      rect(0,0,1,1,col="gray80",border="gray30",lty="dotted")
      text(0.5,0.5,"no replicate data")
    }
  })
  tagger$cycle()
  par(op)
  invisible(dev.off())
  
}

#' Finds runs of TRUE values in a boolean vector
#' 
#' @param bools a logical vector
#' @return an integer matrix with columns 'start' and 'end' indicating runs
findRuns <- function(bools) {
  stopifnot(inherits(bools,"logical"))
  if (length(bools) == 0) {
    return(cbind(start=integer(0),end=integer(0)))
  }
  runs <- list()
  changes <- c(
    bools[[1]]+0,#pretend the outside of the array is "FALSE"
    bools[-1]-bools[-length(bools)],
    (bools[[length(bools)]]*-1)
  )
  cbind(start=which(changes>0),end=which(changes<0)-1)
}

#' Draws a plot showing running means of synonymous and nonsense variant scores
#' across the length of the protein, as well as difference track between the two.
#'
#' @param scores data.frame of the scores
#' @param sCond the currently active selective condition
#' @param tp the currently active time point
#' @param params the parameter sheet object
#' @param outDir the output directory
#' @return nothing, writes plot to output directory
synNonDelta <- function(scores,sCond,tp,params,outDir,srOverride){
  scores$pos <- as.integer(gsub("\\D+","",scores$aaChange))
  #positional weighted averages of synonymous variants
  #FIXME: Use residual error as weights
  syns <- as.df(with(scores[scores$type=="synonymous" & is.na(scores$filter),],tapply(1:length(pos),pos,function(idxs){
    sds <- if (!srOverride) logPhi.se[idxs] else rep(1,length(idxs))
    joint <- join.datapoints(logPhi[idxs],sds,rep(2,length(idxs)))
    p <- unique(pos[idxs])
    c(pos=p,joint)
  })))
  #positional weighted averages of nonsense (stop) variants
  stops <- as.df(with(scores[scores$type=="nonsense" & is.na(scores$filter),],tapply(1:length(pos),pos,function(idxs){
    sds <- if (!srOverride) logPhi.se[idxs] else rep(1,length(idxs))
    joint <- join.datapoints(logPhi[idxs],sds,rep(2,length(idxs)))
    p <- unique(pos[idxs])
    c(pos=p,joint)
  })))
  
  allpos <- as.character(1:params$template$proteinLength)
  #This is a hack to get around a bug in `[.dataframe` which pulls !approximate! matches!!
  #see: https://stackoverflow.com/questions/34233235/r-returning-partial-matching-of-row-names
  pull <- function(tbl,idxs) as.df(lapply(idxs, function(idx) {
    if (idx %in% rownames(tbl)) {
      tbl[idx,2:4]
    } else setNames(rep(NA,3),colnames(tbl)[2:4])
  }))
  joint <- cbind(pos=as.integer(allpos),syn=pull(syns,allpos),non=pull(stops,allpos))
  runningWeights <- as.df(lapply(joint$pos,function(p) {
    idxs <- which(abs(joint$pos-p) < 5)
    synsubset <- na.omit(joint[idxs,c("syn.mj","syn.sj","syn.dfj")])
    synAv <- if (nrow(synsubset) > 0) {
      join.datapoints(synsubset[,1],synsubset[,2],synsubset[,3])
    } else {
      c(mj=NA,sj=NA,dfj=0)
    }
    nonsubset <- na.omit(joint[idxs,c("non.mj","non.sj","non.dfj")])
    nonAv <- if (nrow(nonsubset) > 0) {
      join.datapoints(nonsubset[,1],nonsubset[,2],nonsubset[,3])
    } else {
      c(mj=NA,sj=NA,dfj=0)
    }
    c(pos=p,synAv=synAv[["mj"]],synSD=synAv[["sj"]],nonAv=nonAv[["mj"]],nonSD=nonAv[["sj"]])
  }))
  runningWeights$delta <- with(runningWeights,synAv - nonAv)
  runningWeights$deltaSD <- with(runningWeights,sqrt(synSD^2 + nonSD^2))

  synRuns <- as.data.frame(findRuns(!is.na(runningWeights$synAv)))
  nonRuns <- as.data.frame(findRuns(!is.na(runningWeights$nonAv)))
  
  outfile <- paste0(outDir,sCond,"_t",tp,"_synNonDiff.pdf")
  tagger <- pdftagger(paste(params$pdftagbase,"; selection condition:",sCond),cpp=1)
  pdf(outfile,11,8.5)
  layout(rbind(1,2),heights=c(0.3,1))
  op <- par(mar=c(0,4,1,1),oma=c(12,2,12,2))
  with(runningWeights,{
    plot(NA,type="n",xlim=range(pos),ylim=c(0,1),xlab="",ylab="",axes=FALSE)
    rect(params$regions[,"Start AA"],0,params$regions[,"End AA"],0.49,col="gray80",border=NA)
    text(rowMeans(params$regions[,c("Start AA","End AA")]),0.25,params$regions[,"Region Number"])
    rect(params$tiles[,"Start AA"],0.51,params$tiles[,"End AA"],1,col="gray90",border=NA)
    text(rowMeans(params$tiles[,c("Start AA","End AA"), drop=FALSE]),0.75,params$tiles[,"Tile Number"])
    par(mar=c(5,4,0,1))
    plot(NA,type="n",xlim=range(pos),ylim=range(c(synAv,nonAv),na.rm=TRUE),
         xlab="AA position",ylab=expression("running average"~log(phi))
    )
  })
  rowApply(synRuns,function(start,end,...) with(runningWeights[start:end,],{
    polygon(c(pos,rev(pos)),c(synAv+synSD/2,rev(synAv-synSD/2)),col=colAlpha("chartreuse3",0.2),border=NA)
    lines(pos,synAv,col="chartreuse3")
  }))
  rowApply(nonRuns, function(start,end) with(runningWeights[start:end,],{
    polygon(c(pos,rev(pos)),c(nonAv+nonSD/2,rev(nonAv-nonSD/2)),col=colAlpha("firebrick3",0.2),border=NA)
    lines(pos,nonAv,col="firebrick3")
  }))
  abline(v=params$tiles[-1,"Start AA"],col="gray90",lty="dashed")
  abline(v=params$regions[-1,"Start AA"],col="gray80",lty="dashed")

  tagger$cycle()
  par(op)
  invisible(dev.off())

}
jweile/tileseqMave documentation built on April 5, 2024, 4:51 p.m.