R/legacy.R

Defines functions analyzeLegacyTileseqCounts legacyCountConverter

Documented in analyzeLegacyTileseqCounts legacyCountConverter

# Copyright (C) 2018  Jochen Weile, Roth Lab
#
# This file is part of tileseqMave.
#
# tileseqMave is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# tileseqMave is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with tileseqMave.  If not, see <https://www.gnu.org/licenses/>.

#' converts legacy tileseq count data into the new count file format
#'
#' @param dataDir path to the existing data main directory
#' @param outdir path to the desired output directory
#' @param mut2funcFile path to the mut2func file. Defaults to <dataDir>/mut2func_info.csv
#' @param countFolder path the mutationCallfile folder. Defaults to <dataDir>/mutationCallfile/
#' @param logger An optional yogilogger object. Defaults to NULL, which just prints to stdout.
#' @return NULL. Results are written to file. 
#' @export
legacyCountConverter <- function(dataDir, outdir, 
  mut2funcFile=paste0(dataDir,"mut2func_info.csv"), 
  countFolder=paste0(dataDir,"mutationCallfile/"),
  logger=NULL) {
  
  library(hgvsParseR)
  # library(yogilog)
  library(yogitools)

  op <- options(stringsAsFactors=FALSE)

  if (!is.null(logger)) {
    stopifnot(inherits(logger,"yogilogger"))
  }

  logInfo <- function(...) {
    if (!is.null(logger)) {
      logger$info(...)
    } else {
      do.call(cat,c(list(...),"\n"))
    }
  }
  logWarn <- function(...) {
    if (!is.null(logger)) {
      logger$warning(...)
    } else {
      do.call(cat,c("Warning:",list(...),"\n"))
    }
  }
  logErr <- function(...) {
    if (!is.null(logger)) {
      logger$error(...)
    } else {
      do.call(cat,c("ERROR:",list(...),"\n"))
    }
  }


  canRead <- function(filename) file.access(filename,mode=4) == 0
  
  #make sure data and out dir exist and ends with a "/"
  if (!grepl("/$",dataDir)) {
    dataDir <- paste0(dataDir,"/")
  }
  if (!grepl("/$",outdir)) {
    outdir <- paste0(outdir,"/")
  }
  if (!dir.exists(dataDir)) {
    #we don't use the logger here, assuming that whichever script wraps our function
    #catches the exception and writes to the logger (or uses the log error handler)
    stop("Data folder does not exist!")
  }
  if (!dir.exists(outdir)) {
    logWarn("Output folder does not exist, creating it now.")
    dir.create(outdir,recursive=TRUE)
  }

  # mut2funcFile <- paste0(dataDir,"mut2func_info.csv")
  if (!canRead(mut2funcFile)) {
    stop("Unable to find or read 'mut2func_info.csv' file!")
  }
  # seqDepthFile <- paste0(dataDir,"resultfile/sequencingDepth.csv")
  # if (!canRead(seqDepthFile)) {
  #   stop("Unable to find or read 'sequencingDepth.csv' file!")
  # }

  parseReport <- function(reportFile) {
    lines <- scan(reportFile,what="character",sep="\n",quiet=TRUE)
    mreads <- as.numeric(extract.groups(lines[[7]],"is: (.+) million")[1,1])
    return(mreads*1e6)
  }


  #inverse lookup of sample IDs
  mut2func <- read.csv(mut2funcFile)
  tileRanges <- apply(extract.groups(colnames(mut2func)[-1],"X(\\d+)\\.(\\d+)"),2,as.integer)
  sampleIDs <- unique(sort(do.call(c,mut2func[,-1])))
  condRep <- extract.groups(mut2func[,1],"^(\\w+)(\\d+)$")
  # rownames(condRep) <- mut2func[,1]
  sampleMetadata <- as.df(lapply(sampleIDs,function(sid) {
    idx <- which(mut2func==sid,arr.ind=TRUE)
    reportFile <- paste0(countFolder,sid,"report.txt")
    depth <- parseReport(reportFile)
    list(
      sample=sid, tile=unique(idx[,2]), 
      condition=paste(unique(condRep[idx[,1],1]),collapse=","),
      replicate=paste(unique(condRep[idx[,1],2]),collapse=","),
      timepoint=1,depth=depth
    )
  }))

  processCountFile <- function(countFile,depth) {
    counts <- read.delim(countFile,header=FALSE)

    cbuilder <- new.hgvs.builder.c()
    hgvsc <- apply(counts,1,function(row) {
      wt <- strsplit(row[[4]],"\\|")[[1]]
      pos <- as.integer(strsplit(row[[2]],"\\|")[[1]])
      mut <- strsplit(row[[5]],"\\|")[[1]]
      #codon start indices
      cstart <- pos*3-2
      hgvscs <- mapply(function(wt,pos,mut,cstart) {
        #calculate differences between codons
        diffs <- sapply(1:3,function(i)substr(wt,i,i)!=substr(mut,i,i))
        ndiff <- sum(diffs)
        if (ndiff == 1) { #one difference is a SNV
          offset <- which(diffs)
          wtbase <- substr(wt,offset,offset)
          mutbase <- substr(mut,offset,offset)
          snvpos <- cstart+offset-1
          cbuilder$substitution(snvpos,wtbase,mutbase)
        } else if (ndiff > 1) { #multiple differences is a delIns
          cbuilder$delins(cstart,cstart+2,mut)
        } else {
          stop("mutation must differ from wt!")
        }
      },wt,pos,mut,cstart)
      if (length(hgvscs) > 1) {
        do.call(cbuilder$cis,as.list(hgvscs))
      } else {
        hgvscs
      }
    })
    data.frame(HGVS=hgvsc,count=round(counts[,6]*depth/1e6))
  }


  # #Sample: 14
  # #Tile: 3
  # #Condition: nonselect
  # #Replicate: 1
  # #Timepoint: 1
  # #Read-depth: 30156
  # HGVS,count
  # c.21A>C,34
  # c.[13_15delinsTTG;29G>C],10
  # c.[8T>G;18_19del],8
  # c.123_125delinsCCG,42
  # ...

  invisible(apply(sampleMetadata,1,function(row) {
    sid <- as.integer(row[1])

    multi <- processCountFile(paste0(countFolder,sid,"MultipleMut.txt"),as.integer(row["depth"]))
    single <- processCountFile(paste0(countFolder,sid,"AAchange.txt"),as.integer(row["depth"]))
    header <- mapply(
      function(name,value) sprintf("#%s: %s",name,value), 
      name=c("Sample","Tile","Condition","Replicate","Timepoint","Read-depth"),
      value=row
    )

    con <- file(paste0(outdir,"counts_sample",sid,".csv"),open="w")
    writeLines(header,con)
    write.csv(rbind(multi,single),con,row.names=FALSE,quote=FALSE)
    close(con)

  }))

  options(op)

  return(NULL)
}

#' analyze tileseq counts from legacy pipeline
#'
#' This analysis function performs the following steps for each mutagenesis region:
#' 1. Construction of HGVS variant descriptor strings.
#' 2. Collapsing equivalent codons into amino acic change counts.
#' 3. Error regularization at the level of pre- and post-selection counts.
#' 4. Quality-based filtering filtering based on "Song's rule".
#' 5. Fitness score calculation and error propagation.
#' 6. Secondary error regularization at the level of fitness scores.
#' 7. Determination of synonymous and nonsense medians and re-scaling of fitness scores.
#' 8. Flooring of negative scores and adjustment of associated error.
#' 9. Output in MaveDB format.
#' 
#' @param countfile the path to the "rawData.txt" file produced by the legacy pipeline.
#' @param regionfile the path to a tab-delimited file describing the mutagenesis regions. Must contain columns 
#'  'region', start', 'end', 'syn', 'stop', i.e. the region id, the start position, end position, and
#'  and optional synonymous and stopm mean overrides.
#' @param outdir path to desired output directory
#' @param logger a yogilogger object to be used for logging (or NULL for simple printing)
#' @param inverseAssay a boolean flag to indicate that the experiment was done with an inverse assay
#'       i.e. protein function leading to decreased fitness. Defaults to FALSE
#' @param pseudoObservations The number of pseudoObservations to use for the Baldi&Long regularization.
#'       Defaults to 2.
#' @param conservativeMode Boolean flag. When turned on, pseudoObservations are not counted towards 
#'       standard error and the first round of regularization uses pessimistic error estimates.
#' @return nothing. output is written to various files in the output directory
#' @export
analyzeLegacyTileseqCounts <- function(countfile,regionfile,outdir,logger=NULL,
  inverseAssay=FALSE,pseudoObservations=2,conservativeMode=TRUE) {

  library(hgvsParseR)
  # library(yogilog)
  library(yogitools)

  options(stringsAsFactors=FALSE)

  if (!is.null(logger)) {
    stopifnot(inherits(logger,"yogilogger"))
  }

  logInfo <- function(...) {
    if (!is.null(logger)) {
      logger$info(...)
    } else {
      do.call(cat,c(list(...),"\n"))
    }
  }
  logWarn <- function(...) {
    if (!is.null(logger)) {
      logger$warning(...)
    } else {
      do.call(cat,c("Warning:",list(...),"\n"))
    }
  }
  logErr <- function(...) {
    if (!is.null(logger)) {
      logger$error(...)
    } else {
      do.call(cat,c("ERROR:",list(...),"\n"))
    }
  }

  ##############
  # Read and validate input data
  ##############

  canRead <- function(filename) file.access(filename,mode=4) == 0
  stopifnot(
    canRead(countfile),
    canRead(regionfile)
  )

  #parse the input files
  rawCounts <- read.delim(countfile)
  regions <- read.delim(regionfile)

  #check that all required columns exist and contain the correct data types.
  stopifnot(
    c(
      "wt_codon","pos","mut_codon","wt_aa","mut_aa",
      "nonselect1","nonselect2","select1","select2",
      "controlNS1","controlNS2","controlS1","controlS2"
    ) %in% colnames(rawCounts),
    all(apply(regions[,1:3],2,class)=="integer"),
    all(apply(regions[,4:5],2,class) %in% c("integer","numeric","logical")),
    c("region","start","end","syn","stop") %in% colnames(regions)
  )

  #make sure output directory path (outdir) ends with a "/"
  if (!grepl("/$",outdir)) {
    outdir <- paste0(outdir,"/")
  }
  #and if it doesn't exist, create it
  if (!dir.exists(outdir)) {
    dir.create(outdir,recursive=TRUE)
  }


  ##################
  # Build HGVS variant descriptor strings
  ##################

  #for nucleotide level descriptors
  cbuilder <- new.hgvs.builder.c()
  #for protein-level descriptors
  pbuilder <- new.hgvs.builder.p(aacode=3)

  #for nucleotide level descriptors
  hgvsc <- apply(rawCounts,1,function(row) {
    pos <- as.numeric(row["pos"])
    #codon start indices
    cstart <- pos*3-2
    wt <- row["wt_codon"]
    mut <- row["mut_codon"]
    #calculate differences between codons
    diffs <- sapply(1:3,function(i)substr(wt,i,i)!=substr(mut,i,i))
    ndiff <- sum(diffs)
    if (ndiff == 1) { #one difference is a SNV
      offset <- which(diffs)
      wtbase <- substr(wt,offset,offset)
      mutbase <- substr(mut,offset,offset)
      snvpos <- cstart+offset-1
      return(cbuilder$substitution(snvpos,wtbase,mutbase))
    } else if (ndiff > 1) { #multiple differences is a delIns
      return(cbuilder$delins(cstart,cstart+2,mut))
    } else {
      stop("mutation must differ from wt!")
    }
  })

  hgvsp <- apply(rawCounts,1,function(row) {
    pos <- as.numeric(row["pos"])
    wt <- row["wt_aa"]
    mut <- row["mut_aa"]
    if (mut=="_") mut <- "*" #correct stop character
    if (wt == mut) {
      return(pbuilder$synonymous(pos,wt))
    } else {
      return(pbuilder$substitution(pos,wt,mut))
    }
  })

  logInfo(sprintf(
    "Parsed data for %d codon variants (%d protein variants),\n %d (%d) of which are missense.",
    length(hgvsc),length(unique(hgvsp)), 
    sum(rawCounts$annotation == "NONSYN"), length(unique(hgvsp[!grepl("(Ter|=)$",hgvsp)]))
  ))

  #####################
  # Pre-processing and formatting input
  ####################

  #extract and examine condition names and replicates
  conditions <- colnames(rawCounts)[-c(1:6)]
  condStruc <- extract.groups(conditions,"^(\\w+)(\\d+)$")
  condNames <- unique(condStruc[,1])
  repNames <- unique(condStruc[,2])
  condMatrix <- do.call(rbind,lapply(condNames,paste0,repNames))
  dimnames(condMatrix) <- list(condNames,repNames)

  #TODO: Throw error if conditions are missing

  logInfo(sprintf(
    "Detected %d replicates for %d conditions: %s",
    length(repNames), length(unique(condNames)), paste(condNames,collapse=", ")
  ))


  #apply pre-filter on variants
  rawmsd <- do.call(cbind,lapply(condNames,function(cond) {
    msd <- t(apply(rawCounts[,condMatrix[cond,]],1,function(xs){
      c(mean(xs,na.rm=TRUE), sd(xs,na.rm=TRUE))
    }))
    colnames(msd) <- paste0(cond,c(".mean",".sd"))
    msd
  }))
  flagged1 <- with(as.data.frame(rawmsd), {
    controlNS.mean + 3*controlNS.sd >= nonselect.mean
  })
  flagged2 <- with(as.data.frame(rawmsd), {
    controlS.mean + 3*controlS.sd >= select.mean
  })

  logInfo(sprintf(
"Filtering out %d variants (=%.02f%%):
%d (=%.02f%%) due to likely sequencing error.
%d (=%.02f%%) due to likely bottlenecking.",
    sum(flagged1|flagged2), 100*sum(flagged1|flagged2)/length(flagged1|flagged2),
    sum(flagged1), 100*sum(flagged1)/length(flagged1),
    sum(flagged2)-sum(flagged1), 100*(sum(flagged2)-sum(flagged1))/length(flagged2)
  ))
  rawCountsFiltered <- rawCounts[!(flagged1|flagged2),]
  hgvsc <- hgvsc[!(flagged1|flagged2)]
  hgvsp <- hgvsp[!(flagged1|flagged2)]

  logInfo(sprintf(
    "Data remains for %d codon variants (%d protein variants),\n %d (%d) of which are missense.",
    length(hgvsc),length(unique(hgvsp)), 
    sum(rawCountsFiltered$annotation == "NONSYN"), length(unique(hgvsp[!grepl("(Ter|=)$",hgvsp)]))
  ))

  # logInfo(sprintf(
  #   "Data remains for for %d variants covering %d amino acid changes",
  #   length(hgvsc),length(unique(hgvsp))
  # ))

  

  #collapse codons into unique AA changes
  logInfo("Collapsing variants by outcome...")
  combiCounts <- as.df(tapply(1:nrow(rawCountsFiltered),hgvsp,function(is) {
    # mut <- unique(hgvsp[is])
    hgvsp <- hgvsp[is[[1]]]
    hgvsc <- paste(unique(hgvsc[is]),collapse=" ")
    c(list(hgvsp=hgvsp,hgvsc=hgvsc),colSums(rawCountsFiltered[is,conditions],na.rm=TRUE))
  },simplify=FALSE))

  logInfo("Parsing variant strings...")
  combiCountMuts <- parseHGVS(combiCounts$hgvsp)
  combiCountMuts$type[which(combiCountMuts$variant=="Ter")] <- "nonsense"
  logInfo("Parsing complete.")



  ###############################
  # Plot replicate correlations
  ###############################

  logInfo("Plotting replicate correlations...")

  panel.cor <- function(x, y,...){
    usr <- par("usr"); on.exit(par(usr)); par(usr=c(0,1,0,1))
    r <- cor(fin(cbind(x,y)))[1,2]
    txt <- sprintf("R = %.02f",r)
    # cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt)
  }

  labels <- paste0("rep.",1:ncol(condMatrix)) 
  imgSize <- max(4,ncol(condMatrix))

  pdfFile <- paste0(outdir,"replicateCorrelations_nonselect.pdf")
  pdf(pdfFile,imgSize,imgSize)
  # Plot non-select
  if (ncol(condMatrix) > 2) {
    pairs(
      combiCounts[,condMatrix["nonselect",]],
      lower.panel=panel.cor,pch=".",labels=labels,
      main="Non-select counts"
    )
  } else {
    plotTitle <- sprintf(
      "Non-select counts R = %.03f",
      cor(fin(combiCounts[,condMatrix["nonselect",]]))[1,2]
    )
    plot(
      combiCounts[,condMatrix["nonselect",]],
      main=plotTitle
    )
  }
  invisible(dev.off())

  # Plot phi after collapsing codons
  lfcRepsCombi <- log10((combiCounts[,condMatrix["select",]] - combiCounts[,condMatrix["controlS",]]) /
    (combiCounts[,condMatrix["nonselect",]] - combiCounts[,condMatrix["controlNS",]]))

  pdfFile <- paste0(outdir,"replicateCorrelations_logPhi.pdf")
  pdf(pdfFile,imgSize,imgSize)
  if (ncol(condMatrix) > 2) {
    pairs(
      lfcRepsCombi,pch=".",lower.panel=panel.cor,labels=labels,
      main="Select/Non-select Log-ratios"
    )
  } else {
    plotTitle <- sprintf(
      "Select/Non-select Log-ratios R = %.03f",
      cor(fin(lfcRepsCombi))[1,2]
    )
    plot(
      lfcRepsCombi, main=plotTitle,
      xlab=expression(log(phi[1])), ylab=expression(log(phi[2]))
    )
  }
  
  invisible(dev.off())

  ##############
  # Iterate over regions
  ##############
  for (region.i in 1:nrow(regions)) {

    regionStart <- regions[region.i,"start"]
    regionEnd <- regions[region.i,"end"]
    region.rows <- with(combiCountMuts,which(start >= regionStart & start <= regionEnd))
    region.syn <- regions[region.i,"syn"]
    region.stop <- regions[region.i,"stop"]

    if (length(region.rows) < 1) {
      logWarn(sprintf("\n\nNo variants found for region %d! Skipping...",region.i))
      next
    } else {
      logInfo(sprintf("\n\nProcessing region %d. (pos. %d-%d)",region.i,regionStart,regionEnd))
    }

    localCombiCounts <- combiCounts[region.rows,]
    localCombiCountMuts <- combiCountMuts[region.rows,]

    #########
    # Regularize SD of individual counts
    #########

    #Baldi & Long's formula
    bnl <- function(pseudo.n,n,model.sd,empiric.sd) {
      sqrt((pseudo.n * model.sd^2 + (n - 1) * empiric.sd^2)/(pseudo.n + n - 2))
    }

    #a helper function to construct an appropriately sized useful pseudocount 
    # pseudocount <- function(xs) if (all(xs==0)) 1e-4 else min(xs[xs!=0],na.rm=TRUE)/10
    ps <- 1e-4

    #calculate replicate means and coefficient of variation (CV) for each condition
    # (we use CV here instead of stdev, because that's what anti-correlates with read depth
    #  therefore, this will be the subject of our regression below)
    mcv <- do.call(cbind,lapply(condNames,function(cond) {
      mcv <- t(apply(localCombiCounts[,condMatrix[cond,]],1,function(xs){
        m <- mean(xs,na.rm=TRUE)
        # ps <- pseudocount(m)
        m <- m+ps
        c(m, (ps+sd(xs,na.rm=TRUE))/m)
      }))
      colnames(mcv) <- paste0(cond,c(".mean",".cv"))
      mcv
    }))

    #draw scatterplots for means vs CV
    pdfFile <- paste0(outdir,"region",region.i,"_regularizationInput.pdf")
    pdf(pdfFile,6,9)
    op <- par(mfrow=c(3,2))
    with(as.data.frame(mcv),{
      hist(log10(select.mean),breaks=100,col="gray",border=NA,main="")
      hist(log10(nonselect.mean),breaks=100,col="gray",border=NA,main="")
      plot(nonselect.mean,nonselect.cv,log="x",pch=".")
      plot(nonselect.mean,select.cv,log="x",pch=".")
      plot(nonselect.mean,controlNS.cv,log="x",pch=".")
      plot(controlNS.mean,controlNS.cv,log="x",pch=".")
    })
    par(op)
    invisible(dev.off())

    exportCoefficients <- function(z) {
      coef <- coefficients(z)
      coef <- coef[order(abs(coef),decreasing=TRUE)]
      paste(mapply(function(x,y)sprintf("%s = %.02f",x,y),x=names(coef),y=coef),collapse="; ")
    }

    #build regression input matrix by transforming to logspace and adding pseudocounts
    model.in <- log10(mcv)
    model.in <- as.data.frame(cbind(
      model.in,
      log.ratio=model.in[,"select.mean"]-model.in[,"nonselect.mean"]
    ))

    #for each condition, perform the regularization
    regul <- do.call(cbind,lapply(condNames,function(cond) {
      #construct the regression formula
      input.column <- paste0(cond,".cv")
      mean.column <- paste0(cond,".mean")
      #extract empirical cv
      empiric.cv <- mcv[,input.column]
      input.column.id <- which(colnames(model.in)==input.column)
      regr.formula <- as.formula(paste0(input.column,"~."))
      #identify duplicated input columns, so they can be excluded
      dupli <- setdiff(which(apply(model.in,2,function(x) {
        all(x == model.in[,input.column])
      })),input.column.id)
      .model.in <- if (length(dupli) > 0) model.in[,-dupli] else model.in
      #perform regression
      z <- lm(regr.formula,data=.model.in)
      logInfo(paste("Regression coefficents for",cond,":",exportCoefficients(z)))
      #calculate prior (log(cv)) from regressor
      prior.lcv <- predict(z)
      logInfo(sprintf("Prior PCC=%.02f",cor(10^prior.lcv,empiric.cv)))
      #calculate bayesian regularization
      observations <- length(repNames)
      bayes.cv <- bnl(pseudoObservations,observations,10^prior.lcv,empiric.cv)
      #calculate pessimistic regularization
      pessim.cv <- mapply(max,10^prior.lcv,empiric.cv)
      #calculate the 1% quantile stdev (excluding the pseudocounts) as a minimum floor
      minCut <- quantile(empiric.cv[empiric.cv > 1e-5],0.01)
      #apply that minimum flooring to the pessimisting value
      pessim.cv <- sapply(pessim.cv,function(x) if (x < minCut) minCut else x)
      #return the result
      means <- mcv[,mean.column]
      out <- cbind(
        10^prior.lcv, (10^prior.lcv)*means,
        bayes.cv, bayes.cv*means,
        pessim.cv, pessim.cv*means
      )
      colnames(out) <- paste0(cond,c(
        ".prior.cv",".prior.sd",
        ".bayes.cv",".bayes.sd",
        ".pessim.cv",".pessim.sd"
      ))
      return(out)
    }))

    combiCountsReg <- cbind(localCombiCounts,mcv,regul)


    ###################
    #Plot regularization results
    ###################
    pdfFile <- paste0(outdir,"region",region.i,"_regularizationRound1.pdf")
    pdf(pdfFile,6,12)
    op <- par(mfrow=c(4,2))
    for (cond in condNames) {
      topoScatter(
        log10(mcv[,paste0(cond,".cv")]),
        log10(regul[,paste0(cond,".prior.cv")]),
        resolution=80,
        xlab=expression("Empiric"~log[10](sigma/mu)),
        ylab=expression("Prior"~log[10](sigma/mu)),
        main=cond
      )
      abline(0,1,col="gray",lty="dashed")
      topoScatter(
        log10(mcv[,paste0(cond,".cv")]),
        log10(regul[,paste0(cond,".bayes.cv")]),
        resolution=80,
        xlab=expression("Empiric"~log[10](sigma/mu)),
        ylab=expression("Regularized"~log[10](sigma/mu))
      )
      abline(0,1,col="gray",lty="dashed")
    }
    par(op)
    invisible(dev.off())

    logInfo("First round of regularization complete.")


    #####################
    # Quality checks
    ####################

    #TODO: plot spatial distribution and histogram of sequencing error
    #


    #############
    #Filter based on high sequencing error
    # In this version, we just try to catch any straggles that weren't identified before regularization
    ############
    #Song's rule: Filter out anything where the nonselect count is smaller than the WT control plus three SDs.
    #(That is, where the nonselect count could be explained by sequencing error)
    if (conservativeMode) {
      flagged <- with(combiCountsReg, controlNS.mean + 3*controlNS.pessim.sd >= nonselect.mean)
    } else {
      flagged <- with(combiCountsReg, controlNS.mean + 3*controlNS.bayes.sd >= nonselect.mean)
    }
    logInfo(sprintf(
      "Post-regularization: Filtering out an additional %d variants (=%.02f%%) due to likely sequencing error.",
      sum(flagged), 100*sum(flagged)/nrow(combiCountsReg)
    ))
    combiCountsFiltered <- combiCountsReg[!flagged,]


    ############
    #Calculate raw scores
    ############

    #Helper function: Taylor approximation for variance of ratio
    approx.ratio.var <- function(mr,ms,vr,vs,covrs) {
      (mr^2)/(ms^2) * (vr/(mr^2) - 2*(covrs/(mr*ms)) + (vs/ms^2))
    }

    #pseudocounts
    c.ps <- 1e-4
    phivar.ps <- 1e-4

    rawScores <- as.df(lapply(1:nrow(combiCountsFiltered),function(i) {
      #mean numerator
      mnum <- combiCountsFiltered[i,"select.mean"]-combiCountsFiltered[i,"controlS.mean"]
      if (mnum < c.ps) mnum <- c.ps #apply flooring, so the wt control can't result in negatives counts
      #mean denominator
      mden <- combiCountsFiltered[i,"nonselect.mean"]-combiCountsFiltered[i,"controlNS.mean"]
      #variances
      if (conservativeMode) {
        #variance numerator
        vnum <- combiCountsFiltered[i,"select.pessim.sd"]^2+combiCountsFiltered[i,"controlS.pessim.sd"]^2
        #variance denominator
        vden <- combiCountsFiltered[i,"nonselect.pessim.sd"]^2+combiCountsFiltered[i,"controlNS.pessim.sd"]^2
      } else {
        vnum <- combiCountsFiltered[i,"select.bayes.sd"]^2+combiCountsFiltered[i,"controlS.bayes.sd"]^2
        vden <- combiCountsFiltered[i,"nonselect.bayes.sd"]^2+combiCountsFiltered[i,"controlNS.bayes.sd"]^2
      }
      #covariance of numerator and denominator
      covnumden <- cov(
        unlist(combiCountsFiltered[i,condMatrix["select",]])
        -unlist(combiCountsFiltered[i,condMatrix["controlS",]]),
        unlist(combiCountsFiltered[i,condMatrix["nonselect",]])
        -unlist(combiCountsFiltered[i,condMatrix["controlNS",]])
      )
      #Use helper function to estimate variance for phi
      phivar <- approx.ratio.var(mnum,mden,vnum,vden,covnumden)
      #As this is based on a Taylor approximation, we can sometimes get freaky negative values
      if (phivar < phivar.ps) phivar <- phivar.ps

      phi <- mnum/mden
      phisd <- sqrt(phivar)
      list(
        hgvsp=combiCountsFiltered[i,"hgvsp"],
        hgvsc=combiCountsFiltered[i,"hgvsc"],
        phiPrime=min(combiCountsFiltered[i,c("nonselect1","nonselect2")]),
        mean.c=mnum,
        mean.phi=phi,
        sd.phi=phisd,
        mean.lphi=log10(phi),
        sd.lphi=abs(phisd/(log(10)*phi))
      )
    }))

    #Filter out for selection bottlenecking
    flagged <- rawScores$mean.c <= c.ps
    logInfo(sprintf(
      "Filtering out %d variants (=%.02f%%) due to likely selection bottlenecking.",
      sum(flagged), 100*sum(flagged)/nrow(rawScores)
    ))
    rawScores <- rawScores[!flagged,]


    logInfo(sprintf(
      "Data remains for %d protein-level variants, %d of which are missense.",
      nrow(rawScores),sum(!grepl("Ter|=$",rawScores$hgvsp))
    ))


    ##############
    # 2nd round of regularization: This time for SD of scores
    ##############

    # #regression input matrix
    # splinemat <- with(rawScores,data.frame(
    #   logsd=log10(sd.lphi),
    #   logminbc=log10(phiPrime+1),
    #   lphi=mean.lphi
    # ))
    # #run regression
    # z <- lm(logsd ~.,data=splinemat)
    #calculate prior
    # priorSD <- 10^predict(z)

    logInfo("Starting second round of regularization.")

    #regression input matrix
    splinemat <- with(rawScores,data.frame(
      logcv=log10(sd.phi/mean.phi),
      logminbc=log10(phiPrime+.1)#,
      # lphi=mean.lphi
    ))
    #run regression
    z <- lm(logcv ~.,data=splinemat)
    #calculate prior
    priorSD <- 10^predict(z)*rawScores$mean.phi

    #apply regularization
    observations <- length(repNames)
    bayesSD <- bnl(pseudoObservations,observations,priorSD,rawScores$sd.phi)
    rawScores$bsd.phi=bayesSD
    rawScores$bsd.lphi=with(rawScores,abs(bsd.phi/(log(10)*mean.phi)))
    rawScores$df=if(conservativeMode) observations else observations+pseudoObservations

    #####################
    # Filter out broken values if any
    #####################
    broken <- which(is.na(rawScores$mean.lphi) | is.infinite(rawScores$mean.lphi))
    if (length(broken) > 0) {
      logWarn(sprintf("Values for %d variants could not be calculated and were removed!",
        length(broken)
      ))
      rawScores <- rawScores[-broken,]
    }

    ################
    # Plot results of regularization
    ################
    pdfFile <- paste0(outdir,"region",region.i,"_regularizationRound2.pdf")
    pdf(pdfFile,7,7)
    op <- par(mfrow=c(2,2))
    #Plot phiPrime vs SD
    with(rawScores[rawScores$sd.lphi < 1,],topoScatter(phiPrime+1,sd.lphi,log="x",maxFreq=35,thresh=3,
      resolution=40, xlab="Non-select count (per M.)", ylab=expression(sigma)
    ))
    #Plot score vs SD
    with(rawScores,topoScatter(mean.lphi,sd.lphi,log="y",pch=20,resolution=40, 
      xlab="Fitness score",ylab=expression(sigma),maxFreq=35,thresh=3
    ))
    #Plot phiPrime vs regSD
    if (sum(rawScores$bsd.lphi < 1) > 1) {
      with(rawScores[rawScores$bsd.lphi < 1,],topoScatter(phiPrime+1,bsd.lphi,log="x",maxFreq=35,thresh=3,
        resolution=40, xlab="Non-select count (per M.)", 
        ylab=expression("Bayesian Regularized"~sigma)
      ))
    }
    # abline(0,1,col="gray",lty="dashed")
    #Plot Empiric vs regularized
    with(rawScores,topoScatter(sd.lphi,bsd.lphi,resolution=60,maxFreq=30,log="xy",
      xlab=expression("Empiric"~sigma),ylab=expression("Bayesian Regularized"~sigma)
    ))
    abline(0,1,col="gray",lty="dashed")
    par(op)
    invisible(dev.off())


    #################
    # If the functional assay is based on inverse fitness, invert the scores
    #################
    if (inverseAssay) {
      rawScores$mean.lphi <- -rawScores$mean.lphi
      #stdev remains unchanged as sqrt(((-1)^2)) = 1
    }

    #################
    # Estimate Modes of synonymous and stop
    #################

    sdCutoff <- 0.3

    ####################3
    #TODO: Require minimum amount of filter passes rather than just 1
    #TODO: Try gaussian mixture models with two underlying distributions?
    #########################
    #if we can't find any syn/stop below the cutoff, increase the cutoff to 1
    if (with(rawScores,!any(grepl("Ter$",hgvsp) & bsd.lphi < sdCutoff) ||
      !any(grepl("=$",hgvsp) & bsd.lphi < sdCutoff) )) {
      sdCutoff <- 1
      #if we still can't find any below the new cutoff, get rid of it altogether
      if (with(rawScores,!any(grepl("Ter$",hgvsp) & bsd.lphi < sdCutoff) ||
        !any(grepl("=$",hgvsp) & bsd.lphi < sdCutoff) )) {
        sdCutoff <- Inf
      }
    }

    modes <- with(rawScores,{ 
      stops <- mean.lphi[which(grepl("Ter$",hgvsp) & bsd.lphi < sdCutoff)]
      syns <- mean.lphi[which(grepl("=$",hgvsp) & bsd.lphi < sdCutoff)]
      c(stop=median(stops,na.rm=TRUE),syn=median(syns,na.rm=TRUE))
    })


    #################
    #Plot Syn vs stop
    #################

    plotStopSyn <- function(data,title,modes) {
      with(data,{
        stop.is <- which(grepl("Ter$",hgvsp))
        syn.is <- which(grepl("=$",hgvsp))
        miss.is <- setdiff(1:nrow(data),c(stop.is,syn.is))
        br <- seq(floor(min(mean.lphi)),ceiling(max(mean.lphi)),.1)

        hist(mean.lphi[miss.is],col="gray",breaks=br,xlab=expression(log(phi)),border=NA,main=title)
        hist(mean.lphi[stop.is],add=TRUE,col=colAlpha("firebrick3",.5),breaks=br)
        hist(mean.lphi[syn.is],add=TRUE,col=colAlpha("darkolivegreen3",.5),breaks=br)
        abline(v=modes,col=c("firebrick3","darkolivegreen3"),lty="dashed")
      })
    }

    pdfFile <- paste0(outdir,"scalingQC_region",region.i,".pdf")
    pdf(pdfFile)
    op <- par(mfrow=c(2,1))
    plotStopSyn(rawScores,"Unfiltered",modes)
    legend("topright",c("missense","synonymous","stop"),fill=c("gray","darkolivegreen3","firebrick3"))
    if (any(rawScores$bsd.lphi < sdCutoff)) {
      plotStopSyn(rawScores[rawScores$bsd.lphi < sdCutoff,],
        bquote(sigma["regularized"] < .(sdCutoff)),
        modes
      )
    }
    par(op)
    invisible(dev.off())


    #################
    # Use syn/stop modes to scale scores
    #################

    logInfo(sprintf(
      "Scaling to synonymous (log(phi)=%.02f) and nonsense (log(phi)=%.02f) medians.",modes[["syn"]],modes[["stop"]]
    ))

    #if manual overrides for the synonymous and stop modes were provided, use them
    if (!is.na(region.syn)) {
      logInfo(sprintf(
        "Using manual override (=%.02f) for synonmous mode instead of automatically determined value (=%.02f).",region.syn,modes[["syn"]]
      ))
      modes[["syn"]] <- region.syn
    }
    if (!is.na(region.stop)) {
      logInfo(sprintf(
        "Using manual override (=%.02f) for stop mode instead of automatically determined value (=%.02f).",region.stop,modes[["stop"]]
      ))
      modes[["stop"]] <- region.stop
    }

    if (modes[["stop"]] >= modes[["syn"]]) {
      logErr("Stop mode is not below synonymous mode! Cannot normalize scores!")
      next
    }

    #apply the scaling
    denom <- modes[["syn"]]-modes[["stop"]]
    scoreMat <- with(rawScores,{
      sd <- bsd.lphi/denom
      score <- (mean.lphi - modes[["stop"]])/denom
      cbind(
        score=score,sd=sd,se=sd/sqrt(df)
      )
    })
    scores <- cbind(rawScores,scoreMat)


    ##################
    # Floor negatives and fix their excessive variances
    ##################

    if (!inverseAssay) {
      #the target null-like score towards which we will shift these values
      targetScore <- 0
      #the quantile for which we want to keep the p-value fixed
      quantile <- 1
      #the row numbers containing the cases to be fixed
      toFix <- which(scores$score < targetScore)
    } else {
      #if we're dealing with an inverse assay, we have to apply a ceiling instead of flooring
      #the target functional (but dead) score towards which we will shift these values
      targetScore <- 1
      #the quantile for which we want to keep the p-value fixed
      quantile <- 0
      #the row numbers containing the cases to be fixed
      toFix <- which(scores$score > targetScore)
    }
    #the equivalent sds of a normal distribution with the target mean based on the above area
    equivalent.sds <- with(scores[toFix,], sd*(quantile-targetScore)/(quantile-score))
    #apply the fixed values to the table
    scores$score.unfloored <- scores$score
    scores$sd.unfloored <- scores$sd
    scores$se.unfloored <- scores$se
    scores$score[toFix] <- targetScore
    scores$sd[toFix] <- equivalent.sds
    scores$se[toFix] <- equivalent.sds/sqrt(scores$df[toFix])
    
    logInfo(sprintf(
      "Flooring adjusted the values of %d variant scores",length(toFix)
    ))


    ##############################
    # Draw a histogram of standard errors in the dataset
    #############################
    pdfFile <- paste0(outdir,"errorProfile_region",region.i,".pdf")
    pdf(pdfFile,5,5)
    hist(
      log10(scores$se),col="gray",border=NA,breaks=50,
      main="Standard Error distribution",
      xlab=expression(log[10](sigma[bar(x)]))
    )
    dev.off()

    
    #################
    # Write output to file
    #################

    #detailed output of all intermediate results
    outfile <- paste0(outdir,"detailed_scores_region",region.i,".csv")
    write.csv(scores,outfile,row.names=FALSE)

    #protein-level MaveDB output
    mavedbProtScores <- scores[,c("hgvsp","score","sd","se")]
    colnames(mavedbProtScores) <- c("hgvs_pro","score","sd","se")
    outfile <- paste0(outdir,"mavedb_scores_perAA_region",region.i,".csv")
    write.csv(mavedbProtScores,outfile,row.names=FALSE)

    logInfo(sprintf(
      "Exported data for %d protein-level variants, %d of which are missense.",
      nrow(scores),sum(!grepl("Ter|=$",scores$hgvsp))
    ))

    #nucleotide-level MaveDB output
    mavedbNuclScores <- do.call(rbind,lapply(1:nrow(scores),function(i) {
      hgvsc <- strsplit(scores[i,"hgvsc"]," ")[[1]]
      data.frame(
        hgvs_nt=hgvsc,
        hgvs_pro = scores[i,"hgvsp"],
        score = scores[i,"score"],
        sd = scores[i,"sd"],
        se = scores[i,"se"]
      )
    }))
    outfile <- paste0(outdir,"mavedb_scores_perNt_region",region.i,".csv")
    write.csv(mavedbNuclScores,outfile,row.names=FALSE)

    logInfo(sprintf(
      "Exported data for %d codon-level variants, %d of which are missense.",
      nrow(mavedbNuclScores),sum(!grepl("Ter|=$",mavedbNuclScores$hgvs_pro))
    ))


  }

  ###################
  # Join the results into a single files
  ###################

  joinFiles <- function(filenameBase) {
    joined <- do.call(rbind,lapply(
      paste0(outdir,filenameBase,"_region",regions$region,".csv"),
      function(rfile) {
        if (file.exists(rfile)) {
          return(read.csv(rfile))
        } else {
          return(NULL)
        }
      }
    ))
    outfile <- paste0(outdir,filenameBase,".csv")
    write.csv(joined,outfile,row.names=FALSE)
  }

  joinFiles("detailed_scores")
  joinFiles("mavedb_scores_perNt")
  joinFiles("mavedb_scores_perAA")

}



#' QC plots for library coverage
#' @param dataDir path to the existing data main directory
#' @param outdir path to the output directory
#' @param mut2funcFile path to the mut2func file. Defaults to <dataDir>/mut2func_info.csv
#' @param logger optional yogilogger object. Defaults to NULL, in which case messages will go to stdout.
#' @param mc.cores number of CPU cores to use in parallel
#' @return NULL. Results are written to file.
#' @export
libraryQCLegacy <- function(dataDir,outdir,
  mut2funcFile=paste0(dataDir,"mut2func_info.csv"),
  logger=NULL,mc.cores=8) {

  # library(hgvsParseR)
  # library(yogilog)
  library(yogitools)
  library(hash)
  library(pbmcapply)

  options(stringsAsFactors=FALSE)

  if (!is.null(logger)) {
    stopifnot(inherits(logger,"yogilogger"))
  }

  logInfo <- function(...) {
    if (!is.null(logger)) {
      logger$info(...)
    } else {
      do.call(cat,c(list(...),"\n"))
    }
  }
  logWarn <- function(...) {
    if (!is.null(logger)) {
      logger$warning(...)
    } else {
      do.call(cat,c("Warning:",list(...),"\n"))
    }
  }

  ##############
  # Read and validate input data
  ##############

  canRead <- function(filename) file.access(filename,mode=4) == 0
  
  #make sure data and out dir exist and ends with a "/"
  if (!grepl("/$",dataDir)) {
    dataDir <- paste0(dataDir,"/")
  }
  if (!grepl("/$",outdir)) {
    outdir <- paste0(outdir,"/")
  }
  if (!dir.exists(dataDir)) {
    #we don't use the logger here, assuming that whichever script wraps our function
    #catches the exception and writes to the logger (or uses the log error handler)
    stop("Data folder does not exist!")
  }
  if (!dir.exists(outdir)) {
    logWarn("Output folder does not exist, creating it now.")
    dir.create(outdir,recursive=TRUE)
  }

  # mut2funcFile <- paste0(dataDir,"mut2func_info.csv")
  if (!canRead(mut2funcFile)) {
    stop("Unable to find or read 'mut2func_info.csv' file!")
  }
  # seqDepthFile <- paste0(dataDir,"resultfile/sequencingDepth.csv")
  # if (!canRead(seqDepthFile)) {
  #   stop("Unable to find or read 'sequencingDepth.csv' file!")
  # }


  mut2func <- read.csv(mut2funcFile)
  tileRanges <- apply(extract.groups(colnames(mut2func)[-1],"X(\\d+)\\.(\\d+)"),2,as.integer)


  #Interpret sequencing depth file and obtain sample information
  # seqDepthTable <- read.csv(seqDepthFile)
  # rxGroups <- extract.groups(seqDepthTable$SampleID,"SampleID(\\d+)_(\\w+)(\\d+)")
  # seqDepthTable$sample <- as.integer(rxGroups[,1])
  # seqDepthTable$condition <- rxGroups[,2]
  # seqDepthTable$replicate <- as.integer(rxGroups[,3])

  # #Isolate first replicate of nonselect samples
  # ns1.rows <- with(seqDepthTable,which(condition=="NS" & replicate == 1))
  # ns1.depth <- seqDepthTable[ns1.rows,]

  nsSamples <- unlist(mut2func[which(mut2func$tiles=="nonselect1"),-1])
  ns1.depth <- as.df(lapply(nsSamples, function(sid) {
    reportFile <- paste0(dataDir,"mutationCallfile/",sid,"report.txt")
    reportLines <- scan(reportFile,what="character",sep="\n",quiet=TRUE)
    depth <- as.numeric(extract.groups(reportLines[grep("sequencing depth",reportLines)],"is: (.+) million")[,1])
    list(sample=sid,depth=depth)
  }))


  logInfo("Compiling census...")

  #iterate over tiles/samples
  # censuses <- pbmclapply(1:nrow(ns1.depth), function(depth.row) {
  censuses <- lapply(1:nrow(ns1.depth), function(depth.row) {
    sample.id <- ns1.depth[depth.row,"sample"]
    sample.depth <- ns1.depth[depth.row,2]


    #read the file for mult-mutant CPMs (counts per million reads)
    multiCounts <- read.delim(
      paste0(dataDir,"mutationCallfile/",sample.id,"MultipleMut.txt"),
      header=FALSE
    )
    colnames(multiCounts) <- c("wtaa","pos","mutaa","wtcodon","mutcodon","cpm")

    wtaas <- strsplit(multiCounts$wtaa,"\\|")
    mutaas <- strsplit(multiCounts$mutaa,"\\|")
    positions <- strsplit(multiCounts$pos,"\\|")
    #create ids for multimutants
    multiVars <- mapply(function(w,p,m) paste0(w,p,m), wtaas, positions, mutaas, SIMPLIFY=FALSE)
    #number of aa-changes per multimutant
    nmuts <- sapply(multiVars,length)
    maxMuts <- max(nmuts)

    if (min(nmuts) < 2) {
      logWarn("MultiMuts contains single mutations!")
    }

    #create a census of the number of co-occurring mutations
    multiCensus <- tapply(multiCounts$cpm,nmuts,sum)
    #make sure no number is skipped!
    multiCensus <- multiCensus[as.character(2:maxMuts)]
    names(multiCensus) <- as.character(2:maxMuts)
    if (any(is.na(multiCensus))) {
      multiCensus[which(is.na(multiCensus))] <- 0
    }

    #infer expected single-mutant CPMS from multimutant CPMs
    multiTotals <- hash()
    #also infer complexity underpinning each aa-change (i.e. the number of unique multimutants that contain it)
    varComplexity <- hash()
    for (i in 1:length(multiVars)) {
      for (variant in multiVars[[i]]) {
        if (hash::has.key(variant,multiTotals)) {
          multiTotals[[variant]] <- multiTotals[[variant]] + multiCounts[i,"cpm"]
          varComplexity[[variant]] <- varComplexity[[variant]] + 1
        } else {
          multiTotals[[variant]] <- multiCounts[i,"cpm"]
          varComplexity[[variant]] <- 1
        }
      }
    }

    #Next, read the AAchange file. Note that these are not true single-mutants, but rather
    #the marginal counts for each amino acid change
    marginalCounts <- read.delim(
      paste0(dataDir,"mutationCallfile/",sample.id,"AAchange.txt"),
      header=FALSE
    )
    colnames(marginalCounts) <- c("wtaa","pos","mutaa","wtcodon","mutcodon","cpm")

    #collapse by aa-change
    marginalVariants <- with(marginalCounts,mapply(function(w,p,m) paste0(w,p,m), wtaa, pos, mutaa))
    marginalTotals <- hash()
    for (i in 1:length(marginalVariants)) {
      v <- marginalVariants[[i]]
      if (hash::has.key(v,marginalTotals)) {
        marginalTotals[[v]] <- marginalTotals[[v]] + marginalCounts[[i,"cpm"]]
      } else {
        marginalTotals[[v]] <- marginalCounts[[i,"cpm"]]
      }
    }

    #list of unique aa changes
    uniqueVars <- keys(marginalTotals)

    #calculate the true single mutant CPMs by subtracting the multi-mutant marginal CPMs
    # from the total marginal CPMs
    singletonCounts <- sapply(uniqueVars, function(uvar) {
      if (has.key(uvar,multiTotals)) {
        marginalTotals[[uvar]] - multiTotals[[uvar]]
      } else {
        marginalTotals[[uvar]] 
      }
    })


    #Finally, read the deletion and inseration CPMs
    delFile <- paste0(dataDir,"mutationCallfile/",sample.id,"deletion.txt")
    if (file.size(delFile) > 0) {
      delCounts <- read.delim(delFile,header=FALSE)
    } else delCounts <- t(rep(0,4))
    insFile <- paste0(dataDir,"mutationCallfile/",sample.id,"insertion.txt")
    if (file.size(insFile) > 0) {
      insCounts <- read.delim(insFile,header=FALSE)
    } else insCounts <- t(rep(0,4))
    indelCPM <- sum(delCounts[,2])+sum(insCounts[,4])


    #Now, having compiled the required numbers, we can combine them into a census table.
    #Unfortunately, we have to treat the indels separately, as we have no information on the 
    # cross-talk between indels and missense/nonsense variants.
    census <- c(
      indel = indelCPM,
      # WT = 1e6 - (indelCPM + sum(marginalCounts$cpm)),
      WT = 1e6 - (sum(marginalCounts$cpm)),
      single = sum(singletonCounts),
      multiCensus
    ) * 100 / 1e6


    #Fit a poisson-distribution to the census
    lambdas <- seq(0.01,4,0.01)
    dists <- sapply(lambdas, function(lambda) {
      sum(abs(census[-1] - dpois(0:maxMuts,lambda)*100))
    })
    # cat(min(dists),"\n")
    best.lambda <- lambdas[[which.min(dists)]]
    

    #Also generate a table that compares variant CPMs against variant complexity
    countVComplexity <- data.frame(
      cpm=singletonCounts[hash::keys(varComplexity)],
      complexity=hash::values(varComplexity)
    )

    return(list(census=census,lambda=best.lambda,complexity=countVComplexity))

  # },mc.cores=mc.cores)
  })


  censusRows <- lapply(censuses,`[[`,"census")
  N <- max(sapply(censusRows,length))
  censusTable <- do.call(rbind,lapply(censusRows,function(cr) c(cr,rep(0,N-length(cr)))))
  colnames(censusTable)[-(1:3)] <- 2:(N-2)

  lambdas <- sapply(censuses,`[[`,"lambda")

  complexities <- do.call(rbind,lapply(censuses,`[[`,"complexity"))

  ###########################
  # Plot read counts vs complexities
  #############################

  outfile <- paste0(outdir,"complexities.pdf")
  pdf(outfile,5,5)
  layout(rbind(c(2,4),c(1,3)),heights=c(2,8),widths=c(8,2))
  op <- par(mar=c(5,4,0,0)+.1)
  plot(complexities,pch=".",log="xy",ylab="unique variants per AA-change")
  grid()
  par(mar=c(0,4,1,0)+.1)
  hist(
    log10(complexities$cpm),breaks=50,main="",axes=FALSE,
    col="gray",border=NA
  )
  # axis(1,c(-15,-11,-7,-3,1))
  axis(2)
  par(mar=c(5,0,0,1)+.1)
  yhist <- hist(log10(complexities$complexity),breaks=50,plot=FALSE)
  with(yhist,plot(
    NA,xlim=c(0,max(counts)),ylim=range(breaks),
    xlab="Frequency",ylab="",main="",axes=FALSE
  ))
  axis(1)
  # axis(2,c(0,1,2))
  with(yhist,rect(0,breaks[-length(breaks)],counts,breaks[-1],col="gray",border=NA))
  par(op)
  invisible(dev.off())


  ############################
  # Plot each tile's census
  #############################

  #function to find a good plot layout for a given number of tiles
  layoutFactors <- function(x) {
    sx <- sqrt(x)
    a <- ceiling(sx)
    b <- if (floor(sx)*a < x) a else floor(sx)
    c(a,b)
  }

  cr <- layoutFactors(nrow(censusTable))

  plotCensus <- function(i) {
    depth <- ns1.depth[i,2]
    barplot(censusTable[i,],
      ylim=c(0,100),border=NA,ylab="% reads",xlab="mutant classes",
      col=c("firebrick3","gray",rep("darkolivegreen3",N-2)),
      main=sprintf("Tile #%d: %.02fM reads",i,depth)
    )
    grid(NA,NULL)
    midx <- mean(par("usr")[1:2])
    text(midx,60,bquote(lambda == .(lambdas[[i]])))
    text(midx,40,sprintf("%.02f%% indels",censusTable[i,"indel"]))
  }

  outfile <- paste0(outdir,"tileCensus.pdf")
  if (all(cr==c(2,1))) {
    pdf(outfile,5,10)
  } else {
    pdf(outfile,2*cr[[1]],2.5*cr[[2]])
  }
  layout(t(matrix(1:(cr[[1]]*cr[[2]]),ncol=cr[[1]])))
  op <- par(las=3,mar=c(6,4,3,1)+.1)
  invisible(lapply(1:nrow(censusTable),plotCensus))
  par(op)
  invisible(dev.off())


  ###########################
  # Extrapolate overall variant census
  #############################


  #extrapolate overall indel rate
  # = 1 minus prob of zero indels in all tilesr
  # overallIndel <- 100*(1-dbinom(0,nrow(censusTable),mean(censusTable[,"indel"])/100))
  overallIndel <- 100*(1-(1-mean(censusTable[,"indel"])/100)^nrow(censusTable))
  #and overall #mut per clone
  # = mean of lambdas times number of tiles = sum of lambdas
  overallLambda <- sum(lambdas)

  overallCensus <- c(indel=overallIndel,setNames(dpois(0:8,overallLambda),0:8)*(100-overallIndel))

  outfile <- paste0(outdir,"extrapolatedCensus.pdf")
  pdf(outfile,5,5)
  op <- par(las=3,mar=c(6,4,3,1)+.1)
  barplot(overallCensus,
    ylim=c(0,100),border=NA,ylab="% reads",xlab="mutant classes",
    col=c("firebrick3","gray",rep("darkolivegreen3",length(overallCensus)-2)),
    main="Extrapolation for whole CDS"
  )
  grid(NA,NULL)
  midx <- mean(par("usr")[1:2])
  text(midx,60,bquote(lambda == .(overallLambda)))
  text(midx,40,sprintf("%.02f%% indels",overallCensus[["indel"]]))
  par(op)
  invisible(dev.off())

  ## Plot distribution of lambdas
  # plot(function(x)dnorm(x,))
  # abline(v=jitter(lambdas),col="gray")


  ###################
  # Coverage heatmap
  ###################

  #parse all variant CPMs of all tiles and combine in one table
  allSingleCPMs <- do.call(rbind,lapply(1:nrow(ns1.depth), function(depth.row) {
    sample.id <- ns1.depth[depth.row,"sample"]
    sample.depth <- ns1.depth[depth.row,2]

    singleCounts <- read.delim(
      paste0(dataDir,"mutationCallfile/",sample.id,"AAchange.txt"),
      header=FALSE
    )
    colnames(singleCounts) <- c("wtaa","pos","mutaa","wtcodon","mutcodon","cpm")
    singleCounts
  }))

  #condense the table to unique amino acid changes
  mutcpms <- tapply(allSingleCPMs$cpm, with(allSingleCPMs,paste0(wtaa,pos,mutaa)), sum)
  allCPMs <- data.frame(
    mutname=names(mutcpms),
    wtaa=substr(names(mutcpms),1,1),
    pos=as.integer(substr(names(mutcpms),2,nchar(names(mutcpms))-1)),
    mutaa=substr(names(mutcpms),nchar(names(mutcpms)),nchar(names(mutcpms))),
    cpm=mutcpms
  )

  #calculate divider positions between tiles
  ntiles <- nrow(tileRanges)
  tileDividers <- tileRanges[-ntiles,2]


  #length protein (= highest AA position found)
  start <- min(allCPMs$pos)
  end <- max(allCPMs$pos)
  #prepare list of all possible amino acids
  aas <- toChars("AVILMFYWRHKDESTNQCGP_")

  #Prepare a color mapping function for CPM values
  # - find highest CPM value (to map to darkest color)
  maxPower <- ceiling(log10(max(allCPMs$cpm)))
  cmap <- yogitools::colmap(c(-1,2,maxPower),c("white","orange","firebrick3"))

  #Prepare heatmap coordinates and associated colors
  x <- allCPMs$pos
  y <- sapply(allCPMs$mutaa,function(aa) 22-which(aas==aa))
  colvals <- cmap(log10(allCPMs$cpm))

  #Determine plot size
  mapwidth <- end-start+2
  plotwidth <- mapwidth/10 + 3
  
  #Draw the heatmap plot
  outfile <- paste0(outdir,"coverageHeatmap.pdf")
  pdf(outfile,plotwidth*2/3,4)
  # layout(cbind(1,2),widths=c(mapwidth/10+1,3))
  layout(
    rbind(0:ntiles+3,c(rep(1,ntiles),2)),
    widths=c(rep(mapwidth/ntiles/10+1,ntiles),3),
    heights=c(1,1)
  )
  op <- par(xaxs="i",yaxs="i",mar=c(5,4,0,0)+.1)
  plot(
    NA,xlim=c(start-1,end+1),ylim=c(0,22),
    xlab="AA position",ylab="AA residue",
    axes=FALSE
  )
  axis(1)
  axis(2,at=21:1,aas,las=2,cex.axis=0.7)
  rect(start-0.5,0.5,end+.5,21.5,col="gray",border=NA)
  rect(x-0.5,y-0.5,x+0.5,y+0.5,col=colvals,border=NA)
  abline(v=tileDividers+.5)
  par(op)
  #legend
  op <- par(mar=c(5,0,0,6))
  plot(
    NA,xlim=c(0,1),ylim=c(-2.5,maxPower+.5),
    axes=FALSE,xlab="",ylab=""
  )
  rect(0,(-2:maxPower)-0.5,1,(-2:maxPower)+0.5,border=NA,
    col=cmap(c(NA,(-1:maxPower)))
  )
  axis(4,at=-2:maxPower,c("0","< 0.1","1",10^(1:(maxPower-1)),paste(">",10^maxPower)),las=2)
  mtext("Rel. reads (per mil.)",side=4,line=4)
  par(op)
  #census plots
  op <- par(las=3,mar=c(6,6,3,3)+.1)
  invisible(lapply(1:nrow(censusTable),plotCensus))
  par(op)
  invisible(dev.off())


  logInfo("libraryQC function completed successfully.")

}


#legacy converter for mut2func_info
convertLegacyMut2Func <- function(infile,outfile) {

  library(yogitools)

  op <- options(stringsAsFactors=FALSE); on.exit(options(op))

  m2f <- read.csv(infile,header=FALSE)

  tiles <- cbind(1:(ncol(m2f)-1),do.call(rbind,strsplit(unlist(m2f[1,-1]),"-")))
  colnames(tiles) <- c("id","start","end")

  condRep <- extract.groups(m2f[-1,1],"^(\\w+)(\\d+)$")

  samples <- do.call(rbind,lapply(2:nrow(m2f), function(i) {
    cbind(id=unlist(m2f[i,-1]),tile=1:(ncol(m2f)-1),cond=condRep[i-1,1],tp=1,rep=condRep[i-1,2])
  }))

  con <- file(outfile,open="w")
  write.table(tiles,con,row.names=FALSE,quote=FALSE,sep="\t")
  writeLines("",con)
  write.table(samples,con,row.names=FALSE,quote=FALSE,sep="\t")
  close(con)

}
jweile/tileseqMave documentation built on April 5, 2024, 4:51 p.m.