R/csvParam2Json.R

Defines functions latestSubDir getTimepointsFor getWTControlFor getNonselectFor getNonselects getSelects reachableChanges tilesInRegions parseParameters csvParam2Json checkCSV charProfile validateParameters

Documented in charProfile checkCSV csvParam2Json getNonselectFor getNonselects getSelects getTimepointsFor getWTControlFor latestSubDir parseParameters reachableChanges tilesInRegions validateParameters

# Copyright (C) 2018  Jochen Weile, Roth Lab
#
# This file is part of tileseqMave.
#
# tileseqMave is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# tileseqMave is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with tileseqMave.  If not, see <https://www.gnu.org/licenses/>.

#' run validation checks on parameters object
#'
#' @param params parameter object to validate
#' @param srOverride single replicate override - allows single replicates
#' @return TRUE if everything checks out, otherwise it throws errors
#' @export
validateParameters <- function(params,srOverride=FALSE) {

  if (srOverride) {
    logWarn("SINGLE REPLICATE OVERRIDE HAS BEEN ENABLED. This means:
 * No quality filtering can be performed!
 * The scoring function will be unable to generate error estimates!
 * The selectionQC function will be unable to create correlation plots!
 * Imputation and refinement will not be possible!
 * Downstream scripts and functions may be unable to process the data!
")
  }

  #Validate template sequence
  if (is.na(params$template$cds_start) || is.na(params$template$cds_end)) {
    stop("CDS start and end must be integer numbers!")
  }
  if (params$template$cds_start < 1 || params$template$cds_start > nchar(params$template$seq)) {
    stop("CDS start is out of bounds!")
  }
  if (params$template$cds_end < 1 || params$template$cds_end > nchar(params$template$seq)) {
    stop("CDS end is out of bounds!")
  }
  if (!grepl("^[ACGT]{3,}$",params$template$seq)) {
    stop("Template sequence is not a valid DNA sequence!")
  }
  cdsLength <- params$template$cds_end - params$template$cds_start + 1
  if (cdsLength %% 3 != 0) {
    stop("CDS end is not in-frame with start position!")
  }
  startCodon <- substr(params$template$seq,params$template$cds_start,params$template$cds_start+2)
  if (startCodon != "ATG") {
    stop("CDS start position is not a start codon (ATG)")
  }

  #Validate Uniprot Accession via Regex
  uniprotRX <- "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
  if (!grepl(uniprotRX,params$template$uniprot)) {
    stop("Invalid Uniprot Accession!")
  }

  #Check assay parameters
  if (!params$assay[["selection"]] %in% c("Positive","Negative")) {
    stop("Assay Selection must be either 'Positive' or 'Negative'!")
  }

  #check that at least one condition is defined
  if (length(params$conditions$names) < 1) {
    stop("Must define at least one condition!")
  }
  
  #check that at all condition names are unique
  if (any(duplicated(params$conditions$names))) {
    stop("All condition names must be unique!")
  }

  if (!all(grepl("^[A-Za-z][A-Za-z0-9]*$",params$conditions$names))) {
    stop("Condition names must be strictly alpha-numeric and start with a letter.")
  }

  #check condition table for validity
  #for QC runs, no relationships are declared, but in any other case, the following rules must apply:
  if (nrow(params$conditions$definitions) > 0) {
    #check that definitions use valid condition names
    defconds <- unique(c(
      params$conditions$definitions[,"Condition 1"],
      params$conditions$definitions[,"Condition 2"]
    ))
    if (!all(defconds %in% params$conditions$names)) {
      culprits <- defconds[which(!(defconds %in% params$conditions$names))]
      stop("The following condition(s) listed in the relationship definitions was(were) not declared in the list of conditions: ",
        paste(culprits,collapse=", ")
      )
    }
    #and that they use valid relationship names
    relationships <- c("is_selection_for","is_wt_control_for")
    if (!all(params$conditions$definitions[,"Relationship"] %in% relationships)) {
      rels <- params$conditions$definitions[,"Relationship"]
      culprits <- unique(rels[which(!(rels %in% relationships))])
      stop("Invalid relationship(s) in condition definitions: ",paste(culprits,collapse=", "))
    }
    #check that each condition has at least one relationship
    used <- sapply(params$conditions$names, function(cname) {
      cname %in% params$conditions$definitions[,"Condition 1"] || 
      cname %in% params$conditions$definitions[,"Condition 2"]
    })
    if (!all(used)) {
      stop(
        "All conditions must have at least one defined relationship! Missing: ",
        paste(params$conditions$names[!used],collapse=", ")
      )
    }
    #at least one selection condition must be declared:
    if (!("is_selection_for" %in% params$conditions$definitions[,"Relationship"])) {
      logWarn("No select-nonselect relationship was defined! Is this a QC run?")
    } else {
      #make sure that no selection condition has more than one nonselect partner
      numNS <- sapply(getSelects(params), function(sCond) length(getNonselectFor(sCond,params)))
      if (any(numNS > 1)) {
        culprits <- getSelects(params)[which(numNS > 1)]
        stop("Each selection conditon may only have one corresponding nonselect condition! Violations: ",culprits)
      }
    }
    #Check that each sel/nonsel condition has a WT control
    mainConds <- c(getSelects(params),getNonselects(params))
    hasWT <- sapply(mainConds, function(cond) {
      length(getWTControlFor(cond,params)) > 0
    })
    if (!all(hasWT)) {
      logWarn("No WT control defined for: ",paste(mainConds[!hasWT],collapse=", "))
    }
  } else {
    logWarn("No condition definitions detected! Is this a QC run?")
  }
  

  if (any(is.na(params$numReplicates))) {
    stop("Number of replicates must be integer numbers for each condition!")
  }
  minRep <- if (!srOverride && nrow(params$conditions$definitions) > 0 && length(getSelects(params)) > 0) 2 else 1
  if (any(params$numReplicates < minRep)) {
    stop("Number of replicates must be at least ",minRep,"!")
  }

  if (any(is.na(params$numTimepoints))) {
    stop("Number of time points must be integer numbers for each condition!")
  }
  if (any(params$numTimepoints < 1)) {
    stop("Each condition must have at least 1 timepoint!")
  }
  # if (!all(params$numTimepoints == params$numTimepoints[[1]])) {
  #   logWarn("Differing numbers of timepoints per condition are not yet supported for scoring!")
  # }
  for (sel in getSelects(params)) {
    selTP <- params$numTimepoints[[sel]]
    nsel <- getNonselectFor(sel,params)
    nselTP <- params$numTimepoints[[nsel]]
    if (nselTP > 1 && nselTP != selTP) {
      stop("nonselection condition \'",nsel,"\' must either have only one time point, ",
           "or as many as its paired selection condition \'",sel,"\' (",selTP,")"
      )
    }
  }
  for (cond in c(getSelects(params),getNonselects(params))) {
    condTP <- params$numTimepoints[[cond]]
    wt <- getWTControlFor(cond,params)
    if (length(wt) > 0) {
      wtTP <- params$numTimepoints[[wt]]
      if (wtTP > 1 && wtTP != condTP) {
        stop("WT control condition \'",wt,"\' must either have only one time point, ",
             "or as many as its paired primary condition \'",cond,"\' (",condTP,")"
        )
      }
    }
  }

  #validate tiles
  if (nrow(params$tiles) < 1) {
    stop("Must define at least one tile!")
  }
  if (!all(params$tiles%%1==0)) {
    stop("All Tile IDs must be integer numbers, as well as their starts and ends positions!")
  }
  if (any(duplicated(params$tiles[,1]))){
    stop("Tile IDs must be unique!")
  }
  if (any(params$tiles[,"Start AA"] < 2)) {
    offender <- params$tiles[which(params$tiles[,"Start AA"] < 2),1]
    logWarn("Tile",paste(offender,collapse=", "),"includes start codon!!")
  }
  if (nrow(params$tiles) > 0) {
    for (i in 1:nrow(params$tiles)) {
      tl <- params$tiles[i,"End AA"]-params$tiles[i,"Start AA"]+1
      if (tl < 0) {
        stop(
          "Tile ",params$tiles[i,1],"'s end position is smaller than start position!"
        )
      }
      if (tl < 2) {
        logWarn(
          "Tile ",params$tiles[i,1],"is less than 2AA long! Please double-check!"
        )
      }
    }
  }
  if (nrow(params$tiles) > 1) {
    for (i in 2:nrow(params$tiles)) {
      for (j in 1:(i-1)) {
        ji <- params$tiles[i,"Start AA"] > params$tiles[j,"End AA"]
        ij <- params$tiles[j,"Start AA"] > params$tiles[i,"End AA"]
        if (!(ji || ij)) {
          stop(
            "Tiles ",params$tiles[i,1]," and ",params$tiles[j,1]," are overlapping!\n",
            "Tiles must not overlap!"
          )
        }
      }
    }
  }
  
  #validate regions
  if (nrow(params$regions) < 1) {
    stop("Must define at least one region!")
  }
  if (!all(params$regions%%1==0)) {
    stop("Region IDs must be integer numbers, as well as their starts and ends!")
  }
  if (any(duplicated(params$regions[,1]))){
    stop("Region IDs must be unique!")
  }
  if (any(params$regions[,"Start AA"] < 2)) {
    offender <- params$regions[which(params$regions[,"Start AA"] < 2),1]
    logWarn("Region",paste(offender,collapse=", "),"includes start codon!!")
  }
  
  if (nrow(params$regions) > 0) {
    for (i in 1:nrow(params$regions)) {
      tl <- params$regions[i,"End AA"]-params$regions[i,"Start AA"]+1
      if (tl < 0) {
        stop(
          "Region ",params$regions[i,1],"'s end position is smaller than start position!"
        )
      }
      if (tl < 2) {
        logWarn(
          "Region ",params$regions[i,1],"is less than 2AA long! Please double-check!"
        )
      }
    }
  }
  if (nrow(params$regions) > 1) {
    for (i in 2:nrow(params$regions)) {
      for (j in 1:(i-1)) {
        ji <- params$regions[i,"Start AA"] > params$regions[j,"End AA"]
        ij <- params$regions[j,"Start AA"] > params$regions[i,"End AA"]
        if (!(ji || ij)) {
          stop(
            "Regions ",params$regions[i,1]," and ",params$regions[j,1]," are overlapping!\n",
            "Regions must not overlap!"
          )
        }
      }
    }
  }
  
  #validate time points
  if (any(duplicated(params$timepoints[,"Time point name"]))) {
    stop("Time point names must be unique!")
  }
  if (any(is.na(params$timepoints[,"Time"]))) {
    stop("Time point values must be numeric!")
  }
  #TODO: Validate time units (w,d,h,m,s,ms)

  #validate the sample sheet
  if (!all(params$samples[,"Tile ID"] %in% params$tiles[,"Tile Number"])) {
    tid <- params$samples[,"Tile ID"]
    culprits <- unique(tid[which(!(tid %in% params$tiles[,"Tile Number"]))])
    stop("Undeclared tiles found in sample sheet:",
      paste(culprits,collapse=", "),
      "\nPlease declare them in the Tile table!"
    )
  }
  if (!all(grepl("^[A-Za-z0-9-]+$",params$samples[,"Sample ID"]))) {
    sid <- params$samples[,"Sample ID"]
    culprits <- sid[which(!grepl("^[A-Za-z0-9-]+$",sid))]
    stop("Sample IDs must not contain special characters except minus signs!\n",
      "Violations: ",paste(culprits,collapse=", ")
    )
  }
  if (!all(params$samples[,"Condition"] %in% params$conditions$names)) {
    cid <- params$samples[,"Condition"]
    culprits <- unique(cid[which(!(cid %in% params$conditions$names))])
    stop("Undeclared conditions found in sample sheet:",
      paste(culprits,collapse=", "),
      "\nPlease declare them in the list of conditions!"
    )
  }
  if (!all(params$samples[,"Time point"] %in% params$timepoints[,"Time point name"])) {
    tid <- params$samples[,"Time point"]
    culprits <- unique(tid[which(!(tid %in% params$timepoints[,"Time point name"]))])
    stop("Undeclared time points found in sample sheet:",
      paste(culprits,collapse=", "),
      "\nPlease declare them in the time point definition section!"
    )
  }
  #check for duplicated sample ids
  if (any(duplicated(params$samples$`Sample ID`))) {
    culprits <- params$samples[which(duplicated(params$samples$`Sample ID`)),"Sample ID"]
    logWarn(
      "Duplicated Sample IDs!!\n",
      "The following samples are used multiple times: ",
      paste(culprits,collapse = ", ")
   )
  }
  
  #check for duplicated sample semantics
  semStr <- with(params$samples, paste(`Tile ID`,Condition,`Time point`,Replicate,sep="."))
  if (any(duplicated(semStr))) {
    culprits <- params$sample$`Sample ID`[which(duplicated(semStr))]
    stop(
      "Duplicated Sample definitions!!\n",
      "The following samples have identical definitions:",
      paste(culprits,collapse=", ")
    )
  }

  #check if conditions completely cover all tiles that are used in experiment
  for (tp in params$timepoints[,1])
  for (cond in params$conditions$names) {
    for (repi in 1:(params$numReplicates[[cond]])) {
      srows <- with(params$samples,which(`Time point`==tp & Condition==cond & Replicate==repi))
      tilesFound <- params$samples[srows,"Tile ID"]
      missing <- setdiff(params$samples[,"Tile ID"], tilesFound)
      if (length(missing) > 0) {
        stop("Missing samples for tiles ",paste(missing,collapse=", "),
          " in condition ",cond," timepoint ",tp," replicate ",repi,".\n",
          "Please make sure that all defined tiles have samples for all conditions!"
        )
      }
    }
  }

  repCombos <- apply(params$samples[,c("Tile ID","Condition","Time point")],1,paste,collapse="\t")
  repPerCombo <- tapply(params$samples[,"Replicate"],repCombos,function(x)length(unique(x)))
  comboCondition <- do.call(rbind,strsplit(names(repPerCombo),"\\t"))[,2]
  if (any(repPerCombo != params$numReplicates[comboCondition])) {
    culprits <- which(repPerCombo != params$numReplicates[comboCondition])
    stop("The following tile-condition-time combination(s) do(es) not have ",
         "the correct set of replicates:\n",paste(names(repPerCombo)[culprits],collapse="\n"))
  }
  
  tpCombos <- apply(params$samples[,c("Tile ID","Condition","Replicate")],1,paste,collapse="\t")
  tpPerCombo <- tapply(params$samples[,"Time point"],tpCombos,function(x)length(unique(x)))
  comboCondition <- do.call(rbind,strsplit(names(tpPerCombo),"\\t"))[,2]
  if (any(tpPerCombo != params$numTimepoints[comboCondition])) {
    culprits <- which(tpPerCombo != params$numTimepoints[comboCondition])
    stop("The following tile-condition-replicate combination(s) do(es) not have ",
         "the correct set of time points:\n",paste(names(tpPerCombo)[culprits],collapse="\n"))
  }
  
  #validate metaparameters
  expected <- c("posteriorThreshold","minCover","mutRate","maxDrop")
  if (!all(expected %in% names(params$varcaller))) {
    stop("Parameter sheet JSON file is out of date with software version!\n",
         "Please re-run csv2json.R !")
  }
  if (is.na(params$varcaller$posteriorThreshold)) {
    stop("Posterior threshold must be numeric!")
  }
  if (params$varcaller$posteriorThreshold < 0.5 || params$varcaller$posteriorThreshold >= 1) {
    stop("Posterior threshold must be within greater or equal to 0.5 and less than 1!")
  }
  if (is.na(params$varcaller$minCover)) {
    stop("Minimum cover parameter must be numeric!")
  }
  if (params$varcaller$minCover < 0 || params$varcaller$minCover > 1) {
    stop("Minimum cover must be between 0 and 1!")
  }
  if (is.na(params$varcaller$mutRate)) {
    stop("Mutation rate must be numeric!")
  }
  if (params$varcaller$mutRate < 0 || params$varcaller$mutRate > 1) {
    stop("Mutation rate must be between 0 and 1!")
  }
  if (params$varcaller$mutRate > 0.1) {
    logWarn("Mutation rate parameter is set unusually high! Are you sure about this?")
  }
  if (params$varcaller$maxDrop < 0 || params$varcaller$maxDrop > 1) {
    stop("Maximum depth drop must be between 0 and 1!")
  }
  if (params$varcaller$maxDrop < 0.05) {
    logWarn("Maximum depth drop is set unusually low! Are you sure about this?")
  }
  if (params$varcaller$maxDrop > 0.5) {
    logWarn("Maximum depth drop is set unusually high! Are you sure about this?")
  }
  
  expected <- c(
    "countThreshold","pseudo.n","sdThreshold","wtQuantile",
    "cvDeviation","lastFuncPos","lpdCutoff"
  )
  if (!all(expected %in% names(params$scoring))) {
    stop("Parameter sheet JSON file is out of date with software version!\n",
         "Please re-run csv2json.R !")
  }
  if (is.na(params$scoring$countThreshold)) {
    stop("Minimum read count must be an integer!")
  }
  if (params$scoring$countThreshold < 1) {
    stop("Minimum read count must be at least 1!")
  }
  if (is.na(params$scoring$pseudo.n)) {
    stop("Pseudo-replicates must be an integer!")
  }
  if (params$scoring$pseudo.n < 1) {
    stop("Pseudo-replicates must be at least 1!")
  }
  if (is.na(params$scoring$sdThreshold)) {
    stop("SD threshold must be numeric!")
  }
  if (params$scoring$sdThreshold <= 0) {
    stop("SD threshold must be greater than 0!")
  }
  if (is.na(params$scoring$wtQuantile)) {
    stop("WT filter quantile must be numeric!")
  }
  if (params$scoring$wtQuantile < 0.5 || params$scoring$wtQuantile >= 1) {
    stop("WT filter quantile only allows values from interval [0.5;1[ ")
  }
  if (is.na(params$scoring$cvDeviation)) {
    stop("Replicate disagreement factor (cvDeviation) must be numeric!")
  }
  if (params$scoring$cvDeviation < 1) {
    stop("Replicate disagreement factor (cvDevation) must be greater than 1!")
  }
  if (is.na(params$scoring$lastFuncPos)) {
    stop ("'Last functional AA position' must be numeric or 'Inf'!")
  }
  if (params$scoring$lastFuncPos < 1 ) {
    stop("'Last functional AA position' must be a positive number!")
  }
  #protein length
  plen <- (params$template$cds_end-params$template$cds_start+1)/3
  if (params$scoring$lastFuncPos < plen/2) {
    logWarn("'Last functional AA position' is set very low! Are you sure?")
  } else if (is.finite(params$scoring$lastFuncPos) && params$scoring$lastFuncPos > plen) {
    logWarn("'Last functional AA position' is beyond protein length! Did you mean 'Inf'?")
  }
  if (is.na(params$scoring$lpdCutoff)) {
    stop("Maxium logPhi SD parameter must be numeric!")
  }
  if (params$scoring$lpdCutoff <= 0) {
    stop("Maxium logPhi SD parameter must be greater than 0!")
  }

  #validate pivots table
  if (length(params$pivots) > 0 && nrow(params$pivots) > 0) {
    if (!all(params$pivots$Condition %in% getSelects(params))) {
      stop("Score scaling pivots can only be defined for selection conditions.")
    }
    if (!all(params$pivots$`Time point` %in% params$timepoints$`Time point name`)) {
      stop("Invalid time-point in pivots table!")
    }
    if (!all(params$pivots$Region %in% params$regions$`Region Number`)) {
      stop("Invalid region(s) in pivots table!")
    }
    if (!all(params$pivots$Type %in% c("synonymous","nonsense"))) {
      stop("Invalid entries in pivots table: Type must be 'synonymous' or 'nonsense'!")
    }
    if (any(is.na(params$pivots$Value))) {
      stop("Invalid entries in pivots table: Value must be numeric!")
    }
  }

  #TODO: Validate that the entire CDS in covered with tiles and regions

  return(TRUE)
}


#' Get a contingency table of characters in a file
#'
#' @param filename the file to analyze
#'
#' @return a contingency table of the characters in the file
#' @export
charProfile <- function(filename) {
  chars <- yogitools::toChars(readChar(filename,file.info(filename)$size))
  sort(table(chars),decreasing=TRUE)
}

#' Checks if a file is a CSV file or throws errors
#'
#' @param filename the file to check
#'
#' @return nothing
#' @export
checkCSV <- function(filename) {
  if (!grepl("\\.csv$",filename)) {
    stop(filename," is not a CSV file!")
  }
  tryCatch({
    chars <- charProfile(filename)
  },error=function(e) {
    stop(filename, " is not a CSV file!\n",
         "(It may have been corrupted or accidentally saved in a binary format.) \n",
         "Details:",conditionMessage(e))
  })
  if (!("," %in% head(names(chars))) || chars[["\n"]] < 10) {
    stop(filename, " is not following proper CSV format specifications!")
  }
}


#' convert CSV input parameter file to JSON format
#'
#' @param infile the input CSV file
#' @param outfile the output JSON file. Defaults to parameters.json in the same directory
#' @return NULL. Results are written to file.
#' @export
csvParam2Json <- function(infile,outfile=sub("[^/]+$","parameters.json",infile),srOverride=FALSE) {

  op <- options(stringsAsFactors=FALSE)

  #for writing JSON output
  library(RJSONIO)
  #for helper functions
  library(yogitools)


  #check that the file is indeed a csv file and can be read
  if (!canRead(infile)) {
    stop("Input file cannot be read!")
  }
  checkCSV(infile)

  #read the file into a list of lists
  csv <- strsplit(scan(infile,what="character",sep="\n",quiet=TRUE),",")
  #remove any potential quotes introduced by Excel
  csv <- lapply(csv,function(row) {
    sapply(row,function(cell) {
      trimws(gsub("\"","",cell))
    })
  })
  #Extract the first column
  col1 <- sapply(csv,`[[`,1)

  #Helper function to check if row exists
  hasRow <- function(rowname) {
    any(col1==rowname)
  }

  #helper function to locate a named row
  getRow <- function(rowname) {
    if (!hasRow(rowname)) {
      stop("Parameter sheet is missing mandatory entry: ",rowname)
    }
    i <- which (col1==rowname)
    if (is.null(i))  stop("Missing field: ",rowname)
    return(i)
  }

  extractTable <- function(firstField,nextSection) {
    iHead <- getRow(firstField)
    #table header
    headers <- csv[[iHead]]
    headers <- headers[!(headers == "" | is.na(headers))]
    #find the end of the table
    iEnd <- iHead
    while (iEnd < length(col1) && !(col1[[iEnd+1]] %in% c("",nextSection))) iEnd <- iEnd+1
    #if the table is empty...
    if (iEnd == iHead) {
      return(data.frame())
      # return(matrix(ncol=length(headers),nrow=0,dimnames=list(NULL,headers)))
    } else {
      #extract the table data and apply formatting
      rawTable <- do.call(rbind,csv[(iHead+1):iEnd])
      # formTable <- apply(rawTable[,1:length(headers)],c(1,2),as.integer)
      formTable <- rawTable[,1:length(headers),drop=FALSE]
      colnames(formTable) <- headers
      return(formTable)
    }
  }

  #prepare output data structure
  output <- list()

  #extract project name
  output$project <- csv[[getRow("Project name:")]][[2]]

  #extract template sequence
  output$template <- list()
  output$template$geneName <- csv[[getRow("Gene name:")]][[2]]
  output$template$seq <- toupper(csv[[getRow("Sequence:")]][[2]])
  output$template$cds_start <- as.integer(csv[[getRow("CDS start:")]][[2]])
  output$template$cds_end <- as.integer(csv[[getRow("CDS end:")]][[2]])
  output$template$uniprot <- csv[[getRow("Uniprot Accession:")]][[2]]

  #extract assay parameters
  output$assay <- list()
  output$assay$name <- csv[[getRow("Assay Name:")]][[2]]
  # output$assay$description <- csv[[getRow("Assay Description:")]][[2]]
  output$assay$selection <- switch(csv[[getRow("Negative selection?")]][[2]],
    Yes="Negative", No="Positive", stop("Negative selection must be 'Yes' or 'No'!")
  )
  

  #extract conditions, replicates and timepoints
  conditions <- csv[[getRow("Condition IDs:")]][-1]
  conditions <- conditions[!(conditions=="" | is.na(conditions))]
  output$conditions <- list()
  output$conditions$names <- as.vector(conditions)
  reps <- as.integer(csv[[getRow("Number of Replicates:")]][-1])
  if (length(reps) < length(conditions)) {
    stop("Replicate numbers missing for some conditions!")
  }
  reps <- setNames(reps[1:length(conditions)],conditions)
  output$numReplicates <- reps
  tps <- as.integer(csv[[getRow("Number of time points:")]][-1])
  if (length(tps) < length(conditions)) {
    stop("Replicate numbers missing for some conditions!")
  }
  tps <- setNames(tps[1:length(conditions)],conditions)
  output$numTimepoints <- tps

  #extract regions table
  regionTable <- as.data.frame(extractTable(firstField="Region Number",nextSection="Sequencing Tiles"))
  # regionTable <- apply(regionTable,2,as.integer)
  regionTable$`Region Number` <- as.integer(regionTable$`Region Number`)
  regionTable$`Start AA` <- as.integer(regionTable$`Start AA`)
  regionTable$`End AA` <- as.integer(regionTable$`End AA`)
  output$regions <- regionTable

  #extract tile table
  tileTable <- extractTable(firstField="Tile Number",nextSection="Condition definitions")
  tileTable <- apply(tileTable,2,as.integer)
  if (!inherits(tileTable,"matrix")) {
    tileTable <- rbind(tileTable)
    rownames(tileTable) <- NULL
  }
  output$tiles <- tileTable

  #extract condition definitions
  if (!hasRow("Condition 1") || !hasRow("Condition definitions")) {
    stop("Parameter sheet is missing the mandatory table 'Condition definitions'!")
  }
  conditionTable <- extractTable(firstField="Condition 1",nextSection="Time point definitions")
  output$conditions$definitions <- conditionTable

  #Time point definitions
  timeTable <- as.data.frame(extractTable(firstField="Time point name",nextSection="Sequencing samples"))
  timeTable$Time <- as.numeric(timeTable$Time)
  output$timepoints <- timeTable


  #Extract sample sheet
  sampleTable <- as.data.frame(extractTable(firstField="Sample ID",nextSection=""))
  sampleTable$`Tile ID` <- as.integer(sampleTable$`Tile ID`)
  sampleTable$Replicate <- as.integer(sampleTable$Replicate)
  output$samples <- sampleTable

  #Extract meta-parameters
  output$varcaller <- list()
  if (hasRow("Posterior threshold:")) {
    output$varcaller$posteriorThreshold <- as.numeric(csv[[getRow("Posterior threshold:")]][[2]])
  } else {
    logWarn("No posterior threshold specified. Defaulting to 0.5")
    output$varcaller$posteriorThreshold <- 0.5
  }
  if (hasRow("Minimum coverage:")) {
    output$varcaller$minCover <- as.numeric(csv[[getRow("Minimum coverage:")]][[2]])
  } else {
    logWarn("No minimum coverage specified. Defaulting to 0.6")
    output$varcaller$minCover <- 0.6
  }
  if (hasRow("Per-base mutation rate:")) {
    output$varcaller$mutRate <- as.numeric(csv[[getRow("Per-base mutation rate:")]][[2]])
  } else {
    logWarn("No mutation rate specified. Defaulting to 0.0025")
    output$varcaller$mutRate <- 0.0025
  }
  if (hasRow("Maximum depth drop:")) {
    output$varcaller$maxDrop <- as.numeric(csv[[getRow("Maximum depth drop:")]][[2]])
  } else {
    logWarn("No maximum depth drop specified. Defaulting to 0.2")
    output$varcaller$maxDrop <- 0.2
  }
  output$scoring <- list()
  if (hasRow("Minimum read count:")) {
    output$scoring$countThreshold <- as.integer(csv[[getRow("Minimum read count:")]][[2]])
  } else {
    logWarn("No minimum read count specified. Defaulting to 1")
    output$scoring$countThreshold <- 1L
  }
  if (hasRow("Pseudo-replicates:")) {
    output$scoring$pseudo.n <- as.integer(csv[[getRow("Pseudo-replicates:")]][[2]])
  } else {
    logWarn("No pseudo-replicates specified. Defaulting to 8")
    output$scoring$pseudo.n <- 8L
  }
  if (hasRow("SD threshold:")) {
    output$scoring$sdThreshold <- as.numeric(csv[[getRow("SD threshold:")]][[2]])
  } else {
    logWarn("No SD threshold specified. Defaulting to 0.3")
    output$scoring$sdThreshold <- 0.3
  }
  if (hasRow("WT filter quantile:")) {
    output$scoring$wtQuantile <- as.numeric(csv[[getRow("WT filter quantile:")]][[2]])
  } else {
    logWarn("No WT filter quantile specified. Defaulting to 0.95")
    output$scoring$wtQuantile <- 0.95
  }
  if (hasRow("Replicate disagreement factor:")) {
    output$scoring$cvDeviation <- as.numeric(csv[[getRow("Replicate disagreement factor:")]][[2]])
  } else {
    logWarn("No replicate disagreement factor specified. Defaulting to 10.")
    output$scoring$cvDeviation <- 10
  }
  if (hasRow("Last functional AA position:")) {
    output$scoring$lastFuncPos <- as.numeric(csv[[getRow("Last functional AA position:")]][[2]])
  } else {
    output$scoring$lastFuncPos <- Inf
  }
  if (hasRow("Maximum logPhi SD:")) {
    output$scoring$lpdCutoff <- as.numeric(csv[[getRow("Maximum logPhi SD:")]][[2]])
  } else {
    output$scoring$lpdCutoff <- 1
  }
  
  #Extract scale pivots
  output$pivots <- list()
  #Pivots table used to be called "Score normalization override", so we provide backwards compatibility here
  if ((hasRow("Score normalization overrides") || hasRow("Score scaling pivots")) && hasRow("Condition")) {
    overrideTable <- as.data.frame(extractTable(firstField="Condition",nextSection=""))
    overrideTable$`Region` <- as.numeric(overrideTable$`Region`)
    overrideTable$`Value` <- as.numeric(overrideTable$`Value`)
    output$pivots <- overrideTable
  }


  #Run validation on all parameters
  withCallingHandlers(
    validateParameters(output,srOverride=srOverride),
    warning=function(w)logWarn(conditionMessage(w))
  )

  #convert output to JSON and write to file
  logInfo("Writing output to",outfile,"\n")
  
  con <- file(outfile,open="w")
  #workaround for RJSONIO's inability to deal with infinity values
  output$scoring$lastFuncPos <- as.character(output$scoring$lastFuncPos)
  writeLines(toJSON(output),con)
  close(con)

  options(op)

  logInfo("Conversion successful!\n")
  invisible(return(NULL))
}

#' parse JSON parameter file
#'
#' @param filename the input CSV file
#' @return the parameter object, as a list of lists
#' @export
parseParameters <- function(filename,srOverride=FALSE) { 

  op <- options(stringsAsFactors=FALSE)

  #for writing JSON output
  library(RJSONIO)
  #for helper functions
  library(yogitools)

  #check that the file is indeed a json file and can be read
  stopifnot(grepl("\\.json$",filename), canRead(filename))

  #parse JSON to list of lists
  params <- fromJSON(filename,nullValue=NA)

  #rebuild tables and dataframes from lists
  params$conditions$definitions <- do.call(rbind,params$conditions$definitions)
  if (is.null(params$conditions$definitions)) {
    params$conditions$definitions <- matrix(nrow=0,ncol=3,dimnames=list(NULL,c("Condition 1","Relationship","Condition 2")))
  }
  # if (!inherits(params$regions,"list")) {
  #   params$regions <- list(params$regions)
  # }
  # params$regions <- do.call(rbind,params$regions)
  # if (!inherits(params$tiles,"list")) {
  #   params$tiles <- list(params$tiles)
  # }
  regCols <- names(params$regions)
  params$regions <- as.data.frame(as.list(params$regions))
  colnames(params$regions) <- regCols

  params$tiles <- do.call(rbind,params$tiles)

  timeCols <- names(params$timepoints)
  params$timepoints <- as.data.frame(params$timepoints)
  colnames(params$timepoints) <- timeCols

  sampleCols <- names(params$samples)
  params$samples <- as.data.frame(params$samples)
  colnames(params$samples) <- sampleCols

  if (length(params$pivots) > 0 && length(params$pivots[[1]]) > 0) {
    pCols <- names(params$pivots)
    params$pivots <- as.data.frame(params$pivots)
    colnames(params$pivots) <- pCols
  } else {
    params$pivots <- NULL
  }

  #make sure lists remain lists
  params$varcaller <- as.list(params$varcaller)
  params$scoring <- as.list(params$scoring)
  #workaround for bug in RJSONIO (dealing with 'Inf' values)
  params$scoring$lastFuncPos <- as.numeric(params$scoring$lastFuncPos)

  #run full validation of the parameter object
  validateParameters(params,srOverride=srOverride)

  #calculate CDS and protein sequence
  cdsLength <- params$template$cds_end - params$template$cds_start + 1
  cdsSeq <- substr(params$template$seq,params$template$cds_start,params$template$cds_end)
  #calculate individual codons
  codons <- sapply(seq(1,cdsLength,3),function(i) substr(cdsSeq,i,i+2))
  
  #translate to protein sequence
  data(trtable)
  # proteinSeq <- sapply(seq(1,cdsLength,3),function(pos) trtable[[substr(cdsSeq,pos,pos+2)]])
  proteinSeq <- sapply(codons,function(cd) trtable[[cd]])
  proteinLength <- length(proteinSeq)
  if (proteinSeq[[proteinLength]] == "*") {
    proteinSeq <- proteinSeq[-proteinLength]  
    proteinLength <- proteinLength-1
  }

  params$template$cdsSeq <- cdsSeq
  params$template$cdsLength <- cdsLength
  params$template$cdsCodons <- codons
  params$template$proteinSeq <- paste(proteinSeq,collapse="")
  params$template$proteinLength <- proteinLength


  #convert region and tile positions to nucleotide positions
  params$regions <- cbind(params$regions, 
    `Start NC in CDS` = params$regions[,"Start AA"]*3-2,
    `End NC in CDS` = params$regions[,"End AA"]*3
  )
  params$regions <- cbind(params$regions, 
    `Start NC in Template` = params$regions[,"Start NC in CDS"]+ params$template$cds_start - 1,
    `End NC in Template` = params$regions[,"End NC in CDS"]+ params$template$cds_start - 1
  )

  params$tiles <- cbind(params$tiles, 
    `Start NC in CDS` = params$tiles[,"Start AA"]*3-2,
    `End NC in CDS` = params$tiles[,"End AA"]*3
  )
  params$tiles <- cbind(params$tiles, 
    `Start NC in Template` = params$tiles[,"Start NC in CDS"]+ params$template$cds_start - 1,
    `End NC in Template` = params$tiles[,"End NC in CDS"]+ params$template$cds_start - 1
  )


  #Process the condition definitions to annotate the sample table
  sampleTable <- params$samples
  #Assign correct nonselect sample for each selection sample
  sampleTable$nonselectRef <- sapply(1:nrow(sampleTable), function(i)with(sampleTable[i,], {
    nsCond <- with(as.data.frame(params$conditions$definitions),{
      `Condition 2`[which(Relationship == "is_selection_for" & `Condition 1` == Condition)]
    })
    if (length(nsCond) == 0) return(NA)
    rows <- which(
      sampleTable$`Tile ID` == `Tile ID` & 
      sampleTable$Replicate == Replicate & 
      sampleTable$`Time point` == `Time point` & 
      sampleTable$Condition == nsCond
    )
    if (length(rows) == 0) {
      return(NA)
    } else if (length(rows) > 1) {
      logWarn("More than one nonselect match found for sample",`Sample ID`)
      paste(sampleTable$`Sample ID`[rows],collapse=",")
    } else {
      sampleTable$`Sample ID`[rows]
    }
  }))
  #Assign correct WT control sample for each main sample
  sampleTable$wtCtrlRef <- sapply(1:nrow(sampleTable), function(i)with(sampleTable[i,], {
    wtCond <- with(as.data.frame(params$conditions$definitions),{
      `Condition 1`[which(Relationship == "is_wt_control_for" & `Condition 2` == Condition)]
    })
    if (length(wtCond) == 0) return(NA)
    rows <- which(
      sampleTable$`Tile ID` == `Tile ID` & 
      sampleTable$Replicate == Replicate & 
      sampleTable$`Time point` == `Time point` & 
      sampleTable$Condition == wtCond
    )
    if (length(rows) == 0) {
      return(NA)
    } else if (length(rows) > 1) {
      logWarn("More than one WT control match found for sample",`Sample ID`)
      paste(sampleTable$`Sample ID`[rows],collapse=",")
    } else {
      sampleTable$`Sample ID`[rows]
    }
  }))
  params$samples <- sampleTable

  #convenience functions to retrieve tile and region information by name
  params$regi <- function(region) {
    params$regions[which(params$regions[,"Region Number"] %in% region),]
  }
  params$tili <- function(tile) {
    params$tiles[which(params$tiles[,"Tile Number"] %in% tile),,drop=FALSE]
  }
  params$pos2tile <- function(pos) {
    rows <- sapply(pos,function(pos) {
      i <- which(params$tiles[,"Start AA"] <= pos & params$tiles[,"End AA"] >= pos)
      if (length(i)==0) NA else i
    }) 
    params$tiles[rows,"Tile Number"]
  }
  params$pos2reg <- function(pos) {
    rows <- sapply(pos,function(pos) {
      i <- which(params$regions[,"Start AA"] <= pos & params$regions[,"End AA"] >= pos)
      if (length(i)==0) NA else i
    })
    params$regions[rows,"Region Number"]
  }

  options(op)

  return(params)
}

#' Convenience function to find the tiles that belong to each region.
#'
#' Given the parameter object returns a list of vectors containing the tile IDs for each region ID
#'
#' @param param the parameter object
#' @return a  list of vectors containing the tile IDs for each region ID
#' @export
tilesInRegions <- function(params) {
  setNames(lapply(params$regions$`Region Number`, function(ri) {
    # rs <- params$regions[ri,"Start AA"]
    rs <- params$regi(ri)[["Start AA"]]
    # re <- params$regions[ri,"End AA"]
    re <- params$regi(ri)[["End AA"]]
    tileRows <- which(sapply(params$tiles[,"Start AA"], function(ts){
      ts >=rs && ts < re
    }))
    params$tiles[tileRows,"Tile Number"]
  }),params$regions$`Region Number`)
}

#' Convenience function to calculate the list of SNV-reachable AA changes
#'
#' Given a parameter object containing a coding sequence, this function will calculate a table
#' of all SNV-reachable amino acid changes, detailing which codon changes correspond to each.
#'
#' @param param the parameter object
#' @return a data.frame containing wt and mutant codons as well as wt and mutant AAs for all possible SNVs
#' @export
reachableChanges <- function(params) {
  data(trtable)
  library(hgvsParseR)
  hgvsp <- new.hgvs.builder.p(aacode=3)
  codons <- sapply(
    seq(1,params$template$cdsLength,3),
    function(s) substr(params$template$cdsSeq,s,s+2)
  )
  changes <- expand.grid(i=1:3,base=toChars("ACGT"),stringsAsFactors=FALSE)
  reachable <- do.call(rbind,lapply(1:length(codons), function(pos) {
    wtcodon <- codons[[pos]]
    wtaa <- trtable[[wtcodon]]
    muts <- as.df(lapply(1:nrow(changes),function(k) with(changes[k,],{
      mutcodon <- wtcodon
      substr(mutcodon,changes[k,"i"],changes[k,"i"]) <- changes[k,"base"]
      mutaa <- trtable[[mutcodon]]
      return(list(
        wtcodon=wtcodon,pos=pos,mutcodon=mutcodon,
        wtaa=wtaa,mutaa=mutaa
      ))
    })))
    muts <- muts[with(muts,wtaa!=mutaa),]
    as.df(with(muts,tapply(1:nrow(muts),mutaa,function(is) {
      list(
        wtcodon=unique(wtcodon[is]),pos=unique(pos[is]),
        mutcodons=paste0(mutcodon[is],collapse="|"),
        wtaa=unique(wtaa[is]),mutaa=unique(mutaa[is])
      )
    })))
  }))
  reachable$aachange <- with(reachable,paste0(wtaa,pos,mutaa))
  reachable$hgvsp <- sapply(1:nrow(reachable),function(i) with(reachable[i,],hgvsp$substitution(pos,wtaa,mutaa)))

  return(reachable)
}

#' Convenience function get all selective conditions
#'
#' @param param the parameter object
#' @return the list of selective conditions
#' @export
getSelects <- function(params) {
  if (nrow(params$conditions$definitions) > 0) {
    unique(with(as.data.frame(params$conditions$definitions),{
      `Condition 1`[which(Relationship == "is_selection_for")]
    }))
  } else character(0)
}

#' Convenience function get all non-selective conditions
#'
#' @param param the parameter object
#' @return the list of non-selective conditions
#' @export
getNonselects <- function(params) {
  if (nrow(params$conditions$definitions) > 0) {
    unique(with(as.data.frame(params$conditions$definitions),{
      `Condition 2`[which(Relationship == "is_selection_for")]
    }))
  } else character(0)
}

#' Convenience function get the matching non-selective condition for the given input condition
#'
#' @param cond the input condition
#' @param param the parameter object
#' @return the list of matching non-selective conditions
#' @export
getNonselectFor <- function(cond,params) {
  if (nrow(params$conditions$definitions) > 0) {
    unique(with(as.data.frame(params$conditions$definitions),{
      `Condition 2`[which(Relationship == "is_selection_for" & `Condition 1` == cond)]
    }))
  } else character(0)
}

#' Convenience function get the matching WT control condition for the given input condition
#'
#' @param cond the input condition
#' @param param the parameter object
#' @return the list of matching WT control conditions
#' @export
getWTControlFor <- function(cond,params) {
  if (nrow(params$conditions$definitions) > 0) {
    unique(with(as.data.frame(params$conditions$definitions),{
      `Condition 1`[which(Relationship == "is_wt_control_for" & `Condition 2` == cond)]
    }))
  } else character(0)
}

#' Convenience function get the available time points for the given input condition
#'
#' @param cond the input condition
#' @param param the parameter object
#' @return the list of available time points
#' @export
getTimepointsFor <- function(cond,params) {
  with(params$samples,tapply(`Time point`,Condition,unique))[[cond]]
}

#' Convenience function to find a subdirectory matching the given pattern.
#' 
#' The subfolders should follow the scheme [<label>_]<timestamp>_<outputType>/
#' 
#' @param parentDir the parent folder in which to look for matching subfolders
#' @param pattern a regex pattern used to identify the correct subfolder
#' @return a vector containing the path to the subfolder, its timestamp and any possible label. 
#' @export
latestSubDir <- function(parentDir,pattern="_mut_call$|mut_count$") {
  subDirs <- list.dirs(parentDir,recursive=FALSE)
  subDirs <- subDirs[grepl(pattern,subDirs)]
  if (length(subDirs) == 0) {
    stop("No applicable input folder found!")
  }
  #extract time stamp
  labelsAndTimes <- extract.groups(subDirs,"([^/]+_)?(\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}-\\d{2})")
  #select the newest dataset
  latestIdx <- order(labelsAndTimes[,2])[[nrow(labelsAndTimes)]]
  return(c(
    dir=subDirs[[latestIdx]],
    timeStamp=labelsAndTimes[latestIdx,2],
    label=labelsAndTimes[latestIdx,1]
  ))
}
jweile/tileseqMave documentation built on April 5, 2024, 4:51 p.m.