Nothing
# Mutation profiling
#' @include Shazam.R
#' @include Core.R
NULL
#### Clonal consensus building functions ####
#' Constructs effective clonal sequences for all clones
#'
#' \code{collapseClones} creates effective input and germline sequences for each clonal
#' group and appends columns containing the consensus sequences to the input
#' \code{data.frame}.
#'
#' @param db \code{data.frame} containing sequence data. Required.
#' @param cloneColumn \code{character} name of the column containing clonal
#' identifiers. Required.
#' @param sequenceColumn \code{character} name of the column containing input
#' sequences. Required. The length of each input sequence should
#' match that of its corresponding germline sequence.
#' @param germlineColumn \code{character} name of the column containing germline
#' sequences. Required. The length of each germline sequence
#' should match that of its corresponding input sequence.
#' @param muFreqColumn \code{character} name of the column containing mutation
#' frequency. Optional. Applicable to the \code{"mostMutated"}
#' and \code{"leastMutated"} methods. If not supplied, mutation
#' frequency is computed by calling \code{observedMutations}.
#' Default is \code{NULL}. See Cautions for note on usage.
#' @param regionDefinition \link{RegionDefinition} object defining the regions
#' and boundaries of the Ig sequences. Optional. Default is
#' \code{NULL}.
#' @param method method for calculating input consensus sequence. Required.
#' One of \code{"thresholdedFreq"}, \code{"mostCommon"},
#' \code{"catchAll"}, \code{"mostMutated"}, or
#' \code{"leastMutated"}. See "Methods" for details.
#' @param minimumFrequency frequency threshold for calculating input consensus sequence.
#' Applicable to and required for the \code{"thresholdedFreq"}
#' method. A canonical choice is 0.6. Default is \code{NULL}.
#' @param includeAmbiguous whether to use ambiguous characters to represent positions
#' at which there are multiple characters with frequencies that
#' are at least \code{minimumFrequency} or that are maximal
#' (i.e. ties). Applicable to and required for the
#' \code{"thresholdedFreq"} and \code{"mostCommon"} methods.
#' Default is \code{FALSE}. See "Choosing ambiguous characters"
#' for rules on choosing ambiguous characters.
#' @param breakTiesStochastic In case of ties, whether to randomly pick a sequence from
#' sequences that fulfill the criteria as consensus. Applicable
#' to and required for all methods except for \code{"catchAll"}.
#' Default is \code{FALSE}. See "Methods" for details.
#' @param breakTiesByColumns A list of the form
#' \code{list(c(col_1, col_2, ...), c(fun_1, fun_2, ...))},
#' where \code{col_i} is a \code{character} name of a column
#' in \code{db}, and \code{fun_i} is a function to be applied
#' on that column. Currently, only \code{max} and \code{min}
#' are supported. Note that the two \code{c()}'s in \code{list()}
#' are essential (i.e. if there is only 1 column, the list should
#' be of the form \code{list(c(col_1), c(func_1))}. Applicable
#' to and optional for the \code{"mostMutated"} and
#' \code{"leastMutated"} methods. If supplied, \code{fun_i}'s
#' are applied on \code{col_i}'s to help break ties. Default
#' is \code{NULL}. See "Methods" for details.
#' @param expandedDb \code{logical} indicating whether or not to return the
#' expanded \code{db}, containing all the sequences (as opposed
#' to returning just one sequence per clone).
#' @param nproc Number of cores to distribute the operation over. If the
#' \code{cluster} has already been set earlier, then pass the
#' \code{cluster}. This will ensure that it is not reset.
#' @param juncLengthColumn \code{character} name of the column containing the junction length.
#' Needed when \code{regionDefinition} includes CDR3 and FWR4.
#' @param fields additional fields used for grouping. Use sample_id, to
#' avoid combining sequences with the same clone_id
#' that belong to different sample_id.
#'
#' @return A modified \code{db} with the following additional columns:
#' \itemize{
#' \item \code{clonal_sequence}: effective sequence for the clone.
#' \item \code{clonal_germline}: germline sequence for the clone.
#' \item \code{clonal_sequence_mufreq}: mutation frequency of
#' \code{clonal_sequence}; only added for the \code{"mostMutated"}
#' and \code{"leastMutated"} methods.
#' }
#'
#' \code{clonal_sequence} is generated with the method of choice indicated
#' by \code{method}, and \code{clonal_germline} is generated with the
#' \code{"mostCommon"} method, along with, where applicable, user-defined
#' parameters such as \code{minimumFrequency}, \code{includeAmbiguous},
#' \code{breakTiesStochastic}, and \code{breakTiesByColumns}.
#'
#'
#' @section Consensus lengths: For each clone, \code{clonal_sequence} and
#' \code{clonal_germline} have the same length.
#'
#' \itemize{
#' \item For the \code{"thresholdedFreq"}, \code{"mostCommon"}, and
#' \code{"catchAll"} methods:
#'
#' The length of the consensus sequences is determined by the longest possible
#' consensus sequence (baesd on \code{inputSeq} and \code{germlineSeq}) and
#' \code{regionDefinition@seqLength} (if supplied), whichever is shorter.
#'
#' Given a set of sequences of potentially varying lengths, the longest possible
#' length of their consensus sequence is taken to be the longest length along
#' which there is information contained at every nucleotide position across
#' majority of the sequences. Majority is defined to be greater than
#' \code{floor(n/2)}, where \code{n} is the number of sequences. If the longest
#' possible consensus length is 0, there will be a warning and an empty string
#' (\code{""}) will be returned.
#'
#' If a length limit is defined by supplying a \code{regionDefinition} via
#' \code{regionDefinition@seqLength}, the consensus length will be further
#' restricted to the shorter of the longest possible length and
#' \code{regionDefinition@seqLength}.
#'
#' \item For the \code{"mostMutated"} and \code{"leastMutated"} methods:
#'
#' The length of the consensus sequences depends on that of the most/least
#' mutated input sequence, and, if supplied, the length limit defined by
#' \code{regionDefinition@seqLength}, whichever is shorter. If the germline
#' consensus computed using the \code{"mostCommon"} method is longer than
#' the most/least mutated input sequence, the germline consensus is trimmed
#' to be of the same length as the input consensus.
#'
#' }
#'
#' @section Methods: The descriptions below use "sequences" as a generalization of input
#' sequences and germline sequences.
#'
#' \itemize{
#'
#' \item \code{method="thresholdedFreq"}
#'
#' A threshold must be supplied to the argument \code{minimumFrequency}. At
#' each position along the length of the consensus sequence, the frequency
#' of each nucleotide/character across sequences is tabulated. The
#' nucleotide/character whose frequency is at least (i.e. \code{>=})
#' \code{minimumFrequency} becomes the consensus; if there is none, the
#' consensus nucleotide will be \code{"N"}.
#'
#' When there are ties (frequencies of multiple nucleotides/characters
#' are at least \code{minimumFrequency}), this method can be deterministic
#' or stochastic, depending on additional parameters.
#'
#' \itemize{
#' \item With \code{includeAmbiguous=TRUE}, ties are resolved
#' deterministically by representing ties using ambiguous
#' characters. See "Choosing ambiguous characters" for how
#' ambiguous characters are chosen.
#' \item With \code{breakTiesStochastic=TRUE}, ties are resolved
#' stochastically by randomly picking a character amongst the
#' ties.
#' \item When both \code{TRUE}, \code{includeAmbiguous} takes
#' precedence over \code{breakTiesStochastic}.
#' \item When both \code{FALSE}, the first character from the ties is
#' taken to be the consensus following the order of \code{"A"},
#' \code{"T"}, \code{"G"}, \code{"C"}, \code{"N"}, \code{"."},
#' and \code{"-"}.
#' }
#'
#' Below are some examples looking at a single position based on 5
#' sequences with \code{minimumFrequency=0.6},
#' \code{includeAmbiguous=FALSE}, and \code{breakTiesStochastic=FALSE}:
#'
#' \itemize{
#' \item If the sequences have \code{"A"}, \code{"A"}, \code{"A"},
#' \code{"T"}, \code{"C"}, the consensus will be \code{"A"},
#' because \code{"A"} has frequency 0.6, which is at least
#' \code{minimumFrequency}.
#' \item If the sequences have \code{"A"}, \code{"A"}, \code{"T"},
#' \code{"T"}, \code{"C"}, the consensus will be \code{"N"},
#' because none of \code{"A"}, \code{"T"}, or \code{"C"} has
#' frequency that is at least \code{minimumFrequency}.
#' }
#'
#' \item \code{method="mostCommon"}
#'
#' The most frequent nucleotide/character across sequences at each
#' position along the length of the consensus sequence makes up the consensus.
#'
#' When there are ties (multiple nucleotides/characters with equally
#' maximal frequencies), this method can be deterministic or stochastic,
#' depending on additional parameters. The same rules for breaking ties
#' for \code{method="thresholdedFreq"} apply.
#'
#' Below are some examples looking at a single position based on 5
#' sequences with \code{includeAmbiguous=FALSE}, and
#' \code{breakTiesStochastic=FALSE}:
#'
#' \itemize{
#' \item If the sequences have \code{"A"}, \code{"A"}, \code{"T"},
#' \code{"A"}, \code{"C"}, the consensus will be \code{"A"}.
#' \item If the sequences have \code{"T"}, \code{"T"}, \code{"C"},
#' \code{"C"}, \code{"G"}, the consensus will be \code{"T"},
#' because \code{"T"} is before \code{"C"} in the order of
#' \code{"A"}, \code{"T"}, \code{"G"}, \code{"C"}, \code{"N"},
#' \code{"."}, and \code{"-"}.
#' }
#'
#'
#' \item \code{method="catchAll"}
#'
#' This method returns a consensus sequence capturing most of the
#' information contained in the sequences. Ambiguous characters are
#' used where applicable. See "Choosing ambiguous characters" for how
#' ambiguous characters are chosen. This method is deterministic and
#' does not involve breaking ties.
#'
#' Below are some examples for \code{method="catchAll"} looking at a
#' single position based on 5 sequences:
#'
#' \itemize{
#' \item If the sequences have \code{"N"}, \code{"N"}, \code{"N"},
#' \code{"N"}, \code{"N"}, the consensus will be \code{"N"}.
#' \item If the sequences have \code{"N"}, \code{"A"}, \code{"A"},
#' \code{"A"}, \code{"A"}, the consensus will be \code{"A"}.
#' \item If the sequences have \code{"N"}, \code{"A"}, \code{"G"},
#' \code{"A"}, \code{"A"}, the consensus will be \code{"R"}.
#' \item If the sequences have \code{"-"}, \code{"-"}, \code{"."},
#' \code{"."}, \code{"."}, the consensus will be \code{"-"}.
#' \item If the sequences have \code{"-"}, \code{"-"}, \code{"-"},
#' \code{"-"}, \code{"-"}, the consensus will be \code{"-"}.
#' \item If the sequences have \code{"."}, \code{"."}, \code{"."},
#' \code{"."}, \code{"."}, the consensus will be \code{"."}.
#' }
#'
#' \item \code{method="mostMutated"} and \code{method="leastMutated"}
#'
#' These methods return the most/least mutated sequence as the consensus
#' sequence.
#'
#' When there are ties (multple sequences have the maximal/minimal mutation
#' frequency), this method can be deterministic or stochastic, depending on
#' additional parameters.
#'
#' \itemize{
#' \item With \code{breakTiesStochastic=TRUE}, ties are resolved
#' stochastically by randomly picking a sequence out of
#' sequences with the maximal/minimal mutation frequency.
#' \item When \code{breakTiesByColumns} is supplied, ties are resolved
#' deterministically. Column by column, a function is applied on
#' the column and sequences with column value matching the
#' functional value are retained, until ties are resolved or
#' columns run out. In the latter case, the first remaining
#' sequence is taken as the consensus.
#' \item When \code{breakTiesStochastic=TRUE} and
#' \code{breakTiesByColumns} is also supplied,
#' \code{breakTiesStochastic} takes precedence over
#' \code{breakTiesByColumns}.
#' \item When \code{breakTiesStochastic=FALSE} and
#' \code{breakTiesByColumns} is not supplied (i.e. \code{NULL}),
#' the sequence that appears first amongst the ties is taken
#' as the consensus.
#' }
#'
#' }
#'
#'
#' @section Choosing ambiguous characters:
#'
#' Ambiguous characters may be present in the returned consensuses when using the
#' \code{"catchAll"} method and when using the \code{"thresholdedFreq"} or
#' \code{"mostCommon"} methods with \code{includeAmbiguous=TRUE}.
#'
#' The rules on choosing ambiguous characters are as follows:
#'
#' \itemize{
#' \item If a position contains only \code{"N"} across sequences, the consensus
#' at that position is \code{"N"}.
#' \item If a position contains one or more of \code{"A"}, \code{"T"},
#' \code{"G"}, or \code{"C"}, the consensus will be an IUPAC character
#' representing all of the characters present, regardless of whether
#' \code{"N"}, \code{"-"}, or \code{"."} is present.
#' \item If a position contains only \code{"-"} and \code{"."} across sequences,
#' the consensus at that position is taken to be \code{"-"}.
#' \item If a position contains only one of \code{"-"} or \code{"."} across
#' sequences, the consensus at that position is taken to be the character
#' present.
#' }
#'
#' @section Cautions:
#'
#' \itemize{
#' \item Note that this function does not perform multiple sequence alignment.
#' As a prerequisite, it is assumed that the sequences in
#' \code{sequenceColumn} and \code{germlineColumn} have been aligned
#' somehow. In the case of immunoglobulin repertoire analysis, this
#' usually means that the sequences are IMGT-gapped.
#' \item When using the \code{"mostMutated"} and \code{"leastMutated"} methods,
#' if you supply both \code{muFreqColumn} and \code{regionDefinition},
#' it is your responsibility to ensure that the mutation frequency in
#' \code{muFreqColumn} was calculated with sequence lengths restricted
#' to the \strong{same} \code{regionDefinition} you are supplying.
#' Otherwise, the "most/least mutated" sequence you obtain might not
#' be the most/least mutated given the \code{regionDefinition} supplied,
#' because your mutation frequency was based on a
#' \code{regionDefinition} different from the one supplied.
#' \item If you intend to run \code{collapseClones} before
#' building a 5-mer targeting model, you \strong{must} choose
#' parameters such that your collapsed clonal consensuses do
#' \strong{not} include ambiguous characters. This is because the
#' targeting model functions do NOT support ambiguous characters
#' in their inputs.
#' }
#'
#' @seealso
#' See \link{IMGT_SCHEMES} for a set of predefined \link{RegionDefinition} objects.
#'
#' @examples
#' # Subset example data
#' data(ExampleDb, package="alakazam")
#' db <- subset(ExampleDb, c_call %in% c("IGHA", "IGHG") & sample_id == "+7d" &
#' clone_id %in% c("3100", "3141", "3184"))
#'
#' # thresholdedFreq method, resolving ties deterministically without using ambiguous characters
#' clones <- collapseClones(db, cloneColumn="clone_id", sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' method="thresholdedFreq", minimumFrequency=0.6,
#' includeAmbiguous=FALSE, breakTiesStochastic=FALSE)
#'
#' # mostCommon method, resolving ties deterministically using ambiguous characters
#' clones <- collapseClones(db, cloneColumn="clone_id", sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' method="mostCommon",
#' includeAmbiguous=TRUE, breakTiesStochastic=FALSE)
#'
#' # Make a copy of db that has a mutation frequency column
#' db2 <- observedMutations(db, frequency=TRUE, combine=TRUE)
#'
#' # mostMutated method, resolving ties stochastically
#' clones <- collapseClones(db2, cloneColumn="clone_id", sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' method="mostMutated", muFreqColumn="mu_freq",
#' breakTiesStochastic=TRUE, breakTiesByColumns=NULL)
#'
#' # mostMutated method, resolving ties deterministically using additional columns
#' clones <- collapseClones(db2, cloneColumn="clone_id", sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' method="mostMutated", muFreqColumn="mu_freq",
#' breakTiesStochastic=FALSE,
#' breakTiesByColumns=list(c("duplicate_count"), c(max)))
#'
#' # Build consensus for V segment only
#' # Capture all nucleotide variations using ambiguous characters
#' clones <- collapseClones(db, cloneColumn="clone_id", sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' method="catchAll", regionDefinition=IMGT_V)
#'
#' # Return the same number of rows as the input
#' clones <- collapseClones(db, cloneColumn="clone_id", sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' method="mostCommon", expandedDb=TRUE)
#'
#' @export
collapseClones <- function(db, cloneColumn = "clone_id",
sequenceColumn = "sequence_alignment",
germlineColumn = "germline_alignment_d_mask",
muFreqColumn = NULL,
regionDefinition=NULL,
method = c("mostCommon","thresholdedFreq","catchAll","mostMutated","leastMutated"),
minimumFrequency = NULL,
includeAmbiguous = FALSE,
breakTiesStochastic = FALSE, breakTiesByColumns = NULL,
expandedDb = FALSE, nproc = 1,
juncLengthColumn="junction_length",
fields=NULL) {
# Hack for visibility of foreach index variables
idx <- NULL
## DEBUG
# cloneColumn="CLONE"; sequenceColumn="sequence_alignment"; germlineColumn="germline_alignment_d_mask"
# expandedDb=FALSE; regionDefinition=NULL; method="mostCommon"; nproc=1
#### parameter checks
method <- match.arg(method)
# check minimumFrequency for thresholdedFreq method
if (method=="thresholdedFreq") {
if (!is.numeric(minimumFrequency)) {
stop("minimumFrequency must be a numeric value.")
} else {
if ( minimumFrequency<0 | minimumFrequency>1 ) {
stop("minimumFrequency must be between 0 and 1.")
}
}
}
# check includeAmbiguous & breakTiesStochastic for methods other than catchAll
if (method %in% c("thresholdedFreq", "mostCommon", "mostMutated", "leastMutated")) {
if (!is(includeAmbiguous, "logical")) {
stop ("includeAmbiguous must be TRUE or FALSE.")
}
if (!is(breakTiesStochastic, "logical")) {
stop ("breakTiesStochastic must be TRUE or FALSE.")
}
}
# check breakTiesByColumns and muFreqColumn for methods most/leastMutated
if (method %in% c("mostMutated", "leastMutated")) {
if (!is.null(breakTiesByColumns)) {
if (!is(breakTiesByColumns, "list")) {
stop ("breakTiesByColumns must be a list.")
}
if (length(breakTiesByColumns) != 2) {
stop ("breakTiesByColumns must be a nested list of length 2.")
}
if (length(breakTiesByColumns[[1]]) != length(breakTiesByColumns[[2]])) {
stop ("Nested vectors in breakTiesByColumns must have the same lengths.")
}
if (!all(is.character(breakTiesByColumns[[1]]))) {
stop ("The first vector in breakTiesByColumns must contain column names.")
}
if (!all( unlist( lapply(breakTiesByColumns[[2]], is.function)))) {
stop ("The second vector in breakTiesByColumns must contain functions.")
}
if (!all(breakTiesByColumns[[1]] %in% colnames(db))) {
stop ("All column named included in breakTiesByColumns must be present in db.")
}
}
if ( (!is.null(muFreqColumn)) && (!muFreqColumn %in% colnames(db)) ) {
stop ("If specified, muFreqColumn must be a column present in db.")
}
}
# check mutual exclusivitiy
if (method %in% c("thresholdedFreq", "mostCommon")){
if (includeAmbiguous & breakTiesStochastic) {
message("includeAmbiguous and breakTiesStochastic are mutually exclusive. When both TRUE, includeAmbiguous will take precedence.")
}
#if ( (!includeAmbiguous) & (!breakTiesStochastic) ) {
# message("When both includeAmbiguous and breakTiesStochastic are FALSE, ties are broken in the order of 'A', 'T', 'G', 'C', 'N', '.', and '-'.")
#}
if (!is.null(breakTiesByColumns)) {
message("breakTiesByColumns is ignored when method is thresholdedFreq or mostCommon.")
}
}
if (method %in% c("mostMutated", "leastMutated")){
if (breakTiesStochastic & !is.null(breakTiesByColumns)) {
message("breakTiesStochastic and breakTiesByColumns are mutually exclusive. When both set, breakTiesStochastic will take precedence.")
}
#if ( (!breakTiesStochastic) & is.null(breakTiesByColumns) ) {
# message("When breakTiesStochastic is FALSE and breakTiesByColumns is NULL, ties are broken by taking the sequence that appears earlier in the data.frame.")
#}
if (includeAmbiguous) {
message("includeAmbiguous is ignored when method is mostMutated or leastMutated.")
}
}
# Check for valid columns
check <- checkColumns(db, c(cloneColumn, sequenceColumn, germlineColumn, fields))
if (check != TRUE) { stop(check) }
# Check for NAs
na_rows <- which(is.na(db[,c(cloneColumn, sequenceColumn, germlineColumn)] ), arr.ind=T)
if (nrow(na_rows)>0) {
na_cols <- c(cloneColumn, sequenceColumn, germlineColumn)[unique(na_rows[,2])]
stop("NA values found in column(s): ", paste(na_cols, collapse=", "),". ",length(unique(na_rows[,1])), " sequence(s) affected.")
}
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
### Convert sequence columns to uppercase
db <- toupperColumns(db, c(sequenceColumn, germlineColumn))
# If the user has previously set the cluster and does not wish to reset it
if(!is.numeric(nproc)){
cluster <- nproc
nproc <- 0
}
if (!is(expandedDb, "logical")) {
stop ("expandedDb must be TRUE or FALSE.")
}
# Convert clone identifiers to strings
db[[cloneColumn]] <- as.character(db[[cloneColumn]])
db$tmp_colclones_row_id <- 1:nrow(db)
# use `fields` information to id clones
db$fields_clone_id <- db %>%
group_by(!!!rlang::syms(c(fields, cloneColumn))) %>%
dplyr::group_indices()
db$fields_clone_id <- as.character(db$fields_clone_id)
# get row indices in db for each unique clone
uniqueClones <- unique(db[["fields_clone_id"]])
# crucial to have simplify=FALSE (otherwise won't return a list if uniqueClones has length 1)
uniqueClonesIdx <- sapply(uniqueClones, function(i){which(db[["fields_clone_id"]]==i)}, simplify=FALSE)
regionDefinitionName <- ""
if (!is.null(regionDefinition)) {
regionDefinitionName <- regionDefinition@name
}
# if method is most/leastMutated and muFreqColumn not specified,
# first calculate mutation frequency ($mu_freq)
# IMPORTANT: do this OUTSIDE foreach loop for calcClonalConsensus
# otherwise will get an error saying muFreqColumn not found in db
# (something to do with parallelization/foreach)
if ( (method %in% c("mostMutated", "leastMutated")) & is.null(muFreqColumn) ) {
message("Calculating observed mutation frequency...")
db <- observedMutations(db=db, sequenceColumn=sequenceColumn,
germlineColumn=germlineColumn,
regionDefinition=regionDefinition,
frequency=TRUE, combine=TRUE,
cloneColumn = "fields_clone_id",
mutationDefinition=NULL, nproc=nproc)
muFreqColumn <- "mu_freq"
}
# Ensure that the nproc does not exceed the number of cores/CPUs available
nproc <- min(nproc, cpuCount())
# If user wants to paralellize this function and specifies nproc > 1, then
# initialize and register slave R processes/clusters &
# export all nesseary environment variables, functions and packages.
if (nproc == 1) {
# If needed to run on a single core/cpu then, regsiter DoSEQ
# (needed for 'foreach' in non-parallel mode)
registerDoSEQ()
} else {
if (nproc != 0) {
#cluster <- makeCluster(nproc, type="SOCK")
cluster <- parallel::makeCluster(nproc, type= "PSOCK")
}
parallel::clusterExport(cluster,
list('db', 'sequenceColumn', 'germlineColumn', 'muFreqColumn','juncLengthColumn',
'regionDefinition', 'regionDefinitionName', 'method', 'minimumFrequency','includeAmbiguous',
'breakTiesStochastic', 'breakTiesByColumns',
'calcClonalConsensus', 'consensusSequence', 'breakTiesHelper',
'chars2Ambiguous', 'nucs2IUPAC', 'IUPAC_DNA_2', 'NUCLEOTIDES_AMBIGUOUS',
'uniqueClonesIdx', 'c2s', 's2c','getCloneRegion'),
envir=environment() )
registerDoParallel(cluster,cores=nproc)
}
# Printing status to console
#cat("Collapsing clonal sequences...\n")
# avoid .combine="cbind"!
# if there is only 1 unique clone, .combind="cbind" will result in a vector (as opposed to
# a matrix) being returned, which will subsequently result a failure in
# cons_db$clonal_sequence <- cons_mat[, 1]
cons_mat <- foreach(idx=1:length(uniqueClonesIdx),
.verbose=FALSE, .errorhandling='stop') %dopar% {
cloneIdx <- uniqueClonesIdx[[idx]]
cloneDb <- db[cloneIdx, ]
clone_num <- unique(cloneDb[['fields_clone_id']])
# Verify the assumption that all sequences in the clone have the same
# junction length.
if (length(unique(cloneDb[[juncLengthColumn]])) > 1 ) {
stop("Expecting all sequences in the same clone with the same junction lenght.")
}
cloneRegionDefinition <- regionDefinition
if (regionDefinitionName %in% c("IMGT_VDJ_BY_REGIONS","IMGT_VDJ")) {
cloneRegionDefinition <- getCloneRegion(clone_num=clone_num,
db=cloneDb,
seq_col=sequenceColumn,
juncLengthColumn=juncLengthColumn,
clone_col='fields_clone_id',
regionDefinition=regionDefinition)
}
# collapse clone
calcClonalConsensus(db=cloneDb,
sequenceColumn=sequenceColumn,
germlineColumn=germlineColumn,
muFreqColumn=muFreqColumn,
regionDefinition=cloneRegionDefinition,
method=method,
minimumFrequency=minimumFrequency,
includeAmbiguous=includeAmbiguous,
breakTiesStochastic=breakTiesStochastic,
breakTiesByColumns=breakTiesByColumns)
}
# using cbind below will give a matrix with columns being clones
# use rbind to have rows be clones
# cols: inputCons, germlineCons, inputMuFreq
cons_mat <- do.call(rbind, cons_mat)
# Stop cluster
if(nproc > 1) { parallel::stopCluster(cluster) }
# Build return data.frame
if (expandedDb) {
# Fill all rows with the consensus sequence
clone_index <- match(db[["fields_clone_id"]], uniqueClones)
cons_db <- db
cons_db$clonal_sequence <- unlist(cons_mat[, 1])[clone_index]
cons_db$clonal_germline <- unlist(cons_mat[, 2])[clone_index]
# assign mutation frequency corresponding to consensus into clonal_sequence_mufreq
if (method %in% c("mostMutated", "leastMutated")) {
cons_db$clonal_sequence_mufreq <- unlist(cons_mat[, 3])[clone_index]
}
} else {
# Return only the first row of each clone
clone_index <- match(uniqueClones, db[["fields_clone_id"]])
cons_db <- db[clone_index, ]
cons_db$clonal_sequence <- unlist(cons_mat[, 1])
cons_db$clonal_germline <- unlist(cons_mat[, 2])
# assign mutation frequency corresponding to consensus into clonal_sequence_mufreq
if (method %in% c("mostMutated", "leastMutated")) {
cons_db$clonal_sequence_mufreq <- unlist(cons_mat[, 3])
}
}
cons_db %>%
arrange(!!rlang::sym("tmp_colclones_row_id")) %>%
select(-!!rlang::sym("tmp_colclones_row_id"), -!!rlang::sym("fields_clone_id"))
}
# Break ties given additional columns in db and functions to compute on them
#
# @param idx vector of indices.
# @param cols character vector of colnames. Currently, only columns containing
# numeric values are supported/expected.
# @param funs list of functions. Currently, only \code{max} and \code{min} are
# supported/expected.
# @param db \code{data.frame} containing columns named after \code{cols} with
# corresponding rows for \code{idx}.
#
# @return a single value from \code{idx}.
#
# @details Column by column, \code{breakTiesHelper} calls the corresponding function
# from \code{funs} on a column in \code{db} and finds the index/indices in
# \code{idx} that match(es) the returned value from the function. This stops
# when only a single matching index is obtained, or columns run out. In the
# latter case, the first remaining index is returned.
#
# testing
# expect index 18
# test.idx = c(2,4,18,37,102,76)
# test.db = data.frame(cbind(DUPCOUNT= c(3,5,5,4,5,1),
# CONSCOUNT=c(6,6,6,2,3,4),
# ERR=c(0.9, 0.14, 0.12, 0.07, 0.3, 0.5)))
# test.cols = c("DUPCOUNT", "CONSCOUNT", "ERR")
# test.funs = c(max, max, min)
# stopifnot( breakTiesHelper(test.idx, test.cols, test.funs, test.db)==18 )
# # make index 4 and 18 tie for ERR
# # index 4 is expected because it appears before 18
# test.db[3,"ERR"] = 0.14
# stopifnot( breakTiesHelper(test.idx, test.cols, test.funs, test.db)==4 )
#
breakTiesHelper <- function(idx, cols, funs, db) {
# debug
# idx=test.idx; cols=test.cols; funs=test.funs; db=test.db
counter <- 1
while (length(idx)>1 & counter<=length(cols)) {
cur.col <- cols[counter]
cur.fun <- funs[[counter]]
cur.db <- db[[cur.col]]
target <- cur.fun(cur.db)
tol <- 1e-5 # tolerance
target.idx <- which( abs(cur.db-target)<=tol ) # wrt idx & db
idx <- idx[target.idx]
db <- db[target.idx, ]
counter <- counter+1
}
if (length(idx)==1) {
return(idx)
} else if (length(idx)>1) {
#print("Failed to resolve ties.") # for testing/debugging
return(idx[1])
} else {
stop("breakTieHelper failed unexpectedly.")
}
}
#' Construct a consensus sequence
#'
#' @param sequences character vector of sequences.
#' @param db \code{data.frame} containing sequence data for a single clone.
#' Applicable to and required for the \code{"mostMutated"} and
#' \code{"leastMutated"} methods. Default is \code{NULL}.
#' @param method method to calculate consensus sequence. One of
#' \code{"thresholdedFreq"}, \code{"mostCommon"}, \code{"catchAll"},
#' \code{"mostMutated"}, or \code{"leastMutated"}. See "Methods" under
#' \link{collapseClones} for details.
#' @param minFreq frequency threshold for calculating input consensus sequence.
#' Applicable to and required for the \code{"thresholdedFreq"} method.
#' A canonical choice is 0.6. Default is \code{NULL}.
#' @param muFreqColumn \code{character} name of the column in db containing mutation
#' frequency. Applicable to and required for the \code{"mostMutated"}
#' and \code{"leastMutated"} methods. Default is \code{NULL}.
#' @param lenLimit limit on consensus length. if \code{NULL} then no length limit is set.
#' @param includeAmbiguous whether to use ambiguous characters to represent positions at
#' which there are multiple characters with frequencies that are at least
#' \code{minimumFrequency} or that are maximal (i.e. ties). Applicable to
#' and required for the \code{"thresholdedFreq"} and \code{"mostCommon"}
#' methods. Default is \code{FALSE}. See "Choosing ambiguous characters"
#' under \link{collapseClones} for rules on choosing ambiguous characters.
#' Note: this argument refers to the use of ambiguous nucleotides in the
#' output consensus sequence. Ambiguous nucleotides in the input sequences
#' are allowed for methods catchAll, mostMutated and leastMutated.
#' @param breakTiesStochastic In case of ties, whether to randomly pick a sequence from sequences that
#' fulfill the criteria as consensus. Applicable to and required for all methods
#' except for \code{"catchAll"}. Default is \code{FALSE}. See "Methods"
#' under \link{collapseClones} for details.
#' @param breakTiesByColumns A list of the form \code{list(c(col_1, col_2, ...), c(fun_1, fun_2, ...))},
#' where \code{col_i} is a \code{character} name of a column in \code{db},
#' and \code{fun_i} is a function to be applied on that column. Currently,
#' only \code{max} and \code{min} are supported. Note that the two \code{c()}'s
#' in \code{list()} are essential (i.e. if there is only 1 column, the list
#' should be of the form \code{list(c(col_1), c(func_1))}. Applicable to and
#' optional for the \code{"mostMutated"} and \code{"leastMutated"} methods.
#' If supplied, \code{fun_i}'s are applied on \code{col_i}'s to help break
#' ties. Default is \code{NULL}. See "Methods" under \link{collapseClones}
#' for details.
#'
#' @return A list containing \code{cons}, which is a character string that is the consensus sequence
#' for \code{sequences}; and \code{muFreq}, which is the maximal/minimal mutation frequency of
#' the consensus sequence for the \code{"mostMutated"} and \code{"leastMutated"} methods, or
#' \code{NULL} for all other methods.
#'
#' @details See \link{collapseClones} for detailed documentation on methods and additional parameters.
#'
#' @examples
#' # Subset example data
#' data(ExampleDb, package="alakazam")
#' db <- subset(ExampleDb, c_call %in% c("IGHA", "IGHG") & sample_id == "+7d")
#' clone <- subset(db, clone_id == "3192")
#'
#' # First compute mutation frequency for most/leastMutated methods
#' clone <- observedMutations(clone, frequency=TRUE, combine=TRUE)
#'
#' # Manually create a tie
#' clone <- rbind(clone, clone[which.max(clone$mu_freq), ])
#'
#' # ThresholdedFreq method.
#' # Resolve ties deterministically without using ambiguous characters
#' cons1 <- consensusSequence(clone$sequence_alignment,
#' method="thresholdedFreq", minFreq=0.3,
#' includeAmbiguous=FALSE,
#' breakTiesStochastic=FALSE)
#' cons1$cons
#'
#' @export
## DEBUG
# thresholdedFreq method, resolve ties deterministically using ambiguous characters
# consInput2 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn=NULL, lenLimit=NULL,
# method="thresholdedFreq", minFreq=0.3,
# includeAmbiguous=TRUE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=NULL, db=NULL)$cons
# thresholdedFreq method, resolve ties stochastically
# consInput3 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn=NULL, lenLimit=NULL,
# method="thresholdedFreq", minFreq=0.3,
# includeAmbiguous=FALSE,
# breakTiesStochastic=TRUE,
# breakTiesByColumns=NULL, db=NULL)$cons
# mostCommon method, resolve ties deterministically without using ambiguous characters
# consInput4 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn=NULL, lenLimit=NULL,
# method="mostCommon", minFreq=NULL,
# includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=NULL, db=NULL)$cons
# mostCommon method, resolve ties deterministically using ambiguous characters
# consInput5 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn=NULL, lenLimit=NULL,
# method="mostCommon", minFreq=NULL,
# includeAmbiguous=TRUE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=NULL, db=NULL)$cons
# mostCommon method, resolve ties stochastically
# consInput6 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn=NULL, lenLimit=NULL,
# method="mostCommon", minFreq=NULL,
# includeAmbiguous=FALSE,
# breakTiesStochastic=TRUE,
# breakTiesByColumns=NULL, db=NULL)$cons
# catchAll method
# consInput7 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn=NULL, lenLimit=NULL,
# method="catchAll", minFreq=NULL,
# includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=NULL, db=NULL)$cons
# mostMutated method, resolve ties stochastically
# consInput8 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn="mu_freq", lenLimit=NULL,
# method="mostMutated", minFreq=NULL,
# includeAmbiguous=FALSE,
# breakTiesStochastic=TRUE,
# breakTiesByColumns=NULL, db=clone)$cons
# mostMutated method, resolve ties deterministically using additional columns
# consInput9 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn="mu_freq", lenLimit=NULL,
# method="mostMutated", minFreq=NULL,
# includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=list(c("junction_length","duplicate_count"), c(max, max)),
# db=clone)$cons
# consInput10 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn="mu_freq", lenLimit=NULL,
# method="mostMutated", minFreq=NULL,
# includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=list(c("duplicate_count"), c(max)),
# db=clone)$cons
# mostMutated method, resolve ties deterministically withou using additional columns
# consInput11 <- consensusSequence(clone$sequence_alignment,
# muFreqColumn="mu_freq", lenLimit=NULL,
# method="mostMutated", minFreq=NULL,
# includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=NULL, db=clone)$cons
consensusSequence <- function(sequences, db=NULL,
method=c("mostCommon", "thresholdedFreq", "catchAll", "mostMutated", "leastMutated"),
minFreq=NULL, muFreqColumn=NULL, lenLimit=NULL,includeAmbiguous=FALSE,
breakTiesStochastic=FALSE, breakTiesByColumns=NULL) {
# Check arguments
method <- match.arg(method)
# check muFreqColumn and get muFreq for most/leastMutated
if (method %in% c("mostMutated", "leastMutated")) {
if ( is.null(muFreqColumn) ) {
stop ("muFreqColumn must be specified when method is most/leastMutated.")
}
if ( is.null(db) ) {
stop ("db containing muFreqColumn must be supplied when method is most/leastMutated.")
}
if (!muFreqColumn %in% colnames(db)) {
print(c("Helper", muFreqColumn))
print(c("Helper", colnames(db)))
stop ("muFreqColumn must be a column present in db.")
}
# get muFreq
muFreq <- db[[muFreqColumn]]
}
numSeqs <- length(sequences)
##### if only one sequence in clone, return it
if (numSeqs==1) {
# restrict length if there is a lenLimit
if (!is.null(lenLimit)) {
consensus <- substr(sequences, 1, min(lenLimit, stri_length(sequences)))
} else {
# otherwise, return as is
consensus <- sequences
}
# return with mutation frequency (if applicable)
if (method %in% c("mostMutated", "leastMutated")) {
return(list(cons=consensus, muFreq=db[[muFreqColumn]]))
} else {
return(list(cons=consensus, muFreq=NULL))
}
}
##### if all sequences are the same, return now
if (length(unique(sequences))==1) {
# restrict length if there is a lenLimit
if (!is.null(lenLimit)) {
consensus <- substr(sequences[1], 1, min(lenLimit, stri_length(sequences)))
} else {
# otherwise, return as is
consensus <- sequences[1]
}
# return with mutation frequency (if applicable)
if (method %in% c("mostMutated", "leastMutated")) {
return(list(cons=consensus, muFreq=db[[muFreqColumn]][1]))
} else {
return(list(cons=consensus, muFreq=NULL))
}
}
##### length of longest sequence in sequences
lenSeqs <- stri_length(sequences)
lenMax <- max(lenSeqs, na.rm=T)
##### methods = thresholdedFreq, mostCommon, catchAll
if (method %in% c("thresholdedFreq", "mostCommon", "catchAll")) {
##### convert sequences to a matrix
# if there's no more nucleotide when a seq ends, fill position with NA
seqsMtx <- matrix(NA, nrow=numSeqs, ncol=lenMax)
for (i in 1:numSeqs) {
seqsMtx[i, 1:lenSeqs[i]] <- s2c(sequences[i])
}
##### tabulation matrix
# col: nucleotide position
# row: A,T,G,C,N,.,-,na (to distinguish from NA)
if (method != "catchAll") {
tabMtxRownames <- c("A","T","G","C","N",".","-","na")
} else {
# Allow for input ambiguous characters
tabMtxRownames <- c(NUCLEOTIDES_AMBIGUOUS,"na")
}
tabMtx <- matrix(0, ncol=lenMax, nrow=length(tabMtxRownames),
dimnames=list(tabMtxRownames, NULL))
## across sequences, at each nuc position, how many A, T, G, C, N, ., -?
# this does not capture NA
# for (j in 1:ncol(seqsMtx)) {
# tab <- table(seqsMtx[, j])
# r <- match(names(tab), tabMtxRownames)
# if (any(is.na(r))) {
# stop("Ambiguous nucleotides or unexpected characters found in `sequences`.")
# }
# tabMtx[r, j] <- tab
# }
# This is faster:
if (!all(na.omit(as.vector(seqsMtx)) %in% tabMtxRownames)) {
stop("Ambiguous nucleotides or unexpected characters found in `sequences`.")
}
tabMtx <- apply(seqsMtx, 2, function(j) {
sapply(tabMtxRownames, function(nt){
sum(j == nt, na.rm=T)
})
})
## across sequences, at each nuc position, how many NAs?
numNAs <- colSums(is.na(seqsMtx))
tabMtx["na", ] <- numNAs
# sanity check: counts at each nuc pos (colSum) should sum up to number of sequences
stopifnot( sum( colSums(tabMtx)==numSeqs ) == ncol(tabMtx) )
##### only keep positions at which majority of sequences contain information
### if there are odd number of n sequences, keep position if it has > floor(n/2) non-NAs
# e.g. 5 input sequences, >2 non-NA; 2=floor(5/2)
### if there are even number of n sequences, keep position if it has > n/2 non-NAs
# e.g. 6 input sequences, >3 non-NA; 3=6/2=floor(6/2)
numNonNAs <- numSeqs - numNAs
nonNA.keep <- numNonNAs > floor(numSeqs/2)
# length of longest possible consensus seq
lenConsensus <- sum(nonNA.keep)
if (lenConsensus==0) {
warning("Consensus cannot be produced. Empty string returned.")
return("")
}
##### if there is a lenLimit, restrict consensus length to
# the shorter of longest possible length and lenLimit
if (!is.null(lenLimit)) {
lenConsensus <- min(lenConsensus, lenLimit)
}
# drop=FALSE so that it works even with lenConsensus of 1
tabMtx <- tabMtx[, 1:lenConsensus, drop=FALSE]
### convert absolute count to fraction
tabMtx <- tabMtx/numSeqs
# remove "na" row
# drop=FALSE so that it works even with lenConsensus of 1
tabMtx <- tabMtx[-which(rownames(tabMtx)=="na"), , drop=FALSE]
if (method=="thresholdedFreq") {
#print(method) # for testing
# use as.matrix so that apply won't break with ncol(tabMtx)=1
consensus <- apply(as.matrix(tabMtx), 2, function(x){
idx <- which(x >= minFreq)
# if no character >= the threshold, assign an N
if (length(idx)==0) {
return("N")
# if there is no tie
} else if (length(idx)==1){
return(names(x)[idx])
# if there are ties (multiple characters >= the threhold)
} else if (length(idx)>1) {
# ambiguous character allowed
if (includeAmbiguous) {
return(chars2Ambiguous(tabMtxRownames[idx]))
# ambiguous characters not allowed
} else {
# stochastic
if (breakTiesStochastic) {
return(names(x)[sample(x=idx, size=1)])
# first one is returned
# the order is built-in from tabMtxRownames
} else {
return(names(x)[idx[1]])
}
}
}
})
} else if (method=="mostCommon") {
#print(method) # for testing
# use as.matrix so that apply won't break with ncol(tabMtx)=1
consensus <- apply(as.matrix(tabMtx), 2, function(x){
max.freq <- max(x)
tol <- 1e-5 # tolerance
max.idx <- which( abs(x-max.freq)<=tol )
# if there is no tie
if (length(max.idx)==1){
return(names(x)[max.idx])
# if there are ties (multiple characters with maximal frequency)
} else if (length(max.idx)>1) {
# ambiguous character allowed
if (includeAmbiguous) {
return(chars2Ambiguous(tabMtxRownames[max.idx]))
# ambiguous characters not allowed
} else {
# stochastic
if (breakTiesStochastic) {
return(names(x)[sample(x=max.idx, size=1)])
# first one is returned
# the order is built-in from tabMtxRownames
} else {
return(names(x)[max.idx[1]])
}
}
}
})
} else if (method=="catchAll") {
#print(method) # for testing
# use as.matrix so that apply won't break with ncol(tabMtx)=1
consensus <- apply(as.matrix(tabMtx), 2, function(x){
# all characters that appear at a position across sequences
nonZeroNucs <- rownames(tabMtx)[x>0]
# Disambiguate, except N
nonZeroNucs <- unique(unlist(c(IUPAC_DNA[names(IUPAC_DNA)!="N"],"."=".","-"="-","N"="N")[nonZeroNucs]))
# convert characters to (ambiguous) characters
return(chars2Ambiguous(nonZeroNucs))
})
}
# check there is no ambiguous characters if includeAmbiguous if F
if ( (method=="thresholdedFreq" | method=="mostCommon") & !includeAmbiguous ) {
ambiguous <- NUCLEOTIDES_AMBIGUOUS[!NUCLEOTIDES_AMBIGUOUS %in%
c("A","C","G","T","N","-",".")]
stopifnot( !any(consensus %in% ambiguous) )
}
# convert from character vector to string
consensus <- c2s(consensus)
# sanity check
stopifnot( stri_length(consensus)==lenConsensus )
}
##### methods = mostMutated, leastMutated
if (method %in% c("mostMutated", "leastMutated")) {
# if there's a lenLimit
# if a seq is longer than lenLimit, trim it; otherwise, leave it as is
if (!is.null(lenLimit)) {
idxLong <- which(lenSeqs > lenLimit)
sequences[idxLong] <- substr(sequences[idxLong], 1, lenLimit)
}
##### get index of sequences that fulfill the criterion
# muFreq should have been calculated being on sequences with restricted lengths as defined by
# regionDefinition (which gives rise to lenLimit)
if (method=="mostMutated") {
#print(method) # for testing
targetMuFreq <- max(muFreq)
} else if (method=="leastMutated") {
#print(method) # for testing
targetMuFreq <- min(muFreq)
}
tol <- 1e-5 # tolerance
idx <- which( abs(muFreq-targetMuFreq)<=tol )
##### if there are no ties
if (length(idx)==1) {
consensus <- sequences[idx]
##### if there are ties
} else if (length(idx)>1) {
### stochastic: randomly pick one from idx
if (breakTiesStochastic) {
consensus <- sequences[sample(x=idx, size=1)]
### deterministic: pick one from idx based on breakTiesByColumns
} else if (!is.null(breakTiesByColumns)) {
idx <- breakTiesHelper(idx=idx, cols=breakTiesByColumns[[1]],
funs=breakTiesByColumns[[2]], db=db[idx, ])
consensus <- sequences[idx]
### deterministic: pick first one from idx
} else {
consensus <- sequences[idx[1]]
}
}
}
# check length
if (!is.null(lenLimit)) {
stopifnot(stri_length(consensus) <= lenLimit)
}
if (method %in% c("mostMutated", "leastMutated")) {
return(list(cons=consensus, muFreq=targetMuFreq))
} else {
return(list(cons=consensus, muFreq=NULL))
}
}
# Calculate clonal consensus for a single clone
#
# Given an aligned set of input/observed sequences and an aligned set of germline sequences,
# generate an input/observed consensus and a germline consensus.
#
# @param db \code{data.frame} containing sequence data for a single clone.
# Required.
# @param sequenceColumn \code{character} name of the column containing input
# sequences. Required. The length of each input sequence should
# match that of its corresponding germline sequence.
# @param germlineColumn \code{character} name of the column containing germline
# sequences. Required. The length of each germline sequence should
# match that of its corresponding input sequence.
# @param muFreqColumn \code{character} name of the column containing mutation
# frequency. Applicable to and required for the \code{"mostMutated"}
# and \code{"leastMutated"} methods. Default is \code{NULL}. See
# "Details" for a note of caution.
# @param regionDefinition \link{RegionDefinition} object defining the regions and boundaries
# of the Ig sequences. Optional. Default is \code{NULL}.
# @param method method for calculating input consensus sequence. Required. One of
# \code{"thresholdedFreq"}, \code{"mostCommon"}, \code{"catchAll"},
# \code{"mostMutated"}, or \code{"leastMutated"}. See "Methods" under
# \link{collapseClones} for details.
# @param minimumFrequency frequency threshold for calculating input consensus sequence.
# Applicable to and required for the \code{"thresholdedFreq"} method.
# A canonical choice is 0.6. Default is \code{NULL}.
# @param includeAmbiguous whether to use ambiguous characters to represent positions at
# which there are multiple characters with frequencies that are at least
# \code{minimumFrequency} or that are maximal (i.e. ties). Applicable to
# and required for the \code{"thresholdedFreq"} and \code{"mostCommon"}
# methods. Default is \code{FALSE}. See "Choosing ambiguous characters"
# under \link{collapseClones} for rules on choosing ambiguous characters.
# @param breakTiesStochastic In case of ties, whether to randomly pick a sequence from sequences that
# fulfill the criteria as consensus. Applicable to and required for all methods
# except for \code{"catchAll"}. Default is \code{FALSE}. See "Methods"
# under \link{collapseClones} for details.
# @param breakTiesByColumns A list of the form \code{list(c(col_1, col_2, ...), c(fun_1, fun_2, ...))},
# where \code{col_i} is a \code{character} name of a column in \code{db},
# and \code{fun_i} is a function to be applied on that column. Currently,
# only \code{max} and \code{min} are supported. Applicable to and optional for
# the \code{"mostMutated"} and \code{"leastMutated"} methods. If supplied,
# \code{fun_i}'s are applied on \code{col_i}'s to help break ties. Default is
# \code{NULL}. See "Methods" under \link{collapseClones} for details.
#
# @return A named list of length 3. "inputCons" and "germlineCons" are the consensus sequences.
# The input and germline consensus sequences have the same length. "inputMuFreq" is the
# maximal/minimal mutation frequency for the input consensus for the \code{"mostMutated"}
# and \code{"leastMutated"} methods, and \code{NULL} for all other methods.
#
# @details See \link{collapseClones} for detailed documention on methods and additional parameters.
#
# Caution: when using the \code{"mostMutated"} and \code{"leastMutated"} methods, if you
# supply a \code{regionDefinition}, it is your responsibility to ensure that the mutation
# frequency in\code{muFreqColumn} was calculated with sequence lengths restricted to the
# \strong{same} \code{regionDefinition} you are supplying. Otherwise, the
# "most/least mutated" sequence you obtain might not be the most/least mutated given the
# \code{regionDefinition} supplied, because your mutation frequency was based on a
# \code{regionDefinition} different from the one supplied.
#
# @seealso
# See \link{collapseClones} for constructing consensus for all clones.
#
# @examples
# # Subset example data
# data(ExampleDb, package="alakazam")
# db <- subset(ExampleDb, c_call %in% c("IGHA", "IGHG") & sample_id == "+7d")
#
# # Data corresponding to a single clone
# clone <- db[db[["clone_id"]] == "3192", ]
# # Number of sequences in this clone
# nrow(clone)
# # compute mutation frequency for most/leastMutated methods
# clone <- observedMutations(db=clone, frequency=TRUE, combine=TRUE)
# # manually create a tie
# clone <- rbind(clone, clone[which.max(clone$mu_freq), ])
#
# # Get consensus input and germline sequences
# # thresholdedFreq method, resolve ties deterministically without using ambiguous characters
# cons1 <- calcClonalConsensus(db=clone,
# muFreqColumn=NULL, regionDefinition=NULL,
# method="thresholdedFreq",
# minimumFrequency=0.3, includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE, breakTiesByColumns=NULL)
# # thresholdedFreq method, resolve ties deterministically using ambiguous characters
# cons2 <- calcClonalConsensus(db=clone,
# muFreqColumn=NULL, regionDefinition=NULL,
# method="thresholdedFreq",
# minimumFrequency=0.3, includeAmbiguous=TRUE,
# breakTiesStochastic=FALSE, breakTiesByColumns=NULL)
# # thresholdedFreq method, resolve ties stochastically
# cons3 <- calcClonalConsensus(db=clone,
# muFreqColumn=NULL, regionDefinition=NULL,
# method="thresholdedFreq",
# minimumFrequency=0.3, includeAmbiguous=FALSE,
# breakTiesStochastic=TRUE, breakTiesByColumns=NULL)
# # mostCommon method, resolve ties deterministically without using ambiguous characters
# cons4 <- calcClonalConsensus(db=clone,
# muFreqColumn=NULL, regionDefinition=NULL,
# method="mostCommon",
# minimumFrequency=NULL, includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE, breakTiesByColumns=NULL)
# # mostCommon method, resolve ties deterministically using ambiguous characters
# cons5 <- calcClonalConsensus(db=clone,
# muFreqColumn=NULL, regionDefinition=NULL,
# method="mostCommon",
# minimumFrequency=NULL, includeAmbiguous=TRUE,
# breakTiesStochastic=FALSE, breakTiesByColumns=NULL)
# # mostCommon method, resolve ties stochastically
# cons6 <- calcClonalConsensus(db=clone,
# muFreqColumn=NULL, regionDefinition=NULL,
# method="mostCommon",
# minimumFrequency=NULL, includeAmbiguous=FALSE,
# breakTiesStochastic=TRUE, breakTiesByColumns=NULL)
# # catchAll method
# cons7 <- calcClonalConsensus(db=clone,
# muFreqColumn=NULL, regionDefinition=NULL,
# method="catchAll",
# minimumFrequency=NULL, includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE, breakTiesByColumns=NULL)
# # mostMutated method, resolve ties stochastically
# cons8 <- calcClonalConsensus(db=clone,
# muFreqColumn="mu_freq", regionDefinition=NULL,
# method="mostMutated",
# minimumFrequency=NULL, includeAmbiguous=FALSE,
# breakTiesStochastic=TRUE, breakTiesByColumns=NULL)
# # mostMutated method, resolve ties deterministically using additional columns
# cons9 <- calcClonalConsensus(db=clone,
# muFreqColumn="mu_freq", regionDefinition=NULL,
# method="mostMutated",
# minimumFrequency=NULL, includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=list(c("junction_length", "duplicate_count"), c(max, max)))
# cons10 <- calcClonalConsensus(db=clone,
# muFreqColumn="mu_freq", regionDefinition=NULL,
# method="mostMutated",
# minimumFrequency=NULL, includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE,
# breakTiesByColumns=list(c("duplicate_count"), c(max)))
# # mostMutated method, resolve ties deterministically without using additional columns
# cons11 <- calcClonalConsensus(db=clone,
# muFreqColumn="mu_freq", regionDefinition=NULL,
# method="mostMutated",
# minimumFrequency=NULL, includeAmbiguous=FALSE,
# breakTiesStochastic=FALSE, breakTiesByColumns=NULL)
# @export
calcClonalConsensus <- function(db,
sequenceColumn="sequence_alignment",
germlineColumn="germline_alignment_d_mask",
muFreqColumn=NULL,
regionDefinition=NULL,
method=c("mostCommon", "thresholdedFreq", "catchAll", "mostMutated", "leastMutated"),
minimumFrequency=NULL, includeAmbiguous=FALSE,
breakTiesStochastic=FALSE, breakTiesByColumns=NULL) {
method <- match.arg(method)
inputSeq <- db[[sequenceColumn]]
germlineSeq <- db[[germlineColumn]]
# length of seqs in inputSeq and those in germlineSeq should match
if ( sum(stri_length(inputSeq)==stri_length(germlineSeq)) != length(inputSeq) ) {
stop("Sequences in inputSeq and germlineSeq have different lengths.")
}
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
# length limit from regionDefinition
if (!is.null(regionDefinition)) {
lenRegion <- regionDefinition@seqLength
} else {
lenRegion <- NULL
}
##### get consensus germline sequence (most common)
# NULL for minFreq and muFreqColumn b/c mostCommon definitely doesn't need them
germCons <- consensusSequence(germlineSeq, minFreq=NULL, lenLimit=lenRegion,
method="mostCommon",
includeAmbiguous=includeAmbiguous,
breakTiesStochastic=breakTiesStochastic,
breakTiesByColumns=NULL,
muFreqColumn=NULL, db=NULL)$cons
##### get consensus observed sequence
inputConsMuFreq <- consensusSequence(inputSeq, minFreq=minimumFrequency, lenLimit=lenRegion,
method=method,
includeAmbiguous=includeAmbiguous,
breakTiesStochastic=breakTiesStochastic,
breakTiesByColumns=breakTiesByColumns,
muFreqColumn=muFreqColumn, db=db)
inputCons <- inputConsMuFreq$cons
inputMuFreq <- inputConsMuFreq$muFreq
if (method %in% c("mostMutated", "leastMutated")) {
# possible to have inputCons and germCons of varying lengths
# germCons (mostCommon) length is "longest possible length" for mostCommon
# inputCons length is min of length of most/least mutated and lenLimit
# if different, trim the two to same length
lenInput <- stri_length(inputCons)
lenGerm <- stri_length(germCons)
if (lenInput != lenGerm) {
minLen <- min(lenInput, lenGerm)
inputCons <- substr(inputCons, 1, minLen)
germCons <- substr(germCons, 1, minLen)
}
}
# sanity check: length of germCons and inputCons should be the same
# all methods other than most/leastMutated should expect same lengths of inputCons & germCons
stopifnot( stri_length(germCons)==stri_length(inputCons) )
return(list("inputCons"=inputCons, "germlineCons"=germCons, "inputMuFreq"=inputMuFreq))
}
#### Mutation counting functions ####
#' Calculate observed numbers of mutations
#'
#' \code{observedMutations} calculates the observed number of mutations for each
#' sequence in the input \code{data.frame}.
#'
#' @param db \code{data.frame} containing sequence data.
#' @param sequenceColumn \code{character} name of the column containing input
#' sequences. IUPAC ambiguous characters for DNA are
#' supported.
#' @param germlineColumn \code{character} name of the column containing
#' the germline or reference sequence. IUPAC ambiguous
#' characters for DNA are supported.
#' @param regionDefinition \link{RegionDefinition} object defining the regions
#' and boundaries of the Ig sequences. If NULL, mutations
#' are counted for entire sequence. To use regions definitions,
#' sequences in \code{sequenceColum} and \code{germlineColumn}
#' must be aligned, following the IMGT schema.
#' @param mutationDefinition \link{MutationDefinition} object defining replacement
#' and silent mutation criteria. If \code{NULL} then
#' replacement and silent are determined by exact
#' amino acid identity.
#' @param ambiguousMode whether to consider ambiguous characters as
#' \code{"either or"} or \code{"and"} when determining and
#' counting the type(s) of mutations. Applicable only if
#' \code{sequenceColumn} and/or \code{germlineColumn}
#' contain(s) ambiguous characters. One of
#' \code{c("eitherOr", "and")}. Default is \code{"eitherOr"}.
#' @param frequency \code{logical} indicating whether or not to calculate
#' mutation frequencies. Default is \code{FALSE}.
#' @param combine \code{logical} indicating whether for each sequence should
#' the mutation counts for the different regions (CDR, FWR) and
#' mutation types be combined and return one value of
#' count/frequency per sequence instead of
#' multiple values. Default is \code{FALSE}.
#' @param nproc number of cores to distribute the operation over. If the
#' cluster has already been set the call function with
#' \code{nproc} = 0 to not reset or reinitialize. Default is
#' \code{nproc} = 1.
#' @param cloneColumn clone id column name in \code{db}
#' @param juncLengthColumn junction length column name in \code{db}
#'
#' @return A modified \code{db} \code{data.frame} with observed mutation counts for each
#' sequence listed. The columns names are dynamically created based on the
#' regions in the \code{regionDefinition}. For example, when using the
#' \link{IMGT_V} definition, which defines positions for CDR and
#' FWR, the following columns are added:
#' \itemize{
#' \item \code{mu_count_cdr_r}: number of replacement mutations in CDR1 and
#' CDR2 of the V-segment.
#' \item \code{mu_count_cdr_s}: number of silent mutations in CDR1 and CDR2
#' of the V-segment.
#' \item \code{mu_count_fwr_r}: number of replacement mutations in FWR1,
#' FWR2 and FWR3 of the V-segment.
#' \item \code{mu_count_fwr_s}: number of silent mutations in FWR1, FWR2 and
#' FWR3 of the V-segment.
#' }
#' If \code{frequency=TRUE}, R and S mutation frequencies are
#' calculated over the number of non-N positions in the specified regions.
#' \itemize{
#' \item \code{mu_freq_cdr_r}: frequency of replacement mutations in CDR1 and
#' CDR2 of the V-segment.
#' \item \code{mu_freq_cdr_s}: frequency of silent mutations in CDR1 and CDR2
#' of the V-segment.
#' \item \code{mu_freq_fwr_r}: frequency of replacement mutations in FWR1,
#' FWR2 and FWR3 of the V-segment.
#' \item \code{mu_freq_fwr_s}: frequency of silent mutations in FWR1, FWR2 and
#' FWR3 of the V-segment.
#' }
#' If \code{frequency=TRUE} and \code{combine=TRUE}, the mutations and non-N positions
#' are aggregated and a single \code{mu_freq} value is returned
#' \itemize{
#' \item \code{mu_freq}: frequency of replacement and silent mutations in the
#' specified region
#' }
#'
#' @details
#' Mutation counts are determined by comparing a reference sequence to the input sequences in the
#' column specified by \code{sequenceColumn}. See \link{calcObservedMutations} for more technical details,
#' \strong{including criteria for which sequence differences are included in the mutation
#' counts and which are not}.
#'
#' The mutations are binned as either replacement (R) or silent (S) across the different
#' regions of the sequences as defined by \code{regionDefinition}. Typically, this would
#' be the framework (FWR) and complementarity determining (CDR) regions of IMGT-gapped
#' nucleotide sequences. Mutation counts are appended to the input \code{db} as
#' additional columns.
#'
#' If \code{db} includes lineage information, such as the \code{parent_sequence} column created by
#' \link{makeGraphDf}, the reference sequence can be set to use that field as reference sequence
#' using the \code{germlineColumn} argument.
#'
#' @seealso
#' \link{calcObservedMutations} is called by this function to get the number of mutations
#' in each sequence grouped by the \link{RegionDefinition}.
#' See \link{IMGT_SCHEMES} for a set of predefined \link{RegionDefinition} objects.
#' See \link{expectedMutations} for calculating expected mutation frequencies.
#' See \link{makeGraphDf} for creating the field \code{parent_sequence}.
#'
#' @examples
#' # Subset example data
#' data(ExampleDb, package="alakazam")
#' db <- ExampleDb[1:10, ]
#'
#' # Calculate mutation frequency over the entire sequence
#' db_obs <- observedMutations(db, sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' frequency=TRUE,
#' nproc=1)
#'
#' # Count of V-region mutations split by FWR and CDR
#' # With mutations only considered replacement if charge changes
#' db_obs <- observedMutations(db, sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' regionDefinition=IMGT_V,
#' mutationDefinition=CHARGE_MUTATIONS,
#' nproc=1)
#'
#' # Count of VDJ-region mutations, split by FWR and CDR
#' db_obs <- observedMutations(db, sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' regionDefinition=IMGT_VDJ,
#' nproc=1)
#'
#' # Extend data with lineage information
#' data(ExampleTrees, package="alakazam")
#' graph <- ExampleTrees[[17]]
#' clone <- alakazam::makeChangeoClone(subset(ExampleDb, clone_id == graph$clone))
#' gdf <- makeGraphDf(graph, clone)
#'
#' # Count of mutations between observed sequence and immediate ancenstor
#' db_obs <- observedMutations(gdf, sequenceColumn="sequence",
#' germlineColumn="parent_sequence",
#' regionDefinition=IMGT_VDJ,
#' nproc=1)
#'
#' @export
observedMutations <- function(db,sequenceColumn = "sequence_alignment",
germlineColumn = "germline_alignment_d_mask",
regionDefinition=NULL, mutationDefinition = NULL,
ambiguousMode = c("eitherOr", "and"),
frequency = FALSE, combine = FALSE, nproc = 1,
cloneColumn = "clone_id",
juncLengthColumn = "junction_length") {
ambiguousMode <- match.arg(ambiguousMode)
check <- checkColumns(db, c(sequenceColumn, germlineColumn))
if (check != TRUE) { stop(check) }
regionDefinitionName <- ""
if (!is.null(regionDefinition)) {
regionDefinitionName <- regionDefinition@name
# Message if sequences don't have gaps or Ns (because makeChangeo clone
# masks IMGT gaps) as a proxy to detect not IMGT aligned sequences
if (all(!grepl("[\\.Nn]",db[[sequenceColumn]]))) {
warning("No IMGT gaps detected in ",sequenceColumn,".\nSequences in ",
sequenceColumn," and ", germlineColumn,
" should be aligned, with gaps (.,N or n) following the IMGT numbering scheme.")
}
if (all(!grepl("[\\.Nn]",db[[germlineColumn]]))) {
warning("No IMGT gaps detected in ",germlineColumn,
".\nSequences in ", sequenceColumn," and ", germlineColumn,
" should be aligned, with gaps (., N or n) following the IMGT numbering scheme.")
}
not_na <- !is.na(db[[germlineColumn]])
if (!all.equal(nchar(db[[sequenceColumn]][not_na]), nchar(db[[germlineColumn]][not_na]))) {
warning("Pairs of ", sequenceColumn, " and ", germlineColumn, " sequences with different lengths found.")
stop("Expecting IMGT aligned, same length sequences in ", sequenceColumn, " and ", germlineColumn,".")
}
}
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
# Check if mutation count/freq columns already exist
# and throw overwritting warning
if (!is.null(regionDefinition)) {
labels <- regionDefinition@labels
} else {
labels <- makeNullRegionDefinition()@labels
}
if (frequency == TRUE) {
if (combine) {
labels <- "mu_freq"
} else {
labels <- paste("mu_freq_", labels, sep="")
}
} else {
if (combine) {
labels <- "mu_count"
} else {
labels <- paste("mu_count_", labels, sep="")
}
}
label_exists <- labels[labels %in% colnames(db)]
if (length(label_exists)>0) {
warning(paste0("Columns ",
paste(label_exists, collapse=", "),
" exist and will be overwritten")
)
db[,label_exists] <- NULL
}
# Check mutation definition
if (!is.null(mutationDefinition) & !is(mutationDefinition, "MutationDefinition")) {
stop(deparse(substitute(mutationDefinition)), " is not a valid MutationDefinition object")
}
# Convert sequence columns to uppercase
db <- toupperColumns(db, c(sequenceColumn, germlineColumn))
db$tmp_obsmu_row_id <- 1:nrow(db)
# If the user has previously set the cluster and does not wish to reset it
if(!is.numeric(nproc)){
cluster <- nproc
nproc <- 0
}
# Ensure that the nproc does not exceed the number of cores/CPUs available
nproc <- min(nproc, cpuCount())
# If user wants to paralellize this function and specifies nproc > 1, then
# initialize and register slave R processes/clusters &
# export all nesseary environment variables, functions and packages.
if (nproc > 1) {
cluster <- parallel::makeCluster(nproc, type = "PSOCK")
parallel::clusterExport(cluster, list('db', 'sequenceColumn', 'germlineColumn',
'regionDefinition', 'regionDefinitionName',
'frequency', 'combine',
'ambiguousMode',
'calcObservedMutations','s2c','c2s','NUCLEOTIDES',
'NUCLEOTIDES_AMBIGUOUS', 'IUPAC2nucs',
'EXPANDED_AMBIGUOUS_CODONS',
'makeNullRegionDefinition', 'mutationDefinition',
'getCodonPos','getContextInCodon','mutationType',
'AMINO_ACIDS',
'binMutationsByRegion', 'countNonNByRegion','setRegionBoundaries','IMGT_V_BY_REGIONS'),
envir=environment())
registerDoParallel(cluster,cores=nproc)
} else if (nproc == 1) {
# If needed to run on a single core/cpu then, regsiter DoSEQ
# (needed for 'foreach' in non-parallel mode)
registerDoSEQ()
}
# Printing status to console
#cat("Calculating observed number of mutations...\n")
# Identify all the mutations in the sequences
numbOfSeqs <- nrow(db)
observedMutations_list <-
foreach(idx=iterators::icount(numbOfSeqs)) %dopar% {
rd <- regionDefinition
if (regionDefinitionName %in% c("IMGT_VDJ_BY_REGIONS","IMGT_VDJ")) {
rd <- setRegionBoundaries(juncLength = db[[juncLengthColumn]][idx],
sequenceImgt = db[[sequenceColumn]][idx],
regionDefinition=regionDefinition)
}
oM <- calcObservedMutations(db[[sequenceColumn]][idx],
db[[germlineColumn]][idx],
frequency=frequency & !combine,
regionDefinition=rd,
mutationDefinition=mutationDefinition,
returnRaw=combine,
ambiguousMode=ambiguousMode)
this_row_id <- db[['tmp_obsmu_row_id']][idx]
if (combine) {
num_mutations <- 0
if (!all(is.na(oM$pos))) {
num_mutations <- sum(oM$pos$r, oM$pos$s)
}
if (!frequency) {
c("mu_count"=num_mutations, "tmp_obsmu_row_id"=this_row_id)
} else {
num_nonN <- sum(oM$nonN)
mu_freq <- num_mutations/num_nonN
c("mu_freq"=mu_freq, "tmp_obsmu_row_id"=this_row_id)
}
} else {
oM['tmp_obsmu_row_id'] <- this_row_id
oM
}
}
# Convert list of mutations to data.frame
if (combine) {
labels_length <- 2 # mutation count and tmp_obsmu_row_id
} else if (!is.null(regionDefinition)) {
labels_length <- length(regionDefinition@labels) + 1 # +1 for tmp_obsmu_row_id
} else{
#labels_length=1
labels_length <- length(makeNullRegionDefinition()@labels) +1 # +1 for tmp_obsmu_row_id
}
# Convert mutation vector list to a matrix
observed_mutations <- as.data.frame(do.call(rbind, lapply(observedMutations_list, function(x) {
length(x) <- labels_length
return(x) })), stringsAsFactors=F)
#observed_mutations <- t(sapply(observedMutations_list, c))
sep <- "_"
if (ncol(observed_mutations) > 2) sep <- "_"
observed_mutations[is.na(observed_mutations)] <- 0
col_names <- colnames(observed_mutations)
mu_col_names <- col_names != "tmp_obsmu_row_id"
if (frequency == TRUE) {
idx <- which(colnames(observed_mutations)[mu_col_names] != "mu_freq")
if (length(idx)>0){
colnames(observed_mutations)[mu_col_names][idx] <- gsub("_$","",paste("mu_freq", col_names[mu_col_names][idx], sep=sep))
}
} else {
idx <- which(colnames(observed_mutations)[mu_col_names] != "mu_count")
if (length(idx)>0) {
colnames(observed_mutations)[mu_col_names] <- gsub("_$","",paste("mu_count", col_names[mu_col_names][idx], sep=sep))
}
}
# Properly shutting down the cluster
if (nproc > 1) { parallel::stopCluster(cluster) }
# Bind the observed mutations to db
db_new <- db %>%
ungroup() %>%
left_join(observed_mutations, by="tmp_obsmu_row_id") %>%
arrange(!!rlang::sym("tmp_obsmu_row_id")) %>%
select(-!!rlang::sym("tmp_obsmu_row_id"))
return(db_new)
}
#' Count the number of observed mutations in a sequence.
#'
#' \code{calcObservedMutations} determines all the mutations in a given input sequence
#' compared to its germline sequence.
#'
#' @param inputSeq input sequence. IUPAC ambiguous characters for DNA are
#' supported.
#' @param germlineSeq germline sequence. IUPAC ambiguous characters for DNA
#' are supported.
#' @param regionDefinition \link{RegionDefinition} object defining the regions
#' and boundaries of the Ig sequences. Note, only the part of
#' sequences defined in \code{regionDefinition} are analyzed.
#' If NULL, mutations are counted for entire sequence.
#' @param mutationDefinition \link{MutationDefinition} object defining replacement
#' and silent mutation criteria. If \code{NULL} then
#' replacement and silent are determined by exact
#' amino acid identity.
#' @param ambiguousMode whether to consider ambiguous characters as
#' \code{"either or"} or \code{"and"} when determining and
#' counting the type(s) of mutations. Applicable only if
#' \code{inputSeq} and/or \code{germlineSeq}
#' contain(s) ambiguous characters. One of
#' \code{c("eitherOr", "and")}. Default is \code{"eitherOr"}.
#' @param returnRaw return the positions of point mutations and their
#' corresponding mutation types, as opposed to counts of
#' mutations across positions. Also returns the number of
#' bases used as the denominator when calculating frequency.
#' Default is \code{FALSE}.
#' @param frequency \code{logical} indicating whether or not to calculate
#' mutation frequencies. The denominator used is the number
#' of bases that are not one of "N", "-", or "." in either
#' the input or the germline sequences. If set, this
#' overwrites \code{returnRaw}. Default is \code{FALSE}.
#'
#' @return For \code{returnRaw=FALSE}, an \code{array} with the numbers of replacement (R)
#' and silent (S) mutations.
#'
#' For \code{returnRaw=TRUE}, a list containing
#' \itemize{
#' \item \code{$pos}: A data frame whose columns (\code{position}, \code{r},
#' \code{s}, and \code{region}) indicate, respecitively, the nucleotide
#' position, the number of R mutations at that position, the number of S
#' mutations at that position, and the region in which that nucleotide
#' is in.
#' \item \code{$nonN}: A vector indicating the number of bases in regions
#' defined by \code{regionDefinition} (excluding non-triplet overhang,
#' if any) that are not one of "N", "-", or "." in either the
#' \code{inputSeq} or \code{germlineSeq}.
#' }
#'
#' For \code{frequency=TRUE}, regardless of \code{returnRaw}, an \code{array}
#' with the frequencies of replacement (R) and silent (S) mutations.
#'
#' @details
#' \strong{Each mutation is considered independently in the germline context}. For illustration,
#' consider the case where the germline is \code{TGG} and the observed is \code{TAC}.
#' When determining the mutation type at position 2, which sees a change from \code{G} to
#' \code{A}, we compare the codon \code{TGG} (germline) to \code{TAG} (mutation at position
#' 2 independent of other mutations in the germline context). Similarly, when determining
#' the mutation type at position 3, which sees a change from \code{G} to \code{C}, we
#' compare the codon \code{TGG} (germline) to \code{TGC} (mutation at position 3 independent
#' of other mutations in the germline context).
#'
#' If specified, only the part of \code{inputSeq} defined in \code{regionDefinition} is
#' analyzed. For example, when using the default \link{IMGT_V} definition, then mutations
#' in positions beyond 312 will be ignored. Additionally, non-triplet overhang at the
#' sequence end is ignored.
#'
#' Only replacement (R) and silent (S) mutations are included in the results. \strong{Excluded}
#' are:
#' \itemize{
#' \item Stop mutations
#'
#' E.g.: the case where \code{TAGTGG} is observed for the germline \code{TGGTGG}.
#'
#' \item Mutations occurring in codons where one or both of the observed and the
#' germline involve(s) one or more of "N", "-", or ".".
#'
#' E.g.: the case where \code{TTG} is observed for the germline being any one of
#' \code{TNG}, \code{.TG}, or \code{-TG}. Similarly, the case where any one of
#' \code{TTN}, \code{TT.}, or \code{TT-} is observed for the germline \code{TTG}.
#'
#' }
#' In other words, a result that is \code{NA} or zero indicates absence of R and S mutations,
#' not necessarily all types of mutations, such as the excluded ones mentioned above.
#'
#' \code{NA} is also returned if \code{inputSeq} or \code{germlineSeq} is shorter than 3
#' nucleotides.
#'
#' @section Ambiguous characters:
#' When there are ambiguous characters present, the user could choose how mutations involving
#' ambiguous characters are counted through \code{ambiguousMode}. The two available modes
#' are \code{"eitherOr"} and \code{"and"}.
#' \itemize{
#' \item With \code{"eitherOr"}, ambiguous characters are each expanded but only
#' 1 mutation is recorded. When determining the type of mutation, the
#' priority for different types of mutations, in decreasing order, is as follows:
#' no mutation, replacement mutation, silent mutation, and stop mutation.
#'
#' When counting the number of non-N, non-dash, and non-dot positions, each
#' position is counted only once, regardless of the presence of ambiguous
#' characters.
#'
#' As an example, consider the case where \code{germlineSeq} is \code{"TST"} and
#' \code{inputSeq} is \code{"THT"}. Expanding \code{"H"} at position 2 in
#' \code{inputSeq} into \code{"A"}, \code{"C"}, and \code{"T"}, as well as
#' expanding \code{"S"} at position 2 in \code{germlineSeq} into \code{"C"} and
#' \code{"G"}, one gets:
#'
#' \itemize{
#' \item \code{"TCT"} (germline) to \code{"TAT"} (observed): replacement
#' \item \code{"TCT"} (germline) to \code{"TCT"} (observed): no mutation
#' \item \code{"TCT"} (germline) to \code{"TTT"} (observed): replacement
#' \item \code{"TGT"} (germline) to \code{"TAT"} (observed): replacement
#' \item \code{"TGT"} (germline) to \code{"TCT"} (observed): replacement
#' \item \code{"TGT"} (germline) to \code{"TTT"} (observed): replacement
#' }
#'
#' Because "no mutation" takes priority over replacement mutation, the final
#' mutation count returned for this example is \code{NA} (recall that only R and
#' S mutations are returned). The number of non-N, non-dash, and non-dot
#' positions is 3.
#'
#' \item With \code{"and"}, ambiguous characters are each expanded and mutation(s)
#' from all expansions are recorded.
#'
#' When counting the number of non-N, non-dash, and non-dot positions, if a
#' position contains ambiguous character(s) in \code{inputSeq} and/or
#' \code{germlineSeq}, the count at that position is taken to be the total
#' number of combinations of germline and observed codons after expansion.
#'
#' Using the same example from above, the final result returned for this example
#' is that there are 5 R mutations at position 2. The number of non-N, non-dash,
#' and non-dot positions is 8, since there are 6 combinations stemming from
#' position 2 after expanding the germline codon (\code{"TST"}) and the observed
#' codon (\code{"THT"}).
#' }
#'
#' @seealso See \link{observedMutations} for counting the number of observed mutations
#' in a \code{data.frame}.
#'
#' @examples
#' # Use an entry in the example data for input and germline sequence
#' data(ExampleDb, package="alakazam")
#' in_seq <- ExampleDb[["sequence_alignment"]][100]
#' germ_seq <- ExampleDb[["germline_alignment_d_mask"]][100]
#'
#' # Identify all mutations in the sequence
#' ex1_raw <- calcObservedMutations(in_seq, germ_seq, returnRaw=TRUE)
#' # Count all mutations in the sequence
#' ex1_count <- calcObservedMutations(in_seq, germ_seq, returnRaw=FALSE)
#' ex1_freq <- calcObservedMutations(in_seq, germ_seq, returnRaw=FALSE, frequency=TRUE)
#' # Compare this with ex1_count
#' table(ex1_raw$pos$region, ex1_raw$pos$r)[, "1"]
#' table(ex1_raw$pos$region, ex1_raw$pos$s)[, "1"]
#' # Compare this with ex1_freq
#' table(ex1_raw$pos$region, ex1_raw$pos$r)[, "1"]/ex1_raw$nonN
#' table(ex1_raw$pos$region, ex1_raw$pos$s)[, "1"]/ex1_raw$nonN
#'
#' # Identify only mutations the V segment minus CDR3
#' ex2_raw <- calcObservedMutations(in_seq, germ_seq,
#' regionDefinition=IMGT_V, returnRaw=TRUE)
#' # Count only mutations the V segment minus CDR3
#' ex2_count <- calcObservedMutations(in_seq, germ_seq,
#' regionDefinition=IMGT_V, returnRaw=FALSE)
#' ex2_freq <- calcObservedMutations(in_seq, germ_seq,
#' regionDefinition=IMGT_V, returnRaw=FALSE,
#' frequency=TRUE)
#' # Compare this with ex2_count
#' table(ex2_raw$pos$region, ex2_raw$pos$r)[, "1"]
#' table(ex2_raw$pos$region, ex2_raw$pos$s)[, "1"]
#' # Compare this with ex2_freq
#' table(ex2_raw$pos$region, ex2_raw$pos$r)[, "1"]/ex2_raw$nonN
#' table(ex2_raw$pos$region, ex2_raw$pos$s)[, "1"]/ex2_raw$nonN
#'
#' # Identify mutations by change in hydropathy class
#' ex3_raw <- calcObservedMutations(in_seq, germ_seq, regionDefinition=IMGT_V,
#' mutationDefinition=HYDROPATHY_MUTATIONS,
#' returnRaw=TRUE)
#' # Count mutations by change in hydropathy class
#' ex3_count <- calcObservedMutations(in_seq, germ_seq, regionDefinition=IMGT_V,
#' mutationDefinition=HYDROPATHY_MUTATIONS,
#' returnRaw=FALSE)
#' ex3_freq <- calcObservedMutations(in_seq, germ_seq, regionDefinition=IMGT_V,
#' mutationDefinition=HYDROPATHY_MUTATIONS,
#' returnRaw=FALSE, frequency=TRUE)
#' # Compre this with ex3_count
#' table(ex3_raw$pos$region, ex3_raw$pos$r)[, "1"]
#' table(ex3_raw$pos$region, ex3_raw$pos$s)[, "1"]
#' # Compare this with ex3_freq
#' table(ex3_raw$pos$region, ex3_raw$pos$r)[, "1"]/ex3_raw$nonN
#' table(ex3_raw$pos$region, ex3_raw$pos$s)[, "1"]/ex3_raw$nonN
#'
#' @export
calcObservedMutations <- function(inputSeq, germlineSeq,
regionDefinition=NULL, mutationDefinition=NULL,
ambiguousMode=c("eitherOr", "and"),
returnRaw=FALSE, frequency=FALSE) {
ambiguousMode <- match.arg(ambiguousMode)
if (is.na(inputSeq)) {
inputSeq <- ""
}
if (is.na(germlineSeq)) {
inputSeq <- ""
}
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
# Check mutation definition
if (!is.null(mutationDefinition) & !is(mutationDefinition, "MutationDefinition")) {
stop(deparse(substitute(mutationDefinition)), " is not a valid MutationDefinition object")
}
# IMPORTANT: convert to uppercase
# NUCLEOTIDES, NUCLEOTIDES_AMBIGUOUS are in uppercases only
inputSeq <- toupper(inputSeq)
germlineSeq <- toupper(germlineSeq)
# Assign mutation definition
aminoAcidClasses <- if (is.null(mutationDefinition)) { NULL } else { mutationDefinition@classes }
# Removing IMGT gaps (they should come in threes)
# After converting ... to ZZZ any other . is not an IMGT gap & will be treated like N
germlineSeq <- gsub("\\.\\.\\.", "ZZZ", germlineSeq)
#If there is a single gap left convert it to an N
germlineSeq <- gsub("\\.", "N", germlineSeq)
# Re-assigning s_germlineSeq (now has all "." that are not IMGT gaps converted to Ns)
germlineSeq <- gsub("ZZZ", "...", germlineSeq)
# Removing IMGT gaps (they should come in threes)
# After converting ... to ZZZ any other . is not an IMGT gap & will be treated like N
inputSeq <- gsub("\\.\\.\\.", "ZZZ", inputSeq)
#If there is a single gap left convert it to an N
inputSeq <- gsub("\\.", "N", inputSeq)
# Re-assigning s_germlineSeq (now has all "." that are not IMGT gaps converted to Ns)
inputSeq <- gsub("ZZZ", "...", inputSeq)
# Trim the input and germline sequence to the shortest
len_inputSeq <- stri_length(inputSeq)
len_germlineSeq <- stri_length(germlineSeq)
# If a regionDefinition is passed,
# then only analyze till the end of the defined length
if(!is.null(regionDefinition)) {
rdLength <- regionDefinition@seqLength
} else {
rdLength <- max(len_inputSeq, len_germlineSeq, na.rm=TRUE)
# Create full sequence RegionDefinition object
regionDefinition <- makeNullRegionDefinition(rdLength)
}
len_shortest <- min(c(len_inputSeq, len_germlineSeq, rdLength), na.rm=TRUE)
c_inputSeq <- s2c(inputSeq)[1:len_shortest]
c_germlineSeq <- s2c(germlineSeq)[1:len_shortest]
# If the sequence and germline (which now should be the same length) is shorter
# than the rdLength, pad it with Ns
if(len_shortest<rdLength){
fillWithNs <- array("N",rdLength-len_shortest)
c_inputSeq <- c( c_inputSeq, fillWithNs)
c_germlineSeq <- c( c_germlineSeq, fillWithNs)
}
# length of c_inputSeq and c_germlineSeq should be multiples of 3; if not, trim
# at this point, c_inputSeq and c_germlineSeq have the same length
# this is NECESSARY because otherwise the example below could happen:
# inputSeq 400..402 (codon 134) is "G " (no info at 401 and 402);
# c_inputSeq_codons 400..402 will end up being "G" NA NA,
# which will be turned by the code strsplit(gsub... into
# "GNA" "NA" (2 codons!)
stopifnot(length(c_inputSeq)==length(c_germlineSeq))
seqLen <- length(c_inputSeq)
# return NA if seqLen shorter than one complete codon
# consistent with policy that non-triplet overhang is ignored
if (seqLen<3) {
tooShort <- TRUE
} else {
tooShort <- FALSE
# if there's non-triplet overhang, trim/ignore
if ( (seqLen%%3)!=0 ) {
c_inputSeq <- c_inputSeq[ 1:(seqLen-(seqLen%%3)) ]
c_germlineSeq <- c_germlineSeq[ 1:(seqLen-(seqLen%%3)) ]
}
stopifnot( (length(c_inputSeq)%%3)==0 )
stopifnot( (length(c_germlineSeq)%%3)==0 )
}
mutations_array_raw <- NA
mutations_array <- setNames(object=rep(NA, length(regionDefinition@labels)),
nm=regionDefinition@labels)
if (!tooShort) {
# locate mutations
# germline is one of ATGCN and IUPAC ambiguous characters
# input is one of ATGCN and IUPAC ambiguous characters
# character mismatch between germline & input (captures both cases like A vs. T, and W vs. T)
mutations = ( (c_germlineSeq != c_inputSeq) &
(c_germlineSeq %in% NUCLEOTIDES_AMBIGUOUS[1:14]) &
(c_inputSeq %in% NUCLEOTIDES_AMBIGUOUS[1:14]) )
#print(sum(mutations))
if (sum(mutations) > 0) {
# The nucleotide positions of the mutations
mutations_pos <- which(mutations==TRUE)
# For every mutations_pos, extract the entire codon from germline
mutations_pos_codons <- array(sapply(mutations_pos, getCodonPos))
c_germlineSeq_codons <- c_germlineSeq[mutations_pos_codons]
# For every mutations_pos, extract the codon from input (without other mutations
# at the same codon, if any).
c_inputSeq_codons <- array(sapply(mutations_pos, function(x) {
seqP <- c_germlineSeq[getCodonPos(x)]
seqP[getContextInCodon(x)] <- c_inputSeq[x]
return(seqP) }))
# split the string of codons into vector of codons
# [[:alnum:]]{3} will fail to capture non-ATGC (such as "-CC")
# to include a literal -, place it first or last
c_germlineSeq_codons <- strsplit(gsub("([A-Z\\.-]{3})", "\\1 ", c2s(c_germlineSeq_codons)), " ")[[1]]
c_inputSeq_codons <- strsplit(gsub("([A-Z\\.-]{3})", "\\1 ", c2s(c_inputSeq_codons)), " ")[[1]]
# Determine whether the mutations are R or S
# a table where rows are r/s/stop/na, cols are codon positions
# Count ambiguous characters as "eithe-or" or "and" based on user setting
# Makes use of the fact that c_germlineSeq_codons and c_inputSeqCodons have
# the same length
mutations_array_raw <- sapply(1:length(c_germlineSeq_codons),
function(i){
mutationType(codonFrom=c_germlineSeq_codons[i],
codonTo=c_inputSeq_codons[i],
ambiguousMode=ambiguousMode,
aminoAcidClasses)
})
# check dimension before assigning nucleotide positions to colnames
stopifnot(ncol(mutations_array_raw)==length(mutations_pos))
colnames(mutations_array_raw) <- mutations_pos
# keep only columns in which there are R or S mutations; and keep only R and S rows
# use drop=FALSE so that matrix won't be collapsed into a vector if there is only 1 TRUE in keep.idx
keep.idx <- apply(mutations_array_raw, 2, function(x) { any(x[c("r", "s")]>0) } )
keep.pos <- colnames(mutations_array_raw)[keep.idx]
mutations_array_raw <- mutations_array_raw[c("r", "s"), keep.idx, drop=FALSE]
colnames(mutations_array_raw) <- keep.pos
# if none of columns have R or S > 1, dim will be 2x0
if ( ncol(mutations_array_raw)==0 ) {
# NA if mutations_array_raw contains all NAs and they have all been removed
mutations_array_raw <- NA
mutations_array <- setNames(object=rep(NA, length(regionDefinition@labels)),
nm=regionDefinition@labels)
} else {
# count each mutation type by region
mutations_array <- binMutationsByRegion(mutations_array_raw, regionDefinition)
}
}
}
# frequency=TRUE overrides returnRaw=FALSE/TRUE
if (frequency) {
# avoid is.na(mutations_array_raw) to avoid warning in case mutations_array_raw is a vector
if (length(mutations_array_raw) == sum(is.na(mutations_array_raw))) {
return(mutations_array)
} else {
# Freq = numb of mutations / numb of non N bases (in both seq and gl)
denoms <- countNonNByRegion(regDef=regionDefinition, ambiMode=ambiguousMode,
inputChars=c_inputSeq, germChars=c_germlineSeq,
inputCodons=c_inputSeq_codons,
germCodons=c_germlineSeq_codons,
mutPos=mutations_pos)
mutations_array <- mutations_array/rep(denoms, each=2)
return(mutations_array)
}
}
# return positions of point mutations and their mutation types ("raw")
if (returnRaw){
if (length(mutations_array_raw) == sum(is.na(mutations_array_raw))) {
# if mutations_array_raw is NA, or
# if mutations_array_raw is empty due to all mutations being "stop" and hence removed
# avoid is.na(mutations_array_raw) to avoid warning in case mutations_array_raw is a vector
if (!tooShort) {
# when input and germline are >=3 nucleotides but there's no mutation
# c_inputSeq_codons, c_germlineSeq_codons, and mutations_pos won't exist
# this won't be a problem if ambiguousMode="eitherOr", but would for "and"
# set inputCodons, germCodons, and mutPos to NULL to work around that
nonN.denoms <- countNonNByRegion(regDef=regionDefinition, ambiMode=ambiguousMode,
inputChars=c_inputSeq, germChars=c_germlineSeq,
inputCodons=NULL,
germCodons=NULL,
mutPos=NULL)
} else {
nonN.denoms <- setNames(object=rep(NA, length(regionDefinition@regions)),
nm=regionDefinition@regions)
}
return(list(pos=mutations_array_raw, nonN=nonN.denoms))
} else {
nonN.denoms <- countNonNByRegion(regDef=regionDefinition, ambiMode=ambiguousMode,
inputChars=c_inputSeq, germChars=c_germlineSeq,
inputCodons=c_inputSeq_codons,
germCodons=c_germlineSeq_codons,
mutPos=mutations_pos)
# df indicating position, mutation type (R or S), and region of each mutation
rawDf <- data.frame(as.numeric(colnames(mutations_array_raw)))
rawDf <- cbind(rawDf,
mutations_array_raw["r", ],
mutations_array_raw["s", ],
as.character(regionDefinition@boundaries[as.numeric(colnames(mutations_array_raw))]),
stringsAsFactors=F)
colnames(rawDf) <- c("position", "r", "s", "region")
return(list(pos=rawDf, nonN=nonN.denoms))
}
} else {
# return counts of each mutation type
return(mutations_array)
}
}
# Aggregate mutations by region
#
# \code{binMutationsByRegion} takes an array of observed mutations (e.g. from
# \code{calcObservedMutations}) and bins them by the different regions defined in the
# \code{regionDefinition}.
#
# @param mutationsArray \code{array} containing the number of R and S mutations
# at the nucleotide positions where there are mutations.
# @param regionDefinition \link{RegionDefinition} object defining the regions
# and boundaries of the Ig sequences.
#
# @return An \code{array} of R/S mutations binned across all the unique regions defined
# by \code{regionDefinition}.
#
# @details
# Note, only the part of sequences defined in \code{regionDefinition} are analyzed.
# For example, if the default \link{IMGT_V} definition is used, then mutations
# in positions beyond 312 will be ignored.
#
# @seealso
# See \link{observedMutations} for identifying and counting the
# number of observed mutations.
# This function is also used in \link{calcObservedMutations}.
#
# @examples
# # Generate a random mutation array
# numbOfMutPos <- sample(3:10, 1)
# posOfMutations <- sort(sample(330, numbOfMutPos))
# mutations_array <- matrix(0, nrow=2, ncol=numbOfMutPos, dimnames=list(c("R", "S"), posOfMutations))
# mutations_array["r", ] = sample(x=0:10, size=numbOfMutPos, replace=TRUE)
# mutations_array["s", ] = sample(x=0:10, size=numbOfMutPos, replace=TRUE)
# # Random mutations
# binMutationsByRegion(mutations_array, regionDefinition=NULL)
# binMutationsByRegion(mutations_array, regionDefinition=IMGT_V)
binMutationsByRegion <- function(mutationsArray,
regionDefinition=NULL) {
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
# Create full sequence RegionDefinition object
# The seqLength will be the largest index of a mutation
if (is.null(regionDefinition)) {
regionDefinition <- makeNullRegionDefinition(max(as.numeric(colnames(mutationsArray))))
}
# get 2 vectors, 1 for R, 1 for S, along length of 1:regionDefinition@seqLength
# each vector records the number of R/S at each position
mutatedPositions <- as.numeric(colnames(mutationsArray))
mutations_R <- array(NA, dim=regionDefinition@seqLength)
mutations_S <- array(NA, dim=regionDefinition@seqLength)
mutations_R[mutatedPositions] <- mutationsArray["r", ]
mutations_S[mutatedPositions] <- mutationsArray["s", ]
mutations_R <- mutations_R[1:regionDefinition@seqLength]
mutations_S <- mutations_S[1:regionDefinition@seqLength]
# count number of R/S in each region
mutations_region_counts <- rep(0, length(regionDefinition@labels))
names(mutations_region_counts) <- regionDefinition@labels
for (reg in regionDefinition@regions) {
mutations_region_counts[paste0(reg, "_r")] <- sum(mutations_R[regionDefinition@boundaries==reg], na.rm=T)
mutations_region_counts[paste0(reg, "_s")] <- sum(mutations_S[regionDefinition@boundaries==reg], na.rm=T)
}
return(mutations_region_counts)
}
# Count the number of non-N, non-dash, and non-dot positions
#
# @param regDef regionDefinition
# @param ambiMode ambiguousMode
# @param inputChars c_inputSeq
# @param germChars c_germlineSeq
# @param inputCodons c_inputSeq_codons
# @param germCodons c_germlineSeq_codons
# @param mutPos mutations_pos
#
# @return The number of non-N, non-dash, and non-dot characters. Calculation method
# differs depending on ambiMode being "eitherOr" or "and". By design, when
# there is no ambiguous character in the input or germline, the result should be
# the same regardless of ambiMode.
#
# @details This is a helper function for calcObservedMutations() and is not intended to
# be called directly. All input arguments are, by design, expected to be
# generated as intermediate products during a call to calcObservedMutations().
#
countNonNByRegion <- function(regDef, ambiMode, inputChars, germChars,
inputCodons, germCodons, mutPos) {
regionNames <- unique(sapply(regDef@labels,
function(x) { substr(x, 1, stri_length(x)-2) }))
if (ambiMode=="eitherOr") {
# Subset boundaries to only non-N & non-dash & non-dot bases (in both seq and gl)
# "which" in next line is ESSENTIAL; otherwise @boundaries won't be truncated
# e.g. (1:6)[c(T,T,T)] returns 1:6, not 1:3
boundaries <- regDef@boundaries[
which(inputChars %in% NUCLEOTIDES_AMBIGUOUS[1:14] &
germChars %in% NUCLEOTIDES_AMBIGUOUS[1:14])]
# number of non-N & non-dash & non-dot bases (in both seq and gl)
nonN <- sapply(regionNames, function(x) { sum(boundaries==x) })
} else if (ambiMode=="and") {
### positions where there's no mutation:
# simply count the positions where both input and germline are
# non-N, non-dash, and non-dot
boundaries.1 <- regDef@boundaries[
which(inputChars %in% NUCLEOTIDES_AMBIGUOUS[1:14] &
germChars %in% NUCLEOTIDES_AMBIGUOUS[1:14] &
(germChars == inputChars))]
nonN.1 <- sapply(regionNames, function(x) { sum(boundaries.1==x) })
### positions where there's mutation:
if ( (!is.null(inputCodons)) & (!is.null(germCodons)) & (!is.null(mutPos)) ) {
# expand codon with ambiguous character(s) into codons with unambiguous characters
# calculate the number of possible combinations between input and germline codons
# this makes use of the important fact that each mutation is considered
# independently in the germline context
inputNumExpanded <- sapply(inputCodons,
function(codon){
length(EXPANDED_AMBIGUOUS_CODONS[[codon]])
})
germlineNumExpanded <- sapply(germCodons,
function(codon){
length(EXPANDED_AMBIGUOUS_CODONS[[codon]])
})
totalNumExpanded <- inputNumExpanded * germlineNumExpanded
# use mutations_pos to capture positions at which r/s is absent (stop or na instead)
# such positions would have been omitted from mutations_array_raw or mutations_array
boundaries.2 <- regDef@boundaries[mutPos]
# makes use of the fact that inputCodons, germCodons, and
# mutPos align exactly
nonN.2 <- sapply(regionNames, function(x){ sum(totalNumExpanded[boundaries.2==x]) })
} else {
nonN.2 <- setNames(object=rep(0, length(regionNames)), nm=regionNames)
}
nonN <- nonN.1 + nonN.2
}
return(nonN)
}
#### Sliding window approach ####
#' Sliding window approach towards filtering a single sequence
#'
#' \code{slideWindowSeq} determines whether an input sequence contains equal to or more than
#' a given number of mutations in a given length of consecutive nucleotides (a "window")
#' when compared to a germline sequence.
#'
#' @param inputSeq input sequence.
#' @param germlineSeq germline sequence.
#' @param mutThresh threshold on the number of mutations in \code{windowSize}
#' consecutive nucleotides. Must be between 1 and \code{windowSize}
#' inclusive.
#' @param windowSize length of consecutive nucleotides. Must be at least 2.
#'
#' @return \code{TRUE} if there are equal to or more than \code{mutThresh} number of mutations
#' in any window of \code{windowSize} consecutive nucleotides (i.e. the sequence should
#' be filtered); \code{FALSE} if otherwise.
#'
#' @seealso \link{calcObservedMutations} is called by \code{slideWindowSeq} to identify observed
#' mutations. See \link{slideWindowDb} for applying the sliding window approach on a
#' \code{data.frame}. See \link{slideWindowTune} for parameter tuning for \code{mutThresh}
#' and \code{windowSize}.
#'
#' @examples
#' # Use an entry in the example data for input and germline sequence
#' data(ExampleDb, package="alakazam")
#' in_seq <- ExampleDb[["sequence_alignment"]][100]
#' germ_seq <- ExampleDb[["germline_alignment_d_mask"]][100]
#'
#' # Determine if in_seq has 6 or more mutations in 10 consecutive nucleotides
#' slideWindowSeq(inputSeq=in_seq, germlineSeq=germ_seq, mutThresh=6, windowSize=10)
#' slideWindowSeq(inputSeq="TCGTCGAAAA", germlineSeq="AAAAAAAAAA", mutThresh=6, windowSize=10)
#' @export
slideWindowSeq <- function(inputSeq, germlineSeq, mutThresh, windowSize){
# identify all R and S mutations in input sequence
inputMut <- calcObservedMutations(inputSeq=inputSeq, germlineSeq=germlineSeq, returnRaw=T)
# call helper
return(slideWindowSeqHelper(mutPos=inputMut$pos, mutThresh=mutThresh, windowSize=windowSize))
}
# NOTE: DO NOT MERGE slideWindowSeqHelper with slideWindowSeq (very different input formats)
# slideWindowTune needs to call slideWindowSeqHelper directly for efficiency
# Helper for sliding window approach towards filtering sequences
#
# @param mutPos a \code{data.frame} containing positions and types of point
# mutations as returned in \code{$pos} by
# \code{calcObserverdMutations()} with \code{returnRaw=TRUE}.
# Can be \code{NA}, in which case the returned value will be
# \code{FALSE}.
# @param mutThresh threshold on the number of mutations in \code{windowSize}
# consecutive nucleotides. Must be between 1 and \code{windowSize}
# inclusive.
# @param windowSize length of consecutive nucleotides. Must be at least 2.
#
# @return \code{TRUE} if there are equal to or more than \code{mutThresh} number of mutations
# in any window of \code{windowSize} consecutive nucleotides; \code{FALSE} if otherwise.
slideWindowSeqHelper <- function(mutPos, mutThresh, windowSize){
# check preconditions
stopifnot(mutThresh >= 1 & mutThresh <= windowSize & windowSize>=2)
if (length(mutPos) == 1 && is.na(mutPos)) {
# use && instead of & to short-circuit in case length(mutPos)!=1 (otherwise warning)
return(FALSE)
} else {
# general idea:
# only need to check windows containing mutations (as opposed to every possible window)
for (i in 1:nrow(mutPos)){
# get window limits
lower <- mutPos$position[i]
upper <- lower + windowSize - 1
# how many mutations fall within current window
windowCount <- sum(mutPos[mutPos$position>=lower & mutPos$position<=upper, c("r","s")])
# return as soon as a window has >= mutThresh mutations
if (windowCount >= mutThresh) { return(TRUE) }
}
return(FALSE)
}
}
#' Sliding window approach towards filtering sequences in a \code{data.frame}
#'
#' \code{slideWindowDb} determines whether each input sequence in a \code{data.frame}
#' contains equal to or more than a given number of mutations in a given length of
#' consecutive nucleotides (a "window") when compared to their respective germline
#' sequence.
#'
#' @param db \code{data.frame} containing sequence data.
#' @param sequenceColumn name of the column containing IMGT-gapped sample sequences.
#' @param germlineColumn name of the column containing IMGT-gapped germline sequences.
#' @param mutThresh threshold on the number of mutations in \code{windowSize}
#' consecutive nucleotides. Must be between 1 and \code{windowSize}
#' inclusive.
#' @param windowSize length of consecutive nucleotides. Must be at least 2.
#' @param nproc Number of cores to distribute the operation over. If the
#' \code{cluster} has already been set earlier, then pass the
#' \code{cluster}. This will ensure that it is not reset.
#'
#' @return a logical vector. The length of the vector matches the number of input sequences in
#' \code{db}. Each entry in the vector indicates whether the corresponding input sequence
#' should be filtered based on the given parameters.
#'
#' @seealso See \link{slideWindowSeq} for applying the sliding window approach on a single sequence.
#' See \link{slideWindowTune} for parameter tuning for \code{mutThresh} and \code{windowSize}.
#'
#' @examples
#' # Use an entry in the example data for input and germline sequence
#' data(ExampleDb, package="alakazam")
#'
#' # Apply the sliding window approach on a subset of ExampleDb
#' slideWindowDb(db=ExampleDb[1:10, ], sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' mutThresh=6, windowSize=10, nproc=1)
#'
#' @export
slideWindowDb <- function(db, sequenceColumn="sequence_alignment",
germlineColumn="germline_alignment_d_mask",
mutThresh=6, windowSize=10, nproc=1){
# Hack for visibility of foreach index variables
i <- NULL
# Check input
check <- checkColumns(db, c(sequenceColumn, germlineColumn))
if (check != TRUE) { stop(check) }
db <- db[,c(sequenceColumn, germlineColumn)]
# If the user has previously set the cluster and does not wish to reset it
if(!is.numeric(nproc)){
stop_cluster <- FALSE
cluster <- nproc
nproc <- 0
} else {
stop_cluster <- TRUE
}
# Ensure that the nproc does not exceed the number of cores/CPUs available
nproc <- min(nproc, cpuCount())
# If user wants to paralellize this function and specifies nproc > 1, then
# initialize and register slave R processes/clusters &
# export all nesseary environment variables, functions and packages.
if (nproc == 1) {
# If needed to run on a single core/cpu then, register DoSEQ
# (needed for 'foreach' in non-parallel mode)
registerDoSEQ()
} else {
cluster_type <- "FORK"
if (.Platform$OS.type == "windows") {
cluster_type <- "PSOCK"
}
if (nproc != 0) {
#cluster <- makeCluster(nproc, type="SOCK")
cluster <- parallel::makeCluster(nproc, type= cluster_type)
}
parallel::clusterExport(cluster,
list('db', 'sequenceColumn', 'germlineColumn',
'mutThresh', 'windowSize','slideWindowSeq'),
envir=environment() )
registerDoParallel(cluster,cores=nproc)
}
filter <- unlist(foreach(i=1:nrow(db),
.verbose=FALSE, .errorhandling='stop') %dopar% {
slideWindowSeq(inputSeq = db[i, sequenceColumn],
germlineSeq = db[i, germlineColumn],
mutThresh = mutThresh,
windowSize = windowSize)
})
if (stop_cluster & !is.numeric(nproc)) {
parallel::stopCluster(cluster)
}
filter
}
#' Parameter tuning for sliding window approach
#'
#' Apply \link{slideWindowDb} over a search grid made of combinations of \code{mutThresh} and
#' \code{windowSize} to help with picking a pair of values for these parameters. Parameter
#' tuning can be performed by choosing a combination that gives a reasonable number of
#' filtered/remaining sequences.
#'
#' @param db \code{data.frame} containing sequence data.
#' @param sequenceColumn name of the column containing IMGT-gapped sample sequences.
#' @param germlineColumn name of the column containing IMGT-gapped germline sequences.
#' @param dbMutList if supplied, this should be a list consisting of \code{data.frame}s
#' returned as \code{$pos} in the nested list produced by
#' \link{calcObservedMutations} with \code{returnRaw=TRUE}; otherwise,
#' \link{calcObservedMutations} is called on columns \code{sequenceColumn}
#' and \code{germlineColumn} of \code{db}. Default is \code{NULL}.
#' @param mutThreshRange range of threshold on the number of mutations in \code{windowSize}
#' consecutive nucleotides to try. Must be between 1 and
#' maximum \code{windowSizeRange} inclusive.
#' @param windowSizeRange range of length of consecutive nucleotides to try. The lower end
#' must be at least 2.
#' @param verbose whether to print out messages indicating current progress. Default
#' is \code{TRUE}.
#' @param nproc Number of cores to distribute the operation over. If the
#' \code{cluster} has already been set earlier, then pass the
#' \code{cluster}. This will ensure that it is not reset.
#' @return a list of logical matrices. Each matrix corresponds to a \code{windowSize} in
#' \code{windowSizeRange}. Each column in a matrix corresponds to a \code{mutThresh} in
#' \code{mutThreshRange}. Each row corresponds to a sequence. \code{TRUE} values
#' mean the sequences has at least the number of mutations specified in the column name,
#' for that \code{windowSize}.
#'
#' @details If, in a given combination of \code{mutThresh} and \code{windowSize}, \code{mutThresh}
#' is greater than \code{windowSize}, \code{NA}s will be returned for that particular
#' combination. A message indicating that the combination has been "skipped" will be
#' printed if \code{verbose=TRUE}.
#'
#' If \link{calcObservedMutations} was previously run on \code{db} and saved, supplying
#' \code{$pos} from the saved result as \code{dbMutList} could save time by skipping a
#' second call of \link{calcObservedMutations}. This could be helpful especially when
#' \code{db} is large.
#'
#' @seealso \link{slideWindowDb} is called on \code{db} for tuning. See \link{slideWindowTunePlot}
#' for visualization. See \link{calcObservedMutations} for generating \code{dbMutList}.
#'
#' @examples
#' # Load and subset example data
#' data(ExampleDb, package="alakazam")
#' db <- ExampleDb[1:5, ]
#'
#' # Try out thresholds of 2-4 mutations in window sizes of 7-9 nucleotides.
#' # In this case, all combinations are legal.
#' slideWindowTune(db, mutThreshRange=2:4, windowSizeRange=7:9)
#'
#' # Illegal combinations are skipped, returning NAs.
#' slideWindowTune(db, mutThreshRange=2:4, windowSizeRange=2:4,
#' verbose=FALSE)
#'
#' # Run calcObservedMutations separately
#' exDbMutList <- sapply(1:5, function(i) {
#' calcObservedMutations(inputSeq=db[["sequence_alignment"]][i],
#' germlineSeq=db[["germline_alignment_d_mask"]][i],
#' returnRaw=TRUE)$pos })
#' slideWindowTune(db, dbMutList=exDbMutList,
#' mutThreshRange=2:4, windowSizeRange=2:4)
#' @export
slideWindowTune <- function(db, sequenceColumn="sequence_alignment",
germlineColumn="germline_alignment_d_mask",
dbMutList=NULL,
mutThreshRange, windowSizeRange, verbose=TRUE,
nproc=1){
# Hack for visibility of foreach index variables
i <- NULL
# check preconditions
stopifnot(!is.null(db))
stopifnot(min(mutThreshRange) >= 1 &
max(mutThreshRange) <= max(windowSizeRange) &
min(windowSizeRange) >= 2)
db <- db[,c(sequenceColumn, germlineColumn)]
# If the user has previously set the cluster and does not wish to reset it
if(!is.numeric(nproc)){
stop_cluster <- FALSE
cluster <- nproc
nproc <- 0
} else {
stop_cluster <- TRUE
}
# Ensure that the nproc does not exceed the number of cores/CPUs available
nproc <- min(nproc, cpuCount())
# If user wants to paralellize this function and specifies nproc > 1, then
# initialize and register slave R processes/clusters &
# export all nesseary environment variables, functions and packages.
if (nproc == 1) {
# If needed to run on a single core/cpu then, regsiter DoSEQ
# (needed for 'foreach' in non-parallel mode)
registerDoSEQ()
} else {
cluster_type <- "FORK"
if (.Platform$OS.type == "windows") {
cluster_type <- "PSOCK"
}
if (nproc != 0) {
#cluster <- makeCluster(nproc, type="SOCK")
cluster <- parallel::makeCluster(nproc, type= cluster_type)
}
parallel::clusterExport(cluster,
list('db', 'sequenceColumn', 'germlineColumn',
'calcObservedMutations','slideWindowSeqHelper'),
envir=environment() )
registerDoParallel(cluster,cores=nproc)
}
# get positions of R/S mutations for sequences in db
# do this here and then call slideWindowSeqHelper (so it's done only once)
# instead of calling slideWindowDb which does this every time it is called
if (is.null(dbMutList)) {
if (verbose) {cat(paste0("Identifying mutated positions\n"))}
pb <- txtProgressBar(0, nrow(db), style = 3 )
inputMutList <- foreach(i=1:nrow(db),
.verbose=FALSE, .errorhandling='stop') %dopar% {
setTxtProgressBar(pb, i)
calcObservedMutations(inputSeq=db[i, sequenceColumn],
germlineSeq=db[i, germlineColumn],
returnRaw=T)$pos
}
} else {
if (verbose) {cat("dbMutList supplied; skipped calling calcObservedMutations()\n")}
inputMutList <- dbMutList
}
# if (nproc != 1) {
# parallel::clusterExport(cluster,
# list('dbMutList'),
# envir=environment() )
# }
# Get window-threshold combinations
combs <- expand.grid(windowSizeRange, mutThreshRange)
pb2 <- txtProgressBar(0, nrow(combs), style = 3 )
if (verbose) {cat(paste0("\nAnalyzing combinations of windowSizeRange and mutThreshRange\n"))}
tmp <- foreach(i=1:nrow(combs),
.verbose=FALSE, .combine=rbind, .errorhandling='stop') %dopar% {
setTxtProgressBar(pb2, i)
size <- combs[i,1]
thresh <- combs[i,2]
if (thresh <= size){
# apply slideWindow using current pair of parameters
cur.logical <- unlist(lapply(inputMutList,
slideWindowSeqHelper,
mutThresh = thresh, windowSize = size))
} else {
if (verbose) {cat(paste0(">>> mutThresh = ", thresh, " > windowSize = ",
size, " (skipped)\n"))}
# NA if skipped
cur.logical <- rep(NA, nrow(db))
}
data.frame(list(
"windowSize"=size,
"mutThreshold"=thresh,
"cur_logical"=cur.logical,
"row_idx"=1:length(cur.logical)
))
}
cur.list <- lapply(split(tmp, f=tmp[['windowSize']]), function(x) {
x <- x[,colnames(x) != "windowSize"]
pivot_wider(x, names_from=!!rlang::sym("mutThreshold"),
values_from=!!rlang::sym("cur_logical"),
id_cols=!!rlang::sym("row_idx")) %>%
arrange(!!rlang::sym("row_idx")) %>%
select(-!!rlang::sym("row_idx")) %>%
as.matrix()
})
if (stop_cluster & !is.numeric(nproc)) {
parallel::stopCluster(cluster)
}
return(cur.list)
}
#' Visualize parameter tuning for sliding window approach
#'
#' Visualize results from \link{slideWindowTune}
#'
#' @param tuneList a list of logical matrices returned by \link{slideWindowTune}.
#' @param plotFiltered whether to plot the number of filtered ('filtered'),
#' or remaining ('remaining') sequences for each mutation threshold.
#' Use 'per_mutation' to plot the number of sequences at each mutation
#' value. Default is \code{'filtered'}.
#' @param percentage whether to plot on the y-axis the percentage of filtered sequences
#' (as opposed to the absolute number). Default is \code{FALSE}.
#' @param jitter.x whether to jitter x-axis values. Default is \code{FALSE}.
#' @param jitter.x.amt amount of jittering to be applied on x-axis values if
#' \code{jitter.x=TRUE}. Default is 0.1.
#' @param jitter.y whether to jitter y-axis values. Default is \code{FALSE}.
#' @param jitter.y.amt amount of jittering to be applied on y-axis values if
#' \code{jitter.y=TRUE}. Default is 0.1.
#' @param pchs point types to pass on to \link{plot}. Default is
#' \code{1:length(tuneList)}.
#' @param ltys line types to pass on to \link{plot}. Default is
#' \code{1:length(tuneList)}.
#' @param cols colors to pass on to \link{plot}.
#' @param plotLegend whether to plot legend. Default is \code{TRUE}.
#' @param legendPos position of legend to pass on to \link{legend}. Can be either a
#' numeric vector specifying x-y coordinates, or one of
#' \code{"topright"}, \code{"center"}, etc. Default is \code{"topright"}.
#' @param legendHoriz whether to make legend horizontal. Default is \code{FALSE}.
#' @param legendCex numeric values by which legend should be magnified relative to 1.
#' @param title plot main title. Default is NULL (no title)
#' @param returnRaw Return a data.frame with sequence counts (TRUE) or a
#' plot. Default is \code{FALSE}.
#'
#' @details For each \code{windowSize}, if \code{plotFiltered='filtered'}, the x-axis
#' represents a mutation threshold range, and the y-axis the number of
#' sequences that have at least that number of mutations. If
#' \code{plotFiltered='remaining'}, the y-axis represents the number of sequences
#' that have less mutations than the mutation threshold range. For the same
#' window size, a sequence can be included in the counts for different
#' mutation thresholds. For example, sequence "CCACCAAAA" with germline
#' "AAAAAAAAA" has 4 mutations. This sequence has at least 2 mutations
#' and at least 3 mutations, in a window of size 4. the sequence will
#' be included in the sequence count for mutation thresholds 2 and 3.
#' If \code{plotFiltered='per_mutation'}, the sequences are counted only once
#' for each window size, at their largest mutation threshold. The above
#' example sequence would be included in the sequence count for
#' mutation threshold 3.
#'
#' When plotting, a user-defined \code{amount} of jittering can be applied on values plotted
#' on either axis or both axes via adjusting \code{jitter.x}, \code{jitter.y},
#' \code{jitter.x.amt} and \code{jitter.y.amt}. This may be help with visually distinguishing
#' lines for different window sizes in case they are very close or identical to each other.
#' If plotting percentages (\code{percentage=TRUE}) and using jittering on the y-axis values
#' (\code{jitter.y=TRUE}), it is strongly recommended that \code{jitter.y.amt} be set very
#' small (e.g. 0.01).
#'
#' \code{NA} for a combination of \code{mutThresh} and \code{windowSize} where
#' \code{mutThresh} is greater than \code{windowSize} will not be plotted.
#'
#' @seealso See \link{slideWindowTune} for how to get \code{tuneList}. See \link{jitter} for
#' use of \code{amount} of jittering.
#'
#' @examples
#' # Use an entry in the example data for input and germline sequence
#' data(ExampleDb, package="alakazam")
#'
#' # Try out thresholds of 2-4 mutations in window sizes of 3-5 nucleotides
#' # on a subset of ExampleDb
#' tuneList <- slideWindowTune(db = ExampleDb[1:10, ],
#' mutThreshRange = 2:4, windowSizeRange = 3:5,
#' verbose = FALSE)
#'
#' # Visualize
#' # Plot numbers of sequences filtered without jittering y-axis values
#' plotSlideWindowTune(tuneList, pchs=1:3, ltys=1:3, cols=1:3,
#' plotFiltered='filtered', jitter.y=FALSE)
#'
#' # Notice that some of the lines overlap
#' # Jittering could help
#' plotSlideWindowTune(tuneList, pchs=1:3, ltys=1:3, cols=1:3,
#' plotFiltered='filtered', jitter.y=TRUE)
#'
#' # Plot numbers of sequences remaining instead of filtered
#' plotSlideWindowTune(tuneList, pchs=1:3, ltys=1:3, cols=1:3,
#' plotFiltered='remaining', jitter.y=TRUE,
#' legendPos="bottomright")
#'
#' # Plot percentages of sequences filtered with a tiny amount of jittering
#' plotSlideWindowTune(tuneList, pchs=1:3, ltys=1:3, cols=1:3,
#' plotFiltered='filtered', percentage=TRUE,
#' jitter.y=TRUE, jitter.y.amt=0.01)
#' @export
plotSlideWindowTune <- function(tuneList,
plotFiltered = c('filtered','remaining','per_mutation'),
percentage = FALSE,
jitter.x = FALSE, jitter.x.amt = 0.1,
jitter.y = FALSE, jitter.y.amt = 0.1,
pchs = 1:length(tuneList), ltys = 1:length(tuneList), cols = 1,
plotLegend = TRUE, legendPos = "topright",
legendHoriz = FALSE, legendCex = 1, title=NULL,
returnRaw=FALSE){
# collapse parameter if no user input then first item is selected
plotFiltered <- match.arg(plotFiltered)
if (plotFiltered == 'filtered') {
xlab <- "Threshold on number of mutations"
ylab.part.2 <- "filtered"
} else if (plotFiltered == 'remaining') {
# invert (!) tuneList if plotting retained sequences
tuneList <- lapply(tuneList, function(x){!x})
xlab <- "Threshold on number of mutations"
ylab.part.2 <- "remaining"
} else if (plotFiltered == 'per_mutation') {
xlab <- "Maximum number of mutations"
ylab.part.2 <- 'per_mutation'
} else {
warning("plotFiltered must be in [filtered, remaining, per_mutation].\n ",
plotFiltered, " received instead. \n")
}
# if number of pchs/ltys/cols provided does not match number of lines expected
# expand into vector with repeating values (otherwise legend would break)
if (length(pchs)!=length(tuneList)) {pchs <- rep(pchs, length.out=length(tuneList))}
if (length(ltys)!=length(tuneList)) {ltys <- rep(ltys, length.out=length(tuneList))}
if (length(cols)!=length(tuneList)) {cols <- rep(cols, length.out=length(tuneList))}
# tabulate tuneList (and if applicable convert to percentage)
if (plotFiltered == 'per_mutation') {
# preprocess tuneList to count each sequence once,
# considering the largest number of mutations in the window
plotList.tmp <- lapply(tuneList, function(window_df) {
# For each sequence
bind_rows(lapply(1:nrow(window_df), function(i) {
x <- window_df[i,]
# Find the mutation thresholds that are T
idx <- which(x)
# If there are none (all F or NA values) or there is only one, do nothig
# If there are more than one, keep the largest index and set the previous values to F
if (length(idx) > 1) {
idx <- max(idx)
x[1:(idx-1)] <- F
}
x
}))
})
tuneList <- plotList.tmp
}
plotList <- lapply(tuneList, colSums)
if (percentage) {plotList <- lapply(plotList, function(x){x/nrow(tuneList[[1]])})}
if (returnRaw) {
return (bind_rows(plotList, .id = "windowSize"))
}
# get x-axis values (i.e. mutThreshRange; colnames of matrix in tuneList with most columns)
#threshes = as.numeric(colnames(tuneList[[which.max(lapply(lapply(tuneList, colnames), length))]]))
threshes <- as.numeric(colnames(tuneList[[1]]))
# plot for first window size
x1 <- threshes
if (jitter.x) {x1 <- jitter(x1, amount=jitter.x.amt)}
y1 <- plotList[[1]]
if (jitter.y) {y1 <- jitter(y1, amount=jitter.y.amt)}
if (percentage) {
ylab.part.1 <- "Percentage of sequences"
# ylim
ylim.padding <- abs(diff(range(plotList, na.rm=T)))*0.01
ylims <- c(max(0, min(range(plotList, na.rm=T)) - ylim.padding),
min(1, max(range(plotList, na.rm=T)) + ylim.padding) )
} else {
ylab.part.1 <- "Number of sequences"
# ylim: non-negative lower limit; upper limit slight above max tabulated sum
ylims <- c( max(0, min(range(plotList, na.rm=T)) - max(1, jitter.y.amt) ),
max(range(plotList, na.rm=T)) + max(1, jitter.y.amt) )
}
plot(x1, # mutThreshRange on x-axis
y1, # tabulated sums in plotList on y-axis
ylim = ylims,
# xlim: +/- jitter.x.amt*2 to accommodate for amount of jittering on x-axis
xlim = c(min(threshes)-jitter.x.amt*2, max(threshes+jitter.x.amt*2)),
xaxt="n", xlab=xlab,
ylab=paste(ylab.part.1, ylab.part.2),
cex.lab=1.5, cex.axis=1.5, type="b", lwd=1.5,
pch=pchs[1], lty=ltys[1], col=cols[1])
axis(side=1, at=threshes, cex.axis=1.5)
# add title
if (!is.null(title)) {
title(main=title)
}
# plot for the rest of the window sizes
for (i in 1:length(plotList)){
if (i>=2) {
xi <- threshes
if (jitter.x) {xi <- jitter(xi, amount=jitter.x.amt)}
yi <- plotList[[i]]
if (jitter.y) {yi <- jitter(yi, amount=jitter.y.amt)}
points(xi, yi, type='b', lwd=1.5,
pch=pchs[i], lty=ltys[i], col=cols[i])
}
}
# add legend
if (plotLegend) {
# if legendPos specified as xy coordinates
if (is.numeric(legendPos) & length(legendPos)==2) {
legend(x=legendPos[1], y=legendPos[2],
legend = c("Window Size", names(tuneList)),
horiz = legendHoriz, cex = legendCex,
pch=c(NA, pchs), lty=c(NA, ltys), col=c(NA, cols))
} else {
# if legendPos specified as "center", "topright", etc.
legend(legendPos,
legend = c("Window Size", names(tuneList)),
horiz = legendHoriz, cex = legendCex,
pch=c(NA, pchs), lty=c(NA, ltys), col=c(NA, cols))
}
}
}
#### Expected frequencies calculating functions ####
#' Calculate expected mutation frequencies
#'
#' \code{expectedMutations} calculates the expected mutation frequencies for each
#' sequence in the input \code{data.frame}.
#'
#' @param db \code{data.frame} containing sequence data.
#' @param sequenceColumn \code{character} name of the column containing input
#' sequences.
#' @param germlineColumn \code{character} name of the column containing
#' the germline or reference sequence.
#' @param targetingModel \link{TargetingModel} object. Default is \link{HH_S5F}.
#' @param regionDefinition \link{RegionDefinition} object defining the regions
#' and boundaries of the Ig sequences. To use regions definitions,
#' sequences in \code{sequenceColum} and \code{germlineColumn}
#' must be aligned, following the IMGT schema.
#' @param mutationDefinition \link{MutationDefinition} object defining replacement
#' and silent mutation criteria. If \code{NULL} then
#' replacement and silent are determined by exact
#' amino acid identity.
#' @param nproc \code{numeric} number of cores to distribute the operation
#' over. If the cluster has already been set the call function with
#' \code{nproc} = 0 to not reset or reinitialize. Default is
#' \code{nproc} = 1.
#' @param cloneColumn clone id column name in \code{db}
#' @param juncLengthColumn junction length column name in \code{db}
#'
#' @return A modified \code{db} \code{data.frame} with expected mutation frequencies
#' for each region defined in \code{regionDefinition}.
#'
#' The columns names are dynamically created based on the regions in
#' \code{regionDefinition}. For example, when using the \link{IMGT_V}
#' definition, which defines positions for CDR and FWR, the following columns are
#' added:
#' \itemize{
#' \item \code{mu_expected_cdr_r}: number of replacement mutations in CDR1 and
#' CDR2 of the V-segment.
#' \item \code{mu_expected_cdr_s}: number of silent mutations in CDR1 and CDR2
#' of the V-segment.
#' \item \code{mu_expected_fwr_r}: number of replacement mutations in FWR1,
#' FWR2 and FWR3 of the V-segment.
#' \item \code{mu_expected_fwr_s}: number of silent mutations in FWR1, FWR2 and
#' FWR3 of the V-segment.
#' }
#'
#' @details
#' Only the part of the sequences defined in \code{regionDefinition} are analyzed.
#' For example, when using the \link{IMGT_V} definition, mutations in
#' positions beyond 312 will be ignored.
#'
#' @seealso
#' \link{calcExpectedMutations} is called by this function to calculate the expected
#' mutation frequencies. See \link{observedMutations} for getting observed
#' mutation counts. See \link{IMGT_SCHEMES} for a set of predefined
#' \link{RegionDefinition} objects.
#'
#' @examples
#' # Subset example data
#' data(ExampleDb, package="alakazam")
#' db <- subset(ExampleDb, c_call %in% c("IGHA", "IGHG") & sample_id == "+7d")
#' set.seed(112)
#' db <- dplyr::slice_sample(db, n=100)
#' # Calculate expected mutations over V region
#' db_exp <- expectedMutations(db,
#' sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' regionDefinition=IMGT_V,
#' nproc=1)
#'
#' # Calculate hydropathy expected mutations over V region
#' db_exp <- expectedMutations(db,
#' sequenceColumn="sequence_alignment",
#' germlineColumn="germline_alignment_d_mask",
#' regionDefinition=IMGT_V,
#' mutationDefinition=HYDROPATHY_MUTATIONS,
#' nproc=1)
#'
#' @export
expectedMutations <- function(db,sequenceColumn = "sequence_alignment",
germlineColumn = "germline_alignment",
targetingModel = HH_S5F,
regionDefinition=NULL, mutationDefinition = NULL,
nproc = 1,
cloneColumn = "clone_id",
juncLengthColumn = "junction_length") {
# Hack for visibility of foreach index variable
idx <- NULL
check <- checkColumns(db, c(sequenceColumn, germlineColumn))
if (check != TRUE) { stop(check) }
regionDefinitionName <- ""
if (!is.null(regionDefinition)) {
regionDefinitionName <- regionDefinition@name
# Message if sequences don't have gaps or Ns (because makeChangeo clone
# masks IMGT gaps) as a proxy to detect not IMGT aligned sequences
if (all(!grepl("[\\.Nn]",db[[sequenceColumn]]))) {
warning("No IMGT gaps detected in ",sequenceColumn,".\nSequences in ",
sequenceColumn," and ", germlineColumn,
" should be aligned, with gaps (.,N or n) following the IMGT numbering scheme.")
}
if (all(!grepl("[\\.Nn]",db[[germlineColumn]]))) {
warning("No IMGT gaps detected in ",germlineColumn,
".\nSequences in ", sequenceColumn," and ", germlineColumn,
" should be aligned, with gaps (., N or n) following the IMGT numbering scheme.")
}
not_na <- !is.na(db[[germlineColumn]])
if (!all.equal(nchar(db[[sequenceColumn]][not_na]), nchar(db[[germlineColumn]][not_na]))) {
warning("Pairs of ", sequenceColumn, " and ", germlineColumn, " sequences with different lengths found.")
stop("Expecting IMGT aligned, same length sequences in ", sequenceColumn, " and ", germlineColumn,".")
}
}
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
# Check mutation definition
if (!is.null(mutationDefinition) & !is(mutationDefinition, "MutationDefinition")) {
stop(deparse(substitute(mutationDefinition)), " is not a valid MutationDefinition object")
}
# Check if mutation count/freq columns already exist
# and throw overwritting warning
if (!is.null(regionDefinition)) {
labels <- regionDefinition@labels
} else {
labels <- makeNullRegionDefinition()@labels
}
labels <- paste("mu_expected_", labels, sep="")
label_exists <- labels[labels %in% colnames(db)]
if (length(label_exists)>0) {
warning(paste0("Columns ",
paste(label_exists, collapse=", "),
" exist and will be overwritten")
)
db[,label_exists] <- NULL
}
# Check targeting model
if (!is(targetingModel, "TargetingModel")) {
stop(deparse(substitute(targetingModel)), " is not a valid TargetingModel object")
}
db$tmp_expmu_row_id <- 1:nrow(db)
# Convert sequence columns to uppercase
db <- toupperColumns(db, c(sequenceColumn, germlineColumn))
# If the user has previously set the cluster and does not wish to reset it
if(!is.numeric(nproc)){
cluster = nproc
nproc = 0
}
# Ensure that the nproc does not exceed the number of cores/CPUs available
nproc <- min(nproc, cpuCount(), na.rm=T)
# If user wants to paralellize this function and specifies nproc > 1, then
# initialize and register slave R processes/clusters &
# export all nesseary environment variables, functions and packages.
if (nproc > 1) {
cluster <- parallel::makeCluster(nproc, type = "PSOCK")
parallel::clusterExport(cluster, list('db', 'sequenceColumn', 'germlineColumn',
'regionDefinitionName', 'juncLengthColumn', 'setRegionBoundaries',
'regionDefinition','targetingModel',
'calcExpectedMutations','calculateTargeting',
's2c','c2s','NUCLEOTIDES','HH_S5F',
'calculateMutationalPaths','CODON_TABLE','IMGT_V_BY_REGIONS'),
envir=environment() )
registerDoParallel(cluster,cores=nproc)
} else if (nproc == 1) {
# If needed to run on a single core/cpu then, regsiter DoSEQ
# (needed for 'foreach' in non-parallel mode)
registerDoSEQ()
}
# Printing status to console
# cat("Calculating the expected frequencies of mutations...\n")
# Calculate targeting for each sequence (based on the germline)
# Should be a 5 x N matrix where N in the number of nucleotides defined by
# the regionDefinition
numbOfSeqs <- nrow(db)
targeting_list <-
foreach (idx=iterators::icount(numbOfSeqs)) %dopar% {
rd <- regionDefinition
if (regionDefinitionName %in% c("IMGT_VDJ_BY_REGIONS","IMGT_VDJ")) {
rd <- setRegionBoundaries(juncLength = db[[juncLengthColumn]][idx],
sequenceImgt = db[[sequenceColumn]][idx],
regionDefinition=regionDefinition)
}
eM <- calcExpectedMutations(germlineSeq=db[[germlineColumn]][idx],
inputSeq=db[[sequenceColumn]][idx],
targetingModel=targetingModel,
regionDefinition=rd,
mutationDefinition=mutationDefinition)
eM['tmp_expmu_row_id'] <- db[['tmp_expmu_row_id']][idx]
eM
}
# Convert list of expected mutation freq to data.frame
if (is.null(regionDefinition)) {
labels_length <- length(makeNullRegionDefinition()@labels) + 1 # +1 for tmp row_id
} else {
labels_length <- length(regionDefinition@labels) + 1
}
expectedMutationFrequencies <- as.data.frame(do.call(rbind, lapply(targeting_list, function(x) {
length(x) <- labels_length
return(x) })), stringsAsFactors=F)
expectedMutationFrequencies[is.na(expectedMutationFrequencies)] <- 0
col_names <- colnames(expectedMutationFrequencies)
mu_col_names <- col_names != "tmp_expmu_row_id"
colnames(expectedMutationFrequencies)[mu_col_names] <- paste0("mu_expected_", colnames(expectedMutationFrequencies)[mu_col_names])
# Properly shutting down the cluster
if(nproc>1){ parallel::stopCluster(cluster) }
# Bind the observed mutations to db
db_new <- db %>%
ungroup() %>%
left_join(expectedMutationFrequencies, by="tmp_expmu_row_id") %>%
arrange(!!rlang::sym("tmp_expmu_row_id")) %>%
select(-!!rlang::sym("tmp_expmu_row_id"))
return(db_new)
}
#' Calculate expected mutation frequencies of a sequence
#'
#' \code{calcExpectedMutations} calculates the expected mutation
#' frequencies of a given sequence. This is primarily a helper function for
#' \link{expectedMutations}.
#'
#' @param germlineSeq germline (reference) sequence.
#' @param inputSeq input (observed) sequence. If this is not \code{NULL},
#' then \code{germlineSeq} will be processed to be the same
#' same length as \code{inputSeq} and positions in
#' \code{germlineSeq} corresponding to positions with Ns in
#' \code{inputSeq} will also be assigned an N.
#' @param targetingModel \link{TargetingModel} object. Default is \link{HH_S5F}.
#' @param regionDefinition \link{RegionDefinition} object defining the regions
#' and boundaries of the Ig sequences.
#' @param mutationDefinition \link{MutationDefinition} object defining replacement
#' and silent mutation criteria. If \code{NULL} then
#' replacement and silent are determined by exact
#' amino acid identity.
#'
#' @return A \code{numeric} vector of the expected frequencies of mutations in the
#' regions in the \code{regionDefinition}. For example, when using the default
#' \link{IMGT_V} definition, which defines positions for CDR and
#' FWR, the following columns are calculated:
#' \itemize{
#' \item \code{mu_expected_cdr_r}: number of replacement mutations in CDR1 and
#' CDR2 of the V-segment.
#' \item \code{mu_expected_cdr_s}: number of silent mutations in CDR1 and CDR2
#' of the V-segment.
#' \item \code{mu_expected_fwr_r}: number of replacement mutations in FWR1,
#' FWR2 and FWR3 of the V-segment.
#' \item \code{mu_expected_fwr_s}: number of silent mutations in FWR1, FWR2 and
#' FWR3 of the V-segment.
#' }
#'
#' @details
#' \code{calcExpectedMutations} calculates the expected mutation frequencies of a
#' given sequence and its germline.
#'
#' Note, only the part of the sequences defined in \code{regionDefinition} are analyzed.
#' For example, when using the default \link{IMGT_V} definition, mutations in
#' positions beyond 312 will be ignored.
#'
#' @seealso \link{expectedMutations} calls this function.
#' To create a custom \code{targetingModel} see \link{createTargetingModel}.
#' See \link{calcObservedMutations} for getting observed mutation counts.
#'
#' @examples
#' # Load example data
#' data(ExampleDb, package="alakazam")
#'
#' # Use first entry in the exampled data for input and germline sequence
#' in_seq <- ExampleDb[["sequence_alignment"]][1]
#' germ_seq <- ExampleDb[["germline_alignment_d_mask"]][1]
#'
#' # Identify all mutations in the sequence
#' calcExpectedMutations(germ_seq,in_seq)
#'
#' # Identify only mutations the V segment minus CDR3
#' calcExpectedMutations(germ_seq, in_seq, regionDefinition=IMGT_V)
#'
#' # Define mutations based on hydropathy
#' calcExpectedMutations(germ_seq, in_seq, regionDefinition=IMGT_V,
#' mutationDefinition=HYDROPATHY_MUTATIONS)
#'
#' @export
calcExpectedMutations <- function(germlineSeq,
inputSeq=NULL,
targetingModel=HH_S5F,
regionDefinition=NULL,
mutationDefinition=NULL) {
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
# Check mutation definition
if (!is.null(mutationDefinition) & !is(mutationDefinition, "MutationDefinition")) {
stop(deparse(substitute(mutationDefinition)), " is not a valid MutationDefinition object")
}
# Check targeting model
if (!is(targetingModel, "TargetingModel")) {
stop(deparse(substitute(targetingModel)), " is not a valid TargetingModel object")
}
# Mask ambiguous nucleotide characters
germlineSeq <- gsub("[MRWSYKVHDB]", "N", germlineSeq)
# Assign codon table
codonTable <- if (is.null(mutationDefinition)) { CODON_TABLE } else { mutationDefinition@codonTable }
# Get targeting
targeting <- calculateTargeting(germlineSeq=germlineSeq,
inputSeq=inputSeq,
targetingModel=targetingModel,
regionDefinition=regionDefinition)
# Determine the mutations paths (i.e. determine R and S mutation frequencies)
mutationalPaths <- calculateMutationalPaths(germlineSeq=c2s(colnames(targeting)),
regionDefinition=regionDefinition,
codonTable=codonTable)
typesOfMutations <- c("r", "s")
mutationalPaths[!(mutationalPaths %in% typesOfMutations)] <- NA
if (is.null(regionDefinition)) {
rdLength <- max(stri_length(inputSeq), stri_length(germlineSeq), na.rm=TRUE)
regionDefinition <- makeNullRegionDefinition(rdLength)
}
listExpectedMutationFrequencies <- list()
for(region in regionDefinition@regions){
for(typeOfMutation in typesOfMutations){
region_mutation <- paste(region, typeOfMutation, sep="_")
targeting_region <- targeting[1:4, regionDefinition@boundaries %in% region]
mutationalPaths_region <- mutationalPaths[, regionDefinition@boundaries[1:ncol(mutationalPaths)] %in% region]
targeting_typeOfMutation_region <- sum(targeting_region[mutationalPaths_region == typeOfMutation], na.rm=TRUE)
listExpectedMutationFrequencies[[region_mutation]] <- targeting_typeOfMutation_region
}
}
expectedMutationFrequencies <- unlist(listExpectedMutationFrequencies)
expectedMutationFrequencies[!is.finite(expectedMutationFrequencies)] <- NA
expectedMutationFrequencies <- expectedMutationFrequencies / sum(expectedMutationFrequencies, na.rm=TRUE)
return(expectedMutationFrequencies)
}
calculateTargeting <- function(germlineSeq,
inputSeq=NULL,
targetingModel=HH_S5F,
regionDefinition=NULL) {
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
# Check targeting model
if (!is(targetingModel, "TargetingModel")) {
stop(deparse(substitute(targetingModel)), " is not a valid TargetingModel object")
}
# If an inputSequence is passed then process the germlineSequence
# to be the same legth, mask germlineSequence with Ns where inputSequence is also N
# If not needed then you may skip this step by passing in inputSequence=NULL
# (which is default).
if(!is.null(inputSeq)){
# Trim the input and germline sequence to the shortest
len_inputSeq <- stri_length(inputSeq)
len_germlineSeq <- stri_length(germlineSeq)
# If a regionDefinition is passed,
# then only analyze till the end of the defined length
if(!is.null(regionDefinition)){
length_regionDefinition <- regionDefinition@seqLength
} else{
length_regionDefinition <- max(len_inputSeq, len_germlineSeq, na.rm=TRUE)
}
len_shortest <- min( c(len_inputSeq,len_germlineSeq,length_regionDefinition), na.rm=TRUE)
c_inputSeq <- s2c(inputSeq)[1:len_shortest]
c_germlineSeq <- s2c(germlineSeq)[1:len_shortest]
# If the sequence and germline (which now should be the same length) is shorter
# than the length_regionDefinition, pad it with Ns
if(len_shortest < length_regionDefinition){
fillWithNs <- array("N", length_regionDefinition - len_shortest)
c_inputSeq <- c(c_inputSeq, fillWithNs)
c_germlineSeq <- c( c_germlineSeq, fillWithNs)
}
# Mask germline with Ns where input sequence has Ns
c_germlineSeq[c_inputSeq == "N" | !c_inputSeq %in% c(NUCLEOTIDES[1:5], ".") ] <- "N"
s_germlineSeq <- c2s(c_germlineSeq)
} else {
s_germlineSeq <- germlineSeq
}
# Removing IMGT gaps (they should come in threes)
# After converting ... to ZZZ any other . is not an IMGT gap & will be treated like N
gaplessSeq <- gsub("\\.\\.\\.", "ZZZ", s_germlineSeq)
#If there is a single gap left convert it to an N
gaplessSeq <- gsub("\\.", "N", gaplessSeq)
# Re-assigning s_germlineSeq (now has all "." that are not IMGT gaps converted to Ns)
gaplessSeq <- gsub("ZZZ", "...", gaplessSeq)
# Vector of seq
c_germlineSeq <- s2c(gaplessSeq)
# Matrix to hold targeting values for each position in c_germlineSeq
germlineSeqTargeting <- matrix(NA,
ncol=stri_length(gaplessSeq),
nrow=length(NUCLEOTIDES[1:5]),
dimnames=list(NUCLEOTIDES[1:5], c_germlineSeq))
# Now remove the IMGT gaps so that the correct 5mers can be made to calculate
# targeting. e.g.
# GAGAAA......TAG yields: "GAGAA" "AGAAA" "GAAAT" "AAATA" "AATAG"
# (because the IMGT gaps are NOT real gaps in sequence!!!)
gaplessSeq <- gsub("\\.\\.\\.", "", gaplessSeq)
gaplessSeqLen <- stri_length(gaplessSeq)
#Slide through 5-mers and look up targeting
gaplessSeq <- paste("NN", gaplessSeq, "NN", sep="")
gaplessSeqLen <- stri_length(gaplessSeq)
pos <- 3:(gaplessSeqLen - 2)
subSeq <- substr(rep(gaplessSeq, gaplessSeqLen - 4), (pos - 2), (pos + 2))
germlineSeqTargeting_gapless <- targetingModel@targeting[, subSeq]
# germlineSeqTargeting_gapless <- sapply(subSeq, function(x) {
# targetingModel@targeting[, x] })
germlineSeqTargeting[, c_germlineSeq != "."] <- germlineSeqTargeting_gapless
# Set self-mutating targeting values to be NA
mutatingToSelf <- colnames(germlineSeqTargeting)
mutatingToSelf[!(mutatingToSelf %in% NUCLEOTIDES[1:5])] <- "N"
# # TODO: What's with this <<- business?
# # TODO: I think this is assigning NA to all self-mutations, which are already NA
# sapply(1:ncol(germlineSeqTargeting), function(pos) { germlineSeqTargeting[mutatingToSelf[pos], pos] <<- NA })
germlineSeqTargeting[!is.finite(germlineSeqTargeting)] <- NA
return(germlineSeqTargeting)
}
calculateMutationalPaths <- function(germlineSeq,
inputSeq=NULL,
regionDefinition=NULL,
codonTable=NULL) {
# Check region definition
if (!is.null(regionDefinition) & !is(regionDefinition, "RegionDefinition")) {
stop(deparse(substitute(regionDefinition)), " is not a valid RegionDefinition object")
}
# Set codon table if required
if (is.null(codonTable)) { codonTable <- CODON_TABLE }
# If an inputSequence is passed then process the germlineSequence
# to be the same length, mask germlineSequence with Ns where inputSequence is also N
# If this function is being called after running calculateTargeting you may skip
# this step by passing in inputSequence=NULL (which is default). This way you save
# some processing time.
if(!is.null(inputSeq)){
# Trim the input and germline sequence to the shortest
len_inputSeq <- stri_length(inputSeq)
len_germlineSeq <- stri_length(germlineSeq)
# If a regionDefinition is passed,
# then only analyze till the end of the defined length
if(!is.null(regionDefinition)){
length_regionDefinition <- regionDefinition@seqLength
} else{
length_regionDefinition <- max(len_inputSeq, len_germlineSeq, na.rm=TRUE)
}
len_shortest <- min( c(len_inputSeq,len_germlineSeq,length_regionDefinition), na.rm=TRUE)
c_inputSeq <- s2c(inputSeq)[1:len_shortest]
c_germlineSeq <- s2c(germlineSeq)[1:len_shortest]
# If the sequence and germline (which now should be the same length) is shorter
# than the length_regionDefinition, pad it with Ns
if(len_shortest<length_regionDefinition){
fillWithNs <- array("N",length_regionDefinition-len_shortest)
c_inputSeq <- c(c_inputSeq, fillWithNs)
c_germlineSeq <- c(c_germlineSeq, fillWithNs)
}
# Mask germline with Ns where input sequence has Ns
c_germlineSeq[c_inputSeq == "N" | !c_inputSeq %in% c(NUCLEOTIDES[1:5], ".") ] = "N"
s_germlineSeq <- c2s(c_germlineSeq)
} else {
s_germlineSeq <- germlineSeq
c_germlineSeq <- s2c(s_germlineSeq)
}
s_germlineSeq_len <- stri_length(s_germlineSeq)
vecCodons <- sapply({1:(s_germlineSeq_len/3)}*3 - 2, function(x) { substr(s_germlineSeq, x, x + 2) })
vecCodons[!vecCodons %in% colnames(codonTable)] <- "NNN"
matMutationTypes <- matrix(codonTable[, vecCodons], nrow=4, byrow=F,
dimnames=list(NUCLEOTIDES[1:4], c_germlineSeq[ 1:(3 * length(vecCodons))]))
return(matMutationTypes)
}
#### Additional helper functions ####
# Convert one or more nucleotide characters to IUPAC code
# for incomplete nucleic acid specification
#
# @param nucs a character vector of nucleotides. One or more of
# \code{c("A", "C", "G", "T")}.
#
# @return a single character from the IUPAC ambiguous code.
#
nucs2IUPAC <- function(nucs) {
# input nucleotides must be one of the characters allowed
legal <- c("A", "C", "G", "T")
if (sum(! nucs %in% legal)>0) {
stop("Input nucleotides must be one of A, C, G, or T.")
}
# sort by alphabetical order (important)
nucs <- sort(unique(nucs))
# concatenate
nucs <- c2s(nucs)
# convert
return(IUPAC_DNA_2[nucs])
}
# Convert one or more characters including dash and dots to ambiguous characters
#
# @param chars a character vector of nucleotides. One or more of
# \code{c("A", "C", "G", "T", "N", "-", ".")}.
#
# @return a single IUPAC character or "-" or "."
#
chars2Ambiguous <- function(chars) {
# chars must all be unique
stopifnot(length(unique(chars)) == length(chars))
# input characters must be one of the characters allowed
legal <- c("A", "C", "G", "T", "N", "-", ".")
if (sum(! chars %in% legal) > 0) {
stop("Input characters must be one of A, C, G, T, N, - (dash), or . (dot)")
}
# if any of A, T, G, C, N appears
if (any(chars %in% c("A", "C", "G", "T", "N"))) {
# ignore - and .
idx.dash.dot <- which(chars == "-" | chars == ".")
if (length(idx.dash.dot)>0) {
chars <- chars[-idx.dash.dot]
}
# if only N appears
if (sum(chars=="N") == length(chars)) {
return("N")
} else {
# otherwise, if there are any of A, T, G, C
# remove N
# e.g. AGN would be treated as AG (R)
# e.g. ATGN would be treated as AGT (D)
# e.g. ATGCN would be treated as ACGT (N)
idx.N <- which(chars == "N")
if (length(idx.N) > 0) {
chars <- chars[-idx.N]
}
return(nucs2IUPAC(chars))
}
} else {
# otherwise, if only one or both of - and . appear(s)
# if both - and . appear, return -
if (sum(chars %in% c("-", ".")) == 2) {
return("-")
} else {
# if only - or . appears, return that
return(chars)
}
}
}
# Convert IUPAC incomplete nucleic acid to one or more characters
#
# @param code a single IUPAC character.
# @param excludeN if \code{TRUE}, do not translate when \code{code}
# is \code{N}. Default is \code{TRUE}.
# @return a character vector of nucleotides. One or more of
# \code{c("A", "C", "G", "T")}.
#
IUPAC2nucs <- function(code, excludeN=TRUE) {
# input character must be one of IUPAC codes
if (! code %in% names(IUPAC_DNA) ) {
stop("Input character must be one of IUPAC DNA codes.")
}
# convert
if (code == "N" & excludeN) {
return(code)
} else {
return(IUPAC_DNA[[code]])
}
}
# Given a nuclotide position, returns the codon number
# e.g. nuc 86 = codon 29
getCodonNumb <- function(nucPos){
return( ceiling(nucPos/3) )
}
# Given a codon, returns all the nuc positions that make the codon
getCodonNucs <- function(codonNumb){
getCodonPos(codonNumb*3)
}
# Given a nucleotide postions return the position in the codon
getContextInCodon <- function(nucPos){
return((nucPos - 1)%%3 + 1 )
}
# Given a nuclotide position, returns the pos of the 3 nucs that made the codon
# e.g. nuc 86 is part of nucs 85,86,87
getCodonPos <- function(nucPos) {
codonNum <- (ceiling(nucPos / 3)) * 3
return ((codonNum - 2):codonNum)
}
# Given two codons, tells you if the mutation is R or S (based on your definition)
#
# @param codonFrom starting codon. IUPAC ambiguous characters are allowed.
# @param codonTo ending codon. IUPAC ambiguous characters are allowed.
# @param ambiguousMode whether to consider ambiguous characters as "either or"
# or "and" when determining (and counting) the type(s) of
# mutations. Applicable only if \code{codonFrom} and/or
# \code{codonTo} contains ambiguous characters. One of
# \code{c("eitherOr", "and")}. Default is \code{"eitherOr"}.
# @param aminoAcidClasses vector of amino acid trait classes.
# if NULL then R or S is determined by amino acid identity
# @return A vector with entries named by mutation type, including "r" (replacement),
# "s" (silent), "stop" (stop) or "na" (input codons are identical or include NA).
# Each entry indicates the count of its corresponding type of mutation.
#
# @details When there are ambiguous characters in \code{codonFrom} and/or \code{codonTo}:
# \itemize{
# \item If \code{ambiguousMode="eitherOr"}, ambiguous characters will each
# be expanded but only 1 mutation will be recorded. The priority for
# different types of mutations, in decreasing order, is as follows:
# no mutation ("na"), replacement mutation ("r"), silent mutation ("s"),
# and stop mutation ("Stop"). The returned vector will have exactly one
# entry with a count of 1 and 0 in all other entries.
# \item If \code{ambiguousMode="and"}, ambiguous characters will each be
# expanded and mutation(s) from each expansion will be recorded.
# Each entry in the returned vector could potentially be greater than 1.
# }
#
# @examples
# # Without classes
# mutationType("TTT", "TTC")
# mutationType("TTT", "TTA")
# mutationType("TTT", "TGA")
# mutationType("TGG", "TWG")
#
# # With classes
# classes <- HYDROPATHY_MUTATIONS@classes
# mutationType("TTT", "TTC", aminoAcidClasses=classes)
# mutationType("TTT", "TTA", aminoAcidClasses=classes)
# mutationType("TTT", "TCT", aminoAcidClasses=classes)
# mutationType("TTT", "TGA", aminoAcidClasses=classes)
#
mutationType <- function(codonFrom, codonTo,
ambiguousMode=c("eitherOr", "and"),
aminoAcidClasses=NULL) {
# codonFrom="TTT"; codonTo="TTA"
# codonFrom="TTT"; codonTo="TGA"
ambiguousMode <- match.arg(ambiguousMode)
# placeholder for tabulation
tab <- setNames(object=rep(0, 4), nm=c("r", "s", "stop", "na"))
if (grepl(pattern="[-.]", x=codonFrom) | grepl(pattern="[-.]", x=codonTo)) {
# "na"
tab[4] <- 1
} else {
codonFrom.all <- EXPANDED_AMBIGUOUS_CODONS[[codonFrom]]
codonTo.all <- EXPANDED_AMBIGUOUS_CODONS[[codonTo]]
for (cur.codonFrom in codonFrom.all) {
for (cur.codonTo in codonTo.all) {
# if codons are the same, there is no mutation; count as NA
if (cur.codonFrom == cur.codonTo) {
# "na"
tab[4] <- tab[4] + 1
} else {
# Translate codons
cur.aaFrom <- AMINO_ACIDS[cur.codonFrom]
cur.aaTo <- AMINO_ACIDS[cur.codonTo]
# If any codon is NA then return NA
if (any(is.na(c(codonFrom, codonTo, cur.aaFrom, cur.aaTo)))) {
# "na"
tab[4] <- tab[4] + 1
} else if (any(c(cur.aaFrom, cur.aaTo) == "*")) {
# If any amino acid is Stop then return "stop"
tab[3] <- tab[3] + 1
} else if (is.null(aminoAcidClasses)) {
# Check for exact identity if no amino acid classes are specified
mutation <- if (cur.aaFrom == cur.aaTo) { "s" } else { "r" }
tab[mutation] <- tab[mutation]+1
} else {
# Check for amino acid class identity if classes are specified
mutation <- if (aminoAcidClasses[cur.aaFrom] ==
aminoAcidClasses[cur.aaTo]) { "s" } else { "r" }
tab[mutation] <- tab[mutation]+1
}
}
}
}
# if there's ambiguous char in observed or germline
if ((length(codonFrom.all) > 1) | (length(codonTo.all) > 1)) {
if (ambiguousMode=="eitherOr") {
if (tab[4] > 0) { # "na"
tab <- setNames(object=c(0, 0, 0, 1), nm=c("r", "s", "stop", "na"))
} else if (tab[2] > 0) { # "S"
tab <- setNames(object=c(0, 1, 0, 0), nm=c("r", "s", "stop", "na"))
} else if (tab[1] > 0) { # "R"
tab <- setNames(object=c(1, 0, 0, 0), nm=c("r", "s", "stop", "na"))
} else {
tab <- setNames(object=c(0, 0, 1, 0), nm=c("r", "s", "stop", "na"))
}
stopifnot(sum(tab) == 1)
} else {
stopifnot(sum(tab) >= 1)
}
} else {
# no need to do anything if there isn't ambiguous char in observed or germline
# there should be only 1 mutation
stopifnot(sum(tab) == 1)
}
}
return(tab)
}
# returns a boolean vector indicating whether ambiguous characters
# exist in each entry of input character vector
# input:
# - seqs: a character vector
# output:
# - a boolean vector, where a TRUE indicates presence of ambiguous
# character(s)
checkAmbiguousExist <- function(seqs) {
# ^ within brackets negates the character class
bool <- stri_detect_regex(str=seqs, pattern="[^atgcnATGCN\\-\\.]")
return(bool)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.