Nothing
# Find non triplet gaps in a nucleotide sequence
hasNonImgtGaps <- function (seq) {
len <- ceiling(nchar(seq)/3)*3
codons <- substring(seq, seq(1, len-2, 3), seq(3, len, 3))
gaps_lengths <- nchar(gsub("[^\\.\\-]", "", codons))
if (any(gaps_lengths %% 3 != 0)) {
TRUE
} else {
FALSE
}
}
# Compare two IMGT gapped sequences and find AA mutations
getMutatedAA <- function(ref_imgt, novel_imgt) {
n_ref <- grepl("N", ref_imgt)
if (any(n_ref)) {
warning(sum(n_ref)," Ns found in ref_imgt")
}
n_novel <- grepl("N", novel_imgt)
if (any(n_novel)) {
warning(sum(n_novel)," Ns found in novel_imgt")
}
if (hasNonImgtGaps(ref_imgt)) {
warning("Non IMGT gaps found in ref_imgt")
}
if (hasNonImgtGaps(novel_imgt)) {
warning("Non IMGT gaps found in novel_imgt")
}
ref_imgt <- strsplit(alakazam::translateDNA(ref_imgt),"")[[1]]
novel_imgt <- strsplit(alakazam::translateDNA(novel_imgt),"")[[1]]
mutations <- c()
diff_idx <- which(ref_imgt != novel_imgt)
if (length(diff_idx)>0) {
mutations <- paste0(diff_idx, ref_imgt[diff_idx],">",
replace(novel_imgt[diff_idx], is.na(novel_imgt[diff_idx]),"-"))
}
mutations
}
#' Generate evidence
#'
#' \code{generateEvidence} builds a table of evidence metrics for the final novel V
#' allele detection and genotyping inferrences.
#'
#' @param data a \code{data.frame} containing sequence data that has been
#' passed through \link{reassignAlleles} to correct the allele
#' assignments.
#' @param novel the \code{data.frame} returned by \link{findNovelAlleles}.
#' @param genotype the \code{data.frame} of alleles generated with \link{inferGenotype}
#' denoting the genotype of the subject.
#' @param genotype_db a vector of named nucleotide germline sequences in the genotype.
#' Returned by \link{genotypeFasta}.
#' @param germline_db the original uncorrected germline database used to by
#' \link{findNovelAlleles} to identify novel alleles.
#' @param j_call name of the column in \code{data} with J allele calls.
#' Default is \code{j_call}.
#' @param junction Junction region nucleotide sequence, which includes
#' the CDR3 and the two flanking conserved codons. Default
#' is \code{junction}.
#' @param fields character vector of column names used to split the data to
#' identify novel alleles, if any. If \code{NULL} then the data is
#' not divided by grouping variables.
#'
#' @return
#' Returns the \code{genotype} input \code{data.frame} with the following additional columns
#' providing supporting evidence for each inferred allele:
#'
#' \itemize{
#' \item \code{field_id}: Data subset identifier, defined with the input paramter \code{fields}.
#' \item A variable number of columns, specified with the input parameter \code{fields}.
#' \item \code{polymorphism_call}: The novel allele call.
#' \item \code{novel_imgt}: The novel allele sequence.
#' \item \code{closest_reference}: The closest reference gene and allele in
#' the \code{germline_db} database.
#' \item \code{closest_reference_imgt}: Sequence of the closest reference gene and
#' allele in the \code{germline_db} database.
#' \item \code{germline_call}: The input (uncorrected) V call.
#' \item \code{germline_imgt}: Germline sequence for \code{germline_call}.
#' \item \code{nt_diff}: Number of nucleotides that differ between the new allele and
#' the closest reference (\code{closest_reference}) in the \code{germline_db} database.
#' \item \code{nt_substitutions}: A comma separated list of specific nucleotide
#' differences (e.g. \code{112G>A}) in the novel allele.
#' \item \code{aa_diff}: Number of amino acids that differ between the new allele and the closest
#' reference (\code{closest_reference}) in the \code{germline_db} database.
#' \item \code{aa_substitutions}: A comma separated list with specific amino acid
#' differences (e.g. \code{96A>N}) in the novel allele.
#' \item \code{sequences}: Number of sequences unambiguosly assigned to this allele.
#' \item \code{unmutated_sequences}: Number of records with the unmutated novel allele sequence.
#' \item \code{unmutated_frequency}: Proportion of records with the unmutated novel allele
#' sequence (\code{unmutated_sequences / sequences}).
#' \item \code{allelic_percentage}: Percentage at which the (unmutated) allele is observed
#' in the sequence dataset compared to other (unmutated) alleles.
#' \item \code{unique_js}: Number of unique J sequences found associated with the
#' novel allele. The sequences are those who have been unambiguously assigned
#' to the novel allelle (\code{polymorphism_call}).
#' \item \code{unique_cdr3s}: Number of unique CDR3s associated with the inferred allele.
#' The sequences are those who have been unambiguously assigned to the
#' novel allelle (polymorphism_call).
#' \item \code{mut_min}: Minimum mutation considered by the algorithm.
#' \item \code{mut_max}: Maximum mutation considered by the algorithm.
#' \item \code{pos_min}: First position of the sequence considered by the algorithm (IMGT numbering).
#' \item \code{pos_max}: Last position of the sequence considered by the algorithm (IMGT numbering).
#' \item \code{y_intercept}: The y-intercept above which positions were considered
#' potentially polymorphic.
#' \item \code{alpha}: Significance threshold to be used when constructing the
#' confidence interval for the y-intercept.
#' \item \code{min_seqs}: Input \code{min_seqs}. The minimum number of total sequences
#' (within the desired mutational range and nucleotide range) required
#' for the samples to be considered.
#' \item \code{j_max}: Input \code{j_max}. The maximum fraction of sequences perfectly
#' aligning to a potential novel allele that are allowed to utilize to a particular
#' combination of junction length and J gene.
#' \item \code{min_frac}: Input \code{min_frac}. The minimum fraction of sequences that must
#' have usable nucleotides in a given position for that position to be considered.
#' \item \code{note}: Comments regarding the novel allele inferrence.
#' }
#'
#' @seealso
#' See \link{findNovelAlleles}, \link{inferGenotype} and \link{genotypeFasta}
#' for generating the required input.
#'
#' @examples
#' \donttest{
#' # Generate input data
#' novel <- findNovelAlleles(AIRRDb, SampleGermlineIGHV,
#' v_call="v_call", j_call="j_call", junction="junction",
#' junction_length="junction_length", seq="sequence_alignment")
#' genotype <- inferGenotype(AIRRDb, find_unmutated=TRUE,
#' germline_db=SampleGermlineIGHV,
#' novel=novel,
#' v_call="v_call", seq="sequence_alignment")
#' genotype_db <- genotypeFasta(genotype, SampleGermlineIGHV, novel)
#' data_db <- reassignAlleles(AIRRDb, genotype_db,
#' v_call="v_call", seq="sequence_alignment")
#'
#' # Assemble evidence table
#' evidence <- generateEvidence(data_db, novel, genotype,
#' genotype_db, SampleGermlineIGHV,
#' j_call = "j_call",
#' junction = "junction")
#' }
#'
#' @export
generateEvidence <- function(data, novel, genotype, genotype_db,
germline_db, j_call="j_call", junction="junction",
fields=NULL) {
# Visibility hack
. <- NULL
# Define set of sequences containing genotype and uncorrected calls
germline_set <- c(germline_db[!names(germline_db) %in% names(genotype_db)],
genotype_db)
# Find closest reference
.findClosestReference <- function(seq, allele_calls, ref_germ,
exclude_self=F, multiple=F) {
# filter `ref_germ`, use only same gene segment as `seq`
diff_calls <- sub("[0-9]+$","",getFamily(names(ref_germ))) != sub("[0-9]+$","",getFamily(names(seq)))
if (sum(diff_calls) > 0) {
allele_calls <- allele_calls[allele_calls %in% names(ref_germ[diff_calls]) == FALSE]
ref_germ <- ref_germ[!diff_calls]
}
closest <- getMutCount(seq,
paste(allele_calls, collapse=","),
ref_germ)
min_dist <- min(unlist(closest))
closest_idx <- which(unlist(closest) == min_dist)
closest_names <- unique(allele_calls[closest_idx])
if (exclude_self & names(seq) %in% closest_names) {
warning("Excluding self")
closest_names <- closest_names[closest_names!=names(seq)] # not self
}
if (length(closest_names) > 1) {
warning(paste0("More than one closest reference found for ",
names(seq),": ",
paste(closest_names, collapse=",")))
# Keep the one with less mutated positions
mut_pos_count <- sapply(gsub("[^_]","",closest_names), nchar)
closest_names <- closest_names[mut_pos_count==min(mut_pos_count)]
# Pick same length
if (length(closest_names) > 1 ) {
idx <- which(
sapply(ref_germ[closest_names],nchar) == nchar(ref_germ[names(seq)])
)
if (length(idx) > 0 ) {
closest_names <- closest_names[idx]
}
}
# Pick same allele
if (length(closest_names) > 1 ) {
idx <- which(
getAllele(closest_names) == gsub("_.+", "", getAllele(names(seq)))
)
if (length(idx) > 0 ) {
closest_names <- closest_names[idx]
}
}
# Pick not duplicated
if (length(closest_names) > 1 ) {
idx <- !grepl("D\\*", closest_names)
if (any(idx)) {
closest_names <- closest_names[idx]
}
}
# If still more than one, err and TODO
if (length(closest_names) > 1 & multiple==FALSE) {
msg <- paste0("Multiple closest reference found for ",
names(seq),":\n",
paste(closest_names, collapse=","))
stop(msg)
}
warning(paste0("Use: ",
paste(closest_names, collapse=","),
" (less mutated positions, not D, same length, same allele)"))
}
closest_names
}
# Subset to novel alleles
unnest_cols <- c("alleles", "counts")
final_gt <- genotype %>%
dplyr::group_by(.data$gene) %>%
dplyr::filter(!duplicated(.data$alleles)) %>%
dplyr::ungroup() %>%
dplyr::mutate(alleles=strsplit(as.character(.data$alleles), ","),
counts=strsplit(as.character(.data$counts), ",")) %>%
tidyr::unnest(dplyr::all_of(unnest_cols)) %>%
dplyr::mutate(polymorphism_call=paste0(.data$gene, "*" , .data$alleles)) %>%
dplyr::filter(.data$polymorphism_call %in% novel$polymorphism_call) %>%
dplyr::rename(allele="alleles")
# Add info from novel
final_gt <- dplyr::inner_join(dplyr::rename(final_gt, note_gt="note"),
novel,
by=c(fields, "polymorphism_call"))
# Add message if the same novel img sequence found from
# different starting alleles, these will be novel imgt sequences
# with more than one polymorphism call
final_gt <- final_gt %>%
dplyr::group_by(.data$novel_imgt) %>%
dplyr::mutate(NUM_CALLS=length(unique(.data$polymorphism_call))) %>%
dplyr::ungroup()
idx_mult <- which(final_gt$NUM_CALLS > 1)
final_gt$NUM_CALLS <- NULL
if (length(idx_mult) > 0) {
final_gt$note_gt[idx_mult] <- paste(
final_gt$note_gt[idx_mult],
" Found multiple polymorphism calls for the same novel_imgt.",
sep="")
}
if (nrow(final_gt)>0) {
.addEvidence <- function(df, germline_set, germline_db) {
polymorphism <- df[['polymorphism_call']]
novel_imgt <- df[["novel_imgt"]]
names(novel_imgt) <- polymorphism
v_call_genotyped <- data[["v_call_genotyped"]]
sequences <- sum(v_call_genotyped == polymorphism)
df[["sequences"]] <- sequences
closest_ref_input <- .findClosestReference(novel_imgt,
names(germline_db),
germline_db,
exclude_self=F)
closest_ref <- .findClosestReference(novel_imgt,
names(germline_set),
germline_set,
exclude_self=F, multiple=T)
if (all(getGene(closest_ref_input) != getGene(closest_ref))) {
warning("closest reference gene difference")
}
if (all(closest_ref != polymorphism)) {
warning(paste0("closest reference allele (",
closest_ref
,") different from polymorphism_call allele (",
polymorphism,")"))
}
df[["closest_reference"]] <- closest_ref_input
nt_diff <- unlist(getMutatedPositions(novel_imgt, germline_set[[closest_ref_input]]))
nt_diff_string <- ""
if (nchar(novel_imgt) < nchar(germline_set)[[closest_ref_input]]) {
nt_diff <- c(nt_diff, (nchar(novel_imgt)+1):nchar(germline_set[[closest_ref_input]]))
}
if (length(nt_diff) > 0 ) {
ref_nt <- strsplit(germline_set[[closest_ref_input]],"")[[1]][nt_diff]
novel_nt <- strsplit(germline_set[[polymorphism]],"")[[1]][nt_diff]
nt_diff_string <- paste(paste(
nt_diff,
ref_nt,
">",
replace(novel_nt, is.na(novel_nt), "-"),
sep=""), collapse=",")
}
df[["nt_diff"]] <- length(nt_diff)
df[["nt_substitutions"]] <- nt_diff_string
diff_aa <- getMutatedAA(germline_set[[closest_ref_input]], germline_set[[polymorphism]])
if (length(diff_aa)>0) {
df[["aa_diff"]] <- length(diff_aa)
df[["aa_substitutions"]] <- paste(diff_aa,collapse=",")
} else {
df[["aa_diff"]] <- 0
df[["aa_substitutions"]] <- ""
}
df[["counts"]] <- as.numeric(df[["counts"]])
df[["total"]] <- as.numeric(df[["total"]])
df[["unmutated_sequences"]] <- as.numeric(df[["counts"]])
df[["unmutated_frequency"]] <- as.numeric(df[["counts"]])/sequences
df[["allelic_percentage"]] <- 100*df[["unmutated_sequences"]]/as.numeric(df[["total"]])
if (sequences > 0) {
df[["unique_js"]] <- data %>%
dplyr::filter(.data$v_call_genotyped == polymorphism) %>%
dplyr::distinct(.data[[j_call]]) %>%
nrow()
df[["unique_cdr3s"]] <- data %>%
dplyr::filter(.data$v_call_genotyped == polymorphism) %>%
dplyr::distinct(translateDNA(.data[[junction]], trim=TRUE)) %>%
nrow()
} else {
df[["unique_js"]] <- NA
df[["unique_cdr3s"]] <- NA
}
# Add closest germline
df[["closest_reference_imgt"]] <- cleanSeqs(germline_set[[closest_ref_input]])
data.frame(df, stringsAsFactors=FALSE)
}
final_gt <- final_gt %>%
dplyr::rowwise() %>%
do(.addEvidence(., germline_set=germline_set, germline_db=germline_db)) %>%
dplyr::mutate(note=trimws(paste(.data$note_gt, .data$note, sep=" "))) %>%
dplyr::select(-c("note_gt"))
}
return(final_gt)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.