setGeneric(
"yassai_identifier",
function(data, V_after_C, J_before_FGxG, long=FALSE)
standardGeneric("yassai_identifier")
)
# Case of a single clonotype:
# convert the data vector to a one-line data frame and call the function again.
setMethod(
yassai_identifier,
c(data="character", V_after_C="data.frame", J_before_FGxG="data.frame", long="ANY"),
function(data, V_after_C, J_before_FGxG, long) {
if(missing(long)) long <- FALSE
yassai_identifier(
data.frame(t(data), stringsAsFactors=F),
V_after_C,
J_before_FGxG,
long)
})
# Load default data if no V_after_C and J_before_FGxG tables are specified.
# Custom data in "./inst/extdata/" has precedence.
setMethod(
yassai_identifier,
c(data="ANY", V_after_C="missing", J_before_FGxG="missing", long="ANY"),
function(data, long) {
if ( file.exists("inst/extdata/V_after_C.txt.gz") ) {
V_after_C <- read.table("inst/extdata/V_after_C.txt.gz", header=TRUE, row.names=1, stringsAsFactors=FALSE)
warning("Loading custom data from 'inst/extdata/V_after_C.txt.gz'.")
} else {
V_after_C <- read.table(system.file('extdata', 'V_after_C.txt.gz', package = "clonotypeR"), stringsAsFactors=FALSE)
}
if ( file.exists("inst/extdata/J_before_FGxG.txt.gz") ) {
J_before_FGxG <- read.table("inst/extdata/J_before_FGxG.txt.gz", header=TRUE, row.names=1, stringsAsFactors=FALSE)
warning("Loading custom data from 'inst/extdata/J_before_FGxG.txt.gz'.")
} else {
J_before_FGxG <- read.table(system.file('extdata', 'J_before_FGxG.txt.gz', package = "clonotypeR"), stringsAsFactors=FALSE)
}
if ( missing(long) ) long <- FALSE
yassai_identifier(data, V_after_C, J_before_FGxG, long)
})
# Main function.
setMethod(
yassai_identifier,
c(data="data.frame", V_after_C="data.frame", J_before_FGxG="data.frame", long="logical"),
function(data, V_after_C, J_before_FGxG, long=FALSE) {
if ( ! ( exists("codon_ids") && class(codon_ids) == "data.frame" ) )
if ( file.exists("inst/extdata/codon_ids.txt.gz") )
codon_ids <- read.table("inst/extdata/codon_ids.txt.gz", header=TRUE, row.names=1)
if ( ! ( exists("codon_ids") && class(codon_ids) == "data.frame" ) )
codon_ids <- read.table(system.file('extdata', 'codon_ids.txt.gz', package = "clonotypeR"), header=TRUE, row.names=1)
if ( ! all( c("V", "J", "dna","pep") %in% names(data) ) )
stop ("Missing V or J segment(s), or DNA or peptides sequence(s) in the data.")
strReverse <- function(x) # From ?strsplit help page.
sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
V_name <- as.character(data$V)
V <- as.character(V_after_C[V_name,])
dna <- as.character(data$dna)
J_name <- as.character(data$J)
J <- strReverse(as.character(J_before_FGxG[J_name,]))
dnarev <- strReverse(dna)
pep <- as.character(data$pep)
# True if the reference and CDR3 codons are identical.
is.germline <- function (ref,dna,pos) {
if (pos > nchar(dna)) return (FALSE)
answer <- toupper(substr(ref, 1, pos)) == toupper(substr(dna, 1, pos))
answer[is.na(answer)] <- FALSE # Replace NA per FALSE; is.germline is used in a while loop.
return(answer)
}
germline <- function (ref, dna) {
pos <- 1
while( is.germline(ref,dna,pos) )
pos <- pos + 1
return(pos)
}
# Example
#
# Germline-encoded sequence stops at codon 2 (V_germline).
# T T |
# GCA GCA AGT G
# TRAV14-1 GCA GCA TCT TAT AAC CAG GGG AAG CTT ATC TRAJ23 nchar = 30
# G AAT TAT AAC CAG GGG AAG CTT ATC
# | T T T T T T T
# Germline-encoded sequence restarts at codon 4 (J_germline).
V_germline <- ceiling( apply(cbind(V,dna), 1, function(X) germline(X[1], X[2])) / 3 ) - 1
J_germline <- ceiling( ( nchar(dna) - apply(cbind(J,dnarev), 1, function(X) germline(X[1], X[2])) + 1) / 3 ) + 1
tocodons <- function (sequences)
strsplit(sequences, "(?<=...)", perl=TRUE)
codon2id <- function (codons) {
sapply(codons, function(X) {
if (length(X) == 0) {
return('')
} else {
return(codon_ids[X,"id"])
}
})
}
###################
#Remains to do:
#
# - Convert to lowercase where applicable
# - get codon ids for remaining residues
# Return something for unproductive clonotypes
##################
# Convert to lower case and remove all but the most central germlineally encoded codons.
CDR3aa <- toupper(pep) # Just in case
substr(CDR3aa, 1, V_germline) <- tolower(substr(CDR3aa, 1, V_germline))
substr(CDR3aa, J_germline, nchar(CDR3aa)) <- tolower(substr(CDR3aa, J_germline, nchar(CDR3aa)))
# The published version of the Yassai identifier has "collisions": clonotypes with different
# DNA sequences but same identifiers. As a workaround, the "long" option skips the trimming
# of the leftmost and rightmost unmodified germline codons.
if (long == FALSE) {
CDR3aa <- substring(CDR3aa, 1, J_germline)
CDR3aa <- substring(CDR3aa, V_germline, nchar(CDR3aa))
}
# Convert the V and J names
V_name <- sub("TRAV","A",V_name)
V_name <- sub("N-", "N",V_name)
V_name <- sub("D-", "D",V_name)
V_name <- sub("/.*", "" ,V_name)
V_name <- sub("TRBV","B",V_name)
V_name <- sub("TRGV","G",V_name)
V_name <- sub("TRDV","D",V_name)
J_name <- sub("TRAJ","A",J_name)
J_name <- sub("TRBJ","B",J_name)
J_name <- sub("TRGJ","G",J_name)
J_name <- sub("TRDJ","D",J_name)
# Determine the ID for the remaining codons.
IDs <- codon2id(tocodons(substr(dna,(V_germline * 3) + 1 , (J_germline -1) * 3 )))
# Disambiguate blunt V/J recombinations where all codons are like either V or J germline.
# These kind of collisions only happen with the "long" format.
# Example:
# > data
# V J dna pep
# 1 TRAV14N-1 TRAJ56 GCAGCTACTGGAGGCAATAATAAGCTGACT AATGGNNKLT
# 2 TRAV14N-1 TRAJ56 GCAGCAACTGGAGGCAATAATAAGCTGACT AATGGNNKLT
# > yassai_identifier(data, long=T)
# [1] "aatggnnklt.1A14N1A56L10" "aatggnnklt.2A14N1A56L10"
# > yassai_identifier(data, long=F)
# [1] "aa.1A14N1A56L10" "at.2A14N1A56L10"
IDs[IDs == ''] <- V_germline[IDs == '']
## Different paste commands are needed if the input is one or multiple clonotypes.
if ( nrow(data) == 1 ) {
IDs <- paste(IDs, collapse='')
} else {
IDs <- sapply(IDs, paste, collapse='')
}
# Construct and return the CDR3 in Yassai et al's nomenclature.
return( paste(CDR3aa, ".", IDs, V_name, J_name, "L", nchar(pep), sep=""))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.