R/yassai_identifier.R

setGeneric(
    "yassai_identifier",
    function(data, V_after_C, J_before_FGxG, long=FALSE)
    standardGeneric("yassai_identifier")
)

# Case of a single clonotype:
# convert the data vector to a one-line data frame and call the function again.

setMethod(
    yassai_identifier,
    c(data="character", V_after_C="data.frame", J_before_FGxG="data.frame", long="ANY"),
    function(data, V_after_C, J_before_FGxG, long) {
    if(missing(long)) long <- FALSE
    yassai_identifier(
        data.frame(t(data), stringsAsFactors=F),
        V_after_C,
        J_before_FGxG,
        long)
})

# Load default data if no V_after_C and J_before_FGxG tables are specified.
# Custom data in "./inst/extdata/" has precedence.

setMethod(
    yassai_identifier,
    c(data="ANY", V_after_C="missing", J_before_FGxG="missing", long="ANY"),
    function(data, long) {
    if ( file.exists("inst/extdata/V_after_C.txt.gz") ) {
        V_after_C <- read.table("inst/extdata/V_after_C.txt.gz", header=TRUE, row.names=1, stringsAsFactors=FALSE)
        warning("Loading custom data from 'inst/extdata/V_after_C.txt.gz'.")
    } else {
        V_after_C <- read.table(system.file('extdata', 'V_after_C.txt.gz', package = "clonotypeR"), stringsAsFactors=FALSE)
    }
    if ( file.exists("inst/extdata/J_before_FGxG.txt.gz") ) {
        J_before_FGxG <- read.table("inst/extdata/J_before_FGxG.txt.gz", header=TRUE, row.names=1, stringsAsFactors=FALSE)
        warning("Loading custom data from 'inst/extdata/J_before_FGxG.txt.gz'.")
    } else {
        J_before_FGxG <- read.table(system.file('extdata', 'J_before_FGxG.txt.gz', package = "clonotypeR"), stringsAsFactors=FALSE)
    }
    if ( missing(long) ) long <- FALSE
    yassai_identifier(data, V_after_C, J_before_FGxG, long)
})

# Main function.

setMethod(
    yassai_identifier,
    c(data="data.frame", V_after_C="data.frame", J_before_FGxG="data.frame", long="logical"),
    function(data, V_after_C, J_before_FGxG, long=FALSE) {

if ( ! ( exists("codon_ids") && class(codon_ids) == "data.frame" ) )
	if ( file.exists("inst/extdata/codon_ids.txt.gz") )
		codon_ids <- read.table("inst/extdata/codon_ids.txt.gz", header=TRUE, row.names=1)
if ( ! ( exists("codon_ids") && class(codon_ids) == "data.frame" ) )
	codon_ids <- read.table(system.file('extdata', 'codon_ids.txt.gz', package = "clonotypeR"), header=TRUE, row.names=1)

if ( ! all( c("V", "J", "dna","pep") %in% names(data) ) )
  stop ("Missing V or J segment(s), or DNA or peptides sequence(s) in the data.")

strReverse <- function(x)   # From ?strsplit help page.
	sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")

V_name <- as.character(data$V)
V      <- as.character(V_after_C[V_name,])

dna    <- as.character(data$dna)

J_name <- as.character(data$J)
J      <- strReverse(as.character(J_before_FGxG[J_name,]))
dnarev <- strReverse(dna)

pep    <- as.character(data$pep)

# True if the reference and CDR3 codons are identical.
is.germline <- function (ref,dna,pos) {
	if (pos > nchar(dna)) return (FALSE)
	answer <- toupper(substr(ref, 1, pos)) == toupper(substr(dna, 1, pos))
	answer[is.na(answer)] <- FALSE  # Replace NA per FALSE; is.germline is used in a while loop.
	return(answer)
}

germline <- function (ref, dna) {
	pos <- 1
	while( is.germline(ref,dna,pos) )
		pos <- pos + 1
	return(pos)
}

# Example
# 
# Germline-encoded sequence stops at codon 2 (V_germline).
#             T   T  |
#            GCA GCA AGT G
#   TRAV14-1 GCA GCA TCT TAT AAC CAG GGG AAG CTT ATC TRAJ23 nchar = 30
#                  G AAT TAT AAC CAG GGG AAG CTT ATC
#                     |   T   T   T   T   T   T   T
# Germline-encoded sequence restarts at codon 4 (J_germline).

V_germline <- ceiling(                apply(cbind(V,dna),    1, function(X) germline(X[1], X[2]))      / 3 ) - 1
J_germline <- ceiling( ( nchar(dna) - apply(cbind(J,dnarev), 1, function(X) germline(X[1], X[2])) + 1) / 3 ) + 1

tocodons <- function (sequences)
	strsplit(sequences, "(?<=...)", perl=TRUE)

codon2id <- function (codons) {
    sapply(codons, function(X) {
        if (length(X) == 0) {
            return('')
        } else {
            return(codon_ids[X,"id"])
        }
    })
}
###################


#Remains to do: 
#
# - Convert to lowercase where applicable
# - get codon ids for remaining residues
# Return something for unproductive clonotypes

##################


# Convert to lower case and remove all but the most central germlineally encoded codons.
CDR3aa <- toupper(pep) # Just in case
substr(CDR3aa, 1, V_germline)              <- tolower(substr(CDR3aa, 1, V_germline))
substr(CDR3aa, J_germline, nchar(CDR3aa))  <- tolower(substr(CDR3aa, J_germline, nchar(CDR3aa)))

# The published version of the Yassai identifier has "collisions": clonotypes with different
# DNA sequences but same identifiers.  As a workaround, the "long" option skips the trimming
# of the leftmost and rightmost unmodified germline codons.

if (long == FALSE) {
    CDR3aa <- substring(CDR3aa, 1, J_germline)
    CDR3aa <- substring(CDR3aa, V_germline, nchar(CDR3aa))
}

# Convert the V and J names
V_name <- sub("TRAV","A",V_name)
V_name <- sub("N-",  "N",V_name)
V_name <- sub("D-",  "D",V_name)
V_name <- sub("/.*", "" ,V_name)
V_name <- sub("TRBV","B",V_name)
V_name <- sub("TRGV","G",V_name)
V_name <- sub("TRDV","D",V_name)
J_name <- sub("TRAJ","A",J_name)
J_name <- sub("TRBJ","B",J_name)
J_name <- sub("TRGJ","G",J_name)
J_name <- sub("TRDJ","D",J_name)

# Determine the ID for the remaining codons.
IDs <- codon2id(tocodons(substr(dna,(V_germline * 3) + 1 , (J_germline -1) * 3 )))

# Disambiguate blunt V/J recombinations where all codons are like either V or J germline.
# These kind of collisions only happen with the "long" format.
# Example:
# > data
#           V      J                            dna        pep
# 1 TRAV14N-1 TRAJ56 GCAGCTACTGGAGGCAATAATAAGCTGACT AATGGNNKLT
# 2 TRAV14N-1 TRAJ56 GCAGCAACTGGAGGCAATAATAAGCTGACT AATGGNNKLT
# > yassai_identifier(data, long=T)
# [1] "aatggnnklt.1A14N1A56L10" "aatggnnklt.2A14N1A56L10"
# > yassai_identifier(data, long=F)
# [1] "aa.1A14N1A56L10" "at.2A14N1A56L10"

IDs[IDs == ''] <- V_germline[IDs == '']

## Different paste commands are needed if the input is one or multiple clonotypes.

if ( nrow(data) == 1 )  {
  IDs <- paste(IDs, collapse='')
} else {
  IDs <- sapply(IDs, paste, collapse='')
}

# Construct and return the CDR3 in Yassai et al's nomenclature.
return( paste(CDR3aa, ".", IDs, V_name, J_name, "L", nchar(pep), sep=""))

})
charles-plessy/clonotypeR.old documentation built on May 13, 2019, 3:31 p.m.