#' TCR sequence clustering
#'
#' @description Find clusters of similar TCRs that are likely to recognize the
#' same epitope.
#' @param sequence_df A data.frame containing TCR sequence data. Each row must
#' describe a unique TCR sequence. Following fields are required:
#' * junction_beta - amino acid sequence of CDR3 plus the two flanking conserved residues
#' * v_beta, j_beta - V and J gene with or without allele; allele information is
#' not used for score calculation.
#'
#' If chains="AB" junction_alpha, v_alpha and j_alpha must be provided too.
#' @param id_col Name of a column with unique ids for each TCR.
#' @param chains Which chains to cluster. "B" for beta chain only, "AB" for paired
#' alpha and beta chains.
#' @param tmp_folder Path to a directory where temporary files could be stored. They are
#' deleted when clustering is finished.
#' @param scores_filename If a character string for naming a file is provided
#' BL-scores of each TCR pair will be exported to this file. Supported formats: .Rds, .csv.
#' @param threshold Clustering threshold (optional).
#' @param ncores The number of cores to use for parallel computation (default = 1).
#' Using >1 core is not supported in Windows.
#'
#' @return A data.frame containing same information as sequence_df plus the cluster ids.
#' If scores_filename is provided a file with pairwise BL-scores is created.
#' @export
#'
#' @details
#' The default clustering thresholds were defined to optimally detect clusters of
#' TCRs recognizing the same epitope. If instead of full junction only CDR3 sequence
#' witout flanking residues is provided the scores will be overestimated which may
#' lead to wrong cluster assignment.
#'
#' @examples
#' clusters <- clusterize_TCR(example_TCR_df, chains="AB", id_col="id", tmp_folder=".", ncores=2)
#'
clusterize_TCR <- function(sequence_df, chains, tmp_folder, id_col,
scores_filename=NA, threshold=NA, ncores=1){
### INPUT CHECK
# check chains argument
if (!chains %in% c("AB", "B")) {
stop('Invalid chains argument (must be "AB" or "B")')
}
# check if all the needed columns are present
beta_cols <- c("v_beta", "j_beta", "junction_beta")
missing_beta <- beta_cols[!beta_cols %in% colnames(sequence_df)]
if (length(missing_beta) > 0) {
stop(paste0("Coulumn(s) ", paste(missing_beta, collapse = ", "),
" are missing in sequence_df"))
}
if (chains == "AB") {
alpha_cols <- c("v_alpha", "j_alpha", "junction_alpha")
missing_alpha <- alpha_cols[!alpha_cols %in% colnames(sequence_df)]
if (length(missing_alpha) > 0) {
stop(paste0("Chains AB specified, but coulumn(s) ",
paste(missing_alpha, collapse = ", "),
" are missing in sequence_df"))
}
}
# convert to data.table
sequence_dt <- data.table::as.data.table(sequence_df)
# remove alleles
data.table::setnames(sequence_dt, old = c('v_beta','j_beta'), new = c('v_beta_raw','j_beta_raw'))
sequence_dt[, v_beta := sub("\\*.+$", "", v_beta_raw)]
sequence_dt[, j_beta := sub("\\*.+$", "", j_beta_raw)]
if (chains == "AB") {
data.table::setnames(sequence_dt, old = c('v_alpha','j_alpha'), new = c('v_alpha_raw','j_alpha_raw'))
sequence_dt[, v_alpha := sub("\\*.+$", "", v_alpha_raw)]
sequence_dt[, j_alpha := sub("\\*.+$", "", j_alpha_raw)]
}
# filter out sequences with non-IMGT genes
genes <- unique(c(sequence_dt$v_alpha, sequence_dt$v_beta,
sequence_dt$j_alpha, sequence_dt$j_beta))
unkn_genes <- genes[!genes %in% imgt_gene_list]
if (length(unkn_genes) > 0) {
message(paste("Unknown genes detected: ", paste(unkn_genes, collapse = ", "),
"Sequences carrying these genes will be filtered out"))
sequence_dt <- sequence_dt[!sequence_dt$v_beta %in% unkn_genes]
sequence_dt <- sequence_dt[!sequence_dt$j_beta %in% unkn_genes]
if (chains == "AB") {
sequence_dt <- sequence_dt[!sequence_dt$v_alpha %in% unkn_genes]
sequence_dt <- sequence_dt[!sequence_dt$j_alpha %in% unkn_genes]
}
}
# filter out short junction sequences
n_short_beta <- sum(nchar(sequence_dt$junction_beta) < 5)
if (n_short_beta > 0) {
message(paste0(n_short_beta, " sequences having short junction in beta chain (<5 aa) will be filtered out"))
sequence_dt <- sequence_dt[nchar(sequence_dt$junction_beta) > 4]
}
if (chains == "AB") {
n_short_alpha <- sum(nchar(sequence_dt$junction_alpha) < 5)
if (n_short_alpha > 0) {
message(paste0(n_short_alpha, " sequences having short junction in alpha chain (<5 aa) will be filtered out"))
sequence_dt <- sequence_dt[nchar(sequence_dt$junction_alpha) > 4]
}
}
# filter out sequences with stop-codons
has_stop_beta <- grep("\\*", sequence_dt$junction_beta)
if (chains == "AB") {
has_stop_alpha <- grep("\\*", sequence_dt$junction_alpha)
has_stop <- unique(has_stop_alpha, has_stop_beta)
} else {
has_stop <- has_stop_beta
}
if (length(has_stop) > 0) {
message(paste0(length(has_stop), " sequences having stop codon in junction will be filtered out"))
sequence_dt <- sequence_dt[-has_stop,]
}
# check if id_col exists
if (!id_col %in% colnames(sequence_dt)) {
stop(paste0("Column with name ", id_col, " is not found, please provide a valid column name"))
}
# check if id_col contains unique values
if (nrow(sequence_dt) != length(unique(sequence_dt[[id_col]]))) {
stop(paste0("Provided id_col (", id_col,
") contains duplicated values, please provide a column with unique ids" ))
}
sequence_dt[, receptor_id := get(id_col)]
data.table::setkey(sequence_dt, receptor_id)
# check if data table is not empty
if (nrow(sequence_dt) < 2) {
stop(paste0(nrow(sequence_dt), " sequences, nothing to cluster!"))
}
### CALCULATE SCORES
scored_rec_pairs <- calculate_scores(sequence_dt, chains, tmp_folder, scores_filename, ncores)
### DEFINE CLUSTERS
if (is.na(threshold)){
threshold <- if (chains == "AB") threshold_AB else threshold_B
}
sim_rec_pairs <- scored_rec_pairs |>
dplyr::filter(score > threshold) |>
dplyr::select(from_receptor_id, to_receptor_id, weight = score)
# node ids should be characters otherwise messed up by igraph
sim_rec_pairs$from_receptor_id <- as.character(sim_rec_pairs$from_receptor_id)
sim_rec_pairs$to_receptor_id <- as.character(sim_rec_pairs$to_receptor_id)
g <- igraph::graph_from_data_frame(sim_rec_pairs,
directed = F,
vertices = as.character(sequence_dt$receptor_id))
clusters <- igraph::clusters(g)$membership
# return columns with initial V gene names to the table
sequence_dt[, c("v_beta", "j_beta") := NULL]
data.table::setnames(sequence_dt, old = c('v_beta_raw','j_beta_raw'), new = c('v_beta','j_beta'))
if (chains == "AB") {
sequence_dt[, c("v_alpha", "j_alpha") := NULL]
data.table::setnames(sequence_dt, old = c('v_alpha_raw','j_alpha_raw'), new = c('v_alpha','j_alpha'))
}
clusters <- clusters |>
utils::stack() |>
dplyr::rename(cluster_id = values,
receptor_id = ind) |>
dplyr::mutate(receptor_id = as.integer(as.character(receptor_id))) |>
# merge to the full data.table to add all the info
merge(sequence_dt, by = "receptor_id") |>
dplyr::select(-receptor_id)
return(clusters)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.