Nothing
# Standard column names for different sources of TCR data
col_ref <- list(
adaptive = c("v_resolved", "d_resolved", "j_resolved"),
adaptivev2 = c("vMaxResolved", "dMaxResolved", "jMaxResolved"),
imgt = c("v_gene", "d_gene", "j_gene", "c_gene"),
tenx = c("v_gene", "d_gene", "j_gene", "c_gene")
)
#' Choose lookup table
#'
#' `choose_lookup()` determines which CSV lookup table to use based on the the
#' input format (`frm`) and returns the path to that file.
#'
#' @param frm A string, the input format of TCR data. Must be one of
#' `"tenx"`, `"adaptive"`, `"adaptivev2"`, or `"imgt"`.
#' @param to A string, the output format of TCR data. Must be one of
#' `"tenx"`, `"adaptive"`, `"adaptivev2"`, or `"imgt"`.
#' @param species A string, the species. Optional; defaults to `"human"`.
#' @param verbose A boolean, whether to show messages. Optional; defaults to `TRUE`
#'
#' @return A string, the path to correct lookup table.
#' @importFrom rappdirs user_data_dir
#' @export
#' @keywords internal
#' @examples
#' choose_lookup("imgt", "adaptive")
choose_lookup <- function(frm, to, species = "human", verbose = TRUE) {
# Determine where to find lookup tables
if (species %in% c("human", "mouse", "rhesus")) {
lookup_dir <- system.file("extdata", package = "TCRconvertR")
} else {
lookup_dir <- rappdirs::user_data_dir("TCRconvertR", "Emmma Bishop")
}
data_path <- file.path(lookup_dir, species)
if (frm == "tenx") {
lookup_f <- file.path(data_path, "lookup_from_tenx.csv")
if (verbose) {
message("Converting from 10X. Using *01 as allele for all genes.")
}
} else if (frm %in% c("adaptive", "adaptivev2")) {
lookup_f <- file.path(data_path, "lookup_from_adaptive.csv")
if (to == "imgt" && verbose) {
message("Converting from Adaptive to IMGT. Using *01 for genes lacking alleles.")
}
} else {
lookup_f <- file.path(data_path, "lookup.csv")
}
if (file.exists(lookup_f)) {
return(lookup_f)
} else {
stop("Lookup table not found, please run build_lookup_from_fastas().")
}
}
#' Determine input columns to use
#'
#' `which_frm_cols()` determines the columns that are expected to hold gene
#' name information in the input file based on the input format (`frm`). It
#' returns a vector of those column names.
#'
#' @param df Dataframe containing TCR gene names.
#' @param frm A string, the input format of TCR data. Must be one of
#' `"tenx"`, `"adaptive"`, `"adaptivev2"`, or `"imgt"`.
#' @param frm_cols A character vector, the custom column names to use.
#' @param verbose A boolean, whether to show messages. Optional; defaults to `TRUE`
#'
#' @return A character vector, column names to use.
#' @export
#' @keywords internal
#' @examples
#' tcr_file <- get_example_path("tenx.csv")
#' df <- read.csv(tcr_file)
#' which_frm_cols(df, "tenx")
which_frm_cols <- function(df, frm, frm_cols = NULL, verbose = TRUE) {
if (frm == "imgt" && is.null(frm_cols)) {
cols_from <- col_ref$tenx
warning(paste("No column names for IMGT data. Using 10X columns:", paste(cols_from, collapse = ", ")))
} else if (!is.null(frm_cols)) {
missing_cols <- setdiff(frm_cols, colnames(df))
if (length(missing_cols) > 0) {
stop(paste("These columns are not in the input dataframe:", paste(missing_cols, collapse = ", ")))
} else {
cols_from <- frm_cols
if (verbose) {
message(paste("Using custom column names:", paste(cols_from, collapse = ", ")))
}
}
} else {
cols_from <- col_ref[[frm]]
}
cols_from
}
#' Convert gene names
#'
#' @description
#' `convert_gene()` converts T-cell receptor (TCR) gene names between the IMGT,
#' 10X, and Adaptive formats. It determines the columns to convert based
#' on the input format (`frm`) unless specified by the user (`frm_cols`). It
#' returns a modified version of the input data frame with converted gene names
#' while preserving row order.
#'
#' @details
#' Gene names are converted by performing a `merge` between the relevant
#' input columns and a species-specific lookup table containing IMGT reference
#' genes in all three formats.
#'
#' **Behavioral Notes**
#' - If a gene name cannot be mapped, it is replaced with `NA` and a warning is
#' raised.
#' - If `frm` is `'imgt'` and `frm_cols` is not provided, 10X column
#' names are assumed.
#' - Constant (C) genes are set to `NA` when converting to Adaptive formats,
#' as Adaptive does not capture constant regions.
#' - The input does not need to include all gene types; partial inputs
#' (e.g., only V genes) are supported.
#' - If no values in a custom column can be mapped (e.g. a CDR3 column) it is
#' skipped and a warning is raised.
#'
#' **Standard Column Names**
#'
#' If `frm_cols` is not provided, these column names will be used if present:
#' - **IMGT**: `"v_gene"`, `"d_gene"`, `"j_gene"`, `"c_gene"`
#' - **10X**: `"v_gene"`, `"d_gene"`, `"j_gene"`, `"c_gene"`
#' - **Adaptive**: `"v_resolved"`, `"d_resolved"`, `"j_resolved"`
#' - **Adaptive v2**: `"vMaxResolved"`, `"dMaxResolved"`, `"jMaxResolved"`
#'
#' @param df A dataframe containing TCR gene names.
#' @param frm A string, the input format of TCR data. Must be one of
#' `"imgt"`, `"tenx"`, `"adaptive"`, or `"adaptivev2"`.
#' @param to A string, the output format of TCR data. Must be one of
#' `"imgt"`, `"tenx"`, `"adaptive"`, or `"adaptivev2"`.
#' @param species A string,the species. Optional; defaults to `"human"`.
#' @param frm_cols A character vector of custom gene column names.
#' Optional; defaults to `NULL`.
#' @param verbose A boolean, whether to display messages. Optional; defaults to `TRUE`.
#'
#' @return A dataframe with converted TCR gene names.
#' @autoglobal
#' @export
#' @examples
#' tcr_file <- get_example_path("tenx.csv")
#' df <- read.csv(tcr_file)[c("barcode", "v_gene", "j_gene", "cdr3")]
#' df
#' convert_gene(df, "tenx", "adaptive", verbose = FALSE)
convert_gene <- function(df, frm, to, species = "human", frm_cols = NULL, verbose = TRUE) {
if (frm == to) {
stop('"frm" and "to" formats should be different.')
}
if (!is.data.frame(df)) {
stop("Input is not a data.frame.")
}
if (nrow(df) == 0) {
stop("Input data is empty.")
}
if (to %in% c("adaptive", "adaptivev2")) {
warning("Adaptive only captures VDJ genes; C genes will be NA.")
}
# Load lookup table and determine input columns
lookup_f <- choose_lookup(frm, to, species, verbose)
lookup <- utils::read.csv(lookup_f, stringsAsFactors = FALSE)
cols_from <- which_frm_cols(df, frm, frm_cols, verbose)
if (sum(cols_from %in% colnames(df)) == 0) {
stop(
"None of the expected columns are in the dataframe:\n",
paste(cols_from, collapse = ", ")
)
}
# Add column of row numbers so we can keep order straight
df$id <- seq_len(nrow(df))
# Loop over gene columns, doing a merge to get converted gene names
new_genes <- list()
bad_genes_all <- c()
for (col in cols_from) {
if (col %in% colnames(df)) {
merged <- merge(df[, c(col, "id"), drop = FALSE], lookup, by.x = col, by.y = frm, all.x = TRUE)
merged <- merged[order(merged$id), ]
good_genes <- merged[, to]
# Note genes where the merge produced an NA on the 'to' format side
new_bad_genes <- merged[is.na(merged[, to]), col]
# We don't expect the entire column of genes to be empty.
if (length(new_bad_genes) < length(good_genes)) {
new_genes[[col]] <- good_genes
bad_genes_all <- c(bad_genes_all, new_bad_genes)
} else {
warning(paste("Column", col, "has no valid genes and was skipped."))
}
}
}
# Display genes we couldn't convert
if (!all(is.na(bad_genes_all))) {
bad_genes <- unique(bad_genes_all)
warning(paste(
"These genes are not in IMGT for this species and will be replaced with NA:\n",
paste(unique(bad_genes), collapse = ", ")
))
}
# Build output
df_out <- df
for (col in names(new_genes)) {
df_out[, col] <- new_genes[[col]]
df_out[[col]][df_out[[col]] == "NoData"] <- NA_character_
}
df_out <- subset(df_out, select = -c(id)) # Remove row number column
df_out
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.