Nothing
if (getRversion() >= "2.15.1") {
utils::globalVariables(c(
"v_call", "d_call", "j_call", "junction", "junction_aa",
"CDR3.nt", "D.end", "D.name", "D.start", "DJ.ins", "J.name",
"J.start", "V.end", "V.name", "VD.ins", "VJ.ins", "barcode",
"chain", "contig_id", "raw_clonotype_id", "raw_consensus_id"
))
}
##### Main IO functions #####
#' Load immune repertoire files into the R workspace
#'
#' @concept io
#'
#' @importFrom readr read_delim read_tsv read_csv col_integer col_character col_double col_logical col_guess cols write_lines
#' @importFrom jsonlite read_json
#' @importFrom stringr str_split str_detect str_replace_all str_trim
#' @importFrom methods as
#' @importFrom dplyr contains first select_ group_by_at one_of row_number
#' @importFrom utils read.table
#' @importFrom data.table setDF
#'
#' @description The \code{repLoad} function loads repertoire files
#' into R workspace in the immunarch format where you can immediately use them for
#' the analysis. \code{repLoad} automatically detects the right format for
#' your files, so all you need is simply provide the path to your files.
#'
#' See "Details" for more information on supported formats. See "Examples" for
#' diving right into it.
#'
#' @param .path A character string specifying the path to the input data.
#' Input data can be one of the following:
#'
#' - a single repertoire file.
#' In this case \code{repLoad} returns an R \link{data.frame};
#'
#' - a vector of paths to repertoire files.
#' Same as in the case with no metadata file presented in the next section below;
#'
#' - a path to the folder with repertoire files and, if available, metadata file "metadata.txt".
#' If the metadata file if presented, then the \code{repLoad} returns a list with two elements "data" and "meta".
#' "data" is an another list with repertoire R \link{data.frame}s. "meta" is a data frame with the metadata.
#' If the metadata file "metadata.txt" is not presented, then the \code{repLoad} creates a dummy metadata file with
#' sample names and returns a list with two elements "data" and "meta".
#' If input data has multiple chains or cell types stored in the same file
#' (for example, like in 10xGenomics repertoire files), such repertoire files will be splitted to different
#' R data frames with only one type of chain and cell presented. The metadata file will have additional columns specifying
#' cell and chain types for different samples.
#'
#' @param .mode Either "single" for single chain data or "paired" for paired chain data.
#'
#' Currently "single" works for every format, and "paired" works only for 10X Genomics data.
#'
#' By default, 10X Genomics data will be loaded as paired chain data, and other files will be loaded as single chain data.
#'
#' @param .coding A logical value. Set TRUE to get coding-only clonotypes (by defaul). Set FALSE to get all clonotypes.
#'
#' @param ... Extra arguments for parsing functions
#'
#' @details
#' The metadata has to be a tab delimited file with first column named "Sample".
#' It can have any number of additional columns with arbitrary names.
#' The first column should contain base names of files without extensions in
#' your folder. Example:
#' \tabular{llll}{
#' Sample \tab Sex \tab Age \tab Status\cr
#' immunoseq_1 \tab M \tab 1 \tab C\cr
#' immunoseq_2 \tab M \tab 2 \tab C\cr
#' immunoseq_3 \tab FALSE \tab 3 \tab A
#' }
#'
#' Currently, Immunarch support the following formats:
#'
#' - "immunoseq" - ImmunoSEQ of any version. http://www.adaptivebiotech.com/immunoseq
#'
#' - "mitcr" - MiTCR. https://github.com/milaboratory/mitcr
#'
#' - "mixcr" - MiXCR (the "all" files) of any version. https://github.com/milaboratory/mixcr
#'
#' - "migec" - MiGEC. http://migec.readthedocs.io/en/latest/
#'
#' - "migmap" - For parsing IgBLAST results postprocessed with MigMap. https://github.com/mikessh/migmap
#'
#' - "tcr" - tcR, our previous package. https://imminfo.github.io/tcr/
#'
#' - "vdjtools" - VDJtools of any version. http://vdjtools-doc.readthedocs.io/en/latest/
#'
#' - "imgt" - IMGT HighV-QUEST. http://www.imgt.org/HighV-QUEST/
#'
#' - "airr" - adaptive immune receptor repertoire (AIRR) data format. http://docs.airr-community.org/en/latest/datarep/overview.html
#'
#' - "10x" - 10XGenomics clonotype annotations tables. https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/annotation
#'
#' - "archer" - ArcherDX clonotype tables. https://archerdx.com/
#'
#' @return
#' A list with two named elements:
#'
#' - "data" is a list of input samples;
#'
#' - "meta" is a data frame with sample metadata.
#'
#' @seealso \link{immunr_data_format} for immunarch data format; \link{repSave} for file saving;
#' \link{repOverlap}, \link{geneUsage} and \link{repDiversity} for starting with immune repertoires basic statistics.
#'
#' @examples
#' # To load the data from a single file (note that you don't need to specify the data format):
#' file_path <- paste0(system.file(package = "immunarch"), "/extdata/io/Sample1.tsv.gz")
#' immdata <- repLoad(file_path)
#'
#' # Suppose you have a following structure in your folder:
#' # >_ ls
#' # immunoseq1.txt
#' # immunoseq2.txt
#' # immunoseq3.txt
#' # metadata.txt
#'
#' # To load the whole folder with every file in it type:
#' file_path <- paste0(system.file(package = "immunarch"), "/extdata/io/")
#' immdata <- repLoad(file_path)
#' print(names(immdata))
#'
#' # We recommend creating a metadata file named "metadata.txt" in the folder.
#'
#' # In that case, when you load your data you will see:
#' # > immdata <- repLoad("path/to/your/folder/")
#' # > names(immdata)
#' # [1] "data" "meta"
#'
#' # If you do not have "metadata.txt", you will see the same output,
#' # but your metadata will be almost empty:
#' # > immdata <- repLoad("path/to/your/folder/")
#' # > names(immdata)
#' # [1] "data" "meta"
#' @export repLoad
repLoad <- function(.path, .mode = "paired", .coding = TRUE, ...) {
exclude_extensions <- c(
"so", "exe", "bam", "fasta", "fai", "fastq", "bed", "rds", "report", "vdjca"
)
# Process a repertoire file: detect format and load the data
# Return: a named list with a repertoire data frame and it's name
.read_repertoire <- function(.path, .mode, .coding, ...) {
parse_res <- list()
cur_format <- .detect_format(.path)
# Parse the file
if (is.na(cur_format)) {
message("unsupported format, skipping")
} else {
message(cur_format)
parse_fun <- switch(cur_format,
mitcr = parse_mitcr,
migec = parse_migec,
migmap = parse_migmap,
mixcr = parse_mixcr,
tcr = parse_tcr,
vdjtools = parse_vdjtools,
imgt = parse_imgt,
immunoseq = parse_immunoseq,
airr = parse_airr,
`10x (consensus)` = parse_10x_consensus,
`10x (filt.contigs)` = parse_10x_filt_contigs,
archer = parse_archer,
immunarch = parse_immunarch,
catt = parse_catt,
rtcr = parse_rtcr,
imseq = parse_imseq,
vidjil = parse_vidjil,
NA
)
if (suppressWarnings(is.na(parse_fun))) {
message("unknown format, skipping")
} else {
parse_res <- parse_fun(.path, .mode, ...)
if (is.null(parse_res)) {
message(" [!] Warning: zero clonotypes found, skipping")
parse_res <- list()
} else {
if (.coding) {
parse_res <- coding(parse_res)
}
if (!has_class(parse_res, "list")) {
parse_res <- list(parse_res)
names(parse_res) <- .remove.ext(.path)
}
}
}
}
parse_res
}
# Process a vector with filepaths to repertoire files, metadata and barcodes:
# just load all repertoire files.
# Do NOT (!) create a dummy metadata, return en empty data frame instead
# Return: list with data, metadata and barcodes (if necessary)
.process_batch <- function(.files, .mode, .coding) {
parsed_batch <- list()
metadata <- tibble()
for (.filepath in .files) {
message(" -- [", match(.filepath, .files), "/",
length(.files), "] Parsing \"",
.filepath, "\" -- ",
appendLF = FALSE
)
if (!file.exists(.filepath)) {
message('Can\'t find\t"', .filepath, '", skipping')
} else {
# Check for the type: repertoire, metadata or barcodes
if (stringr::str_detect(.filepath, "metadata")) {
message("metadata")
metadata <- readr::read_tsv(.filepath, trim_ws = TRUE, col_types = cols())
# The most basic check
if (!("Sample" %in% colnames(metadata))) {
stop(
"No \"Sample\" column found in the metadata file.\n",
"The \"Sample\" column with the names of samples without extensions ",
"(e.g., \".txt\", \".tsv\") is required.\n",
"Please provide it and run the parsing again."
)
}
} else if (stringr::str_detect(.filepath, "barcode")) {
# TODO: add the barcode processing subroutine to split by samples
} else {
repertoire <- .read_repertoire(.filepath, .mode, .coding, ...)
if (length(repertoire) != 0) {
parsed_batch <- c(parsed_batch, repertoire)
}
}
}
}
list(data = parsed_batch, meta = metadata)
}
# Recursively find all subdirectories and create
# a list of batches with repertoire files to process
.split_to_batches <- function(.paths, .root_folder) {
# split .paths to files and folders
folders <- c()
files <- c()
for (path in .paths) {
if (!file.exists(path)) {
stop("Input file or directory not found: ", path)
}
if (file.info(path)$isdir) {
folders <- c(folders, path)
} else if (!any(endsWith(path, exclude_extensions))) {
files <- c(files, path)
}
}
batches <- list(files)
names(batches) <- .root_folder
# process each subdirectory to batches
for (folder_path in folders) {
new_batch <- .split_to_batches(dir(folder_path, full.names = TRUE, recursive = FALSE), folder_path)
batches <- c(batches, new_batch)
}
batches
}
.check_metadata <- function(.metadata, .rep_names) {
error_flag <- FALSE
# Check for zero .metadata
if (nrow(.metadata) == 0) {
message(" -- Metadata file not found; creating a dummy metadata...")
.metadata <- tibble(Sample = .rep_names)
error_flag <- TRUE
}
# Check for repertoire names
missed_in_folders <- setdiff(.rep_names, .metadata$Sample)
missed_in_metadata <- setdiff(.metadata$Sample, .rep_names)
if (length(missed_in_folders) || length(missed_in_metadata)) {
if (length(missed_in_metadata)) {
message(" -- Samples found in the metadata, but not in the folder:\n ", toString(missed_in_metadata))
message(" Did you correctly specify all the sample names in the metadata file?")
error_flag <- TRUE
}
if (length(missed_in_folders)) {
message(" -- Samples found in the folder, but not in the metadata:\n ", toString(missed_in_folders))
message(" Did you add all the necessary samples to the metadata file with correct names?")
message(" Creating dummy sample records in the metadata for now...")
.metadata <- merge(.metadata, as_tibble(data.frame(Sample = missed_in_folders, stringsAsFactors = FALSE)), all = TRUE)
error_flag <- TRUE
}
}
# Drop incorrect columns
to_drop <- c()
for (col_name in names(.metadata)) {
if (sum(is.na(.metadata[[col_name]])) == nrow(.metadata)) {
to_drop <- c(to_drop, col_name)
}
}
if (length(to_drop)) {
.metadata <- .metadata[-match(to_drop, names(.metadata))]
message("Dropping ", length(to_drop), "column(s) from .metadata.txt. Do you have spaces or tabs after the name of the last column? Remove them to ensure everything works correctly.")
error_flag <- TRUE
}
if (!error_flag) {
message(" -- Everything is OK!")
}
.metadata
}
#
# Step 1: load repertoire files
#
message("\n== Step 1/3: loading repertoire files... ==\n")
# Split .path to files and folders
batches <- .split_to_batches(.path, "<initial>")
# Parse repertoires from each batch
parsed_batches <- list()
for (batch_i in seq_along(batches)) {
if (length(batches[[batch_i]])) {
message('Processing "', names(batches)[batch_i], '" ...')
parsed_batches[[names(batches)[batch_i]]] <- .process_batch(batches[[batch_i]], .mode, .coding)
}
}
#
# Step 2: check metadata files
#
message("\n== Step 2/3: checking metadata files and merging files... ==\n")
for (batch_i in seq_along(parsed_batches)) {
message('Processing "', names(parsed_batches)[batch_i], '" ...')
parsed_batches[[batch_i]]$meta <- .check_metadata(parsed_batches[[batch_i]]$meta, names(parsed_batches[[batch_i]]$data))
}
names(parsed_batches) <- NA
parsed_batches <- list(data = do.call(c, lapply(parsed_batches, "[[", "data")), meta = Reduce(function(x, y) merge(x, y, all = TRUE), lapply(parsed_batches, "[[", "meta")))
names(parsed_batches$data) <- substr(names(parsed_batches$data), 4, nchar(names(parsed_batches$data)))
if (any(duplicated(names(parsed_batches$data)))) {
stop("Found non-unique sample names: ", paste0(names(parsed_batches$data)[duplicated(names(parsed_batches$data))], collapse = " "), " \n Please rename your samples so they all have unique names.")
}
#
# Step 3: split by chains and barcodes
#
message("\n== Step 3/3: processing paired chain data... ==\n")
# Check for mixed chains repertoire files and split them if necessary
rep_names <- names(parsed_batches$data)
metadata <- parsed_batches$meta
for (source_name in rep_names) {
if ("Chain" %in% colnames(metadata)) {
if (!is.na(metadata[which(metadata$Sample == source_name), "Chain"])) {
next
}
}
if ("chain" %in% colnames(parsed_batches$data[[source_name]])) {
#
# If mode is "single" then split paired chain data to two chain-specific immune repertoires
#
if (.mode == "single") {
df_split <- split(parsed_batches$data[[source_name]], parsed_batches$data[[source_name]]$chain)
if (length(df_split) > 1) {
chain_col <- "Chain"
source_col <- "Source"
if (!(chain_col %in% colnames(metadata))) {
message("Splitting repertoires by their chain type. Columns ", chain_col, " and ", source_col, " added to the metadata.")
message(" -- Column ", chain_col, " specifies chain types: TRA, TRB, etc.")
message(" -- Column ", source_col, " specifies the initial sample name for a repertoire after splitting by chain types")
}
for (i in seq_along(df_split)) {
sample_name <- paste0(source_name, "_", names(df_split)[i])
parsed_batches$data[[sample_name]] <- df_split[[i]]
new_record <- metadata[which(metadata$Sample == source_name), ]
new_record$Sample <- sample_name
new_record[[chain_col]] <- names(df_split)[i]
new_record[[source_col]] <- source_name
metadata <- merge(metadata, new_record, all = TRUE)
}
parsed_batches$data[[source_name]] <- NULL
metadata <- metadata[-(which(metadata$Sample == source_name)), ]
}
}
#
# If mode is "paired" then nothing to do here
#
else {
# pass
}
}
}
message("Done!\n")
parsed_batches$meta <- metadata
parsed_batches
}
#' Save immune repertoires to the disk
#'
#' @concept io
#'
#' @importFrom utils packageVersion
#' @importFrom plyr mapvalues
#' @importFrom purrr map
#'
#' @description The \code{repSave} function is deigned to save your data to the disk
#' in desirable format. Currently supports "immunarch" and "vdjtools" file formats.
#'
#' @param .data An R dataframe, a list of R dataframes or a list with \code{data} and
#' \code{meta} where first element is a list of dataframes and the latter is a dataframe
#' with metadata.
#' @param .path A string with the path to the output directory. It should include file
#' name if a single dataframe is provided to .data argument.
#' @param .format A string with desirable format specification. Current options are
#' "immunarch" and "vdjtools".
#' @param .compress A boolean value. Defines whether the output will be compressed or not.
#'
#' @details It is not necessary to create directories beforehand. If the provided directory
#' does not exist it will be created automatically.
#'
#' @return No return value.
#'
#' @examples
#' data(immdata)
#' # Reduce data to save time on examples
#' immdata$data <- purrr::map(immdata$data, ~ .x %>% head(10))
#' dirpath <- tempdir()
#' # Save the list of repertoires
#' repSave(immdata, dirpath)
#' # Load it and check if it is the same
#' new_immdata <- repLoad(dirpath)
#' # sum(immdata$data[[1]] != new_immdata$data[[1]], na.rm = TRUE)
#' # sum(immdata$data[[2]] != new_immdata$data[[2]], na.rm = TRUE)
#' # sum(immdata$meta != new_immdata$meta, na.rm = TRUE)
#' @export repSave
repSave <- function(.data, .path, .format = c("immunarch", "vdjtools"),
.compress = TRUE) {
.format <- .format[1]
if (!.format %in% c("immunarch", "vdjtools")) {
stop("Unknown format. Please provide either 'immunarch' or 'vdjtools'")
}
if (!has_class(.data, "list")) {
success <- switch(.format,
immunarch = save_immunarch(.data, .path, .compress),
vdjtools = save_vdjtools(.data, .path, .compress),
NA
)
} else {
if ("meta" %in% names(.data)) {
ifelse(!dir.exists(.path), dir.create(.path), FALSE)
readr::write_tsv(.data$meta, path = paste0(.path, "/metadata.txt"))
for (name in names(.data$data)) {
success <- switch(.format,
immunarch = save_immunarch(
.data$data[[name]],
paste0(.path, "/", name),
.compress
),
vdjtools = save_vdjtools(
.data$data[[name]],
paste0(.path, "/", name),
.compress
),
NA
)
}
} else {
ifelse(!dir.exists(.path), dir.create(.path), FALSE)
for (name in names(.data)) {
success <- switch(.format,
immunarch = save_immunarch(
.data[[name]],
paste0(.path, "/", name),
.compress
),
vdjtools = save_vdjtools(
.data[[name]],
paste0(.path, "/", name),
.compress
),
NA
)
}
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.