#' Analyzes and processes the repertoire sequencing data from cellranger vdj. This provides information on the single-cell level for each clone.
#' @param VDJ.directory Character vector with each element containing the path to the output of cellranger vdj runs. To integrate the repertoire with transcriptome data GEX data can by provided inside the corresponding folder. The name of the GEX folder must contain "gex" in his name (case insensitive). This pipeline assumes that the output file names have not been changed from the default 10x settings in the /outs/ folder.
#' @param filtered.prod Logical indicating if the contigs file should be filtered by productive cells. Default set to TRUE
#' @param filtered.confidence Logical indicating if the contigs file should be filtered by contigs with high_confidence. Default set to TRUE
#' @param filtered.is.cell Logical indicating if the contigs file should be filtered by sequences that only come from cells. Default set to TRUE
#' @param red.method Method of dimensionality reduction used. Options are "pca", "tsne", "umap".
#' @param n.dims Number of dimension to be used in the chosen reduction. Default is 20.
#' @return Returns a list of dataframes containing information on the cell or clonotype level.
#' @export
#' @examples
#' \dontrun{
#' check_load_data <- load_data(VDJ.directory = "path/to/cellranger/outs/", clonotype.level = FALSE)
#' }
#'
load_data <- function(VDJ.directory,
clonotype.level = FALSE,
filter.prod = TRUE,
filter.confidence = TRUE,
filter.is.cell = TRUE,
red.method = "pca",
n.dims = 1:10) {
require(tidyverse)
require(Rsamtools)
require(RcppSimdJson)
require(Seurat)
require(seqinr)
require(Biostrings)
if (!all(dir.exists(VDJ.directory))) print("Some directory supplied are not supported")
VDJ.directory[dir.exists(VDJ.directory)]
# Reading input files
if (all(str_detect(VDJ.directory, "mouse"))) sample_number <- as.numeric(substr(sub(".*mouse", "", VDJ.directory), 1, 1))
else sample_number <- seq(1, length(VDJ.directory))
print("Reading input files")
contig.list <- lapply(paste0(VDJ.directory, "/filtered_contig_annotations.csv"), read.csv, stringsAsFactors=F, sep=",", header=T)
fasta.list <- lapply(paste0(VDJ.directory, "/filtered_contig.fasta"), read.fasta, as.string=T, seqonly=F, forceDNAtolower=F)
annotations.list <- lapply(paste0(VDJ.directory, "/all_contig_annotations.json"), fload)
clonotype.list <- lapply(paste0(VDJ.directory, "/clonotypes.csv"), read.table, stringsAsFactors = F, sep=",", header=T)
# Filter clones that contain exactly one heavy/beta and one light/alpha chain
print("Filtering clones that have both HC and LC")
clonotype.list <- map(clonotype.list, ~filter(., str_count(cdr3s_aa, "IGH:") == 1 & str_count(cdr3s_aa, "IGL:|IGK:") == 1))
clonotype.list <- map2(clonotype.list, sample_number, ~mutate(., sample = .y))
# Filtering contigs by specified parameters
contig.list <- map(contig.list, ~filter(., case_when(filter.is.cell ~ is_cell %>% str_detect("(?i)true")),
case_when(filter.confidence ~ high_confidence %>% str_detect("(?i)true")),
case_when(filter.prod ~productive %>% str_detect("(?i)true"))) %>%
dplyr::rename(clonotype_id = raw_clonotype_id) %>%
dplyr::select(-c(is_cell, full_length, productive, reads, high_confidence)))
# Filtering annotations by specified parameters
annotations.list <- map(annotations.list, ~filter(., case_when(filter.is.cell ~ is_cell == T),
case_when(filter.confidence ~ high_confidence == T),
case_when(filter.prod ~ productive == T)) %>%
dplyr::select(-c(is_cell, productive, high_confidence)))
# Adding gene expression information # -------------------------------------------------------------------------
print("Processing gene expression information")
GEX.list <- map_if(VDJ.directory,
~any(dir.exists(unique(str_extract(list.files(., recursive = T, "features.tsv.gz|barcodes.tsv.gz|matrix.mtx.gz", full.names = T), ".*/")))),
~Read10X(unique(str_extract(list.files(., recursive = T, "features.tsv.gz|barcodes.tsv.gz|matrix.mtx.gz", full.names = T), ".*/"))) %>%
CreateSeuratObject)
antibody_gene_names <- c("IGHA", "IGHG", "IGHM", "IGHD", "IGHE", "IGHJ", "IGK", "IGHV", "JCHAIN", "IGL", "TRAV", "TRAC", "TRBC",
"TRGC", "TRDC", "TRBD", "TRBJ", "TRGV", "TRGJ", "TRGJ", "TRDV", "TRDD", "TRDJ", "TRBV", "TRAJ")
GEX.list <- lapply(GEX.list, function(x) x[!toupper(row.names(x)) %in% antibody_gene_names])
GEX.list <- lapply(GEX.list, function(x) x$barcode <- {as.character(gsub(colnames(x), pattern = "-1", replacement = "")); return(x)})
GEX.list <- lapply(GEX.list, function(x) {if (typeof(x) == "S4") x$percent.mt <- PercentageFeatureSet(x, pattern="^mt-"); return(x)})
GEX.list <- lapply(GEX.list, function(x) {if (typeof(x) == "S4")
RunPCA(ScaleData(FindVariableFeatures(NormalizeData(subset(x, percent.mt<20 & nFeature_RNA>100 & nFeature_RNA<2500), verbose=F), verbose=F), verbose=F), npcs = 20, verbose=F)})
GEX.list <- lapply(GEX.list, function(x)
if (!is.null(x)) switch(red.method, tsne = {RunTSNE(x, dims = 1:n.dims, verbose=F)},
umap = {RunUMAP(x, dims = 1:n.dims, verbose=F)},
pca = {x}))
red_data <- lapply(GEX.list, function(x) {if (typeof(x) == "S4")
rownames_to_column(as.data.frame(x@reductions[[red.method]]@cell.embeddings), var = "barcode")})
red_data <- map2(red_data, contig.list, ~{if (!is.null(.x)) inner_join(.x, dplyr::select(.y, c(barcode, clonotype_id)), by = "barcode")})
# -------------------------------------------------------------------------
# Adding cigar string for position of mutations and counting number of mutations
print("Processing somatic hypermutation data")
shm <- map_if(VDJ.directory, ~file.exists(paste0(., "/concat_ref.bam")), ~as.data.frame(scanBam(paste0(., "/concat_ref.bam"))) %>%
dplyr::rename(contig_id = qname) %>%
mutate(mutation_count = str_extract_all(cigar, "[0-9]+(?=[X])")) %>%
dplyr::select(contig_id, mutation_count, cigar) %>% mutate(mutation_count = sapply(.$mutation_count, function(x) sum(as.numeric(x)))))
if (clonotype.level == TRUE) {
# Counting the most common gene for the V,D,J segments for each clonotype and chain
print("Extracting gene usage information")
genes <- map(contig.list, ~group_by(., clonotype_id, chain, barcode) %>%
summarize_at(vars(ends_with("gene")), ~names(which.max(table(.)))) %>%
inner_join(filter(., chain == "IGH"), filter(., str_detect(chain, "IGK|IGL")), by = "clonotype_id", suffix = c("_HC", "_LC")) %>%
distinct(clonotype_id, .keep_all = T) %>% dplyr::select(-c(barcode_LC, chain_HC, chain_LC, barcode_HC)))
clonotype.list <- clonotype.list <- map2(clonotype.list, genes, ~left_join(.x, .y, by = "clonotype_id", suffix = c("", "")))
# Adding mean of reduced components
clonotype.list <- map2(clonotype.list, red_data, ~{if (!is.null(.y))
left_join(.x, group_by(.y, clonotype_id) %>% dplyr::select(-barcode) %>% summarise_each(funs(mean(., na.rm = T))), by = "clonotype_id")
else .x})
return(clonotype.list)
}
if (clonotype.level == FALSE) {
# Fusing HC and LC contigs from same cell in one row
vdj.per.cell <- map(contig.list,
~left_join(filter(., chain %in% "IGH"),
filter(dplyr::select(., barcode, cdr3, cdr3_nt, chain, contig_id, umis, ends_with("gene")), chain %in% c("IGK", "IGL")), by = "barcode", suffix = c("_HC", "_LC")))
# Pasted CDR3
vdj.per.cell <- map(vdj.per.cell, ~unite(., cdr3_HC, "chain_HC", "cdr3_HC", sep = ":", remove = F) %>%
unite(., cdr3_LC, "chain_LC", "cdr3_LC", sep = ":", remove = F) %>%
unite(., cdr3s_aa, "cdr3_HC", "cdr3_LC", sep = ";", remove = F))
# Pasted CDR3 nt
vdj.per.cell <- map(vdj.per.cell, ~unite(., cdr3_HC_nt, "chain_HC", "cdr3_nt_HC", sep = ":") %>%
unite(., cdr3_LC_nt, "chain_LC", "cdr3_nt_LC", sep = ":") %>%
unite(., cdr3s_nt, "cdr3_HC_nt", "cdr3_LC_nt", sep = ";", remove = F))
# Adding SHM
vdj.per.cell <- map2(vdj.per.cell, shm, ~{if (is.data.frame(.y)) left_join(.x, .y, by = c("contig_id_HC" = "contig_id"), suffix = c("_HC", "_LC")) else .x})
vdj.per.cell <- map2(vdj.per.cell, shm, ~{if (is.data.frame(.y)) left_join(.x, .y, by = c("contig_id_LC" = "contig_id"), suffix = c("_HC", "_LC")) else .x})
vdj.per.cell <- mapply(cbind, vdj.per.cell, "sample" = sample_number, SIMPLIFY = F)
# Adding aa sequence
vdj.per.cell <- map2(vdj.per.cell, annotations.list, ~left_join(.x, dplyr::select(.y, aa_sequence, contig_name), by = c("contig_id_HC" = "contig_name"), suffix=c("_HC", "_LC")))
vdj.per.cell <- map2(vdj.per.cell, annotations.list, ~left_join(.x, dplyr::select(.y, aa_sequence, contig_name), by = c("contig_id_LC" = "contig_name"), suffix=c("_HC", "_LC")))
# Adding nt sequence
trimmed <- map(annotations.list, ~filter(., !is.na(start_codon_pos)) %>%
mutate(., sequence_nt = str_sub(sequence, start_codon_pos + 1)) %>% dplyr::select(sequence, contig_name))
vdj.per.cell <- map2(vdj.per.cell, trimmed, ~left_join(.x, .y, by = c("contig_id_HC" = "contig_name"), suffix = c("_HC", "_LC")))
vdj.per.cell <- map2(vdj.per.cell, trimmed, ~left_join(.x, .y, by = c("contig_id_LC" = "contig_name"), suffix = c("_HC", "_LC")))
# Adding pca components
vdj.per.cell <- map2(vdj.per.cell, red_data, ~{if (!is.null(.y)) left_join(.x, dplyr::select(.y, -clonotype_id), by = "barcode") else .x})
# Removing contig_id and separated cdr3_nt HC and LC
vdj.per.cell <- map(vdj.per.cell, ~dplyr::select(., -c(contig_id_HC, contig_id_LC, raw_consensus_id, cdr3_LC_nt, cdr3_HC_nt, cdr3_HC, cdr3_LC)))
return(vdj.per.cell)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.