#' Examining the clonal overlap between groups or samples
#'
#' This functions allows for the calculation and visualizations of
#' various overlap metrics for clones. The methods include overlap
#' coefficient (\strong{overlap}), Morisita's overlap index
#' (\strong{morisita}), Jaccard index (\strong{jaccard}), cosine
#' similarity (\strong{cosine}) or the exact number of clonal
#' overlap (\strong{raw}).
#'
#' @details
#' The formulas for the indices are as follows:
#'
#' \strong{Overlap Coefficient:}
#' \deqn{overlap = \frac{\sum \min(a, b)}{\min(\sum a, \sum b)}}
#'
#' \strong{Raw Count Overlap:}
#' \deqn{raw = \sum \min(a, b)}
#'
#' \strong{Morisita Index:}
#' \deqn{morisita = \frac{\sum a b}{(\sum a)(\sum b)}}
#'
#' \strong{Jaccard Index:}
#' \deqn{jaccard = \frac{\sum \min(a, b)}{\sum a + \sum b - \sum \min(a, b)}}
#'
#' \strong{Cosine Similarity:}
#' \deqn{cosine = \frac{\sum a b}{\sqrt{(\sum a^2)(\sum b^2)}}}
#'
#' Where:
#' \itemize{
#' \item{\eqn{a} and \eqn{b} are the abundances of species \eqn{i} in groups A and B, respectively.}
#' }
#' @examples
#' #Making combined contig data
#' combined <- combineTCR(contig_list,
#' samples = c("P17B", "P17L", "P18B", "P18L",
#' "P19B","P19L", "P20B", "P20L"))
#'
#' clonalOverlap(combined,
#' cloneCall = "aa",
#' method = "jaccard")
#'
#' @param input.data The product of \code{\link{combineTCR}},
#' \code{\link{combineBCR}}, or \code{\link{combineExpression}}.
#' @param cloneCall How to call the clone - VDJC gene (\strong{gene}),
#' CDR3 nucleotide (\strong{nt}), CDR3 amino acid (\strong{aa}),
#' VDJC gene + CDR3 nucleotide (\strong{strict}) or a custom variable
#' in the data.
#' @param chain indicate if both or a specific chain should be used -
#' e.g. "both", "TRA", "TRG", "IGH", "IGL"
#' @param method The method to calculate the "overlap", "morisita",
#' "jaccard", "cosine" indices or "raw" for the base numbers.
#' @param group.by The variable to use for grouping.
#' @param exportTable Returns the data frame used for forming the graph.
#' @param palette Colors to use in visualization - input any
#' \link[grDevices]{hcl.pals}.
#' @importFrom stringr str_sort str_to_title
#' @importFrom reshape2 melt
#' @importFrom stats quantile
#' @export
#' @concept Visualizing_Clones
#' @return ggplot of the overlap of clones by group
clonalOverlap <- function(input.data,
cloneCall = "strict",
method = NULL,
chain = "both",
group.by = NULL,
exportTable = FALSE,
palette = "inferno"){
if(method == "morisita") {
return_type <- "freq"
} else {
return_type <- "unique"
}
input.data <- .data.wrangle(input.data,
group.by,
.theCall(input.data, cloneCall, check.df = FALSE),
chain)
cloneCall <- .theCall(input.data, cloneCall)
num_samples <- length(input.data[])
names_samples <- names(input.data)
length <- seq_len(num_samples)
#Selecting Index Function
indexFunc <- switch(method,
"morisita" = .morisitaCalc,
"jaccard" = .jaccardCalc,
"raw" = .rawCalc,
"overlap" = .overlapCalc,
"cosine" = .cosineCalc,
stop("Invalid method provided"))
#Calculating Index
coef_matrix <- data.frame(matrix(NA, num_samples, num_samples))
coef_matrix <- .calculateIndex(input.data,
length,
cloneCall,
coef_matrix,
indexFunc,
return_type)
#Data manipulation
colnames(coef_matrix) <- names_samples
rownames(coef_matrix) <- names_samples
if (exportTable == TRUE) {
return(coef_matrix)
}
mat_melt <- suppressMessages(melt(as.matrix(coef_matrix)))
mean_value <- mean(na.omit(mat_melt[,"value"]))
plot <- ggplot(mat_melt, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
geom_tile(data = mat_melt[!is.na(mat_melt[,"value"]),],
fill = NA,
lwd= 0.25,
color = "black") +
labs(fill = str_to_title(method)) +
geom_text(aes(label = round(value, digits = 3),
color = ifelse(value <= mean_value,
"white", "black")),
na.rm = TRUE) +
scale_fill_gradientn(colors = .colorizer(palette, 7), na.value = "white") +
scale_color_identity() +
theme_classic() +
theme(axis.title = element_blank())
return(plot)
}
# Helper function to prepare data
.prepareDataFrame <- function(df,
cloneCall,
return_type = "unique") {
if (return_type == "unique") {
return(unique(df[, cloneCall]))
} else if (return_type == "freq") {
temp_df <- data.frame(table(df[, cloneCall]))
colnames(temp_df) <- c(cloneCall, 'Count')
temp_df[, 2] <- as.numeric(temp_df[, 2])
return(temp_df)
}
}
# Helper function for common loop and conditional structure
.calculateIndex <- function(df,
length,
cloneCall,
coef_matrix,
indexFunc,
return_type = "unique") {
for (i in seq_along(length)) {
df_i <- .prepareDataFrame(df[[i]], cloneCall, return_type)
for (j in seq_along(length)) {
if (i >= j) { next }
df_j <- .prepareDataFrame(df[[j]], cloneCall, return_type)
coef_matrix[i, j] <- indexFunc(df_i, df_j)
}
}
return(coef_matrix)
}
# Morisita Index calculation function
.morisitaCalc <- function(df_i, df_j) {
merged <- merge(df_i, df_j, by = names(df_i)[1], all = TRUE)
merged[is.na(merged)] <- 0
X <- sum(merged[, 2])
Y <- sum(merged[, 3])
num <- 2 * sum(merged[, 2] * merged[, 3])
den <- ((sum(df_i[, 2]^2) / (X^2)) + (sum(df_j[, 2]^2) / (Y^2))) * X * Y
return(num / den)
}
# Jaccard Index calculation function
.jaccardCalc <- function(df_i, df_j) {
overlap <- length(intersect(df_i, df_j))
return(overlap / (length(df_i) + length(df_j) - overlap))
}
# Raw Index calculation function
.rawCalc <- function(df_i, df_j) {
return(length(intersect(df_i, df_j)))
}
# Overlap Index calculation function
.overlapCalc <- function(df_i, df_j) {
overlap <- length(intersect(df_i, df_j))
return(overlap / min(length(df_i), length(df_j)))
}
# Overlap Index calculation function
.cosineCalc <- function(df_i, df_j) {
all_species <- unique(c(df_i, df_j))
vector_location1 <- as.integer(all_species %in% df_i)
vector_location2 <- as.integer(all_species %in% df_j)
return(sum(vector_location1 * vector_location2) /
(sqrt(sum(vector_location1^2)) * sqrt(sum(vector_location2^2))))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.