#' Bhattacharyya, Similarity, Sorensen, or PSI matrix
#'
#' Calculates the Bhattacharyya coefficient, Similarity score, Sorensen Index, or
#' Percent Similarity Index of all pairwise comparison from a list of data frames.
#'
#' @param productive_table A tibble of productive sequences generated
#' by the LymphoSeq function productiveSeq. "duplicate_frequency" and "junction_aa"
#' are a required columns.
#' @param mode The mode to use for calculating pairwise similarity. Can take the values
#' "Bhattacharyya", "Similarity", "Sorensen", or "PSI". Default is "Bhattacharyya".
#' @return A data frame of Bhattacharyya coefficients, Similarity scores, Sorensen Index, or
#' Percent Similarity Index calculated from all pairwise comparisons from a list of
#' repertoire_id data frames. Both metrics measure the amount of overlap between two samples.
#' The value ranges from 0 to 1 where 1 indicates the sequence frequencies are
#' identical in the two samples and 0 indicates no shared frequencies.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq2
#' stable <- readImmunoSeq(path = file_path)
#' atable <- productiveSeq(stable, aggregate = "junction_aa")
#' bhattacharyya_matrix <- scoringMatrix(productive_table = atable,
#' mode = "Bhattacharyya")
#' similarity_matrix <- scoringMatrix(productive_table = atable,
#' mode = "Similarity")
#' sorensen_matrix <- scoringMatrix(productive_table = atable,
#' mode = "Sorensen")
#' psi_matrix <- scoringMatrix(productive_table = atable,
#' mode = "PSI")
#' @seealso \code{\link{pairwisePlot}} for plotting results as a heat map.
#' @export
#' @import tidyverse tictoc
scoringMatrix <- function(productive_table, mode="Bhattacharyya") {
sample_list <- productive_table %>%
dplyr::select(repertoire_id, junction_aa, duplicate_frequency, duplicate_count) %>%
dplyr::group_by(repertoire_id) %>%
dplyr::group_split()
if (mode == "Bhattacharyya") {
scoring_matrix <- list(sample_list, sample_list) %>%
purrr::cross() %>%
purrr::map(bhattacharyyaCoefficient) %>%
dplyr::bind_rows() %>%
tidyr::pivot_wider(id_cols=sample1,
names_from=sample2,
values_from=bhattacharyya_coefficient)
} else if (mode == "Similarity") {
scoring_matrix <- list(sample_list, sample_list) %>%
purrr::cross() %>%
purrr::map(similarityScore) %>%
dplyr::bind_rows() %>%
tidyr::pivot_wider(id_cols=sample1,
names_from=sample2,
values_from=similarityScore)
} else if (mode == "Sorensen") {
scoring_matrix <- list(sample_list, sample_list) %>%
purrr::cross() %>%
purrr::map(sorensenIndex) %>%
dplyr::bind_rows() %>%
tidyr::pivot_wider(id_cols=sample1,
names_from=sample2,
values_from=sorensenIndex)
} else if (mode == "PSI") {
scoring_matrix <- list(sample_list, sample_list) %>%
purrr::cross() %>%
purrr::map(percentSI) %>%
dplyr::bind_rows() %>%
tidyr::pivot_wider(id_cols=sample1,
names_from=sample2,
values_from=percentSI)
}
row_names <- scoring_matrix$sample1
scoring_matrix <- scoring_matrix %>%
dplyr::select(-sample1) %>%
as.matrix()
rownames(scoring_matrix) <- row_names
return(scoring_matrix)
}
#' Bhattacharyya coefficient
#'
#' Calculates the Bhattacharyya coefficient of two samples.
#'
#' @param sample_list A list of two tibble corresponding derived from the productiveSeq
#' function in LymphoSeq2. `duplicate_frequency`, `junction_aa`, and `repertoire_id` columns are necessary
#' for the calcualtion of the Bhattacharyya coefficient.
#'
#' @return A tibble with one row and three columns sample1, sample2, bhattacharyyaCoefficient
#'
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq2
#'
#' stable <- readImmunoSeq(path = file_path)
#'
#' atable <- productiveSeq(stable, aggregate = "junction_aa")
#'
#' @seealso \code{\link{scoringMatrix}}
#' @import tidyverse
#' @export
bhattacharyyaCoefficient <- function(sample_list) {
sample1 <- sample_list[[1]]
sample2 <- sample_list[[2]]
sample_merged <- dplyr::full_join(sample1, sample2,
by="junction_aa",
suffix = c("_p", "_q")) %>%
dplyr::mutate(duplicate_frequency_p = tidyr::replace_na(duplicate_frequency_p, 0),
duplicate_frequency_q = tidyr::replace_na(duplicate_frequency_q, 0))
s <- sample_merged$duplicate_frequency_p * sample_merged$duplicate_frequency_q
bc <- base::sum(base::sqrt(s))
bhattacharyya_coefficient <- tibble::tibble(sample1=sample1$repertoire_id[1],
sample2=sample2$repertoire_id[1],
bhattacharyya_coefficient=bc)
return(bhattacharyya_coefficient)
}
#' Similarity score
#'
#' Calculates the similarity score of two samples.
#'
#' @param sample1 A data frame consisting of frequencies of antigen receptor
#' sequences. "junction_aa" and "duplicate_count" are a required columns.
#' @param sample2 A data frame consisting of frequencies of antigen receptor
#' sequences. "junction_aa" and "duplicate_count" are a required columns.
#' @return Returns the similarity score, a measure of the amount of
#' overlap between two samples. The value ranges from 0 to 1 where 1 indicates
#' the sequence frequencies are identical in the two samples and 0
#' indicates no shared frequencies.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq2
#'
#' stable <- readImmunoSeq(path = file_path)
#'
#' atable <- productiveSeq(stable, aggregate = "junction_aa")
#'
#' @seealso \code{\link{scoringMatrix}}
#' @import tidyverse
#' @export
similarityScore <- function(sample_list) {
sample1 <- sample_list[[1]]
sample2 <- sample_list[[2]]
s1 <- sample1 %>%
dplyr::filter(junction_aa %in% sample2$junction_aa) %>%
dplyr::summarise(total = sum(duplicate_count)) %>%
base::as.integer()
s2 <- sample2 %>%
dplyr::filter(junction_aa %in% sample1$junction_aa) %>%
dplyr::summarise(total = sum(duplicate_count)) %>%
base::as.integer()
score <- (s1 + s2)/ (base::sum(sample1$duplicate_count) + base::sum(sample2$duplicate_count))
similarity_score <- tibble::tibble(sample1=sample1$repertoire_id[1],
sample2=sample2$repertoire_id[1],
similarityScore=score)
return(similarity_score)
}
#' Sorensen index
#'
#' Calculates the Sorensen index between two groups of repertoires. Similar to
#' a Jaccard index, Sorensen index gives a greater weight to shared sequences
#' over unique sequences.
#'
#' @param sample1 A data frame consisting of frequencies of antigen receptor
#' sequences. "junction_aa" and "duplicate_count" are a required columns.
#' @param sample2 A data frame consisting of frequencies of antigen receptor
#' sequences. "junction_aa" and "duplicate_count" are a required columns.
#' @return Returns the similarity score, a measure of the amount of
#' overlap between two samples. The value ranges from 0 to 1 where 1 indicates
#' the sequence frequencies are identical in the two samples and 0
#' indicates no shared frequencies.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq2
#'
#' stable <- readImmunoSeq(path = file_path)
#'
#' atable <- productiveSeq(stable, aggregate = "junction_aa")
#'
#' @seealso \code{\link{scoringMatrix}}
#' @import tidyverse
#' @export
sorensenIndex <- function(sample_list) {
sample1 <- sample_list[[1]]
sample2 <- sample_list[[2]]
intersection <- dplyr::inner_join(sample1, sample2, by = "junction_aa", suffix = c("_1", "_2"))
unique_sample1 <- dplyr::anti_join(sample1, sample2, by = "junction_aa")
unique_sample2 <- dplyr::anti_join(sample2, sample1, by = "junction_aa")
a <- intersection %>%
dplyr::pull(junction_aa) %>%
unique() %>%
length()
b <- unique_sample1 %>%
dplyr::pull(junction_aa) %>%
unique() %>%
length()
c <- unique_sample2 %>%
dplyr::pull(junction_aa) %>%
unique() %>%
length()
sorensen_index <- (2 * a)/ ((2*a) + b + c)
sorensen_score <- tibble::tibble(sample1=sample1$repertoire_id[1],
sample2=sample2$repertoire_id[1],
sorensenIndex=sorensen_index)
return(sorensen_score)
}
#' Percent similarity index
#'
#' Calculates the Percent similarity index between two groups of repertoires. Percent
#' similarity index, not only compares the number of similar and dissimilar species
#' present between two sites, but also incorporate abundance.
#'
#' @param sample1 A data frame consisting of frequencies of antigen receptor
#' sequences. "junction_aa" and "duplicate_count" are a required columns.
#' @param sample2 A data frame consisting of frequencies of antigen receptor
#' sequences. "junction_aa" and "duplicate_count" are a required columns.
#' @return Returns the similarity score, a measure of the amount of
#' overlap between two samples. The value ranges from 0 to 1 where 1 indicates
#' the sequence frequencies are identical in the two samples and 0
#' indicates no shared frequencies.
#' @examples
#' file_path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq2
#'
#' stable <- readImmunoSeq(path = file_path)
#'
#' atable <- productiveSeq(stable, aggregate = "junction_aa")
#'
#' @seealso \code{\link{scoringMatrix}}
#' @import tidyverse
#' @export
percentSI <- function(sample_list) {
sample1 <- sample_list[[1]]
sample2 <- sample_list[[2]]
combined <- dplyr::full_join(sample1, sample2, by = "junction_aa", suffix = c("_1", "_2")) %>%
dplyr::select(junction_aa, duplicate_frequency_1, duplicate_frequency_2) %>%
dplyr::mutate(duplicate_frequency_1 = tidyr::replace_na(duplicate_frequency_1, 0),
duplicate_frequency_2 = tidyr::replace_na(duplicate_frequency_2, 0)) %>%
dplyr::group_by(junction_aa, duplicate_frequency_1, duplicate_frequency_2) %>%
dplyr::summarize(min_count = min(duplicate_frequency_1, duplicate_frequency_2)) %>%
ungroup()
min_sum <- combined %>% dplyr::pull(min_count) %>% sum()
sum_sample1 <- combined %>% dplyr::pull(duplicate_frequency_1) %>% sum()
sum_sample2 <- combined %>% dplyr::pull(duplicate_frequency_2) %>% sum()
percent_si <- 200 * sum(min_sum) / (sum_sample1 + sum_sample2)
psi_score <- tibble::tibble(sample1=sample1$repertoire_id[1],
sample2=sample2$repertoire_id[1],
percentSI=percent_si)
return(psi_score)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.