Nothing
utils::globalVariables(c("id1", "id2", "id1_l", "id2_l", "i", "sim"))
#' Calculate a melted similarity matrix.
#'
#' \code{sim_calculate} calculates a melted similarity matrix.
#'
#' @param population data.frame with annotations (a.k.a. metadata) and
#' observation variables.
#' @param annotation_prefix optional character string specifying prefix
#' for annotation columns.
#' @param strata optional character vector specifying stratification columns.
#' @param method optional character string specifying method for
#' to calculate similarity. This must be one of the
#' strings \code{"pearson"} (default), \code{"kendall"}, \code{"spearman"},
#' \code{"euclidean"}, \code{"cosine"}.
#' @param lazy optional boolean specifying whether to lazily evaluate
#' similarity.
#' @param all_same_cols_rep_or_group optional character vector specifying
#' columns.
#' @param all_same_cols_ref optional character vector specifying columns.
#' @param all_same_cols_rep_ref optional character vector specifying columns.
#' @param reference optional character string specifying reference.
#' @param ... arguments passed downstream for parallel processing.
#'
#' @return \code{metric_sim} object, with similarity matrix and related metadata
#'
#' @examples
#' suppressMessages(suppressWarnings(library(magrittr)))
#' population <- tibble::tribble(
#' ~Metadata_group1, ~Metadata_group2, ~x, ~y, ~z,
#' 1, 1, -1, 5, -5,
#' 1, 2, -1.2, 5.1, -5.2,
#' 2, 1, 0, 6, -4,
#' 2, 2, 0.3, 6.2, -4.4,
#' 3, 1, 7, -4, 3,
#' 3, 2, 7.2, -4.1, 3.7
#' )
#' sim_pearson <- matric::sim_calculate(population, method = "pearson")
#' sim_cosine <- matric::sim_calculate(population, method = "cosine")
#' sim_euclidean <- matric::sim_calculate(population, method = "euclidean")
#'
#' sim_pearson %>%
#' dplyr::inner_join(sim_cosine,
#' by = c("id1", "id2"),
#' suffix = c("_pearson", "_cosine")
#' ) %>%
#' dplyr::inner_join(sim_euclidean %>% dplyr::rename(sim_euclidean = sim),
#' by = c("id1", "id2")
#' )
#'
#' sim_cosine <-
#' matric::sim_calculate(population,
#' strata = "Metadata_group1",
#' method = "cosine",
#' lazy = TRUE
#' )
#'
#' matric::sim_calculate(population,
#' method = "cosine",
#' lazy = TRUE,
#' all_same_cols_rep_or_group = c("Metadata_group2")
#' )
#'
#' matric::sim_calculate(population,
#' method = "cosine",
#' lazy = TRUE,
#' all_same_cols_rep_or_group = c("Metadata_group2"),
#' all_same_cols_ref = c("Metadata_group1"),
#' reference = data.frame(Metadata_group2 = 2)
#' )
#'
#' matric::sim_calculate(population,
#' method = "cosine",
#' lazy = TRUE,
#' all_same_cols_rep_or_group = c("Metadata_group2"),
#' all_same_cols_ref = c("Metadata_group1"),
#' all_same_cols_rep_ref = c("Metadata_group2"),
#' reference = data.frame(Metadata_group2 = 2)
#' )
#' @export
sim_calculate <-
function(population,
annotation_prefix = "Metadata_",
strata = NULL,
method = "pearson",
lazy = FALSE,
all_same_cols_rep_or_group = NULL,
all_same_cols_ref = NULL,
all_same_cols_rep_ref = NULL,
reference = NULL,
...) {
# ---- Checks ----
stopifnot(is.null(strata) | is.null(all_same_cols_rep_or_group))
if (!is.null(strata)) {
all_same_cols_rep_or_group <- strata
}
distances <- c("euclidean")
correlations <- c("pearson", "kendall", "spearman")
similarities <- c("cosine")
stopifnot(is.data.frame(population))
stopifnot(method %in% c(distances, correlations, similarities))
# ---- Setup ----
calculate_optimized <-
!is.null(all_same_cols_rep_or_group) |
!is.null(all_same_cols_ref) |
!is.null(all_same_cols_rep_ref) |
!is.null(reference)
sim_calculate_cases <- function(X) {
if (method %in% distances) {
S <-
as.matrix(stats::dist(
X,
method = method,
diag = TRUE,
upper = TRUE
))
} else if (method %in% correlations) {
S <-
stats::cor(t(X), method = method, use = "pairwise.complete.obs")
} else if (method %in% similarities) {
if (method == "cosine") {
S <- tcrossprod(X / sqrt(rowSums(X * X)))
}
}
S
}
# ---- Preprocess ----
population <- preprocess_data(population)
# ---- Calculate similarity ----
if (!calculate_optimized) {
# ---- * All pairs ----
n_rows <- nrow(population)
sim_df <-
expand.grid(
id1 = seq(n_rows),
id2 = seq(n_rows),
KEEP.OUT.ATTRS = FALSE
)
if (!lazy) {
X <-
drop_annotation(population, annotation_prefix) %>%
as.matrix()
S <- sim_calculate_cases(X)
sim_df <-
sim_df %>%
dplyr::mutate(sim = as.vector(S)) %>%
dplyr::filter(id1 != id2)
}
} else {
# ---- * Some pairs ----
# ---- ** Setup ----
metadata <- get_annotation(population, annotation_prefix)
mapper <- function(strata, reductor, metadata_subset) {
metadata %>%
dplyr::select(all_of(strata)) %>%
dplyr::group_by(across(all_of(strata))) %>%
dplyr::reframe(reductor(dplyr::cur_group(), metadata_subset)) %>%
dplyr::select(id1, id2)
}
reduct_all_same_cols_rep_or_group <-
function(partition, metadata_subset) {
id_partition <-
dplyr::inner_join(metadata_subset,
partition,
by = names(partition)
) %>%
purrr::pluck("id")
expand.grid(
id1 = id_partition,
id2 = id_partition,
KEEP.OUT.ATTRS = FALSE
)
}
reduct_all_same_cols_ref <-
function(partition, metadata_subset) {
metadata_partition <-
dplyr::inner_join(metadata_subset,
partition,
by = names(partition)
)
id_partition <-
metadata_partition %>%
purrr::pluck("id")
id_reference <-
dplyr::inner_join(metadata_partition,
reference,
by = names(reference)
) %>%
purrr::pluck("id")
expand.grid(
id1 = id_partition,
id2 = id_reference,
KEEP.OUT.ATTRS = FALSE
)
}
reduct_all_same_cols_rep_ref <-
function(partition, metadata_subset) {
id_reference <-
dplyr::inner_join(metadata_subset,
partition,
by = names(partition)
) %>%
purrr::pluck("id")
expand.grid(
id1 = id_reference,
id2 = id_reference,
KEEP.OUT.ATTRS = FALSE
)
}
# ---- ** Calculate ----
sim_df <- data.frame()
if (!is.null(all_same_cols_rep_or_group)) {
if (!is.null(reference)) {
metadata_subset <-
metadata %>%
dplyr::anti_join(reference, by = names(reference))
} else {
metadata_subset <- metadata
}
sim_df <-
dplyr::bind_rows(
sim_df,
mapper(
all_same_cols_rep_or_group,
reduct_all_same_cols_rep_or_group,
metadata_subset
)
)
}
if (!is.null(reference)) {
metadata_subset <-
metadata
sim_df <-
dplyr::bind_rows(
sim_df,
mapper(
all_same_cols_ref,
reduct_all_same_cols_ref,
metadata_subset
)
)
}
if (!is.null(all_same_cols_rep_ref) && !is.null(reference)) {
metadata_subset <-
metadata %>%
dplyr::inner_join(reference,
by = names(reference),
multiple = "all"
)
sim_df <-
dplyr::bind_rows(
sim_df,
mapper(
all_same_cols_rep_ref,
reduct_all_same_cols_rep_ref,
metadata_subset
)
)
}
sim_df <-
dplyr::distinct(sim_df)
if (!lazy) {
sim_df <- sim_calculate_ij(population, sim_df, method = method, annotation_prefix = annotation_prefix, ...)
}
sim_df
}
# ---- Package results ----
# get metadata
row_metadata <-
get_annotation(population, annotation_prefix)
# construct object
sim_df <-
sim_validate(sim_new(sim_df, row_metadata, list(method = method)))
sim_df
}
#' Calculate similarities given pairs of rows
#'
#' \code{sim_calculate_ij} calculates similarities given pairs of rows.
#'
#' @param population data.frame with annotations (a.k.a. metadata) and
#' observation variables.
#' @param index data.frame with at least two columns \code{id1} and
#' \code{id2} specifying rows of \code{population}, and an optional attribute
#' \code{metric_metadata$method}, which is a character string specifying
# method for to calculate similarity. Currently only \code{"cosine"} is
#' implemented. Preserve the diagonal entries when constructing \code{index}
# to allow for optimizations. \code{sim_calculate_ij} filters out the
# diagonal in the result.
#' @param method optional character string specifying method for
#' to calculate similarity. \code{method}, if specified, overrides
#' \code{attr(index, "metric_metadata")$method}.
#' @param annotation_prefix optional character string specifying prefix
#' for annotation columns.
#' @param ... arguments passed downstream for parallel processing.
#'
#' @return data.frame which is the same as \code{index}, but with a new
#' column `sim` containing similarities, and with the diagonals filtered out.
#'
#' @importFrom foreach %dopar%
#'
#' @examples
#' suppressMessages(suppressWarnings(library(magrittr)))
#' population <- tibble::tribble(
#' ~Metadata_group, ~x, ~y, ~z,
#' 1, -1, 5, -5,
#' 2, 0, 6, -4,
#' 3, 7, -4, 3,
#' 4, 14, -8, 6
#' )
#'
#' n <- nrow(population)
#'
#' index <-
#' expand.grid(id1 = seq(n), id2 = seq(n), KEEP.OUT.ATTRS = FALSE)
#'
#' matric::sim_calculate_ij(population, index, method = "cosine")
#'
#' attr(index, "metric_metadata") <- list(method = "cosine")
#'
#' matric::sim_calculate_ij(population, index)
#' @export
sim_calculate_ij <-
function(population,
index,
method = NULL,
annotation_prefix = "Metadata_",
...) {
correlations <- c("pearson")
similarities <- c("cosine")
stopifnot(is.data.frame(index))
stopifnot(!is.null(attr(index, "metric_metadata")) |
!is.null(method))
stopifnot(!is.null(attr(index, "metric_metadata")$method) |
!is.null(method))
if (is.null(method)) {
method <- attr(index, "metric_metadata")$method
} else {
attr(index, "metric_metadata")$method <- method
}
stopifnot(all(c("id1", "id2") %in% names(index)))
stopifnot(is.data.frame(population))
stopifnot(method %in% c(correlations, similarities))
if ("sim" %in% names(index)) {
index <- index %>% dplyr::select(-sim)
}
# remove duplicates because it will be inner joined back later
index_distinct <-
index %>%
dplyr::select(id1, id2) %>%
dplyr::distinct()
population <- preprocess_data(population)
# get data matrix
X <-
drop_annotation(population, annotation_prefix) %>%
as.matrix()
if (method %in% c(correlations, similarities)) {
if (method == "cosine") {
sim_df <- cosine_sparse(X, index_distinct$id1, index_distinct$id2, ...)
}
if (method == "pearson") {
sim_df <- pearson_sparse(X, index_distinct$id1, index_distinct$id2, ...)
}
}
# filter out diagonal here
sim_df <- sim_df %>%
dplyr::filter(id1 != id2)
index <-
index %>%
dplyr::inner_join(sim_df, by = c("id1", "id2"))
index
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.