R/compute_similarity.R

Defines functions kl_divergence cosine vector_similarity calc_similarity permute_similarity get_similarity

Documented in calc_similarity cosine get_similarity kl_divergence permute_similarity vector_similarity

#' Compute similarity of matrices
#'
#' @param expr_mat single-cell expression matrix
#' @param ref_mat reference expression matrix
#' @param cluster_ids vector of cluster ids for each cell
#' @param compute_method method(s) for computing similarity scores
#' @param pseudobulk_method method used for summarizing clusters, options are mean (default), median, truncate (10% truncated mean), or trimean, max, min
#' @param per_cell run per cell?
#' @param rm0 consider 0 as missing data, recommended for per_cell
#' @param if_log input data is natural log,
#' averaging will be done on unlogged data
#' @param low_threshold option to remove clusters with too few cells
#' @param ... additional parameters not used yet
#' @return matrix of numeric values, clusters from expr_mat as row names,
#'  cell types from ref_mat as column names
get_similarity <- function(
  expr_mat,
  ref_mat,
  cluster_ids,
  compute_method,
  pseudobulk_method = "mean",
  per_cell = FALSE,
  rm0 = FALSE,
  if_log = TRUE,
  low_threshold = 0,
  ...
) {
  if (nrow(expr_mat) == 0) {
    stop(
      "after subsetting to shared genes",
      "query expression matrix has 0 rows",
      call. = FALSE
    )
  }

  if (ncol(expr_mat) == 0) {
    stop("query expression matrix has 0 cols", call. = FALSE)
  }

  if (nrow(ref_mat) == 0) {
    stop(
      "after subsetting to shared genes",
      "reference expression matrix has 0 rows",
      call. = FALSE
    )
  }

  if (ncol(ref_mat) == 0) {
    stop("reference expression matrix has 0 cols", call. = FALSE)
  }

  ref_clust <- colnames(ref_mat)
  if (ncol(expr_mat) != length(cluster_ids)) {
    stop(
      "number of cells in expression matrix not equal",
      "to metadata/cluster_col",
      call. = FALSE
    )
  }

  if (sum(is.na(cluster_ids)) > 0) {
    message("reassigning NAs to unknown")
    cluster_ids <- factor(cluster_ids)
    cluster_ids <-
      factor(
        cluster_ids,
        levels = c(levels(cluster_ids), NA),
        labels = c(levels(cluster_ids), "unknown"),
        exclude = NULL
      )
    cluster_ids <- as.character(cluster_ids)
  }

  if (!per_cell) {
    sc_clust <- sort(unique(cluster_ids))
    clust_avg <- average_clusters(
      expr_mat,
      cluster_ids,
      if_log = if_log,
      low_threshold = low_threshold,
      method = pseudobulk_method
    )
  } else {
    sc_clust <- cluster_ids
    clust_avg <- expr_mat
  }

  assigned_score <- calc_similarity(
    clust_avg,
    ref_mat,
    compute_method,
    rm0 = rm0,
    ...
  )

  if (low_threshold == 0) {
    rownames(assigned_score) <- sc_clust
    colnames(assigned_score) <- ref_clust
  }

  return(assigned_score)
}

#' Compute a p-value for similarity using permutation
#'
#' @description Permute cluster labels to calculate empirical p-value
#'
#'
#' @param expr_mat single-cell expression matrix
#' @param cluster_ids clustering info of single-cell data assume that
#'  genes have ALREADY BEEN filtered
#' @param ref_mat reference expression matrix
#' @param n_perm number of permutations
#' @param per_cell run per cell?
#' @param compute_method method(s) for computing similarity scores
#' @param pseudobulk_method method used for summarizing clusters, options are mean (default), median, truncate (10% truncated mean), or trimean, max, min
#' @param rm0 consider 0 as missing data, recommended for per_cell
#' @param ... additional parameters
#' @return matrix of numeric values
permute_similarity <- function(
  expr_mat,
  ref_mat,
  cluster_ids,
  n_perm,
  per_cell = FALSE,
  compute_method,
  pseudobulk_method = "mean",
  rm0 = FALSE,
  ...
) {
  ref_clust <- colnames(ref_mat)

  if (!per_cell) {
    sc_clust <- sort(unique(cluster_ids))
    clust_avg <- average_clusters(
      expr_mat,
      cluster_ids,
      method = pseudobulk_method
    )
  } else {
    sc_clust <- colnames(expr_mat)
    clust_avg <- expr_mat
  }

  assigned_score <- calc_similarity(
    clust_avg,
    ref_mat,
    compute_method,
    rm0 = rm0,
    ...
  )

  # perform permutation
  sig_counts <-
    matrix(0L, nrow = length(sc_clust), ncol = length(ref_clust))

  for (i in seq_len(n_perm)) {
    resampled <- sample(cluster_ids, length(cluster_ids), replace = FALSE)

    if (!per_cell) {
      permuted_avg <- average_clusters(
        expr_mat,
        resampled,
        method = pseudobulk_method
      )
    } else {
      permuted_avg <- expr_mat[, resampled, drop = FALSE]
    }

    # permutate assignment
    new_score <- calc_similarity(
      permuted_avg,
      ref_mat,
      compute_method,
      rm0 = rm0,
      ...
    )
    sig_counts <-
      sig_counts + as.numeric(new_score > assigned_score)
  }

  rownames(assigned_score) <- sc_clust
  colnames(assigned_score) <- ref_clust
  rownames(sig_counts) <- sc_clust
  colnames(sig_counts) <- ref_clust

  return(list(
    score = assigned_score,
    p_val = sig_counts / n_perm
  ))
}

#' compute similarity
#' @param query_mat query data matrix
#' @param ref_mat reference data matrix
#' @param compute_method method(s) for computing similarity scores
#' @param rm0 consider 0 as missing data, recommended for per_cell
#' @param ...  additional parameters
#' @return matrix of numeric values
calc_similarity <- function(
  query_mat,
  ref_mat,
  compute_method,
  rm0 = FALSE,
  ...
) {
  # remove 0s ?
  if (rm0) {
    message("considering 0 as missing data")
    query_mat[query_mat == 0] <- NA
    similarity_score <- suppressWarnings(stats::cor(
      as.matrix(query_mat),
      ref_mat,
      method = compute_method,
      use = "pairwise.complete.obs"
    ))
    return(similarity_score)
  } else {
    if (
      any(
        compute_method %in%
          c(
            "pearson",
            "spearman",
            "kendall"
          )
      )
    ) {
      similarity_score <- suppressWarnings(
        stats::cor(as.matrix(query_mat), ref_mat, method = compute_method)
      )
      return(similarity_score)
    }
    if (compute_method == "cosine") {
      res <- proxy::simil(
        as.matrix(query_mat),
        ref_mat,
        method = "cosine",
        by_rows = FALSE
      )
      similarity_score <- matrix(res, nrow = nrow(res))
      return(similarity_score)
    }
  }

  sc_clust <- colnames(query_mat)
  ref_clust <- colnames(ref_mat)
  features <- intersect(rownames(query_mat), rownames(ref_mat))
  query_mat <- query_mat[features, ]
  ref_mat <- ref_mat[features, ]
  similarity_score <- matrix(
    NA,
    nrow = length(sc_clust),
    ncol = length(ref_clust)
  )
  for (i in seq_along(sc_clust)) {
    for (j in seq_along(ref_clust)) {
      similarity_score[i, j] <- vector_similarity(
        query_mat[, sc_clust[i]],
        ref_mat[, ref_clust[j]],
        compute_method,
        ...
      )
    }
  }
  return(similarity_score)
}

#' Compute similarity between two vectors
#'
#' @description Compute the similarity score between two vectors using a
#' customized scoring function
#' Two vectors may be from either scRNA-seq or bulk RNA-seq data.
#' The lengths of vec1 and vec2 must match, and must be arranged in the
#' same order of genes.
#' Both vectors should be provided to this function after pre-processing,
#' feature selection and dimension reduction.
#'
#' @param vec1 test vector
#' @param vec2 reference vector
#' @param compute_method method to run i.e. corr_coef
#' @param ... arguments to pass to compute_method function
#' @return numeric value of desired correlation or distance measurement
vector_similarity <- function(vec1, vec2, compute_method, ...) {
  # examine whether two vectors are of the same size
  if (
    !is.numeric(vec1) ||
      !is.numeric(vec2) ||
      length(vec1) != length(vec2)
  ) {
    stop(
      "compute_similarity: two input vectors",
      " are not numeric or of different sizes.",
      call. = FALSE
    )
  }

  if (!(compute_method %in% c("cosine", "kl_divergence"))) {
    stop(compute_method, " not implemented", call. = FALSE)
  }

  if (compute_method == "kl_divergence") {
    res <- kl_divergence(vec1, vec2, ...)
  } else if (compute_method == "cosine") {
    res <- cosine(vec1, vec2, ...)
  }
  # return the similarity score, must be
  return(res)
}

#' Cosine distance
#' @param vec1 test vector
#' @param vec2 reference vector
#' @return numeric value of cosine distance between the vectors
cosine <- function(vec1, vec2) {
  sum(vec1 * vec2) / sqrt(sum(vec1^2) * sum(vec2^2))
}
#' KL divergence
#'
#' @description Use package entropy to compute Kullback-Leibler divergence.
#' The function first converts each vector's reads to pseudo-number of
#' transcripts by normalizing the total reads to total_reads.
#' The normalized read for each gene is then rounded to serve as the
#' pseudo-number of transcripts.
#' Function entropy::KL.shrink is called to compute the KL-divergence between
#' the two vectors, and the maximal allowed divergence is set to max_KL.
#' Finally, a linear transform is performed to convert the KL divergence,
#' which is between 0 and max_KL, to a similarity score between -1 and 1.
#'
#' @param vec1 Test vector
#' @param vec2 Reference vector
#' @param if_log Whether the vectors are log-transformed. If so, the
#' raw count should be computed before computing KL-divergence.
#' @param total_reads Pseudo-library size
#' @param max_KL Maximal allowed value of KL-divergence.
#' @return numeric value, with additional attributes, of kl divergence
#'  between the vectors
kl_divergence <- function(
  vec1,
  vec2,
  if_log = FALSE,
  total_reads = 1000,
  max_KL = 1
) {
  if (if_log) {
    vec1 <- expm1(vec1)
    vec2 <- expm1(vec2)
  }
  count1 <- round(vec1 * total_reads / sum(vec1))
  count2 <- round(vec2 * total_reads / sum(vec2))
  est_KL <- entropy::KL.shrink(count1, count2, unit = "log", verbose = FALSE)
  return((max_KL - est_KL) / max_KL * 2 - 1)
}
rnabioco/clustifyr documentation built on June 11, 2025, 5:38 a.m.