R/js_divergence.R

Defines functions generate2dJsDivergenceDataFrame .generate2dMaps calculate2dJsDivergence .calculateMatrixKde calculateJsDivergence

Documented in calculate2dJsDivergence calculateJsDivergence generate2dJsDivergenceDataFrame

#' Calculate the Jensen-Shannon divergence.
#'
#' Given two probability distributions, calculate the Jensen-Shannon (JS)
#' divergence between them.
#'
#' @param p,q Numerical vectors representing the probability distributions.
#' @return A numeric for the JS divergence between the vectors.
#' @seealso \code{\link{generate2dJsDivergenceDataFrame}}
#' @export
calculateJsDivergence <- function(p, q) {
  # Verify that p and q are probability distributions.
  if (!isTRUE(all.equal(sum(p), 1)) ||
      !isTRUE(all.equal(sum(q), 1)) ||
      any(p < 0) || any(q < 0)) {
    stop("p and q should be probability distributions")
  }

  # Calculate the JS divergence.
  m <- 0.5 * (p + q)
  0.5 * (sum(p * log(p / m)) + sum(q * log(q / m)))
}

#' Calculate the probability distribution of a matrix.
#'
#' Calculate the two-dimensional density of a matrix using kernel density
#' estimation (KDE), convert it into a vector, and normalize it to sum 1.
#'
#' @param mtx Numeric matrix for which to calculate the probability
#' distribution.
#' @param n Number of grid points for KDE.
#' @param lims The limits of the rectangle covered by the KDE.
#' @return A one-dimensional numeric vector for the probability density.
#' @importFrom MASS kde2d
.calculateMatrixKde <- function(mtx, n = 25, lims = NULL) {
  if (is.null(lims)) {
    stop("Kernel density estimation limits parameter is missing")
  }

  density <- kde2d(mtx[, 1], mtx[, 2], n = n, lims = lims)
  density_vector <- as.vector(density$z)
  density_vector / sum(density_vector)
}

#' Jensen-Shannon divergence between two-dimensional maps.
#'
#' Calculate the Jensen-Shannon (JS) divergence between two two-dimensional maps.
#' These maps may be generated by a dimensionality reduction (DR) method; however
#' this is not a requirement.
#'
#' @param x,y Two-dimensional numeric matrices to compare.
#' @inheritParams .calculateMatrixKde
#' @return A numeric for the JS divergence between the maps.
#' @seealso \code{\link{calculateJsDivergence}}
#' @export
calculate2dJsDivergence <- function(x, y, n = 2 ^ 8) {
  # Use range over combined data as KDE limits.
  lims <- c(range(c(x[, 1], y[, 1])), range(c(x[, 2], y[, 2])))
  # Convert matrices to probability distributions.
  x_prob <- .calculateMatrixKde(x, n, lims)
  y_prob <- .calculateMatrixKde(y, n, lims)

  calculateJsDivergence(x_prob, y_prob)
}

#' Two-dimensional maps for Jensen-Shannon divergence calculation.
#'
#' Create a list with keys pointing to filenames and values pointing to 2-D
#' matrices that were generated by extracting columns of interest from the files.
#' @inheritParams generate2dJsDivergenceDataFrame
#' @import flowCore
.generate2dMaps <- function(source_filepaths, two_d_map_column_names) {
  two_d_maps <- list()
  for(file_path in source_filepaths) {
    flow_frame <- flowCore::read.FCS(file_path)
    map <- flow_frame@exprs[, two_d_map_column_names]
    filename <- basename(file_path)
    two_d_maps[[filename]] <- map
  }

  two_d_maps
}

#' Jensen-Shannon divergence between multiple two-dimensional maps.
#'
#' Calculate the Jensen-Shannon (JS) divergence between two or more two-dimensional 
#' maps and generate a data frame displaying the divergence value for each possible 
#' pair of two-dimensional maps. The two-dimensional maps are generated from 
#' the source files and columns of interest (two_d_map_column_names). The maps 
#' may be generated by a dimensionality reduction (DR) method; however this is 
#' not a requirement.
#'
#' @param source_filepaths A vector containing the file paths for the data of interest.
#' @param two_d_map_column_names A vector containing the column names of interest. Defaults to Cytobank's default tSNE column names.
#' @return A data frame that includes a numeric for the JS divergence between each 
#' possible combination of generated 2-D maps.
#' @seealso \code{\link{calculateJsDivergence}}
#' @inheritParams .calculateMatrixKde
#' @export
generate2dJsDivergenceDataFrame <- function(source_filepaths, 
                                            two_d_map_column_names = c("tSNE1", "tSNE2"), 
                                            n = 2 ^ 8) {
  if (length(two_d_map_column_names) != 2) {
    stop("We must generate two-dimensional numeric matrices with two columns.")
  }

  two_d_maps <- .generate2dMaps(source_filepaths, two_d_map_column_names)

  filenames <- names(two_d_maps)
  filename_pairs <- combn(filenames, 2)
  num_pairs <- length(filename_pairs) / 2

  pair_divergences <- data.frame(matrix(ncol=3, nrow=0))
  colnames(pair_divergences) <- c('file1', 'file2', 'divergence_value')
  for (num in 1:num_pairs) {
    current_pair = filename_pairs[, num]
    filename_1 = current_pair[1]
    filename_2 = current_pair[2]
    df_1 = as.data.frame(two_d_maps[filename_1])
    df_2 = as.data.frame(two_d_maps[filename_2])

    divergence_value <- calculate2dJsDivergence(df_1, df_2, n)
    new_row <- data.frame(file1 = filename_1, 
                          file2 = filename_2, 
                          divergence_value = divergence_value) 
    pair_divergences <- rbind(pair_divergences, new_row)
  }

  pair_divergences
}
dtelad11/cytutils documentation built on Sept. 1, 2022, 2:45 p.m.