MolEndoMatch: Molecular Endophenotype Patient Matchmaking

Documented in mle.getPatientSimilarity

#' Patient similarity using mutual information MLE metric of patients' most modular, perturbed subsets.
#'
#' This function calculates the universal distance between patients, using a mutual information metric, where self-information comes from the minimum encoding length of each patient's encoded modular perturbations in the background knowledge graph.
#' @param ig - An igraph object of the background knowledge graph.
#' @param path - The filepath to a directory of the stored patient-specific RData files.
#' @param data.pvals - The matrix that gives the perturbation strenght significance for all variables (columns) for each patient (rows)
#' @param patient1 - The row name in data.pvals corresponding to the first patient in the pair of patients being compared.
#' @param patient2 - The row name in data.pvals corresponding to the second patient in the pair of patients being compared.
#' @export mle.getPatientSimilarity
#' @examples
#' mle.getPatientSimilarity(ig, path, data.pvals, patient1, patient2)
mle.getPatientSimilarity <- function(ig, path, data.pvals, patient1, patient2) {
  results <- mle.getEncodingLength(ig, path, data.pvals, patient1)
  results <- results[order(results[,"d.score"], decreasing = TRUE),]
  sizeSubset = results[1,"subsetSize"]

  load(sprintf("%s/%s-Bitstrings.RData", path, patient1))
  p1.optBS <- as.numeric(unlist(strsplit(optimalBitString[[sizeSubset]], split="")))
  ind <- which(sapply(p1.optBS, function(i) i==1))
  p1.optBS <- p1.optBS[1:ind[length(ind)]]
  p1.sig.nodes <- names(sort(data.pvals[patient1,], decreasing = FALSE))[1:sizeSubset]

  load(sprintf("%s/%s-Bitstrings.RData", path, patient2))
  p2.optBS <- as.numeric(unlist(strsplit(optimalBitString[[sizeSubset]], split="")))
  ind <- which(sapply(p2.optBS, function(i) i==1))
  p2.optBS <- p2.optBS[1:ind[length(ind)]]
  p2.sig.nodes <- names(sort(data.pvals[patient2,], decreasing = FALSE))[1:sizeSubset]

  # Get optimal bitstring for encoding of patient1's union patient2's subsets
  p12.sig.nodes <- unique(c(p1.sig.nodes, p2.sig.nodes))
  permutationByStartNode <- mle.getPermutations(p12.sig.nodes, 0, ig)
  bitStrings.pt <- list()
  for (sn in 1:length(p12.sig.nodes)) {
    startNode <- p12.sig.nodes[sn]
    current_node_set <- permutationByStartNode[[startNode]]
    bitStrings.pt[[sn]] <- as.numeric(current_node_set %in% p12.sig.nodes)
    names(bitStrings.pt[[sn]]) <- current_node_set
  }
  tmp <- unlist(lapply(bitStrings.pt, function(i) sum(i)))
  bitStrings.pt <- bitStrings.pt[which(tmp==max(tmp))]
  p12.optBS <- bitStrings.pt[[which.min(unlist(lapply(bitStrings.pt, function(i) sum(which(i==1)))))]]
  ind <- which(sapply(p12.optBS, function(i) i==1))
  p12.optBS <- p12.optBS[1:ind[length(ind)]]

  # Normalize all bitstrings for patient1, patient2, and patient1 UNION patient2 based on the max length
  normSize <- max(c(length(p1.optBS), length(p2.optBS), length(p12.optBS)))
  p1.optBS <- c(p1.optBS, rep(0, normSize-length(p1.optBS)))
  p2.optBS <- c(p2.optBS, rep(0, normSize-length(p2.optBS)))
  p12.optBS <- c(p12.optBS, rep(0, normSize-length(p12.optBS)))

  # Do entropy metric for patient1
  if (length(p1.optBS)==1) {
    p1.k <- sum(p1.optBS)
    p1.e = log2(length(igraphObjectG)) + iteratedLogarithm(length(p1.optBS)) + log2(choose(length(igraphObjectG)-p1.k, length(p1.sig.nodes)-p1.k))
  } else {
    p1.k <- sum(p1.optBS)
    p1.e = log2(length(igraphObjectG)) + iteratedLogarithm(length(p1.optBS)) + 1 + (length(p1.optBS)-1)*entropyFunction(p1.optBS[2:length(p1.optBS)]) + log2(choose(length(igraphObjectG)-p1.k, length(p1.sig.nodes)-p1.k))
  }

  # Do entropy Metric for patient2
  if (length(p2.optBS)==1) {
    p2.k <- sum(p2.optBS)
    p2.e = log2(length(igraphObjectG)) + iteratedLogarithm(length(p2.optBS)) + log2(choose(length(igraphObjectG)-p2.k, length(p2.sig.nodes)-p2.k))
  } else {
    p2.k <- sum(p2.optBS)
    p2.e = log2(length(igraphObjectG)) + iteratedLogarithm(length(p2.optBS)) + 1 + (length(p2.optBS)-1)*entropyFunction(p2.optBS[2:length(p2.optBS)]) + log2(choose(length(igraphObjectG)-p2.k, length(p2.sig.nodes)-p2.k))
  }

  # Do entropy metric for union of patient1 and patient2
  if (length(p12.optBS)==1) {
    p12.k <- sum(p12.optBS)
    p12.e = log2(length(igraphObjectG)) + iteratedLogarithm(length(p12.optBS)) + log2(choose(length(igraphObjectG)-p12.k, length(p12.sig.nodes)-p12.k))
  } else {
    p12.k <- sum(p12.optBS)
    p12.e = log2(length(igraphObjectG)) + iteratedLogarithm(length(p12.optBS)) + 1 + (length(p12.optBS)-1)*entropyFunction(p12.optBS[2:length(p12.optBS)]) + log2(choose(length(igraphObjectG)-p12.k, length(p12.sig.nodes)-p12.k))
  }

  # Universal distance metric.
  return(1-((p1.e+p2.e-p12.e)/p12.e))
}