R/calcSimilarity.R

Defines functions heatmapSimilarity NthMin ClustFunction PermutedVar ModelPvalue SimulatedVarPValue SimulatedPValueAbs

########
 # _7: Each model is compared with permutations of random data
 # Two new functions ClustFunction and ModelPValue
 # Assigns clusters based on p values
 # should work with all mean, min, z score type assignments as p values are related
 # A separate funtion to change the assignment type. Change ClustFunction

heatmapSimilarity = function(data.reference, data.simulation, cluster.cut = NULL, n.clusters = 3,
                             p.value=0.05, permuted.var, permutations = 1000,
                             cor.method = "spearman", cluster.method = "ward.D2", method = "pvalue", buffer = 0.0001, permutation.method = "simulation", return.data = FALSE) {
  #' Calculates the similarity between two gene expression data
  #'
  #'  Comparison is done across columns, i.e., how similar are the columns in the two dataset.
  #' For gene expression data, format data so that gene names are in rows and samples in columns.
  #' @param data.reference The reference data matrix, for example, the experimental gene expression values
  #' @param data.simulation The data matrix to be compared.
  #' @param n.clusters (optional) The number of clusters in which the reference data should be clustered for comparison.
  #' Not needed if cluster.cut is provided
  #' @param p.value (optional) p-value to consider two gene expression sets as belonging to same cluster.
  #' Ward's method with spearman correlation is used to determine if a model belongs to a specific cluster.
  #' @param permuted.var (optional) Similarity scores computed after permutations.
  #' @param cluster.cut (optional) Clsuter numbers assigned to reference data.
  #'  If cluster.cut is missing, hierarchical clustering using /code{ward.D2}
  #'  and /code{distance  = (1-cor(x, method = "spear"))/2} will be used to cluster the reference data.
  #'  @param cluster.method (optional) Clustering method to be used to cluster the experimental data
  #'  @param cor.method (optional) Correlation method. Default method is "spearman". For single cell data, use "kendall"
  #'  @param permutation.method "sample" or "reference"
  #' @return A list containing the KL distance of new cluster distribution from reference data and
  #' the probability of each cluster in the reference and simulated data.
  #'
  #'


  message("Calculating the similarity index")
  #  n.clusters = 3
  n.models <- dim(data.reference)[2]
  n.models.KO <- dim(data.simulation)[2]

  if (missing(permutations)) {
    permutations = 1000
  }

  if (missing(cor.method)) {
    cor.method <- "spearman"
  }

  ref.cor <- cor((data.reference), method = cor.method)

  if (missing(cluster.cut)) {
    if(missing(n.clusters)){
      stop("Please specify the number of clusters using n.clusters or
           cluster assignments using cluster.cut")
    }

    # cluster the reference data if the clutering assignments has not been provided.
    distance <- as.dist((1-ref.cor)/2)
    clusters <- hclust(distance, method = cluster.method)
    #plot(clusters)
    cluster.cut <- cutree(clusters, n.clusters)

    } else {
      if(!missing(n.clusters)){
        warnings("Neglecting n.clusters. The number of clusters will be determined from cluster.cut.")
      }
      n.clusters <- length(unique(cluster.cut))
    }

  # find the variance within each cluster
  #TO DO Will standard deviation be better? shouldn't be with ward method.

  ref.cluster.var <- c(rep(0,n.clusters))
  for(j in 1:n.clusters)
  {
  #  print(j)
    temp.cluster.var <- (((1 - ref.cor[which(cluster.cut==j), which(cluster.cut==j)])/2)^2)
    ref.cluster.var[j] <- ClustFunction(temp.cluster.var[upper.tri(temp.cluster.var, diag = FALSE)])
    temp.cluster.var <- NULL
  }


  #  cluster.cut <- cluster.cut[1:10]
  #  data.reference <- data.reference[,1:10]
  simulated.ref.cor <- t(cor(data.reference, data.simulation, method = cor.method))

  #clusterFreq <- table(CLUSTERCUT)/n_models

  if (sum(is.na(simulated.ref.cor)) > 0) {
    message("Error in correlation. Please verify the data")
  }

  simulated.cluster.var <- matrix(0, nrow=n.models.KO, ncol = n.clusters)

  for(i in 1:n.models.KO){
    for(j in 1:n.clusters)
    {
      temp.cluster.var <- ((1 - simulated.ref.cor[i, which(cluster.cut==j)])/2)^2
      simulated.cluster.var[i,j] <- ClustFunction(temp.cluster.var )
      temp.cluster.var <- NULL
    }
  }

  # testing clustering robustness
  # ref.cluster.var <- matrix(0, nrow = n.models, ncol =  n.clusters)
  # for(i in 1:n.models)
  # for(j in 1:n.clusters)
  # {
  #   ref.cluster.var[i,j] <- mean(((1 - ref.cor[i, which(cluster.cut==j)])/2)^2)
  # }

  if (method == "variance") {
    simulated.cluster <- matrix(0, nrow =  n.models.KO, ncol = 2)
    simulated.cluster[, 2] <- apply(simulated.cluster.var,1,min)
    # simulated.cluster.allowed <- simulated.cluster.var < ref.cluster.var
    simulated.cluster[, 1] <- apply(simulated.cluster.var,1,which.min)
    simulated.cluster[which(3*ref.cluster.var[simulated.cluster[,1]] < simulated.cluster[, 2]), 1] <- 0
    simulated.cluster <- simulated.cluster[,-2]

  }
  #  permutations = 1000
  if(missing(method)) {
    method = "pvalue"
  }
  if (method == "pvalue" ) {
    message("pvalue method")
    if(missing(permuted.var )) {
      if(permutation.method == "reference"){
      permuted.var <- PermutedVar(simulated.ref.cor, cluster.cut, permutations, ref.cluster.var)
      simulated.var.P.value <- SimulatedVarPValue(permuted.var, p.value)
      #rowSums(simulated.cluster.allowed)
      #simulated.cluster.var.sorted <- sort(simulated.cluster.var, index.return = TRUE )
      # simulated.cluster.allowed <- simulated.cluster.var < simulated.var.P.value
      simulated.cluster <- matrix(0, nrow =  n.models.KO, ncol = 2)
      simulated.cluster[, 2] <- apply(simulated.cluster.var,1,min)
      simulated.cluster[, 1] <- apply(simulated.cluster.var,1,which.min)
      simulated.cluster[which(simulated.var.P.value[simulated.cluster[,1]] < simulated.cluster[, 2]), 1] <- 0
      simulated.cluster <- simulated.cluster[,-2]

      } else {
        message("simulation permutation")

        p.value.mat <- ModelPvalue(data.simulation, data.reference, cluster.cut, permutations,
                                   ref.cluster.var, cor.method, simulated.cluster.var)
        simulated.cluster <- matrix(0, nrow =  n.models.KO, ncol = 2)
        simulated.cluster[, 2] <- apply(p.value.mat,1,min)
        simulated.cluster[, 1] <- apply(p.value.mat,1,which.min)
        simulated.cluster[which(simulated.cluster[,2] > p.value), 1] <- 0
        simulated.cluster <- simulated.cluster[,-2]

        }
    }


  }

  similarity <- list()
  cluster.names <- unique(cluster.cut)
  #print(c("Original Clusters", cluster.names))
  cluster.names <- c(0, cluster.names) #test

  bufferEnteriesPerCluster <- max(1,as.integer(buffer*n.models))
  cluster.cut.adjusted <- c(cluster.cut, rep(0,bufferEnteriesPerCluster))

  simulated.cluster.names <- unique(simulated.cluster)
 # print(c("Simulated Clusters", simulated.cluster.names))
  missing.ref.clusters <- setdiff(cluster.names, simulated.cluster.names)
  #print(c("Missing Clusters", missing.ref.clusters))
  bufferEnteriesPerCluster <- max(1,as.integer(buffer*n.models.KO))
  missing.ref.clusters.add <- numeric() #c(rep(0,bufferEnteriesPerCluster*length(missing.ref.clusters)))
  if (length(missing.ref.clusters) > 0) {
  for(i in 1:length(missing.ref.clusters))
  {
    missing.ref.clusters.add <- c(missing.ref.clusters.add, rep(missing.ref.clusters[i],bufferEnteriesPerCluster))
  }
  }
  simulated.cluster.adjusted <- c(simulated.cluster, missing.ref.clusters.add)




  ref.cluster.freq <- table(cluster.cut.adjusted)/(length(cluster.cut.adjusted))
  # similarity$ref.cluster.freq <- table(cluster.cut)/n.models
  similarity$ref.cluster.freq <- ref.cluster.freq

  simulated.cluster.freq <- table(simulated.cluster.adjusted)/length(simulated.cluster.adjusted)

  #similarity$simulated.cluster.freq <- table(simulated.cluster)/n.models.KO
  similarity$simulated.cluster.freq <- simulated.cluster.freq

  similarity$cluster.similarity <- simulated.cluster.freq*log(simulated.cluster.freq/ref.cluster.freq)
  similarity$KL <- sum(similarity$cluster.similarity )

  if(return.data){
  similarity$data.reference <- data.reference
  colnames(similarity$data.reference) <- cluster.cut
  similarity$data.reference <- similarity$data.reference[,order(colnames(similarity$data.reference))]


  similarity$data.simulation <- data.simulation[,which(simulated.cluster>0)]
  colnames(similarity$data.simulation) <- simulated.cluster[which(simulated.cluster>0)]
  similarity$data.simulation <- similarity$data.simulation[,order(colnames(similarity$data.simulation))]
  ref.sim.cor <- numeric()
  previous.cluster.size <- 0
  ref.sim.cor.ref <- numeric()
  previous.cluster.size.ref <- 0

  for(i in 1:(length(unique(colnames(similarity$data.simulation)))))
  {
    temp.ref <- similarity$data.reference[,which(colnames(similarity$data.reference)==i)]
    temp.sim <- similarity$data.simulation[,which(colnames(similarity$data.simulation)==i)]

    temp.ref.sim.cor <- cor(temp.ref,temp.sim, method = cor.method)
    ref.sim.cor <- c(ref.sim.cor,previous.cluster.size +
                       sort(colMeans(temp.ref.sim.cor), decreasing = T, index.return = T)$ix)
    previous.cluster.size <- previous.cluster.size + dim(temp.sim)[2]

    ref.sim.cor.ref <- c(ref.sim.cor.ref, previous.cluster.size.ref +
                       sort(rowMeans(temp.ref.sim.cor), decreasing = T, index.return = T)$ix)
    previous.cluster.size.ref <- previous.cluster.size.ref + dim(temp.ref)[2]


  }

  similarity$data.simulation <- similarity$data.simulation[,ref.sim.cor]
  tmp <- data.simulation[,which(simulated.cluster == 0)]
  colnames(tmp) <- rep(0, dim(tmp)[2])

  similarity$data.simulation <- cbind(similarity$data.simulation[,ref.sim.cor], tmp)

  similarity$data.reference <- similarity$data.reference[,ref.sim.cor.ref]

  #TO DO : This invovlves repeat calculation of cor--can be optimized
  similarity$simulated.ref.cor <- t(cor(similarity$data.reference, similarity$data.simulation, method = cor.method))
}
  #image(similarity$simulated.ref.cor, col = plot_color)
  return(similarity)
}

#########################################################
# Helper functions
#########################################################

NthMin <- function(x,index) {
  #' Find nth minimum value from a vector
  #' @param x the given unsorted vector
  #' @param index
  #' @return the nth minimum element of the vector
  #'
  return (sort(x, decreasing = FALSE, partial = index)[index])

}

#############################################

ClustFunction <- function(x){
  #return (mean(x))
  return (min(x))
}

#############################################

PermutedVar <- function(simulated.ref.cor, cluster.cut, permutations, ref.cluster.var){
  #' Returns the variance array after permutations
  #' @param simulated.ref.cor Correlation matrix of simulated and reference data
  #' @param cluster.cut The original cluster assignments
  #' @param permutations The number of permutations
  #' @return An array of dimension n.models by n.clusters by permutations
  #'
  n.clusters <- length(unique(cluster.cut))
  n.models.KO <- dim(simulated.ref.cor)[1]
  permuted.var <- array(0, c(n.models.KO, n.clusters, permutations))
  for(k in 1:permutations){
    cluster.cut.permuted <- sample(cluster.cut)
    for(j in 1:n.clusters)
    {
      cor.mat <- simulated.ref.cor[,which(cluster.cut.permuted == j)]
      for(i in 1:n.models.KO){
        temp.cluster.var <- (((1-cor.mat[i, ])/2)^2)
        permuted.var[i,j,k] <- (mean(temp.cluster.var)/ (ref.cluster.var[j]))
      }
    }
  }
  return(permuted.var)
}
#############################################

ModelPvalue <- function(data.simulation, data.reference, cluster.cut, permutations,
                        ref.cluster.var, cor.method, simulated.cluster.var){
  #' Returns the variance array after permutations
  #' @param data.simulation
  #' @param data.reference
  #' @param cluster.cut The original cluster assignments
  #' @param permutations The number of permutations
  #' @param ref.cluster.var SD of the clusters
  #' @return An array of dimension n.models by n.clusters by permutations
  #'
  n.clusters <- length(unique(cluster.cut))
  n.models.KO <- dim(data.simulation)[2]
  n.gene <- dim(data.simulation)[1]
  p.value.mat <- matrix(0, nrow = n.models.KO, ncol = n.clusters)
  random.models <-  matrix(rep(seq(1:n.gene),permutations), n.gene, permutations)
  random.models <- apply(random.models,2,sample)
  #random.models <- matrix(0, nrow = n.gene, ncol = permutations)
  #random.models <- t(apply(data.simulation,1,function(x)  sample(x, replace = TRUE, size = permutations)))
  permuted.ref.cor <- matrix(0,nrow = permutations, ncol = dim(data.reference)[2])
  permuted.ref.cor <- cor(random.models, data.reference, method = cor.method)
  for(j in 1:n.clusters)
  {
    dist.mat <- ((1 - permuted.ref.cor[,which(cluster.cut == j)])/2)^2
    temp.vector <- sort(apply(dist.mat,1,ClustFunction))
    for (i in 1:n.models.KO) {
      p.value.mat[i,j] <- (which(abs(temp.vector - simulated.cluster.var[i,j])==min(abs(temp.vector - simulated.cluster.var[i,j])))[1] - 1)/permutations
      #[1] as sometimes which() might satisfy for multiple values
    }

  }

  return(p.value.mat)
}

#############################################

SimulatedVarPValue <- function(permuted.var, p.value){
  #' Finds the variance corresponding to a given value
  #' @param permuted.var An array containing the distance of clusters for each model for every permutation
  #' @param p.value
  #' @return p-values for each model
  #'
  permutations <- dim(permuted.var)[3]
  n.models.KO <- dim(permuted.var)[1]
  n.clusters <- dim(permuted.var)[2]
  selected.index = as.integer(permutations*p.value)

  if(selected.index==0) {stop("Number of permutations is not sufficient to achieve the required p.value.
                              Please increase the permutations")}
  #print(selected.index)
  simulated.var.P.value <- matrix(0, nrow=n.models.KO, ncol = n.clusters)

  for (i in 1:n.models.KO) {
    for(j in 1:n.clusters)    {
      simulated.var.P.value[i,j] <- NthMin(permuted.var[i,j,],selected.index)
    }
  }

  return(simulated.var.P.value)
  }
#############################################
 #' Returns absolute P value
SimulatedPValueAbs <- function(permuted.var, simulated.cluster.var){
  #' Finds the variance corresponding to a given value
  #' @param permuted.var An array containing the distance of clusters for each model for every permutation
  #' @param p.value
  #' @return p-values for each model
  #'
  permutations <- dim(permuted.var)[3]
  n.models.KO <- dim(permuted.var)[1]
  n.clusters <- dim(permuted.var)[2]

  simulated.var.P.value <- matrix(0, nrow=n.models.KO, ncol = n.clusters)

  for (i in 1:n.models.KO) {
    for(j in 1:n.clusters)    {
      temp.vector <- sort(permuted.var[i,j,])

      simulated.var.P.value[i,j] <- which(abs(temp.vector - simulated.cluster.var[i,j])==min(abs(temp.vector - simulated.cluster.var[i,j])))/permutations
    }
  }

  return(simulated.var.P.value)
}
TheJacksonLaboratory/GeneVyuha documentation built on May 5, 2019, 2:44 a.m.