R/correlation_clique.R

Defines functions correlation_clique perform_iterations fisher_pval calculate_correlation correlation_adjust_cutoff correlation_set_module_size construct_correlation_module create_clique_file process_clique entrez_to_index index_to_entrez get_clique_cutoffs check_significance scale_observations

Documented in correlation_adjust_cutoff correlation_clique correlation_set_module_size

#' Clique_correlation
#'
#' A clique based method to find a disease module from correlated gene expression
#'
#' @inheritParams clique_sum_exact
#' @inheritParams build_clique_db
#' @param iteration Number of iterations to be performed
#' @param fraction_of_interactions Fraction of interactions from the original network that will be used
#' in each iteration
#' @param frequency_cutoff Fraction of the number of times a gene should be present in it 
#' iterations. Default is 0.5, meaning 50 procent of all iterations 
#' @param clique_significance Cutoff for Fisher exact test for cliques
#' @param multiple_cores parallelize iterations using number of cores on system -1?
#' @param tempfolder Folder where temporary files are stored
#' @details 
#' 
#' The correlation clique is a clique-based algorithm using consensus clustering.
#' The algorithm starts with calculating a correlation score between each interaction in the PPi network.
#' The correlation score is obtained by subtracting the Pearson correlation p-value:
#' \deqn{correlation score = 1 - Pearson p-value}
#' Subsequently, the correlation score is multiplied by 
#' the correlation confidence and scaled with the scale factor to get the edge score:
#' \deqn{edge score = \sqrt(correlation score * confidence score  ) * scale_factor}
#' When the edge scores are calculated the iterative part of the algorithm commences:
#' All edge scores are compared to random variables from the uniform distribution between (0,1)
#' only interactions where the edge score is higher than the random variable are used to construct
#' a new PPi network. Then, maximal cliques are inferred from this new network.
#' The cliques are tested for significant enrichment of DEGs by Fisher's exact test and
#' the union of significant cliques is the disease module for this iteration
#' The final disease module will consist of genes that have been present in at least \code{frequency_cutoff} iterations
#' 
#' @return 
#' correlation_clique returns an object of class "MODifieR_module" with subclass "Correlation_clique". 
#' This object is a named list containing the following components:
#' \item{module_genes}{A character vector containing the genes in the final module}
#' \item{frequency_table}{A table containing the fraction of times the genes were present in an iteration module}
#' \item{settings}{A named list containing the parameters used in generating the object}
#' 
#' @author Dirk de Weerd
#' 
#' @export
correlation_clique <- function(MODifieR_input, ppi_network,
                               frequency_cutoff = .5, 
                               fraction_of_interactions = 0.4,
                               iteration = 50, clique_significance = 0.01, 
                               deg_cutoff = 0.05, multiple_cores = F, n_cores = 3, 
                               dataset_name = NULL){
  
  # Retrieve settings
  evaluated_args <- c(as.list(environment()))
  settings <- as.list(stackoverflow::match.call.defaults()[-1])
  replace_args <- names(settings)[!names(settings) %in% unevaluated_args]
  for (argument in replace_args) {
    settings[[which(names(settings) == argument)]] <- evaluated_args[[which(names(evaluated_args) == 
                                                                              argument)]]
  }
  
  #Validate the input parameters
  check_diff_genes(MODifieR_input, deg_cutoff = deg_cutoff)
  check_expression_matrix(MODifieR_input)
  ppi_network <- validate_ppi(ppi_network)
  validate_inputs(settings)
  
  if (ncol(ppi_network) == 2){
    warning("Score column in PPi network missing. Setting the score to 1000 (highest confidence ) for all interactions")
    ppi_network <- cbind(ppi_network, 1000)
  }
  if (!is.null(dataset_name)){
    settings$MODifieR_input <- dataset_name
  }
  MODifieR_input$diff_genes <- MODifieR_input$diff_genes[MODifieR_input$diff_genes$gene %in% rownames(MODifieR_input$annotated_exprs_matrix), ]
  
  #Get DEGs
  pValueMatrix <- MODifieR_input$diff_genes
  pValueMatrix <- stats::na.omit(pValueMatrix)
  #Remove genes from PPi that are not in input
  overlap <- ppi_network[,1] %in% pValueMatrix[,1] & ppi_network[,1] %in% rownames(MODifieR_input$annotated_exprs_matrix)
  ppi_network <- ppi_network[overlap,]
  overlap <- ppi_network[,2] %in% pValueMatrix[,1] & ppi_network[,2] %in% rownames(MODifieR_input$annotated_exprs_matrix)
  ppi_network <- ppi_network[overlap,]
  ppi_network <- ppi_network[!duplicated(data.frame(t(apply(ppi_network[1:2], 1, sort)), ppi_network[, 3])), ]
  
  ppi_network[,3] <- ppi_network[,3] / 1000
  if (nrow(ppi_network) == 0){
    stop("No shared genes between ppi_network, diff_genes and annotated_exprs_matrix", call. = F)
  }
  #Calculates (1- p value) for all relevant connections in the PPi network, and is stored in column 1.
  #Column 2 will contain the PPi score from the PPi network
  corrPvalues <- cbind(apply(X = ppi_network, MARGIN = 1, FUN = calculate_correlation, 
                             expression_matrix = MODifieR_input$annotated_exprs_matrix),
                       ppi_network[,3])
  #Calculates square root of p-value * PPi score. This is then scaled by probabilityScaleFactor. 
  pval_score <- apply(X = corrPvalues, MARGIN = 1, FUN = function(x){sqrt(x[1] * x[2])})
  
  pval_score <- scale_observations(sf = fraction_of_interactions, scores = pval_score)
  
  genes <- dataframe_to_vector(as.data.frame(pValueMatrix))
  
  genes <- genes[names(genes) %in% unique(unlist(ppi_network[ ,1:2]))]
  
  module_list <- list()
  if (multiple_cores){
    cl <- parallel::makeCluster(n_cores)
    doParallel::registerDoParallel(cl)
    parallel::clusterCall(cl, function(x) .libPaths(x), .libPaths())
    #module_list <- foreach(i=1:iteration, .combine = list, .export = ls(.GlobalEnv)) %do% {
    #Compare computed scores to random scores
    
    module_list <- parLapply(cl = cl, X = 1:iteration, fun = perform_iterations, pval_score, ppi_network, genes, clique_significance, deg_cutoff)
    parallel::stopCluster(cl) # stop the cluster
    
    #}
    
  }else{
    #For every iteration:
    for (i in 1:iteration){
      #Compare computed scores to random scores 
      
      
      module_list[[i]] <- perform_iterations(iteration_n = i, pval_score = pval_score, ppi_network = ppi_network, genes = genes, clique_significance = clique_significance, deg_cutoff = deg_cutoff)
    }
  }
  #Get the frequency of each gene that has been present in a module, expressed in fraction
  tabled_frequencies <- table(unlist(module_list)) / iteration
  #The resulting module is going to consist of all genes that have are seen more frequently in the iterations
  #than the frequency cutoff (default 0.5, so present in 50% of all iterations) 
  module_genes <- names(tabled_frequencies[tabled_frequencies >= frequency_cutoff])
  
  new_correlation_clique_module <- construct_correlation_module(module_genes = module_genes,
                                                                frequency_table = tabled_frequencies,
                                                                input_class = class(MODifieR_input)[3],
                                                                settings = settings)
  
  return( new_correlation_clique_module)
}


perform_iterations <- function(iteration_n, pval_score, ppi_network, genes, clique_significance, deg_cutoff){
  
  to_pass <- pval_score > runif(n = length(pval_score))
  
  clique_file_list <- create_clique_file(adjacency_matrix = ppi_network[to_pass, ], genes = genes) 
  
  module_list <- process_clique(clique_file = clique_file_list$tempfile, genes = clique_file_list$tr_genes, 
                                graphed_ppi = clique_file_list$graphed_ppi, clique_significance = clique_significance,
                                deg_cutoff = deg_cutoff, name_table = clique_file_list$name_table)
}
#Pval Helper function
fisher_pval <- function(clique_row){
  stats::fisher.test(x = matrix(data = clique_row, nrow = 2, byrow = T), alternative = "g")$p.value
}
#This function takes as the first argument a row from a PPi network, retrieves the corresponding row and column
#by row and column name and computes (1- correlation P value). 
calculate_correlation <- function(row, expression_matrix){
  row <- as.character(row)
  
  x_y <- c(which(rownames(expression_matrix) == row[1]), which(rownames(expression_matrix) == row[2]))
  
  1 - round(stats::cor.test(x = expression_matrix[x_y[1],], y = expression_matrix[x_y[2],])$p.value, 3)
}

#' correlation_adjust_cutoff
#' 
#' @inheritParams correlation_clique
#' @param correlation_module Module object that has been produced by \code{correlation_clique}  function
#' @details 
#' 
#' This function allows to adjust the frequency cutoff for a \code{correlation_clique} module object
#' @return 
#' 
#' 
#'  \code{correlation_clique} module object 
#' @seealso 
#' 
#' \code{\link{correlation_clique}}
#' 
#' @author Dirk de Weerd
#' 
#' @export
correlation_adjust_cutoff <- function(frequency_cutoff, correlation_module){
  
  frequency_table <- correlation_module$frequency_table
  
  module_genes <- names(frequency_table[frequency_table >= frequency_cutoff])
  
  correlation_module$settings$frequency_cutoff <- frequency_cutoff
  
  new_correlation_clique_module <- construct_correlation_module(module_genes = module_genes,
                                                                frequency_table = frequency_table,
                                                                input_class = class(correlation_module)[3],
                                                                settings = correlation_module$settings)
  return( new_correlation_clique_module)
  
}

#' correlation_set_module_size
#' 
#' Returns a correlation_clique module closest to \code{size}
#' 
#' @inheritParams correlation_adjust_cutoff
#' @param size Module object that has been produced by \code{correlation_clique}
#'  function
#' @details 
#' The function will find the the frequency cutoff for that will result
#' in a \code{correlation_clique} module object closest to \code{size}
#' @return 
#'  \code{correlation_clique} module object 
#' @seealso 
#' 
#' \code{\link{correlation_clique}}
#' 
#' @author Dirk de Weerd
#' 
#' @export
correlation_set_module_size <- function(size, correlation_module){
  frequency_cutoff <- as.numeric(names(which.min(abs(size - table(correlation_module$frequency_table)))))
  
  new_correlation_clique_module <- correlation_adjust_cutoff(frequency_cutoff = frequency_cutoff,
                                                             correlation_module = correlation_module)
  
  return (new_correlation_clique_module)
}

construct_correlation_module <- function(module_genes, frequency_table, input_class, settings){
  
  new_correlation_clique_module <- list("module_genes" = module_genes,
                                        "frequency_table" = frequency_table,
                                        "settings" = settings)
  
  class( new_correlation_clique_module) <- c("MODifieR_module", "Correlation_clique", input_class)
  
  return( new_correlation_clique_module)
  
}

create_clique_file <- function(adjacency_matrix, genes){
  clique_file <- tempfile()
  
  g <- igraph::simplify(igraph::graph.data.frame(adjacency_matrix, directed = F))
  
  genes <- genes[names(genes) %in% V(g)$name]
  name_table <- cbind(V(g)$name, as.character(as.numeric(V(g)-1)))
  
  names(genes) <- sapply(X = names(genes), FUN = entrez_to_index, name_table = name_table)
  
  igraph::max_cliques(graph = g, file = clique_file)
  
  return(list(tempfile = clique_file, graphed_ppi = g, name_table = name_table, tr_genes = genes))
}


process_clique <- function(clique_file, genes, graphed_ppi, clique_significance, deg_cutoff, name_table) {
  
  deg_genes <- genes[genes < deg_cutoff]
  
  non_deg_genes <- genes[genes >= deg_cutoff]
  cutoff_per_size <- vector(mode = "numeric")
  con <- file(clique_file, "r")
  significant_cliques <- NULL
  deg_names <- names(deg_genes)
  while (TRUE) {
    line <- readLines(con, n = 100000)
    if ( length(line) == 0 ) {
      break
    }
    clique_genes <- sapply(X = line, FUN = strsplit, split = " ")
    
    size_and_deg <- unname(sapply(X = clique_genes, FUN = function(x) c(length(x), sum(x %in% deg_names))))
    
    clique_sizes <- unique(size_and_deg[1,])
    
    if (sum(is.na(cutoff_per_size[clique_sizes]) > 0)){
      indici <- clique_sizes[which(is.na(cutoff_per_size[clique_sizes]))]
      cutoff_per_size[indici] <- sapply(X = indici, FUN = get_clique_cutoffs, 
                                        deg_names = deg_names, 
                                        non_deg_genes = non_deg_genes,
                                        genes = genes,
                                        clique_significance)
    }
    
    significant_vector <- apply(X = size_and_deg, MARGIN = 2, FUN = check_significance, cutoff_per_size = cutoff_per_size)
    
    significant_cliques <- unique(c(significant_cliques, unlist(clique_genes[significant_vector])))
  }
  
  
  significant_cliques <- unname(unique(unlist(sapply(X = significant_cliques, 
                                                     FUN = index_to_entrez, 
                                                     name_table = name_table))))
  
  close(con)
  
  file.remove(clique_file)
  
  return (unique(significant_cliques))
  
}

entrez_to_index <- function(entrez_id, name_table){
  name_table[which(name_table[,1] == entrez_id), 2 ]
}
index_to_entrez <- function(gene_index, name_table){
  name_table[which(name_table[,2] == gene_index), 1 ]
}

get_clique_cutoffs <- function(clique_size, deg_names, non_deg_genes, genes, clique_significance){
  
  create_fisher_tables(clique_size = clique_size, 
                       total_n_genes = length(genes), 
                       n_sig = length(deg_names), 
                       unsig = length(non_deg_genes)) %>% 
    evaluate_table(fisher_table = ., 
                   clique_significance = clique_significance, 
                   min_deg_in_clique = 1)
}

check_significance <- function(clique, cutoff_per_size){
  if (clique[2] >= cutoff_per_size[clique[1]]){
    return (TRUE)
  }else{
    return(FALSE)
  }
}

scale_observations <- function(sf, scores){
  scores * (length(scores) * sf) / sum(scores)
}
ddeweerd/MODifieRDev documentation built on Nov. 12, 2019, 7:50 a.m.