#' Clique Sum Exact
#'
#' An implementation of the clique-based disease module inference method proposed by Barrenäs et al.
#' @inheritParams clique_sum_permutation
#' @param db A clique database created by \code{\link{build_clique_db}}
#' @param clique_significance p-value for cliques to be considered significant
#' @param deg_cutoff p-value cutoff for differentialy expressed genes
#' @param min_clique_size Minimal size for cliques
#' @param min_deg_in_clique Minimum number of DEGs to be present in a clique
#' @param multiple_cores Parallel process using multiple cores?
#' @param n_cores Number of cores to use
#' @param dataset_name Optional name for the input object that will be stored in the settings object.
#' Default is the variable name of the input object
#' @return clique_sum_exact returns an object of class "MODifieR_module" with subclass "Clique_Sum_exact".
#' This object is a named list containing the following components:
#' \item{module_genes}{A character vector containing the genes in the final module}
#' \item{settings}{A named list containing the parameters used in generating the object}
#' @details
#' Clique_sum_exact finds cliques of at least size \code{min_clique_size} that are
#' significantly enriched with DEGs.
#' The union of maximal cliques with a Fisher-exact test p-value below \code{clique_significance} and at
#' least \code{min_deg_in_clique} is the final disease module.
#' @references
#' \cite{Barrenäs F, Chavali S, Alves AC, et al. Highly interconnected genes in
#' disease-specific networks are enriched for disease-associated polymorphisms.
#' Genome Biology. 2012;13(6):R46. doi:10.1186/gb-2012-13-6-r46.}
#'
#' @author Dirk de Weerd
#' @export
clique_sum_exact <- function(MODifieR_input, db, clique_significance = 0.01,
deg_cutoff = 0.05, min_clique_size = 5, min_deg_in_clique = 3,
multiple_cores = T, n_cores = 4, dataset_name = NULL){
# Retrieve settings
evaluated_args <- c(as.list(environment()))
settings <- as.list(stackoverflow::match.call.defaults()[-1])
replace_args <- names(settings)[!names(settings) %in% unevaluated_args]
for (argument in replace_args) {
settings[[which(names(settings) == argument)]] <- evaluated_args[[which(names(evaluated_args) ==
argument)]]
}
#Validate the input parameters
check_diff_genes(MODifieR_input, deg_cutoff = deg_cutoff)
validate_inputs(settings)
if (!is.null(dataset_name)){
settings$MODifieR_input <- dataset_name
}
con <- connect_db(db)
unique_genes <- unname(unlist(RSQLite::dbGetQuery(con, "SELECT * FROM unique_genes")))
genes <- MODifieR_input$diff_genes
#Convert 2 column dataframe to named vector, names will be the gene names, values will be the p value
genes <- -log10(dataframe_to_vector(as.data.frame(genes)))
genes <- genes[names(genes) %in% unique_genes]
deg_genes <- genes[genes > -log10(deg_cutoff)]
non_deg_genes <- genes[genes < -log10(deg_cutoff)]
#Fix tables
tables <- RSQLite::dbListTables(con)
chunk_table <- tables[grep(pattern = "chunk_summary", x = tables)]
tables <- tables[-grep(pattern = "chunk_summary", x = tables)]
tables <- tables[-grep(pattern = "unique_genes", x = tables)]
table_numbers <- as.numeric(gsub(pattern = "clique", replacement = "", x = tables))
tables <- tables[as.numeric(gsub(pattern = "clique", replacement = "", x = tables)) >= min_clique_size]
tables <- tables[order(as.numeric(gsub(pattern = "clique", replacement = "", x = tables)))]
table_sizes <- as.numeric(sub("clique", "", tables))
cutoff_per_size <- get_cutoffs(min_clique_size = min_clique_size,
max_table_size = max(table_sizes),
total_n_genes = length(genes),
deg_genes = deg_genes,
unsig = length(non_deg_genes),
clique_significance = clique_significance,
min_deg_in_clique = min_deg_in_clique)
chunk_table <- get_chunk_table(con = con,
deg_genes = deg_genes,
cutoff_per_size = cutoff_per_size,
min_clique_size = min_clique_size,
genes = genes,
min_deg_in_clique = min_deg_in_clique)
if (is.null(chunk_table) | length(chunk_table) == 0){
module_genes <- NULL
}else{
if (multiple_cores == T){
cl <- parallel::makeCluster(n_cores)
doParallel::registerDoParallel(cl)
parallel::clusterCall(cl, function(x) .libPaths(x), .libPaths())
module_genes <- foreach(i = 1:length(chunk_table),.combine = 'append', .packages = "MODifieRDev" ) %dopar% {
clique_sum_core_exact(chunk_row = chunk_table[[i]], db = db, deg_genes = deg_genes,
cutoff_per_size = cutoff_per_size,
genes = genes,
min_clique_size = min_clique_size)
}
module_genes <- unique(unlist(module_genes))
parallel::stopCluster(cl) # stop the cluster
}else{
#Convert list to table
chunk_table <- do.call(rbind, chunk_table)
module_genes <- unique(unlist(apply(X = chunk_table, 1, clique_sum_core_exact,
db = db, deg_genes = deg_genes, genes = genes, cutoff_per_size = cutoff_per_size, min_clique_size = min_clique_size)))
}
}
#Construct list and give it the correct class
new_clique_sum_module <- list("module_genes" = module_genes,
"settings" = settings)
class(new_clique_sum_module) <- c("MODifieR_module", "Clique_Sum_exact", class(MODifieR_input)[3])
RSQLite::dbDisconnect(conn = con)
return(new_clique_sum_module)
}
#Create a set of 4 column tables for all clique sizes.
#Column 1: Number of DEGs in clique
#Column 2: Number of DEGs not in clique
#Column 3: Number of unsignificant genes in clique
#Column 4: Number of unsignificant genes not in clique
#The tables are used to calculate the minimum number of DEGs needed
#for every clique size to achieve significance
create_fisher_tables <- function(clique_size, total_n_genes, n_sig, unsig){
min_size <- min(c(clique_size, n_sig))
fisher_table <- cbind(1:min_size, clique_size - 1:min_size, n_sig - 1:min_size, unsig - clique_size + 1:min_size)
fisher_table[which(fisher_table < 0)] <- 0
return(fisher_table)
}
#Evaluate a 'Fisher table' and return the minimum number of DEGs needed to achieve significance.
#If this is less than min_deg_in_clique, return min_deg_in_clique
evaluate_table <- function(fisher_table, clique_significance, min_deg_in_clique){
for (i in 1:nrow(fisher_table)){
if (MODifieRDev:::fisher_pval(clique_row = fisher_table[i,]) <= clique_significance)
return(max(i, min_deg_in_clique))
}
return (100000)
}
#Function that determines if there are potentially significant genes in a clique
get_chunks <- function(result_row, cutoff_per_size, deg_genes, genes, min_clique_size, min_deg_in_clique){
#Initialize variable 'cutoff' for current chunk
deg_cutoff <- NULL
#Get the row genes
row_genes <- unname(unlist(strsplit(x = as.character(result_row[4]), split = " ")))
#Get the number of genes in the chunk that are actually present in the data
n_p_genes <- sum(names(genes) %in% row_genes)
#Quantifying the number of missing genes, i.e. genes that are present in the chunk, but
#not in the data
n_missing_genes <- length(row_genes) - n_p_genes
#number of missing genes will say something about the potential clique size. For example,
#if clique size is 65, and there are 55 genes missing, that means there are potential size 10 cliques.
smallest_clique <- unlist(result_row[3] - n_missing_genes)
#If the number of missing genes is equal or more than the clique size, assumption is that there are
#potential cliques of size min_clique_size cliques DEG cutoffs should be set accordingly
if (smallest_clique <= min_clique_size){
deg_cutoff <- cutoff_per_size[[1]]
#If the number of missing genes is subtracted from the clique size and there is still a minimum clique left
#that is bigger than min_clique_size, assumption is that there is a potential clique of size
# clique_size - missing genes. DEG cutoffs should be set accordingly
}else{
deg_cutoff <- cutoff_per_size[as.character(smallest_clique)]
}
#Count the number of DEGs present in the chunk
n_deg_genes <- sum(names(deg_genes) %in% row_genes)
#If the number of DEGs in the chunk is larger than the cutoff, there are potentially significant cliques.
#Return the clique_size (that is the table in the DB), the start and stop of the chunk.
#Later this can be converted in a query. If the number of DEGs is lower than the cutoff, do nothing.
if (n_deg_genes >= deg_cutoff){
return (c(paste("clique", result_row[3], sep = ""), result_row[1], result_row[2], deg_cutoff))
}
}
get_chunk_table <- function(con, deg_genes, cutoff_per_size, min_clique_size, genes, min_deg_in_clique){
#Select all chunk summaries where minimum clique size is at least min_clique_size
query <- sprintf("SELECT * FROM chunk_summary WHERE clique_size >= %s ORDER BY clique_size", min_clique_size)
result_table <- RSQLite::dbGetQuery(con, query)
#Get a list of chunks with potentially significant genes
chunk_list <- plyr::compact(plyr::alply(result_table, 1, get_chunks,
cutoff_per_size = cutoff_per_size,
deg_genes = deg_genes,
genes = genes,
min_clique_size = min_clique_size,
min_deg_in_clique = min_deg_in_clique))
return (chunk_list)
}
#Core algorithm, recieves a 'chunk_row' vector containing a:
#clique table [1]
#start coordinate [2]
#stop coordinate [3]
#A database query is constructed with these values. Executing the query will give a set of cliques
#that will be checked for significance. Return a unique set of genes from the siginificant cliques
clique_sum_core_exact <- function(chunk_row, db, deg_genes, genes, cutoff_per_size, min_clique_size){
con <- connect_db(db)
query <- sprintf("SELECT * FROM %s WHERE rowid BETWEEN %s AND %s", chunk_row[1], chunk_row[2], chunk_row[3])
result <- RSQLite::dbGetQuery(con, query)
#Make a 2 column matrix with:
#column 1: Number of genes present in the input dataset
#column 2: Number of DEGs in these genes
length_n_degs <- cbind(apply(X = result, MARGIN = 1, FUN = function(x)sum(as.character(x) %in% names(genes))),
apply(X = result, MARGIN = 1, FUN = function(x)sum(as.character(x) %in% names(deg_genes))))
#Check for each row if the clique is significant. Return type is a logical vector
significant_rows <- apply(X = length_n_degs, MARGIN = 1, FUN = get_significant_rows,
cutoff_per_size = cutoff_per_size, min_clique_size = min_clique_size)
#Subset the result with the logical vector
module_genes <- result[significant_rows, ]
#Return the unique genes in vector format
RSQLite::dbDisconnect(conn = con)
return (unique(unlist(module_genes)))
}
get_significant_rows <- function(length_deg_row, cutoff_per_size, min_clique_size){
#If the number of genes in the clique is the same or higher than min_clique_size
if (length_deg_row[1] >= min_clique_size){
#If the number of DEGs is sufficient to make clique significant return TRUE
if (length_deg_row[2] >= cutoff_per_size[as.character(length_deg_row[1])]){
return(T)
#Else return F
}else{
return(F)
}
}else{
return(F)
}
}
#Retrieve for every possible clique size (min_clique_size to size of largest clique) the number of genes that is needed
#to get a p-value under clique_significance
get_cutoffs <- function(min_clique_size, max_table_size, total_n_genes, deg_genes, unsig, clique_significance, min_deg_in_clique){
fisher_tables <- lapply(X = min_clique_size:max_table_size, create_fisher_tables,
total_n_genes = total_n_genes,
n_sig = length(deg_genes),
unsig = unsig)
#The names of the list are the sizes of the cliques; the values are the number of DEGs needed to acieve significance
cutoff_per_size <- lapply(X = fisher_tables, FUN = evaluate_table, clique_significance = clique_significance, min_deg_in_clique)
names(cutoff_per_size) <- paste0(min_clique_size:max_table_size)
#If the size is NULL there is no number of DEGs possible to achieve significance, set to arbitrary high number instead
cutoff_per_size[sapply(cutoff_per_size, is.null)] <- 10000
return(unlist(cutoff_per_size))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.