R/gcn_inference.R

Defines functions get_edge_list get_neighbors module_enrichment enrichment_analysis ora get_hubs_gcn plot_gene_significance gene_significance plot_module_trait_cor module_trait_cor module_stability plot_dendro_and_colors plot_eigengene_network exp2gcn_blockwise exp2gcn SFT_fit

Documented in enrichment_analysis exp2gcn exp2gcn_blockwise gene_significance get_edge_list get_hubs_gcn get_neighbors module_enrichment module_stability module_trait_cor plot_dendro_and_colors plot_eigengene_network plot_gene_significance plot_module_trait_cor SFT_fit

#' Pick power to fit network to a scale-free topology
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param net_type Network type. One of 'signed', 'signed hybrid' or
#' 'unsigned'. Default is signed.
#' @param rsquared R squared cutoff. Default is 0.8.
#' @param cor_method Correlation method. One of "pearson", "biweight"
#' or "spearman". Default is "spearman".
#'
#' @return A list containing: \itemize{
#'   \item{power}{Optimal power based on scale-free topology fit}
#'   \item{plot}{A ggplot object displaying main statistics of the SFT fit test}
#' }
#'
#' @author Fabricio Almeida-Silva
#' @seealso
#'  \code{\link[WGCNA]{pickSoftThreshold}}
#' @rdname SFT_fit
#' @export
#' @importFrom WGCNA pickSoftThreshold
#' @importFrom ggplot2 theme element_text geom_hline
#' @examples
#' data(filt.se)
#' sft <- SFT_fit(filt.se, cor_method = "pearson")
SFT_fit <- function(exp, net_type = "signed", rsquared = 0.8,
                    cor_method = "spearman") {
    exp <- handleSE(exp)
    texp <- t(exp)

    if(cor_method == "pearson") {
        sft <- WGCNA::pickSoftThreshold(
            texp, networkType = net_type, powerVector = 3:20,
            RsquaredCut = rsquared
        )
    } else if(cor_method == "biweight") {
        sft <- WGCNA::pickSoftThreshold(
            texp, networkType = net_type, powerVector = 3:20,
            RsquaredCut = rsquared, corFnc = bicor,
            corOptions = list(use = 'p', maxPOutliers = 0.05)
        )
    } else if (cor_method == "spearman"){
        sft <- WGCNA::pickSoftThreshold(
            texp, networkType = net_type, powerVector=3:20,
            RsquaredCut = rsquared,
            corOptions = list(use = 'p', method = "spearman")
        )
    } else {
        stop("Please, specify a correlation method (one of 'spearman', 'pearson' or 'biweight').")
    }
    wgcna_power <- sft$powerEstimate
    if(is.na(wgcna_power)){
        message("No power reached R-squared cut-off, now choosing max R-squared based power")
        wgcna_power <- sft$fitIndices$Power[which(sft$fitIndices$SFT.R.sq == max(sft$fitIndices$SFT.R.sq))]
    }

    # Create data frame with indices to plot
    sft_df <- data.frame(
        power = sft$fitIndices$Power,
        fit = -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,
        meank = sft$fitIndices$mean.k.
    )
    sft_df <- sft_df[sft_df$fit > 0, ]

    # Plot 1
    p1 <- ggplot(sft_df, aes(x = .data$power, y = .data$fit)) +
        geom_point(color = "gray10") +
        ggrepel::geom_text_repel(aes(label = .data$power)) +
        labs(
            x = "Soft threshold (power)",
            y = expression(paste("Scale-free topology fit - ", R^{2})),
            title = "Scale independence"
        ) +
        theme_bw() +
        ylim(c(0,1)) +
        geom_hline(yintercept = rsquared, color = "brown3") +
        theme(legend.position = "none")

    # Plot 2
    p2 <- ggplot(sft_df, aes(x = .data$power, y = .data$meank)) +
        geom_point(color = "gray10") +
        ggrepel::geom_text_repel(aes(label = .data$power)) +
        labs(
            x = "Soft threshold (power)",
            y = "Mean connectivity (k)",
            title = "Mean connectivity"
        ) +
        theme_bw()

    # Combined plot
    sft_plot <- patchwork::wrap_plots(p1, p2, ncol = 2)
    result <- list(power = as.numeric(wgcna_power), plot = sft_plot)
    return(result)
}

#' Infer gene coexpression network from gene expression
#'
#' @param exp Either a `SummarizedExperiment` object, or a gene expression
#' matrix/data frame with genes in row names and samples in column names.
#' @param net_type Character indicating the type of network to infer.
#' One of 'signed', 'signed hybrid' or 'unsigned'. Default: 'signed'.
#' @param module_merging_threshold Numeric indicating the minimum correlation
#' threshold to merge similar modules into a single one. Default: 0.8.
#' @param SFTpower Numeric scalar indicating the value of the \eqn{\beta}
#' power to which correlation coefficients will be raised to ensure
#' scale-free topology fit. This value can be obtained with
#' the function \code{SFT_fit()}.
#' @param cor_method Character with correlation method to use.
#' One of "pearson", "biweight" or "spearman". Default: "spearman".
#' @param TOM_type Character specifying the method to use to calculate a
#' topological overlap matrix (TOM). If NULL, TOM type will be automatically
#' inferred from network type specified in \strong{net_type}. Default: NULL.
#' @param return_cormat Logical indicating whether the correlation matrix
#' should be returned. If TRUE (default), an element named `correlation_matrix`
#' containing the correlation matrix will be included in the result list.
#' @param verbose Logical indicating whether to display progress
#' messages or not. Default: FALSE.
#'
#' @return A list containing the following elements: \itemize{
#'   \item \emph{adjacency_matrix} Numeric matrix with network adjacencies.
#'   \item \emph{MEs} Data frame of module eigengenes, with samples
#'   in rows, and module eigengenes in columns.
#'   \item \emph{genes_and_modules} Data frame with columns 'Genes' (character)
#'   and 'Modules' (character) indicating the genes and the modules to
#'   which they belong.
#'   \item \emph{kIN} Data frame of degree centrality for each gene,
#'   with columns 'kTotal' (total degree), 'kWithin' (intramodular degree),
#'   'kOut' (extra-modular degree), and 'kDiff' (difference between
#'   the intra- and extra-modular degree).
#'   \item \emph{correlation_matrix} Numeric matrix with pairwise
#'   correlation coefficients between genes.
#'   If parameter \strong{return_cormat} is FALSE, this will be NULL.
#'   \item \emph{params} List with network inference parameters passed
#'   as input.
#'   \item \emph{dendro_plot_objects} List with objects to plot the dendrogram
#'   in \code{plot_dendro_and_colors}. Elements are named 'tree' (an hclust
#'   object with gene dendrogram), 'Unmerged' (character with per-gene module
#'   assignments before merging similar modules), and 'Merged' (character
#'   with per-gene module assignments after merging similar modules).
#' }
#' @author Fabricio Almeida-Silva
#' @rdname exp2gcn
#' @export
#' @importFrom WGCNA TOMsimilarity standardColors labels2colors
#' moduleEigengenes mergeCloseModules intramodularConnectivity
#' @importFrom dynamicTreeCut cutreeDynamicTree
#' @importFrom stats as.dist median cor fisher.test hclust na.omit prcomp
#' qnorm qqplot quantile sd var
#' @importFrom grDevices colorRampPalette
#' @examples
#' data(filt.se)
#' # The SFT fit was previously calculated and the optimal power was 16
#' gcn <- exp2gcn(exp = filt.se, SFTpower = 18, cor_method = "pearson")
exp2gcn <- function(
        exp, net_type = "signed", module_merging_threshold = 0.8,
        SFTpower = NULL, cor_method = "spearman", TOM_type = NULL,
        return_cormat = TRUE, verbose = FALSE
) {

    params <- list(
        net_type = net_type,
        module_merging_threshold = module_merging_threshold,
        SFTpower = SFTpower,
        cor_method = cor_method
    )
    exp <- handleSE(exp)

    if(is.null(SFTpower)) { stop("Please, specify the SFT power.") }

    # Calculate adjacency matrix
    if(verbose) { message("Calculating adjacency matrix...") }
    cor_matrix <- NULL
    if(return_cormat) {
        cor_matrix <- exp2cor(exp, cor_method = cor_method)
        adj_matrix <- cor2adj(cor_matrix, beta = SFTpower, net_type = net_type)
    } else {
        adj_matrix <- exp2cor(exp, cor_method = cor_method) |>
            cor2adj(beta = SFTpower, net_type = net_type)
    }

    # Calculate TOM from adjacency matrix
    if(verbose) { message("Calculating topological overlap matrix (TOM)...") }
    if(is.null(TOM_type)) { TOM_type <- guess_tom(net_type) }
    TOM <- WGCNA::TOMsimilarity(adj_matrix, TOMType = TOM_type)

    # Hierarchically cluster genes
    geneTree <- hclust(as.dist(1 - TOM), method = "average")
    geneTree$height <- round(geneTree$height, 7)

    # Detect coexpression modules
    if(verbose) { message("Detecting coexpression modules...") }
    original_mods <- dynamicTreeCut::cutreeDynamicTree(
        dendro = geneTree, minModuleSize = 30, deepSplit = TRUE
    )

    nmod <- length(unique(original_mods))
    palette <- rev(WGCNA::standardColors(nmod))
    original_colors <- WGCNA::labels2colors(original_mods, colorSeq = palette)

    # Calculate module eigengenes and pairwise distances between them
    if(verbose) { message("Calculating module eigengenes (MEs)...") }
    me_list <- WGCNA::moduleEigengenes(
        t(exp), colors = original_colors, softPower = SFTpower
    )
    me <- me_list$eigengenes
    original_metree <- hclust(
        as.dist(1 - cor(me, method = "spearman")), method = "average"
    )

    # Merge very similar modules
    if(verbose) { message("Merging similar modules...") }
    merged <- merge_modules(
        exp, original_colors, me, palette,
        dissimilarity = 1 - module_merging_threshold, cor_method = cor_method
    )
    new_colors <- merged$colors
    new_mes <- merged$newMEs

    # Get data frame with genes and modules they belong to
    genes_modules <- data.frame(Genes = rownames(exp), Modules = new_colors)

    if(verbose) { message("Calculating intramodular connectivity...") }
    kwithin <- WGCNA::intramodularConnectivity(adj_matrix, new_colors)

    result_list <- list(
        adjacency_matrix = adj_matrix,
        MEs = new_mes,
        genes_and_modules = genes_modules,
        kIN = kwithin,
        correlation_matrix = cor_matrix,
        params = params,
        dendro_plot_objects = list(
            tree = geneTree, Unmerged = original_colors, Merged = new_colors
        )
    )

    return(result_list)
}

#' Infer gene coexpression network from gene expression in a blockwise manner
#'
#' @param exp Either a `SummarizedExperiment` object, or a gene expression
#' matrix/data frame with genes in row names and samples in column names.
#' @param net_type Character indicating the type of network to infer.
#' One of 'signed', 'signed hybrid' or 'unsigned'. Default: 'signed'.
#' @param module_merging_threshold Numeric indicating the minimum correlation
#' threshold to merge similar modules into a single one. Default: 0.8.
#' @param SFTpower Numeric scalar indicating the value of the \eqn{\beta}
#' power to which correlation coefficients will be raised to ensure
#' scale-free topology fit. This value can be obtained with
#' the function \code{SFT_fit()}.
#' @param cor_method Character with correlation method to use.
#' One of "pearson" or "biweight". Default: "pearson".
#' @param TOM_type Character specifying the method to use to calculate a
#' topological overlap matrix (TOM). If NULL, TOM type will be automatically
#' inferred from network type specified in \strong{net_type}. Default: NULL.
#' @param max_block_size Numeric indicating the maximum block size for module
#' detection.
#' @param ... Additional arguments to \code{WGCNA::blockwiseModules()}.
#'
#' @return A list containing the following elements: \itemize{
#'   \item \emph{MEs} Data frame of module eigengenes, with samples
#'   in rows, and module eigengenes in columns.
#'   \item \emph{genes_and_modules} Data frame with columns 'Genes' (character)
#'   and 'Modules' (character) indicating the genes and the modules to
#'   which they belong.
#'   \item \emph{params} List with network inference parameters passed
#'   as input.
#'   \item \emph{dendro_plot_objects} List with objects to plot the dendrogram
#'   in \code{plot_dendro_and_colors}. Elements are named 'tree' (an hclust
#'   object with gene dendrogram), 'Unmerged' (character with per-gene module
#'   assignments before merging similar modules), and 'Merged' (character
#'   with per-gene module assignments after merging similar modules).
#' }
#'
#' @author Fabricio Almeida-Silva
#' @rdname exp2gcn_blockwise
#' @export
#' @importFrom WGCNA blockwiseModules
#' @examples
#' data(filt.se)
#' # The SFT fit was previously calculated and the optimal power was 16
#' cor <- WGCNA::cor
#' gcn <- exp2gcn_blockwise(
#'     exp = filt.se, SFTpower = 18, cor_method = "pearson"
#' )
exp2gcn_blockwise <- function(
        exp, net_type = "signed", module_merging_threshold = 0.8,
        SFTpower = NULL, cor_method = "pearson", TOM_type = NULL,
        max_block_size = 5000, ...
) {

    params <- list(
        net_type = net_type,
        module_merging_threshold = module_merging_threshold,
        SFTpower = SFTpower,
        cor_method = cor_method
    )
    exp <- handleSE(exp)

    if(is.null(SFTpower)) { stop("Please, specify the SFT power.") }

    cortype <- "pearson"
    if(cor_method == "biweight") {
        cortype <- "bicor"
    } else if(!cor_method %in% c("pearson", "biweight")) {
        stop("Argument to `cor_method` must be one of 'pearson' or 'biweight'.")
    }

    if(is.null(TOM_type)) { TOM_type <- guess_tom(net_type) }

    # Detect modules
    cor <- WGCNA::cor
    bmod <- WGCNA::blockwiseModules(
        datExpr = as.matrix(t(exp)),
        checkMissingData = FALSE,
        maxBlockSize = max_block_size,
        corType = cortype,
        maxPOutliers = 0.1,
        power = SFTpower,
        networkType = net_type,
        TOMType = TOM_type,
        minModuleSize = 30,
        mergeCutHeight = 1 - module_merging_threshold,
        ...
    )

    # Create result list
    genes_modules <- data.frame(
        Genes = names(bmod$colors), Modules = bmod$colors, row.names = NULL
    )

    result_list <- list(
        MEs = bmod$MEs,
        genes_and_modules = genes_modules,
        params = params,
        dendro_plot_objects = list(
            tree = bmod$dendrograms[[1]],
            Unmerged = bmod$unmergedColors,
            Merged = bmod$colors
        )
    )

    return(result_list)
}



#' Plot eigengene network
#'
#' @param gcn List object returned by \code{exp2gcn}.
#' @param palette Character indicating the name of the RColorBrewer palette
#' to use. Default: "PRGn".
#'
#' @return A base plot with the eigengene network
#' @importFrom stats hclust
#' @importFrom ComplexHeatmap pheatmap
#' @export
#' @rdname plot_eigengene_network
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' plot_eigengene_network(gcn)
plot_eigengene_network <- function(gcn, palette = "PRGn") {

    # Calculate pairwise correlations between eigengenes and get hclust object
    mes <- gcn$MEs[, !names(gcn$MEs) %in% "MEgrey"]
    me_correlation <- cor(mes)
    clusters <- hclust(as.dist(1 - me_correlation), method = "average")

    # Create column metadata and row metadata
    coldata <- data.frame(row.names = names(mes), Module = names(mes))
    rowdata <- data.frame(row.names = names(mes), Module = names(mes))

    colors <- list(
        Module = setNames(gsub("ME", "", names(mes)), names(mes))
    )

    # Plot heatmap with distances
    pal <- colorRampPalette(brewer.pal(n = 7, name = palette))(100)
    hm <- ComplexHeatmap::pheatmap(
        me_correlation, name = "Correlation",
        color = pal,
        cluster_cols = clusters,
        cluster_rows = clusters,
        main = "Pairwise correlations between module eigengenes",
        legend_breaks = seq(-1, 1, 0.5),
        breaks = seq(-1, 1, 0.02),
        annotation_row = rowdata,
        annotation_col = coldata,
        legend = TRUE,
        annotation_colors = colors
    )

    # Remove legend for row and column annotation
    hm@top_annotation@anno_list$Module@show_legend <- FALSE
    hm@left_annotation@anno_list$Module@show_legend <- FALSE

    return(hm)
}

#' Plot dendrogram of genes and modules
#'
#' @param gcn List object returned by \code{exp2gcn}.
#'
#' @return A base plot with the gene dendrogram and modules.
#' @importFrom ggdendro dendro_data
#' @importFrom ggplot2 geom_segment aes theme_minimal labs scale_x_continuous
#' labs theme coord_cartesian element_blank geom_tile scale_fill_identity
#' scale_y_discrete geom_line element_rect
#' @importFrom patchwork wrap_plots
#' @export
#' @rdname plot_dendro_and_colors
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' plot_dendro_and_colors(gcn)
plot_dendro_and_colors <- function(gcn) {

    # Create a dendrogram with ggplot2
    ## Get coordinates as a data frame
    ddata <- ggdendro::dendro_data(
        gcn$dendro_plot_objects$tree, type = "rectangle"
    )$segments

    ## To make sure lines in the y-axis do not extend to 0 and get too long
    ddata$yend[ddata$yend < 0.05] <- ddata$y[ddata$yend < 0.1] - 0.02

    ## Plot
    p_dendro <- ggplot(ddata) +
        geom_segment(aes(
            x = .data$x, y = .data$y, xend = .data$xend, yend = .data$yend
        ), linewidth = 0.3) +
        coord_cartesian(ylim = c(0, 1)) +
        theme_minimal() +
        labs(x = "", y = "Height", title = "Dendrogram of genes and modules") +
        scale_x_continuous(expand = c(0, 0)) +
        theme(
            panel.grid = element_blank(),
            axis.text.x = element_blank()
        )

    # Create a heatmap
    ## Heatmap data frame: 3 columns named `col`, `class`, and `value`
    o <- gcn$dendro_plot_objects$tree$order

    heatmap_df <- lapply(seq_along(gcn$dendro_plot_objects)[-1], function(x) {
        modules <- gcn$dendro_plot_objects[[x]][o]
        setname <- names(gcn$dendro_plot_objects)[x]
        df <- data.frame(
            col = factor(modules, levels = unique(modules)),
            class = setname,
            value = seq_along(modules)
        )
        return(df)
    })
    heatmap_df <- Reduce(rbind, heatmap_df)

    ## Create a data with coordinates for lines between rows of the heatmap
    nrows <- length(unique(heatmap_df$class))
    ngenes <- length(unique(heatmap_df$value))
    line_df <- data.frame(
        x = c(0, ngenes) + 0.5,
        y = rep(2:nrows, each = 2) - 0.5
    )

    ## Plot
    p_hm <- ggplot(heatmap_df) +
        geom_tile(aes(x = .data$value, y = .data$class, fill = .data$col)) +
        scale_fill_identity() +
        theme_minimal() +
        labs(x = "", y = "") +
        theme(
            panel.background = element_rect(fill = "black"),
            axis.text.x = element_blank(),
            panel.grid = element_blank()
        ) +
        scale_x_continuous(expand = c(0, 0)) +
        scale_y_discrete(expand = c(0.2, 0.2)) +
        geom_line(data = line_df, aes(x = .data$x, y = .data$y, group = .data$y))

    ## Combine plots
    heights <- c(3, 1)
    if(length(gcn$dendro_plot_objects) > 3) { heights <- c(2, 1) }
    p_combined <- wrap_plots(p_dendro, p_hm, nrow = 2, heights = heights)

    return(p_combined)
}

#' Perform module stability analysis
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param net List object returned by \code{exp2gcn}.
#' @param nRuns Number of times to resample. Default is 20.
#'
#' @return A base plot with the module stability results.
#' @seealso
#'  \code{\link[WGCNA]{sampledBlockwiseModules}}
#' @rdname module_stability
#' @export
#' @importFrom WGCNA sampledBlockwiseModules matchLabels
#' @examples
#' data(filt.se)
#' filt <- filt.se[1:100, ] # reducing even further for testing purposes
#' # The SFT fit was previously calculated and the optimal power was 16
#' gcn <- exp2gcn(filt, SFTpower = 16, cor_method = "pearson")
#' # For simplicity, only 2 runs
#' module_stability(exp = filt, net = gcn, nRuns = 2)
module_stability <- function(exp, net, nRuns = 20) {

    norm.exp <- handleSE(exp)
    expr <- as.matrix(t(norm.exp))
    cor_method <- net$params$cor_method
    if(cor_method == "biweight") { cor_method <- "bicor" }

    # Infer modules for original and resampled data n times (n = nRuns)
    mods0 <- WGCNA::sampledBlockwiseModules(
        nRuns = nRuns,
        replace = FALSE,
        datExpr = expr,
        maxBlockSize = 5000,
        checkSoftPower = FALSE,
        corType = cor_method,
        networkType = net$params$net_type,
        TOMType = guess_tom(net$params$net_type),
        TOMDenom = "mean",
        mergeCutHeight = 1 - net$params$module_merging_threshold,
        reassignThreshold = 0,
        numericLabels = FALSE,
        checkMissingData = FALSE,
        quickCor = 0, verbose = 2
    )
    nGenes <- ncol(expr)

    # Create a matrix of labels for the original and all resampling runs
    labels <- matrix(0, nGenes, nRuns + 1)
    labels[, 1] <- mods0[[1]]$mods$colors

    ## Relabel modules in each of the resampling runs
    labs <- Reduce(cbind, lapply(2:(nRuns+1), function(x) {
        labels[, x] <- WGCNA::matchLabels(mods0[[x-1]]$mods$colors, labels[, 1])
    }))
    labels <- cbind(labels[,1], labs)
    colnames(labels) <- c("Original", paste0("Resampling ", seq_len(nRuns)))

    ## Convert matrix to list of vectors
    label_list <- as.list(as.data.frame(labels))

    # Simulate the output of `exp2gcn()` for plotting
    sim_gcn <- list(
        dendro_plot_objects = c(
            list(tree = mods0[[1]]$mods$dendrograms[[1]]), label_list
        )
    )

    # Plot dendrogram of genes and modules in each run
    p <- plot_dendro_and_colors(sim_gcn)

    return(p)
}

#' Correlate module eigengenes to trait
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param metadata A data frame containing sample names in row names and
#' sample annotation in the first column. Ignored if `exp` is
#' a `SummarizedExperiment` object, since the function will extract colData.
#' @param MEs Module eigengenes. It is the 2nd element of the result list
#' generated by the function \code{exp2gcn}.
#' @param metadata_cols A vector (either numeric or character) indicating
#' which columns should be extracted from column metadata if \strong{exp}
#' is a `SummarizedExperiment` object. The vector can contain column
#' indices (numeric) or column names (character). By default, all columns are
#' used.
#' @param cor_method Method to calculate correlation. One of 'pearson',
#' 'spearman' or 'kendall'. Default is 'spearman'.
#'
#' @return A data frame with correlation and correlation p-values for each pair
#' of ME and trait, with the following variables:
#' \describe{
#'   \item{ME}{Factor, module eigengene.}
#'   \item{trait}{Factor, trait name. Each trait corresponds to a variable
#'                of the sample metadata (if numeric) or levels of a variable
#'                (if categorical).}
#'   \item{cor}{Numeric, correlation.}
#'   \item{pvalue}{Numeric, correlation P-values.}
#'   \item{group}{Character, name of the metadata variable.}
#' }
#'
#' @author Fabricio Almeida-Silva
#' @rdname module_trait_cor
#' @export
#' @importFrom WGCNA corPvalueStudent
#' @importFrom reshape2 melt
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' module_trait_cor(filt.se, MEs = gcn$MEs)
module_trait_cor <- function(exp, metadata, MEs, metadata_cols = NULL,
                             cor_method = "pearson") {

    if(is(exp, "SummarizedExperiment")) {
        metadata <- se2metadata(exp, coldata_cols = metadata_cols)$coldata
    }

    exp <- handleSE(exp)
    metadata <- metadata[colnames(exp), , drop = FALSE]

    # Get data frame of module-trait correlations and P-values for each variable
    cor_df <- Reduce(rbind, lapply(seq_along(metadata), function(x) {

        ## Get matrices of correlations, P-values, and P-value symbols
        trait <- get_model_matrix(metadata, column_idx = x)
        cor_matrix <- cor(as.matrix(MEs), trait, use = "p", method = cor_method)
        pvalues <- WGCNA::corPvalueStudent(cor_matrix, nSamples = ncol(exp))

        ## Reshape to long format and merge data frames into one
        v <- c("ME", "trait")
        cor_long <- reshape2::melt(cor_matrix, value.name = "cor", varnames = v)
        p_long <- reshape2::melt(pvalues, value.name = "pvalue", varnames = v)

        final_df <- merge(cor_long, p_long)
        final_df$group <- names(metadata)[x]

        return(final_df)
    }))

    return(cor_df)
}

#' Plot a heatmap of module-trait correlations
#'
#' @param corandp A data frame of module-trait correlations as returned
#' by \code{module_trait_cor()}.
#' @param palette Character indicating which RColorBrewer palette to use.
#' Default: 'RdYlBu'.
#' @param transpose Logical indicating whether to transpose the heatmap
#' or not.
#'
#' @details Significance levels:
#' 1 asterisk: significant at alpha = 0.05.
#' 2 asterisks: significant at alpha = 0.01.
#' 3 asterisks: significant at alpha = 0.001.
#' no asterisk: not significant.
#'
#' @return A `Heatmap` object created by \code{ComplexHeatmap::pheatmap()}.
#'
#' @export
#' @rdname plot_module_trait_cor
#' @importFrom reshape2 dcast
#' @importFrom RColorBrewer brewer.pal
#' @importFrom ComplexHeatmap pheatmap
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' corandp <- module_trait_cor(filt.se, MEs = gcn$MEs)
#' plot_module_trait_cor(corandp)
plot_module_trait_cor <- function(corandp, palette = "RdYlBu",
                                  transpose = FALSE) {

    # Get a wide matrix of correlations
    cormat <- reshape2::dcast(corandp, ME ~ trait, value.var = "cor")
    rownames(cormat) <- cormat$ME
    cormat$ME <- NULL
    cormat <- as.matrix(cormat)
    cormat[is.na(cormat)] <- 0

    # Create a matrix of correlations and P-value symbols to display
    ## Get a matrix of P-values and convert values to symbols
    pmat <- reshape2::dcast(corandp, ME ~ trait, value.var = "pvalue")
    rownames(pmat) <- pmat$ME
    pmat$ME <- NULL
    pmat <- pval2symbol(as.matrix(pmat))
    pmat[is.na(pmat)] <- ""

    ## Get a matrices of correlations and P-values
    textmat <- paste(signif(cormat, 2), pmat, sep = "")
    dim(textmat) <- dim(cormat)

    # Get row and column annotation to decorate the heatmap
    rowdata <- data.frame(row.names = rownames(cormat), Module = rownames(cormat))
    coldata <- corandp[!duplicated(corandp$trait), ]
    coldata <- data.frame(row.names = coldata$trait, Trait = coldata$group)

    # Create a list of named vectors with colors to use in the heatmap
    cols <- list(
        Module = setNames(gsub("ME", "", rownames(cormat)), rownames(cormat)),
        Trait = setNames(
            custom_palette(1)[seq_along(unique(corandp$group))],
            unique(corandp$group)
        )
    )

    # Plot heatmap
    pal <- colorRampPalette(rev(RColorBrewer::brewer.pal(10, palette)))(100)

    hm <- ComplexHeatmap::pheatmap(
        mat = if(transpose) t(cormat) else cormat,
        name = "Correlation",
        color = pal,
        display_numbers = if(transpose) t(textmat) else textmat,
        main = "Module-trait correlations",
        legend_breaks = seq(-1, 1, 0.5),
        breaks = seq(-1, 1, 0.02),
        annotation_row = if(transpose) coldata else rowdata,
        annotation_col = if(transpose) rowdata else coldata,
        annotation_colors = cols,
        row_split = if(transpose) coldata$Trait else NULL,
        column_split = if(transpose) NULL else coldata$Trait
    )

    # Remove legend for module colors
    if(transpose) {
        hm@top_annotation@anno_list$Module@show_legend <- FALSE
    } else {
        hm@left_annotation@anno_list$Module@show_legend <- FALSE
    }

    return(hm)
}


#' Calculate gene significance for a given group of genes
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param metadata A data frame containing sample names in row names and
#' sample annotation in the first column. Ignored if `exp` is
#' a `SummarizedExperiment` object, since the function will extract colData.
#' @param metadata_cols A vector (either numeric or character) indicating
#' which columns should be extracted from column metadata if \strong{exp}
#' is a `SummarizedExperiment` object. The vector can contain column
#' indices (numeric) or column names (character). By default, all columns are
#' used.
#' @param genes Character vector of genes to be correlated with traits.
#' If not given, all genes in `exp` will be considered.
#' @param alpha Significance level. Default is 0.05.
#' @param cor_method Method to calculate correlation. One of 'pearson',
#' 'spearman' or 'kendall'. Default is 'spearman'.
#' @param min_cor Minimum correlation coefficient. Default is 0.2.
#' @param use_abs Logical indicating whether to filter by correlation using
#' absolute value or not. If TRUE, a \code{min_cor} of say 0.2 would keep all
#' correlations above 0.2 and below -0.2. Default is TRUE.
#'
#' @return A data frame with correlation and correlation p-values for each pair
#' of gene and trait, with the following variables:
#' \describe{
#'   \item{gene}{Factor, gene ID.}
#'   \item{trait}{Factor, trait name. Each trait corresponds to a variable
#'                of the sample metadata (if numeric) or levels of a variable
#'                (if categorical).}
#'   \item{cor}{Numeric, correlation.}
#'   \item{pvalue}{Numeric, correlation P-values.}
#'   \item{group}{Character, name of the metadata variable.}
#' }
#'
#' @author Fabricio Almeida-Silva
#' @rdname gene_significance
#' @export
#' @importFrom WGCNA corPvalueStudent
#' @importFrom reshape2 melt
#' @examples
#' data(filt.se)
#' gs <- gene_significance(filt.se)
gene_significance <- function(
        exp, metadata, metadata_cols = NULL, genes = NULL, alpha = 0.05,
        cor_method = "pearson", min_cor = 0.2, use_abs = TRUE
) {

    if(is(exp, "SummarizedExperiment")) {
        metadata <- se2metadata(exp, coldata_cols = metadata_cols)$coldata
    }

    # Make sure samples in expression matrix and sample metadata match
    exp <- handleSE(exp)
    metadata <- metadata[colnames(exp), , drop = FALSE]
    final_exp <- exp[, rownames(metadata)]

    # Use only a subset of genes?
    if(!is.null(genes)) {
        final_exp <- final_exp[genes, , drop = FALSE]
    }

    # Get data frame of gene-trait correlations and P-values for each variable
    cor_df <- Reduce(rbind, lapply(seq_along(metadata), function(x) {

        ## Get matrices of correlations, P-values, and P-value symbols
        trait <- get_model_matrix(metadata, column_idx = x)
        cor_matrix <- cor(
            as.matrix(t(exp)), trait, use = "p", method = cor_method
        )
        pvalues <- WGCNA::corPvalueStudent(cor_matrix, nSamples = ncol(exp))

        ## Reshape to long format and merge data frames into one
        v <- c("gene", "trait")
        cor_long <- reshape2::melt(cor_matrix, value.name = "cor", varnames = v)
        p_long <- reshape2::melt(pvalues, value.name = "pvalue", varnames = v)

        final_df <- merge(cor_long, p_long)
        final_df$group <- names(metadata)[x]

        return(final_df)
    }))

    # Filter by correlation and P-value
    if(use_abs) {
        cor_df <- cor_df[cor_df$pvalue < alpha & abs(cor_df$cor) >= min_cor, ]
    } else {
        cor_df <- cor_df[cor_df$pvalue < alpha & cor_df$cor >= min_cor, ]
    }

    return(cor_df)
}


#' Plot a heatmap of gene significance
#'
#' @param corandp A data frame of gene-trait correlations as returned
#' by \code{gene_significance()}.
#' @param palette Character indicating which RColorBrewer palette to use.
#' Default: 'RdYlBu'.
#' @param transpose Logical indicating whether to transpose the heatmap
#' or not.
#' @param ... Additional arguments to \code{ComplexHeatmap::pheatmap()}.
#'
#' @details Significance levels:
#' 1 asterisk: significant at alpha = 0.05.
#' 2 asterisks: significant at alpha = 0.01.
#' 3 asterisks: significant at alpha = 0.001.
#' no asterisk: not significant.
#'
#' @return A `Heatmap` object created by \code{ComplexHeatmap::pheatmap()}.
#'
#' @export
#' @rdname plot_gene_significance
#' @importFrom reshape2 dcast
#' @importFrom RColorBrewer brewer.pal
#' @importFrom ComplexHeatmap pheatmap
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' corandp <- gene_significance(filt.se)
#' plot_gene_significance(corandp, show_rownames = FALSE)
plot_gene_significance <- function(
        corandp, palette = "RdYlBu", transpose = FALSE, ...
) {

    # Get a wide matrix of correlations
    cormat <- reshape2::dcast(corandp, gene ~ trait, value.var = "cor")
    rownames(cormat) <- cormat$gene
    cormat$gene <- NULL
    cormat <- as.matrix(cormat)
    cormat[is.na(cormat)] <- 0

    # Create a matrix of correlations and P-value symbols to display
    ## Get a matrix of P-values and convert values to symbols
    pmat <- reshape2::dcast(corandp, gene ~ trait, value.var = "pvalue")
    rownames(pmat) <- pmat$gene
    pmat$gene <- NULL
    pmat <- pval2symbol(as.matrix(pmat))
    pmat[is.na(pmat)] <- ""

    ## Get a matrices of correlations and P-values
    textmat <- paste(signif(cormat, 2), pmat, sep = "")
    dim(textmat) <- dim(cormat)

    # Get column annotation to decorate the heatmap
    coldata <- corandp[!duplicated(corandp$trait), ]
    coldata <- data.frame(row.names = coldata$trait, Trait = coldata$group)
    coldata <- coldata[colnames(cormat), , drop = FALSE]

    # Create a list of named vectors with colors to use in the heatmap
    cols <- list(
        Trait = setNames(
            custom_palette(1)[seq_along(unique(corandp$group))],
            unique(corandp$group)
        )
    )

    # Plot heatmap
    pal <- colorRampPalette(rev(RColorBrewer::brewer.pal(10, palette)))(100)

    hm <- ComplexHeatmap::pheatmap(
        mat = if(transpose) t(cormat) else cormat,
        name = "Correlation",
        color = pal,
        display_numbers = FALSE,
        main = "Gene-trait correlations",
        legend_breaks = seq(-1, 1, 0.5),
        breaks = seq(-1, 1, 0.02),
        annotation_row = if(transpose) coldata else NA,
        annotation_col = if(transpose) NA else coldata,
        annotation_colors = cols,
        row_split = if(transpose) coldata$Trait else NULL,
        column_split = if(transpose) NULL else coldata$Trait,
        ...
    )

    return(hm)
}


#' Get GCN hubs
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param net List object returned by \code{exp2gcn}.
#'
#' @return Data frame containing gene IDs, modules and
#' intramodular connectivity of all hubs.
#' @author Fabricio Almeida-Silva
#' @seealso
#'  \code{\link[WGCNA]{signedKME}}
#' @rdname get_hubs_gcn
#' @export
#' @importFrom WGCNA signedKME
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' hubs <- get_hubs_gcn(filt.se, gcn)
get_hubs_gcn <- function(exp, net) {

    exp <- handleSE(exp)
    cor_method <- net$params$cor_method
    genes_modules <- net$genes_and_modules
    MEs <- net$MEs
    kIN <- net$kIN[, 2, drop=FALSE]

    # Add kIN info
    genes_modulesk <- merge(
        genes_modules, kIN, by.x = "Genes", by.y = "row.names"
    )

    # Calculate kME
    if(cor_method == "spearman") {
        MM <- WGCNA::signedKME(
            t(exp), MEs, outputColumnName = "",
            corOptions = "use = 'p', method = 'spearman'"
        )
    } else if(cor_method == "pearson") {
        MM <- WGCNA::signedKME(t(exp), MEs, outputColumnName = "")
    } else if(cor_method == "biweight") {
        MM <- WGCNA::signedKME(
            t(exp), MEs, corFnc = "bicor", outputColumnName = "",
            corOptions = "maxPOutliers = 0.1"
        )
    }

    # Add kME info
    kme_info <- merge(genes_modulesk, MM, by.x="Genes", by.y="row.names")

    # Keep top 10% genes with the highest degree for each module
    genes_mod_list <- split(kme_info, kme_info[,2])
    genes_mod_list$grey <- NULL
    top_10 <- lapply(genes_mod_list, function(x) {
        indices <- round(nrow(x) * 0.1)
        return(x[order(x$kWithin, decreasing=TRUE), ][seq_len(indices), ])
    })

    # Pick genes from the top 10% degree with kME above 0.8
    hubs <- Reduce(rbind, lapply(top_10, function(x) {
        cols <- c("Genes", "Modules", "kWithin", unique(x$Modules))
        h <- x[, cols]
        h <- h[h[,4] > 0.8, c(1,2,3)]
    }))
    rownames(hubs) <- seq_len(nrow(hubs))
    colnames(hubs) <- c("Gene", "Module", "kWithin")
    return(hubs)
}


#' Perform overrepresentation analysis for a set of genes
#'
#' @param genes Character vector containing genes for which
#' overrepresentation will be tested.
#' @param genesets Named list of character vectors, where list names
#' represent functional categories (e.g., GO, pathway, etc.), and vectors
#' represent their associated genes.
#' @param universe Character vector of genes to be used as universe.
#' @param adj Character indicating the multiple testing correction method
#' as in \code{stats::p.adjust()}.
#'
#' @return A data frame of overrepresentation results with the following
#' variables:
#' \describe{
#'   \item{term}{character, functional term ID/name.}
#'   \item{genes}{numeric, intersection length between input genes and genes
#'                in a particular functional term.}
#'   \item{all}{numeric, number of all genes in a particular functional term.}
#'   \item{pval}{numeric, P-value for the hypergeometric test.}
#'   \item{padj}{numeric, P-value adjusted for multiple comparisons using
#'               the method specified in parameter \strong{adj}.}
#' }
#'
#' @importFrom stats p.adjust phyper
#' @noRd
ora <- function(
        genes, genesets, universe, adj = "BH"
) {

    # Remove genes that are not in `universe`
    genes <- intersect(genes, universe)
    gene_sets <- lapply(genesets, function(x) intersect(x, universe))

    # Define universe and input genes
    n_universe <- length(universe)
    n_genes <- length(genes)

    # Define variables of the hypergeometric test
    x <- vapply(gene_sets, function(x) length(intersect(x, genes)), numeric(1))
    m <- vapply(gene_sets, length, numeric(1))
    n <- n_universe - m
    k <- n_genes

    # Get P-values from a hypergeometric test
    pvals <- phyper(x - 1, m, n, k, lower.tail = FALSE)

    # Store results in a data frame
    results <- data.frame(
        term = names(pvals),
        genes = x,
        all = m,
        pval = pvals,
        row.names = NULL
    )

    results$padj <- p.adjust(results$pval, method = adj)

    return(results)
}


#' Perform overrepresentation analysis for a set of genes
#'
#' @param genes Character vector containing genes for overrepresentation
#' analysis.
#' @param background_genes Character vector of genes to be used as background
#' for the overrepresentation analysis.
#' @param annotation Annotation data frame with genes in the first column and
#' functional annotation in the other columns. This data frame can be exported
#' from Biomart or similar databases.
#' @param column Column or columns of \strong{annotation} to be used for
#' enrichment.
#' Both character or numeric values with column indices can be used.
#' If users want to supply more than one column, input a character or
#' numeric vector. Default: all columns from \strong{annotation}.
#' @param correction Multiple testing correction method. One of "holm",
#' "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" or "none".
#' Default is "BH".
#' @param p P-value threshold. P-values below this threshold will be
#' considered significant. Default: 0.05.
#' @param min_setsize Numeric indicating the minimum gene set size to be
#' considered. Gene sets correspond to levels of each variable
#' in \strong{annotation}). Default: 10.
#' @param max_setsize Numeric indicating the maximum gene set size to be
#' considered. Gene sets correspond to levels of each variable
#' in \strong{annotation}). Default: 500.
#' @param bp_param BiocParallel back-end to be used.
#' Default: BiocParallel::SerialParam()
#'
#' @return A data frame of overrepresentation results with the following
#' variables:
#' \describe{
#'   \item{term}{character, functional term ID/name.}
#'   \item{genes}{numeric, intersection length between input genes and genes
#'                in a particular functional term.}
#'   \item{all}{numeric, number of all genes in a particular functional term.}
#'   \item{pval}{numeric, P-value for the hypergeometric test.}
#'   \item{padj}{numeric, P-value adjusted for multiple comparisons using
#'               the method specified in parameter \strong{adj}.}
#'   \item{category}{character, name of the grouping variable (i.e., column
#'                   name of \strong{annotation}).}
#' }
#'
#' @author Fabricio Almeida-Silva
#' @rdname enrichment_analysis
#' @export
#' @importFrom BiocParallel bplapply
#' @examples
#' \donttest{
#' data(filt.se)
#' data(zma.interpro)
#' genes <- rownames(filt.se)[1:50]
#' background_genes <- rownames(filt.se)
#' annotation <- zma.interpro
#' # Using p = 1 to show all results
#' enrich <- enrichment_analysis(genes, background_genes, annotation, p = 1)
#' }
enrichment_analysis <- function(
        genes, background_genes, annotation, column = NULL,
        correction = "BH", p = 0.05, min_setsize = 10, max_setsize = 500,
        bp_param = BiocParallel::SerialParam()
) {

    names(annotation)[1] <- "Gene"

    # Filtered `annotation` data frame to keep only specified columns
    col <- names(annotation)
    if(!is.null(column)) {
        col <- if(is.numeric(column)) c(1, column) else c("Gene", column)
    }
    fannot <- annotation[, col]

    # Get a data frame of enrichment results for each column
    enrichment <- bplapply(seq_along(fannot)[-1], function(x) {

        ## Remove missing values (unannotated genes for a particular category)
        fannot[, x][is.na(fannot[, x])] <- "unannotated"

        ## Split variable into a named list
        gene_sets <- split(fannot[, 1], fannot[, x])
        gene_sets <- gene_sets[!duplicated(gene_sets)]
        gene_sets <- gene_sets[names(gene_sets) != "unannotated"]

        ## Keep only element with `min_setsize` <= n <= `max_setsize` elements
        set_sizes <- as.numeric(lengths(gene_sets))
        keep <- which(set_sizes >= min_setsize & set_sizes <= max_setsize)
        gene_sets <- gene_sets[keep]

        ## Get ORA results and add `category` column
        ora_df <- ora(genes, gene_sets, background_genes, correction)

        if(nrow(ora_df) > 0) {
            ora_df$category <- names(fannot)[x]

            ## Filter non-significant observations out
            ora_df <- ora_df[ora_df$padj <= p, ]
        }

    }, BPPARAM = bp_param)

    enrichment <- Reduce(rbind, enrichment)

    return(enrichment)
}

#' Perform enrichment analysis for coexpression network modules
#'
#' @param net List object returned by \code{exp2gcn}.
#' @param background_genes Character vector of genes to be used as background
#' for the Fisher's Exact Test.
#' @param annotation Annotation data frame with genes in the first column
#' and functional annotation in the other columns. This data frame can
#' be exported from Biomart or similar databases.
#' @param column Column or columns of \code{annotation} to be used for enrichment.
#' Both character or numeric values with column indices can be used.
#' If users want to supply more than one column, input a character or
#' numeric vector. Default: all columns from \code{annotation}.
#' @param correction Multiple testing correction method. One of "holm",
#' "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" or "none".
#' Default is "BH".
#' @param p P-value threshold. P-values below this threshold will be considered
#' significant. Default is 0.05.
#' @param min_setsize Numeric indicating the minimum gene set size to be
#' considered. Gene sets correspond to levels of each variable
#' in \strong{annotation}). Default: 10.
#' @param max_setsize Numeric indicating the maximum gene set size to be
#' considered. Gene sets correspond to levels of each variable
#' in \strong{annotation}). Default: 500.
#' @param bp_param BiocParallel back-end to be used.
#' Default: BiocParallel::SerialParam()
#'
#' @return A data frame of overrepresentation results with the following
#' variables:
#' \describe{
#'   \item{term}{character, functional term ID/name.}
#'   \item{genes}{numeric, intersection length between input genes and genes
#'                in a particular functional term.}
#'   \item{all}{numeric, number of all genes in a particular functional term.}
#'   \item{pval}{numeric, P-value for the hypergeometric test.}
#'   \item{padj}{numeric, P-value adjusted for multiple comparisons using
#'               the method specified in parameter \strong{adj}.}
#'   \item{category}{character, name of the grouping variable (i.e., column
#'                   name of \strong{annotation}).}
#'   \item{module}{character, module name.}
#' }
#'
#' @author Fabricio Almeida-Silva
#'
#' @rdname module_enrichment
#' @importFrom BiocParallel bplapply SerialParam
#' @export
#' @examples
#' \donttest{
#' data(filt.se)
#' data(zma.interpro)
#' background <- rownames(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' mod_enrich <- module_enrichment(gcn, background, zma.interpro, p=1)
#' }
module_enrichment <- function(
        net = NULL, background_genes, annotation, column = NULL,
        correction = "BH", p = 0.05, min_setsize = 10, max_setsize = 500,
        bp_param = BiocParallel::SerialParam()
) {

    # Create a list of data frames with columns `Genes` and `Modules`
    genes.modules <- net$genes_and_modules
    list.gmodules <- split(genes.modules, genes.modules$Modules)
    list.gmodules <- list.gmodules[names(list.gmodules) != "grey"]

    enrichment_all <- bplapply(seq_along(list.gmodules), function(x) {

        module <- names(list.gmodules)[x]
        message("Enrichment analysis for module ", module, "...")

        # Perform enrichment analysis for {module}
        l <- enrichment_analysis(
            genes = as.character(list.gmodules[[x]][, 1]),
            background_genes = background_genes,
            annotation = annotation,
            column = column,
            correction = correction, p = p,
            min_setsize = min_setsize, max_setsize = max_setsize
        )

        # Either add a column `module` or set `l` to NULL
        if(!is.null(l)) {
            if(nrow(l) > 0) {
                l$module <- module
            } else {
                l <- NULL
            }
        }

        return(l)
    }, BPPARAM = bp_param)

    enrichment_all <- Reduce(rbind, enrichment_all)

    return(enrichment_all)
}


#' Get 1st-order neighbors of a given gene or group of genes
#'
#' @param genes Character vector containing genes from which
#' direct neighbors will be extracted.
#' @param net List object returned by \code{exp2gcn}.
#' @param cor_threshold Correlation threshold to filter connections.
#' As a weighted network is a fully connected graph, a cutoff must be selected.
#' Default is 0.7.
#'
#' @return List containing 1st-order neighbors for each input gene.
#' @seealso \code{exp2gcn} \code{SFT_fit}
#' @author Fabricio Almeida-Silva
#' @export
#' @rdname get_neighbors
#' @examples
#' data(filt.se)
#' genes <- rownames(filt.se)[1:10]
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' neighbors <- get_neighbors(genes, gcn)
get_neighbors <- function(genes, net, cor_threshold = 0.7) {

    net_type <- net$params$net_type
    power <- net$params$SFTpower
    edges <- net$correlation_matrix

    edges <- cormat_to_edgelist(edges)
    colnames(edges) <- c("Gene1", "Gene2", "Weight")

    filt_edges <- edges[edges$Gene1 %in% genes | edges$Gene2 %in% genes, ]
    if(net_type == "signed") {
        filt_edges <- filt_edges[abs(filt_edges$Weight) >= cor_threshold, ]
    } else {
        filt_edges <- filt_edges[filt_edges$Weight >= cor_threshold, ]
    }

    result_list <- lapply(genes, function(x) {
        y <- filt_edges[rowSums(filt_edges == x) > 0, ]
        y <- c(as.character(y$Gene1), as.character(y$Gene2))
        y <- unique(y[y != x])
        return(y)
    })
    names(result_list) <- genes
    return(result_list)
}


#' Get edge list from an adjacency matrix for a group of genes
#'
#' @param net List object returned by \code{exp2gcn}.
#' @param genes Character vector containing a subset of genes from which
#' edges will be extracted. It can be ignored if the user wants to extract
#' an edge list for a given module instead of individual genes.
#' @param module Character with module name from which edges will be extracted.
#' To include 2 or more modules, input the names in a character vector.
#' @param filter Logical indicating whether to filter the edge list or not.
#' @param method Method to filter spurious correlations. One of "Zscore",
#' "optimalSFT", "pvalue" or "min_cor". See details for more information
#' on the methods. Default: 'optimalSFT'
#' @param r_optimal_test Numeric vector with the correlation thresholds
#' to be tested for optimal scale-free topology fit. Only valid
#' if \code{method} equals "optimalSFT". Default: seq(0.4, 0.9, by = 0.1)
#' @param Zcutoff Minimum Z-score threshold. Only valid if
#'  \code{method} equals "Zscore". Default: 1.96
#' @param pvalue_cutoff Maximum P-value threshold. Only valid
#' if \code{method} equals "pvalue". Default: 0.05
#' @param rcutoff Minimum correlation threshold. Only valid
#' if \code{method} equals "min_cor". Default: 0.7
#' @param nSamples Number of samples in the data set from which
#' the correlation matrix was calculated. Only required
#' if \code{method} equals "pvalue".
#' @param check_SFT Logical indicating whether to test if the resulting network
#' is close to a scale-free topology or not. Default: FALSE.
#' @param bp_param BiocParallel back-end to be used.
#' Default: BiocParallel::SerialParam()
#'
#' @return Data frame with edge list for the input genes.
#' @details The default method ("optimalSFT") will create several different
#' edge lists by filtering the original correlation matrix by the thresholds
#' specified in \code{r_optimal_test}. Then, it will calculate a scale-free
#' topology fit index for each of the possible networks and return the network
#' that best fits the scale-free topology.
#' The method "Zscore" will apply a Fisher Z-transformation for the correlation
#' coefficients and remove the Z-scores below the threshold specified
#' in \code{Zcutoff}.
#' The method "pvalue" will calculate Student asymptotic p-value for the
#' correlations and remove correlations whose p-values are above the threshold
#' specified in \code{pvalue_cutoff}.
#' The method "min_cor" will remove correlations below the minimum correlation
#' threshold specified in \code{rcutoff}.
#' @seealso
#'  \code{\link[WGCNA]{scaleFreeFitIndex}}
#' @seealso \code{SFT_fit}
#' @seealso \code{exp2gcn}.
#' @author Fabricio Almeida-Silva
#' @rdname get_edge_list
#' @export
#' @importFrom WGCNA scaleFreeFitIndex corPvalueStudent
#' @importFrom ggplot2 theme geom_point geom_line theme_bw
#' @importFrom BiocParallel bplapply SerialParam
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' genes <- rownames(filt.se)[1:50]
#' edges <- get_edge_list(gcn, genes=genes, filter = FALSE)
get_edge_list <- function(
        net, genes = NULL, module = NULL, filter = FALSE,
        method = "optimalSFT", r_optimal_test = seq(0.4, 0.9, by = 0.1),
        Zcutoff = 1.96, pvalue_cutoff = 0.05, rcutoff = 0.7, nSamples = NULL,
        check_SFT = FALSE, bp_param = BiocParallel::SerialParam()
) {

    # Define objects containing correlation matrix and data frame of genes and modules
    cor_matrix <- net$correlation_matrix

    # Should we extract genes in a module?
    if(!is.null(module)) {
        genes_modules <- net$genes_and_modules
        keep <- genes_modules[genes_modules$Modules %in% module, 1]
        cor_matrix <- cor_matrix[keep, keep]
    }

    # Should we extract a user-defined gene set?
    if(!is.null(genes)) {
        cor_matrix <- cor_matrix[genes, genes]
    }

    # Should we filter the matrix?
    if(filter) {
        # Create edge list from correlation matrix
        edges <- cormat_to_edgelist(cor_matrix)
        colnames(edges) <- c("Gene1", "Gene2", "Weight")

        if(method == "Zscore") {
            # Apply Fisher-Z transformation to correlation values
            edgesZ <- edges
            edgesZ$Weight <- 0.5 * log((1+edges$Weight) / (1-edges$Weight))

            edgelist <- edgesZ[edgesZ$Weight >= Zcutoff, ]
        } else if(method == "optimalSFT") {
            cutoff <- r_optimal_test

            # Create list of 2 elements: edge lists, each with a different correlation threshold, and degree
            list_mat <- replicate(length(cutoff), cor_matrix, simplify = FALSE)

            list_cormat_filtered <- BiocParallel::bplapply(seq_along(list_mat), function(x) {
                # Convert r values below threshold to NA and set diagonals to 0
                matrix <- list_mat[[x]]
                matrix[matrix < cutoff[x] ] <- NA
                diag(matrix) <- 0

                # Calculate degree
                degree <- rowSums(matrix, na.rm=TRUE)

                # Convert lower triangle and diagonals to NA
                matrix[lower.tri(matrix, diag=TRUE)] <- NA

                # Convert symmetrical matrix to edge list (Gene1, Gene2, Weight)
                matrix <- na.omit(data.frame(as.table(matrix)))
                result <- list(matrix = matrix, degree = degree)
                return(result)
            }, BPPARAM = bp_param)

            degree_list <- lapply(list_cormat_filtered, function(x) return(x[[2]]))

            # Calculate scale-free topology
            sft.rsquared <- unlist(lapply(degree_list, function(x) WGCNA::scaleFreeFitIndex(x)$Rsquared.SFT))
            max.index <- which.max(sft.rsquared)

            # Plot scale-free topology fit for r values
            plot.data <- data.frame(x = cutoff, y = sft.rsquared)

            plot <- ggplot(plot.data, aes(x = .data$x, y = .data$y, group = 1)) +
                geom_point(color = "firebrick", size = 4) +
                geom_line(color = "firebrick", linewidth = 2) +
                labs(
                    x = "Correlation (r) values",
                    y = expression(paste("Scale-free topology fit - ", R^{2})),
                    title = "Scale-free topology fit for given r values"
                ) +
                theme_bw()

            plot
            optimalr <- cutoff[max.index]
            message("The correlation threshold that best fits the scale-free topology is ", optimalr)
            edgelist <- list_cormat_filtered[[max.index]][[1]]
        } else if(method == "pvalue") {
            if(is.null(nSamples)) {
                stop("Please, specify the number of samples used to calculate the correlation matrix")
            }

            # Calculate Student asymptotic p-value for given correlations
            cor.pvalue <- WGCNA::corPvalueStudent(edges$Weight, nSamples)

            # Create a data frame of correlations and p-values
            corandp <- edges
            corandp$pvalue <- cor.pvalue

            # Create a final edge list containing only significant correlations
            edgelist <- corandp[corandp$pvalue < pvalue_cutoff, c(1,2,3)]

        } else if(method == "min_cor") {
            edgelist <- edges[edges$Weight >= rcutoff, ]

        } else{
            stop("Please, specify a valid filtering method.")
        }

    } else {
        # Create edge list from correlation matrix without filtering
        edgelist <- cormat_to_edgelist(cor_matrix)
        colnames(edgelist) <- c("Gene1", "Gene2", "Weight")
    }

    # Check scale-free topology fit
    if(check_SFT) {
        test <- check_SFT(edgelist, net_type="gcn")
    }

    return(edgelist)
}
almeidasilvaf/BioNERO documentation built on March 25, 2024, 9:14 p.m.