MoDentify: Phenotype-driven module identification

Documented in identifyModules

#' Module Identification
#'
#'
#' @param graph an \code{\link[igraph]{igraph}} object, which can be generated with \code{\link{generateNetwork}}.
#' The ID of the nodes must correspond to the name of the variables.
#' @param data either a matrix, where the columns correspond to the variables and the rows to the observations.
#' Or a \code{\link[data.table]{data.table}} with three columns: name, sampleID and value.
#' @param phenotype a vector with the values for a phenotype of interest.
#' It must have the same number of observations as data.
#' @param covars a \code{\link[data.table]{data.table}} containing the covariates
#' to correct for.
#' The rows for the samples must be in the same order as in the phenotype vector.
#' @param annotations a \code{\link[data.table]{data.table}} containing annotations for the variables.
#' The columns correspond to the different annotations, the rows to the variables.
#' @param merge.overlapping if \code{TRUE}, overlapping modules will be merged.
#' @param better.than.components if \code{TRUE}, modules will only be enlarged and accepted,
#' if they are better than all of their components.
#' @param alpha significance level for accepting the modules.
#' @param level Must be set to the name of the column to be used, if modules should be calculated for pathways.
#' @param pathway.column
#' @param representative.method the method, that is used for the calculation of the module representation.
#' Currently implemented: "eigenmetabolite" and "average"
#' @param correction.method the method that used for multiple testing correction ("bonferroni", "BH", "BY", "fdr", "holm", "hochberg", "hommel", "none").
#' Default is set to bonferroni. See \code{\link[stats]{p.adjust}}.
#' @param BPPARAM An instance of the
#' \code{\link[BiocParallel]{BiocParallelParam-class}} that determines how to
#' parallelisation of the functions will be evaluated.
#' @param scoringFunction a scoring function accepting parameters 
#' moduleRepresentatives, phenotype and covars. See \code{\link[MoDentify]{linearScoring}}
#'
#' @references
#' \insertRef{Do2017}{MoDentify}
#' @references
#' \insertRef{Chuang2007}{MoDentify}
#' @import data.table
#' @import igraph
#' @import BiocParallel
#' @export identifyModules
#' @usage identifyModules(graph, data, phenotype, covars = NULL, annotations,
#' merge.overlapping=FALSE, better.than.components= TRUE, alpha=0.05,
#' level=NULL, representative.method="average", correction.method="bonferroni",
#' BPPARAM = SerialParam(progressbar = TRUE))
#' @return a list consisting of four elements.
#' @examples
#' data(qmdiab.data)
#' data(qmdiab.annos)
#' data(qmdiab.phenos)
#' 
#' data <- qmdiab.data[, 1:75]
#' annotations <- qmdiab.annos[1:75]
#' 
#' net.graph <- generateNetwork(data = data, annotations = annotations)
#' mods <- identifyModules(
#'   graph = net.graph, data = data, annotations =
#'     annotations, phenotype = qmdiab.phenos$T2D, alpha = 0.05
#' )
#' 
#' pathway.graph <- generatePathwaysNetwork(data = data, annotations = annotations)
#' 
#' pathway.modules <- identifyModules(
#'   graph = pathway.graph$network, data = data,
#'   phenotype = qmdiab.phenos$T2D, level = pathway.graph$level, annotations = annotations,
#'   alpha = 0.05
#' )
identifyModules <- function(graph, data, phenotype, covars = NULL,
                            annotations,
                            merge.overlapping = FALSE,
                            better.than.components = TRUE,
                            alpha = 0.05,
                            level = NULL,
                            representative.method = "average",
                            correction.method = "bonferroni",
                            caching = TRUE,
                            BPPARAM = SerialParam(progressbar = TRUE),
                            scoringFunction=linearScoring) {
    
    
    # for incomplete data only average approach possible
    if (sum(is.na(data)) > 0) {
        if (representative.method == "eigenmetabolite") {
            stop("Data matrix contains missing values.\n Module identification with eigenmetabolite approach not possible.\n")
        } else {
            warning("Data matrix contains missing values.\n Only complete cases were used for module representatives (average).\n")
        }
    }
    
    
    if (!is.data.table(data)) {
        if (is.data.frame(data)) {
            data <- as.data.table(data)
            data <- cbind(sampleID = paste0("sample", seq_len(dim(data)[1])), data)
        } else if (is.matrix(data)) {
            data <- data.table(sampleID = paste0("sample", seq_len(dim(data)[1])), data)
        }
        data <- melt(data = data, id.vars = "sampleID", variable.name = "name")
    }
    
    annotations <- as.data.table(annotations)
    
    # Calculate the z-scores for variables
    data[, z.score := scale(value), by = .(name)]
    if (!is.null(level)) {
        if (!"Fluid" %in% colnames(annotations)) {
            annotations[, s.id := annotations[, get(level)]]
        } else {
            annotations[, s.id := paste(annotations[, get(level)], Fluid)]
        }
        
        data <- data[annotations[, .(name, s.id)], on = "name"]
        data[, met.name := name]
        data[, name := s.id]
        data[, s.id := NULL]
    }
    
    
    if (!is.null(covars)) {
        if (dim(covars)[1] != length(phenotype)) {
            stop("Covars and Phenotype have a different amount of samples")
        }
        if (dim(covars)[1] != length(unique(data$sampleID))) {
            stop("Variables and covariates have a different number of observations.")
        }
    } else {
        if (length(phenotype) != length(unique(data$sampleID))) {
            stop("Variables and covariates have a different number of observations.")
        }
    }
    
    
    if (!all(as.character(unique(data$name)) %in% vertex_attr(graph, "name"))) {
        warning("Not all of your variables are represented in the graph.")
    }
    if (!all(vertex_attr(graph, "name") %in% as.character(unique(data$name)))) {
        stop("Not all of your nodes are represented in the data data.table")
    }
    
    message("The function identifyModules could take a few minutes.")
    
    modules <- data.table(
        moduleID = integer(), module.score = numeric(),
        module.beta = numeric(), adjusted.score = numeric()
    )
    nodes <- data.table(
        moduleID = integer(), nodeID = integer(), name = character(), label = character(),
        order.added = integer(), score.after.adding = numeric()
    )
    
    # create temporary folder for the cache
    if(caching){
        cacheFolder <- tempfile()
        dir.create(cacheFolder)
    }else{
        cacheFolder <- NULL
    }
    
    
    seed.scores <- c()
    seed.betas <- c()
    
    message("Identifiying functional modules:")
    l<-bplapply(V(graph), greedyModuleSelection, graph=graph, data=data,
                phenotype=phenotype, covars=covars,  alpha=alpha, cacheFolder = cacheFolder,
                better.than.components=better.than.components,
                representative.method = representative.method, 
                scoringFunction=scoringFunction,
                BPPARAM = BPPARAM)
    
    for (module in l) {
        moduleCache<-module$moduleCache
        seed.scores <- c(seed.scores, module$seed.score)
        seed.betas <- c(seed.betas, unname(module$seed.beta))
        
        adjusted.score <- p.adjust(p = module$module.score, n = vcount(graph), method = correction.method)
        if (length(module$module) > 1 & adjusted.score < alpha) {
            newID <- dim(modules)[1] + 1
            
            modules <- rbind(modules, data.table(
                moduleID = newID, module.score = module$module.score,
                module.beta = module$beta, adjusted.score = adjusted.score
            ))
            
            nodes <- rbind(nodes, data.table(
                moduleID = newID, nodeID = module$module,
                name = vertex_attr(graph, "name", module$module),
                label = vertex_attr(graph, "label", module$module),
                order.added = seq_len(length(module$module)),
                score.after.adding = module$score.sequence
            ))
        }
    }
    
    
    seed_scores_DT <- data.table(nodeID = V(graph), seed.score = seed.scores, seed.beta = seed.betas)
    
    if (dim(modules)[1] > 0) {
        message("\nDeleting duplicate modules.")
        tmp_DT <- deleteDuplicates(nodes, modules, graph)
        modules <- unique(tmp_DT[, .(moduleID, module.score, module.beta, adjusted.score)])
        tmp_DT[, module.score := NULL]
        nodes <- tmp_DT
        
        if (better.than.components) {
            message("Deleting less significant contained modules.")
            tmp_DT <- deleteSupergraphs(nodes, modules, graph)
            modules <- unique(tmp_DT[, .(moduleID, module.score, module.beta, adjusted.score)])
            tmp_DT[, module.score := NULL]
            nodes <- tmp_DT
        }
        
        if (merge.overlapping) {
            message("Merging overlapping modules.")
            toMerge <- getToMerge(nodes)
            nodes <- toMerge[nodes, on = .(moduleID)]
            nodes[, moduleID := NULL]
            nodes[, module.beta := NULL]
            nodes[, adjusted.score := NULL]
            nodes[, adjusted.pval := NULL]
            setnames(nodes, c("moduleID", "nodeID", "name", "label", "order.added", "score.after.adding"))
            modules <- getMergedModules(graph, data, phenotype, covars, nodes, 
                                        scoringFunction=scoringFunction)
            modules$adjusted.score <- p.adjust(p = modules$module.score, n = vcount(graph), method = correction.method)
        }
        
        nodes <- nodes[seed_scores_DT, on = .(nodeID)]
        nodes[, name := vertex_attr(graph, "name", nodeID)]
        nodes[, label := vertex_attr(graph, "label", nodeID)]
        message(paste0("Number of modules found: ", dim(modules)[1]))
    } else {
        (message("No modules found."))
    }
    return(list(modules = modules, nodes = nodes, seeds = seed_scores_DT))
}