R/netboost.R

Defines functions nb_moduleEigengenes nb_plot_dendro nb_filter nb_transfer nb_summary nb_clust cut_trees cut_dendro tree_dendro tree_search nb_mcupgma nb_dist calculate_adjacency netboost

Documented in calculate_adjacency cut_dendro cut_trees nb_clust nb_dist nb_filter nb_mcupgma nb_moduleEigengenes nb_plot_dendro nb_summary nb_transfer netboost tree_dendro tree_search

## Formatting changed by RStudio-Autoformatting (*mostly* compatible to BioC
## preferences)

## Load WGCNA, try to hide the welcome-message (only a try as it is printed...)
## Workaround using environment to force WGCNA skipping it's welcome-message.
Sys.setenv(ALLOW_WGCNA_THREADS = 1)
suppressPackageStartupMessages(require(WGCNA))
Sys.unsetenv("ALLOW_WGCNA_THREADS")

## require(colorspace)
## require(parallel)

#' Netboost clustering.
#'
#' The Netboost clustering is performed in three subsequent steps. First, a
#' filter of important edges in the network is calculated. Next, pairwise
#' distances are calculated. Last, clustering is performed. For details see
#' Schlosser et al. doi...
#'
#' @name netboost
#' @param datan    Data frame were rows correspond to samples and columns to
#'   features.
#' @param stepno    Integer amount of boosting steps applied in the filtering
#'   step
#' @param filter_method    The following filtering methods are supported:
#'   "boosting" (non-zero coefficients in likelihood based boosting),
#'   "skip" (no filter), "kendall" (stats::cor.test),
#'   "spearman" (stats::cor.test), "pearson" (stats::cor.test)
#' @param until    Stop at index/column (if 0: iterate through all columns).
#'   For testing purposes in large datasets.
#' @param progress    Integer. If > 0, print progress after every X steps (
#' Progress might not be reported completely accurate due to parallel execution)
#' @param mode    Integer. Mode (0: x86, 1: FMA, 2: AVX). Features are only
#'   available if compiled accordingly and available on the hardware.
#' @param soft_power    Integer. Exponent of the transformation. Set
#'    automatically based on the scale free topology criterion if unspecified.
#' @param max_singleton    Integer. The maximal singleton in the clustering.
#'   Usually equals the number of features.
#' @param qc_plot    Logical. Should plots be created?
#' @param min_cluster_size  Integer. The minimum number of features in one
#' module.
#' @param ME_diss_thres    Numeric. Module Eigengene Dissimilarity Threshold for
#'   merging close modules.
#' @param cores    Integer. Amount of CPU cores used (<=1 : sequential)
#' @param scale    Logical. Should data be scaled and centered?
#' @param method    A character string specifying the method to be used for
#'   correlation coefficients.
#' @param verbose    Additional diagnostic messages.
#' @param n_pc        Number of principal components and variance explained
#'   entries to be calculated. The number of returned variance explained
#'   entries is currently ‘min(n_pc,10)’. If given ‘n_pc’ is 
#'   greater than 10, a warning is issued.
#' @param robust_PCs	Should PCA be calculated on ranked data (Spearman PCA)?
#'   Rotations will not correspond to original data if this is applied.
#' @param nb_min_varExpl    Minimum proportion of variance explained for
#'   returned module eigengenes. The number of PCs is capped at n_pc.
#' @return dendros    A list of dendrograms. For each fully separate part of the
#'   network an individual dendrogram.
#' @return names    A vector of feature names.
#' @return colors    A vector of numeric color coding in matching order of names
#'   and module eigengene names (color = 3 -> variable in ME3).
#' @return MEs    Aggregated module measures (Module eigengenes).
#' @return var_explained    Proportion of variance explained per module
#'   eigengene
#'   per principal component (max n_pc principal components are listed).
#' @return rotation    Matrix of variable loadings divided by their singular
#'   values. datan %*% rotation = MEs (with datan potentially scaled)
#' @return filter    Filter-Matrix as generated by the nb_filter function.
#'
#' @examples
#' data('tcga_aml_meth_rna_chr18',  package='netboost')
#' results <- netboost(datan=tcga_aml_meth_rna_chr18, stepno=20L,
#'    soft_power=3L, min_cluster_size=10L, n_pc=2, scale=TRUE,
#'    ME_diss_thres=0.25, qc_plot=TRUE)
#'
#' @export
netboost <-
    function(datan = NULL,
             stepno = 20L,
             filter_method = c("boosting","skip",
                "kendall", "spearman", "pearson"),
             until = 0L,
             progress = 1000L,
             mode = 2L,
             soft_power = NULL,
             max_singleton = ncol(datan),
             qc_plot = TRUE,
             min_cluster_size = 2L,
             ME_diss_thres = 0.25,
             n_pc = 1,
             robust_PCs = FALSE,
             nb_min_varExpl = 0.5,
             cores = as.integer(getOption("mc.cores", 2)),
             scale = TRUE,
             method = c("pearson", "kendall", "spearman"),
             verbose = getOption("verbose")) {
        # Initialize parallelization of WGCNA package.
        if (cores > 1)
            WGCNA::allowWGCNAThreads(nThreads = as.integer(cores))

        if (is.null(datan) || !is.data.frame(datan))
            stop("netboost: Error: datan must be a data.frame.")

        if (is.null(stepno) || !is.integer(stepno))
            stop("netboost: Error: stepno must be an integer.")

        if (is.null(min_cluster_size) || !is.integer(min_cluster_size))
            stop("netboost: Error: min_cluster_size must be an integer.")

        if (is.null(ME_diss_thres) || !is.numeric(ME_diss_thres))
            stop("netboost: Error: ME_diss_thres must be numeric.")

        verbose <- as.logical(verbose)

        if (ncol(datan) > 5e+06) {
            stop(
                "A bug in sparse UPGMA currently prevents analyses",
                " with more than 5 million features."
            )
        }

        if (scale) {
            if (verbose) message("Netboost: Scaling and centering data.")
            
            datan <-
                as.data.frame(scale(datan, center = TRUE, scale = TRUE))
        }

        if (verbose) message("Netboost: Initialising filter step.")

        filter <-
            nb_filter(
                datan = datan,
                stepno = stepno,
                until = until,
                filter_method = filter_method,
                progress = progress,
                cores = cores,
                mode = mode,
                verbose = verbose
            )
        
        if (verbose) message("Netboost: Finished filter step.")

        if (is.null(soft_power)) {
            # Random subset out of allocation
            random_features <-
                sample(ncol(datan), min(c(10000, ncol(datan))))
            # Call the network topology analysis function
            sft <- WGCNA::pickSoftThreshold(datan[, random_features])
            soft_power <- sft[["powerEstimate"]]
            if (verbose) {
                message(
                    paste0(
                        "Netboost: soft_power was set to ",
                        soft_power,
                        " based on the scale free topology criterion."
                    )
                )}
        }
        
        if (verbose) message("Netboost: Initialising distance calculation (",
        method[1],").")
        dist <-
            nb_dist(
                datan = datan,
                filter = filter,
                soft_power = soft_power,
                cores = cores,
                method = method
            )

        if (verbose) message("Netboost: Finished distance calculation.")

        if (verbose) message("Netboost: Initialising clustering step.")

        results <-
            nb_clust(
                datan = datan,
                filter = filter,
                dist = dist,
                min_cluster_size = min_cluster_size,
                ME_diss_thres = ME_diss_thres,
                n_pc = n_pc,
                robust_PCs = robust_PCs,
                nb_min_varExpl = nb_min_varExpl,
                max_singleton = max_singleton,
                cores = cores,
                method = method,
                qc_plot = qc_plot
            )

        results$filter <- filter
        
        if (verbose) {
            message("Netboost: Finished clustering step.")
            message("Netboost: Finished Netboost.")
        }

        invisible(results)
    }

#' Calculate network adjacencies for filter
#'
#' @name calculate_adjacency
#' @param datan    Data frame were rows correspond to samples and columns to
#'   features.
#' @param filter    Filter-Matrix as generated by the nb_filter function.
#' @param soft_power Integer. Exponent of the transformation. Set automatically
#'   based on the scale free topology criterion if unspecified.
#' @param method    A character string specifying the method to be used for
#'   correlation coefficients.
#' @return Vector with adjacencies for the filter
calculate_adjacency <-
    function(datan,
             filter,
             soft_power = 2,
             method = c("pearson", "kendall", "spearman")) {
        return(vapply(X=seq(
            from = 1,
            to = nrow(filter),
            by = 1
        ),
        FUN=function(i) {
            abs(WGCNA::cor(datan[, filter[i, 1]],
                           datan[, filter[i, 2]], method = method)) ^ soft_power
        },
        FUN.VALUE=1))
    }

#' Calculate distance (external wrapper for internal C++ function)
#' Parallelisation inside C++ program with RcppParallel.
#'
#' @name nb_dist
#' @param filter    Filter-Matrix as generated by the nb_filter function.
#' @param datan    Data frame were rows correspond to samples and columns to
#'   features.
#' @param soft_power    Integer. Exponent of the transformation. Set 
#'  automatically based on the scale free topology criterion if unspecified.
#' @param cores    Integer. Amount of CPU cores used (<=1 : sequential).
#' @param verbose    Additional diagnostic messages.
#' @param method    A character string specifying the method to be used for
#'   correlation coefficients.
#' @return Vector with distances.
#'
#' @examples
#'  data('tcga_aml_meth_rna_chr18', package='netboost')
#'  cores <- as.integer(getOption('mc.cores', 2))
#'  datan <- as.data.frame(scale(tcga_aml_meth_rna_chr18, center=TRUE,
#'  scale=TRUE))
#'  filter <- nb_filter(datan=datan, stepno=20L, until=0L, progress=1000L,
#'  cores=cores,mode=2L)
#'  dist <- nb_dist(datan=datan, filter=filter, soft_power=3L, cores=cores)
#'  summary(dist)
#'
#' @export
nb_dist <-
    function(filter,
             datan,
             soft_power = 2,
             cores = getOption("mc.cores", 2L),
             verbose = getOption("verbose"),
             method = c("pearson", "kendall", "spearman")) {
        # if (is.null(filter) || is.null(adjacency)) stop('Both filter and
        # adjacency must be provided')
        if (!(is.matrix(filter) &&
              (nrow(filter) > 0) && (ncol(filter) > 0)))
            stop("filter must be matrix with nrow > 0 and ncol >0")

        # if (!(is.vector(adjacency) && (length(adjacency) > 0)))
        # if (is.null(filter) || is.null(adjacency)) stop('Both filter and
        # adjacency must be provided')
        if (!(is.matrix(filter) &&
              (nrow(filter) > 0) && (ncol(filter) > 0)))
            stop("filter must be matrix with nrow > 0 and ncol >0")
        
        # if (!(is.vector(adjacency) && (length(adjacency) > 0)))
        # stop('adjacency is required a vector with length > 0')
        cores <- max(as.integer(cores), 1)
        
        ## RcppParallel amount of threads to be started
        setThreadOptions(numThreads = cores)
        
        return(cpp_dist_tom(
            filter,
            calculate_adjacency(
                datan = datan,
                filter = filter,
                soft_power = soft_power,
                method = method)
        ))
    }

#' Calculate dendrogram for a sparse distance matrix
#' (external wrapper MC-UPGMA clustering package Loewenstein et al.
#'
#' @name nb_mcupgma
#' @param filter    Filter-Matrix as generated by the nb_filter function.
#' @param dist    Distance-Matrix as generated by the nb_dist function.
#' @param max_singleton    Integer The maximal singleton in the clustering.
#'                      Usually equals the number of features.
#' @param cores    Integer Amount of CPU cores used (<=1 : sequential)
#' @param verbose    Logical Additional diagnostic messages.
#' @return Raw dendrogram to be processed by tree_search and tree_dendro.
#'
#' @examples
#'    data('tcga_aml_meth_rna_chr18', package='netboost')
#'    cores <- as.integer(getOption('mc.cores', 2))
#'    datan <- as.data.frame(scale(tcga_aml_meth_rna_chr18,
#'    center=TRUE, scale=TRUE))
#'    filter <- nb_filter(datan=datan, stepno=20L, until=0L,
#'                        progress=1000L, cores=cores, mode=2L)
#'    dist <- nb_dist(datan=datan, filter=filter, soft_power=3L, cores=cores)
#'    max_singleton = dim(tcga_aml_meth_rna_chr18)[2]
#'    forest <- nb_mcupgma(filter=filter, dist=dist,
#'                         max_singleton=max_singleton, cores=cores)
#'  head(forest)
#'
#' @export
nb_mcupgma <-
    function(filter,
             dist,
             max_singleton,
             cores = getOption("mc.cores", 2L),
             verbose = getOption("verbose")) {
        # Deletes all files under netboostTmpPath(), esp. clustering/iteration
        netboostTmpCleanup()
        
        if (max_singleton > 5e+06) {
            stop("A bug in sparse UPGMA currently prevents analyses",
                 " with more than 5 million features.")
        }
        
        if (!dir.create(file.path(netboostTmpPath(), "clustering")))
            stop("Unable to create: ",
                 file.path(netboostTmpPath(), "clustering"))
        
        file_dist_edges <-
            file.path(netboostTmpPath(), "clustering", "dist.edges")
        file_dist_tree <-
            file.path(netboostTmpPath(), "clustering", "dist.mcupgma_tree")
        
        # write.table(file='clustering/dist.edges',
        write.table(
            file = file_dist_edges,
            cbind(
                format(filter, scientific = FALSE,
                       trim = TRUE),
                format(dist, scientific = FALSE, trim = TRUE)
            ),
            row.names = FALSE,
            col.names = FALSE,
            sep = "\t",
            quote = FALSE
        )
        
        # Compress edges file (inplace)
        file_dist_edges_gz <- paste(file_dist_edges, ".gz", sep="")
        ret <- R.utils::gzip(file_dist_edges, destname = file_dist_edges_gz,
                             overwrite = TRUE)
        
        if (attr(ret, "nbrOfBytes") <= 0 || !R.utils::isGzipped(file_dist_edges_gz))
            warning(paste("Gzip maybe failed on:", file_dist_edges,
                          "Return:", as.character(ret),
                          " Bytes: ", attr(ret, "nbrOfBytes")))
        
        ret <-
            mcupgma_exec(
                exec = "cluster.pl",
                "-max_distance",
                1,
                "-max_singleton",
                max_singleton,
                "-iterations 1000 -heap_size 10000000 -num_hash_buckets 40",
                "-jobs",
                cores,
                "-retries 1",
                "-output_tree_file",
                file_dist_tree,
                "-split_unmodified_edges",
                max(cores,2L),
                file_dist_edges_gz,
                console = FALSE
            )
        
        if (verbose>1)
            message(ret)
        
        if (!file.exists(file_dist_tree) ||
            file.info(file_dist_tree)[["size"]] == 0)
            stop("No output file created. mcupgma error.")
        
        return(as.matrix(
            read.table(
                file = file_dist_tree,
                row.names = NULL,
                col.names = c("cluster_id1",
                              "cluster_id2", "distance", "cluster_id3")
            )
        ))
    }

#' Extracts independent trees from nb_mcupgma results (external wrapper for
#' internal C++ function)
#'
#' @name tree_search
#' @param forest    Raw dendrogram-matrix as generated by the nb_mcupgma
#'   function.
#' @return List
#'   
#' @examples
#'    data('tcga_aml_meth_rna_chr18',  package='netboost')
#'    cores <- as.integer(getOption('mc.cores', 2))
#'    datan <- as.data.frame(scale(tcga_aml_meth_rna_chr18, center=TRUE,
#'                                 scale=TRUE))
#'    filter <- nb_filter(datan=datan, stepno=20L, until=0L, progress=1000L,
#'                        cores=cores,mode=2L)
#'    dist <- nb_dist(datan=datan, filter=filter, soft_power=3L, cores=cores)
#'    max_singleton = dim(tcga_aml_meth_rna_chr18)[2]
#'    forest <- nb_mcupgma(filter=filter, dist=dist,
#'                         max_singleton=max_singleton, cores=cores)
#'    trees <- tree_search(forest)
#'    str(trees[[length(trees)]])
#'
#' @export
tree_search <- function(forest = NULL) {
    if (is.null(forest) || !is.matrix(forest))
        stop("forest must be provided (as matrix)")
    
    return(cpp_tree_search(forest))
}


#' Calculate the dendrogram for an individual tree
#'
#' @name tree_dendro
#' @param tree    A list with two elements. ids, which is an integer vector of
#'   feature identifiers and rows, which is an integer vector of selected rows
#'   in the corresponding forest
#' @param datan    Data frame were rows correspond to samples and columns to
#'   features.
#' @param forest    Raw dendrogram-matrix as generated by the nb_mcupgma 
#'   function.
#' @return List of tree specific objects including dendrogram, tree data and
#'   features.
tree_dendro <- function(tree,
                        datan,
                        forest) {
    index.features <- tree[["ids"]][tree[["ids"]] <= dim(datan)[2]]
    data_tree <- datan[, index.features]
    
    colnames_tree <- colnames(datan)[index.features]
    tree_cluster <- forest[tree[["rows"]], , drop = FALSE]
    
    all_ids <- rev(sort(unique(c(forest[, c(1, 2, 4)]))))
    none_tree_ids <- all_ids[!(all_ids %in% tree[["ids"]])]
    for (i in none_tree_ids) {
        for (j in c(1, 2, 4)) {
            tree_cluster[tree_cluster[, j] > i, j] <-
                tree_cluster[tree_cluster[,
                                          j] > i, j] - 1
        }
    }
    
    feature_names <- colnames(data_tree)
    
    cutpoint <- dim(data_tree)[2]
    dendro <- list()
    dendro[["merge"]] <- tree_cluster[, c(1, 2), drop = FALSE]
    dendro[["merge"]][dendro[["merge"]] <= cutpoint] <-
        -dendro[["merge"]][dendro[["merge"]] <= cutpoint]
    dendro[["merge"]][dendro[["merge"]] > 0] <-
        dendro[["merge"]][dendro[["merge"]] > 0] - cutpoint
    dendro[["merge"]] <- apply(dendro[["merge"]], c(1, 2), function(x) {
        (as.integer(x))
    })
    dendro[["height"]] <- tree_cluster[, 3]
    dendro[["order"]] <- seq(from = 1, to = cutpoint, by = 1)
    dendro[["labels"]] <- colnames_tree
    class(dendro) <- "hclust"
    b <- as.dendrogram(dendro)
    b.order <- order.dendrogram(b)
    dendro[["order"]] <- b.order
    
    return(list(
        dendro = dendro,
        data = data_tree,
        names = feature_names
    ))
}

#' Module detection for an individual tree
#'
#' @name cut_dendro
#' @param tree_dendro    List of tree specific objects including dendrogram, 
#'   tree
#'   data and features originating from the tree_dendro function.
#' @param min_cluster_size    Integer. The minimum number of features in one 
#'   module.
#' @param datan    Data frame were rows correspond to samples and columns to
#'   features.
#' @param ME_diss_thres    Numeric. Module Eigengene Dissimilarity Threshold for
#'   merging close modules.
#' @param name_of_tree    String. Annotating plots and messages.
#' @param qc_plot    Logical. Should plots be created?
#' @param n_pc    Number of principal components and variance explained
#'   entries to be calculated. The number of returned variance explained entries
#'   is currently ‘min(n_pc,10)’. If given ‘n_pc’ is greater than 10, a warning 
#'   is issued.
#' @param robust_PCs	Should PCA be calculated on ranked data (Spearman PCA)?
#'   Rotations will not correspond to original data if this is applied.
#' @param nb_min_varExpl    Minimum proportion of variance explained for
#'   returned module eigengenes. The number of PCs is capped at n_pc.
#' @param method    A character string specifying the method to be used for
#'   correlation coefficients.
#' @return List
cut_dendro <-
    function(tree_dendro,
             min_cluster_size = 2L,
             datan,
             ME_diss_thres,
             name_of_tree = "",
             qc_plot = TRUE,
             n_pc = 1,
             robust_PCs = FALSE,
             nb_min_varExpl = 0.5,
             method = c("pearson", "kendall", "spearman")) {
        dynamicMods <-
            dynamicTreeCut::cutreeDynamic(
                dendro = tree_dendro[["dendro"]],
                method = "tree",
                deepSplit = TRUE,
                minClusterSize = min_cluster_size
            )
        ### Merging of Dynamic Modules ### Calculate eigengenes
        MEList <-
            netboost::nb_moduleEigengenes(
                expr = tree_dendro[["data"]],
                colors = dynamicMods,
                n_pc = n_pc,
                robust = robust_PCs,
                nb_min_varExpl = nb_min_varExpl
            )
        MEs <- MEList[["nb_eigengenes"]]
        # Calculate dissimilarity of module eigengenes
        MEDiss <- 1 - abs(WGCNA::cor(MEs,method = method))
        # Cluster module eigengenes
        if (length(MEDiss) > 1) {
            METree <- hclust(as.dist(MEDiss), method = "average")
            if (qc_plot == TRUE) {
                graphics::plot(
                    METree,
                    main = paste0(name_of_tree,
                                  "Clustering of module eigengenes"),
                    xlab = "",
                    sub = ""
                )
                graphics::abline(h = ME_diss_thres, col = "red")
            }
            
            
            save_verbose <- getOption("verbose")
            options("verbose" = FALSE)

#             colors_old <- dynamicMods
#             colors_new <- c()
#             while(length(unique(colors_old))>length(unique(colors_new))){
#             	if(length(colors_new)>0){
#             	colors_old <- colors_new            	
#             	}
#         		MEcor <- abs(WGCNA::cor(netboost::nb_moduleEigengenes(
#                 	expr = tree_dendro[["data"]],
#                 	colors = colors_old,
#                 	n_pc = 1,
#                 	robust = robust_PCs)[["nb_eigengenes"]],method = method))
#                 MEcor <- MEcor[rownames(MEcor)!="ME0_pc1",colnames(MEcor)!="ME0_pc1"]
#                 colors_match <- gsub(pattern="_pc1",replacement="",x=gsub(pattern="ME",replacement="",x=colnames(MEcor)))
#                 MEcor <- MEcor - diag(ncol(MEcor))
#                 colors_match <- cbind(colors_match[which(MEcor>ME_diss_thres,arr.ind=TRUE)[,"row"]],colors_match[which(MEcor>ME_diss_thres,arr.ind=TRUE)[,"col"]])
#                 colors_match <- colors_match[colors_match[,2] < colors_match[,1],,drop=FALSE]
#                 colors_new <- colors_old
#                 if(nrow(colors_match)>0){
#                     for(i in 1:nrow(colors_match)){
#                         colors_new[colors_new==colors_match[i,1]] <- colors_match[i,2]
#                     }
#                 }
#             }
#             mergedColors <- colors_new
#  

            colors_old <- dynamicMods
            colors_new <- c()
            while(length(unique(colors_old))>length(unique(colors_new))){
            	if(length(colors_new)>0){
            		colors_old <- colors_new            	
            	}
        		MEs <- netboost::nb_moduleEigengenes(
                	expr = tree_dendro[["data"]],
                	colors = colors_old,
                	n_pc = 1,
                	robust = robust_PCs)[["nb_eigengenes"]]
				colors_new <- colors_old            	
        		MEDiss <- 1 - abs(WGCNA::cor(MEs,method = method))
       			if (length(MEDiss) > 1) {
            		METree <- hclust(as.dist(MEDiss), method = "average")
            		clust <- WGCNA::cutreeStatic(METree, cutHeight = ME_diss_thres, minSize = 2)
            		for(i in clust[clust!=0]){
            			colors_match <- METree$labels[clust==i]
            			colors_match <- colors_match[colors_match!="ME0_pc1"]
            			if(length(colors_match)>1){
							colors_match <- gsub(pattern="_pc1",replacement="",x=gsub(pattern="ME",replacement="",x=colors_match))
							colors_new[colors_new%in%colors_match] <- as.character(min(as.integer(colors_match)))	
            			}
        			}
				}
			}
            mergedColors <- colors_new

            
#             merged <-
#                 WGCNA::mergeCloseModules(
#                     exprData = tree_dendro[["data"]],
#                     dynamicMods,
#                     cutHeight = ME_diss_thres,
#                     corOptions = list(method = method, use = "p"),
#                     verbose = 3
#                 )
#             mergedColors <- merged[["colors"]]
            # Calculate eigengenes

           options("verbose" = save_verbose)
            MEList <-
                netboost::nb_moduleEigengenes(
                    expr = tree_dendro[["data"]],
                    colors = mergedColors,
                    n_pc = n_pc,
                    robust = robust_PCs,
                    nb_min_varExpl = nb_min_varExpl
                )
            MEs <- MEList[["nb_eigengenes"]]
            MEDiss <- 1 - abs(WGCNA::cor(MEs,method = method))
            if (length(MEDiss) > 1) {
                METree <- hclust(as.dist(MEDiss), method = "average")
                if (qc_plot == TRUE &
                    length(tree_dendro[["dendro"]][["labels"]]) > 2) {
                    graphics::plot(
                        METree,
                        main = paste0(
                            name_of_tree,
                            "Clustering of merged module eigengenes"
                        ),
                        xlab = "",
                        sub = ""
                    )
                    WGCNA::plotDendroAndColors(
                        dendro = tree_dendro[["dendro"]],
                        colors = mergedColors,
                        "Merged Dynamic",
                        dendroLabels = FALSE,
                        hang = 0.01,
                        addGuide = TRUE,
                        guideHang = 0.05,
                        main = paste0(name_of_tree, "Cluster Dendrogram")
                    )
                }
            }
        } else {
            message("\nOnly one module in ", name_of_tree, ".\n")
            mergedColors <- dynamicMods
            if (length(tree_dendro[["dendro"]][["labels"]]) > 2) {
                if (qc_plot == TRUE) {
                    graphics::plot(
                        tree_dendro[["dendro"]],
                        main = paste0(
                            name_of_tree,
                            "Cluster Dendrogram ",
                            "(Tree maximaly consists out of one module.)"
                        )
                    )
                }
            } else {
                message(
                    "\nOnly two elements in the one module in ",
                    name_of_tree,
                    " (no plot generated).\n"
                )
            }
        }
        message(
            "\nNetboost extracted ",
            length(table(mergedColors)),
            " modules (including background) with an average size of ",
            mean(table(mergedColors)[-1]),
            " (excluding background) from ",
            substr(name_of_tree,
                   start = 1, stop = (nchar(name_of_tree) - 1)),
            ".\n"
        )
        return(
            list(
                colors = mergedColors,
                MEs = MEs,
                var_explained = MEList[["var_explained"]],
                rotation = MEList[["rotation"]]
            )
        )
        # svd_PCs = MEList[["eigengenes"]],
    }

#' Module detection for the results from a nb_mcupgma call
#'
#' @name cut_trees
#' @param trees    List of trees, where one tree is a list of ids and rows
#' @param datan    Data frame were rows correspond to samples and columns to
#'   features.
#' @param forest    Raw dendrogram-matrix as generated by the nb_mcupgma 
#'   function.
#' @param min_cluster_size    Integer. The minimum number of features in one
#'   module.
#' @param ME_diss_thres    Numeric. Module Eigengene Dissimilarity Threshold for
#'   merging close modules.
#' @param qc_plot    Logical. Should plots be created?
#' @param n_pc    Number of principal components and variance explained
#'   entries to be calculated. The number of returned variance explained entries
#'   is currently ‘min(n_pc,10)’. If given ‘n_pc’ is greater than 10, a warning 
#'   is issued.
#' @param robust_PCs	Should PCA be calculated on ranked data (Spearman PCA)?
#'   Rotations will not correspond to original data if this is applied.
#' @param nb_min_varExpl    Minimum proportion of variance explained for
#'   returned module eigengenes. The number of PCs is capped at n_pc.
#' @param method    A character string specifying the method to be used for
#'   correlation coefficients.
#' @return List
#'
#' @examples
#' data('tcga_aml_meth_rna_chr18',  package='netboost')
#'  cores <- as.integer(getOption('mc.cores', 2))
#'  datan <- as.data.frame(scale(tcga_aml_meth_rna_chr18, center=TRUE,
#'  scale=TRUE))
#'  filter <- nb_filter(datan=datan, stepno=20L, until=0L, progress=1000L,
#'  cores=cores,mode=2L)
#'  dist <- nb_dist(datan=datan, filter=filter, soft_power=3L, cores=cores)
#'  max_singleton = dim(tcga_aml_meth_rna_chr18)[2]
#'  forest <- nb_mcupgma(filter=filter, dist=dist, max_singleton=max_singleton,
#'  cores=cores)
#'  trees <- tree_search(forest)
#'  results <- cut_trees(trees=trees,datan=datan, forest=forest,
#'  min_cluster_size=10L, ME_diss_thres=0.25, qc_plot=TRUE)
#'
#' @export
cut_trees <-
    function(trees,
             datan,
             forest,
             min_cluster_size = 2L,
             ME_diss_thres,
             qc_plot = TRUE,
             n_pc = 1,
             robust_PCs = FALSE,
             nb_min_varExpl = 0.5,
             method = c("pearson", "kendall", "spearman")) {
        res <- list()
        i <- 1L
        
        for (tree in trees) {
            tree_dendro_res <-
                tree_dendro(tree = tree,
                            datan = datan,
                            forest = forest)
            res[[i]] <- list()
            res[[i]][["dendro"]] <- tree_dendro_res[["dendro"]]
            res[[i]][["data"]] <- tree_dendro_res[["data"]]
            res[[i]][["names"]] <- tree_dendro_res[["names"]]
            cut_dendro_res <-
                cut_dendro(
                    tree_dendro = tree_dendro_res,
                    min_cluster_size = min_cluster_size,
                    datan = datan,
                    ME_diss_thres = ME_diss_thres,
                    name_of_tree = paste0("Tree ",
                                          i, ":"),
                    qc_plot = qc_plot,
                    n_pc = n_pc,
                    robust_PCs = robust_PCs,
                    nb_min_varExpl = nb_min_varExpl,
                    method = method
                )
            res[[i]][["colors"]] <- cut_dendro_res[["colors"]]
            res[[i]][["MEs"]] <- cut_dendro_res[["MEs"]]
            # res[[i]][['svd_PCs']] <- cut_dendro_res[["svd_PCs"]]
            res[[i]][["var_explained"]] <-
                cut_dendro_res[["var_explained"]]
            res[[i]][["rotation"]] <- cut_dendro_res[["rotation"]]
            i <- i + 1
        }
        
        return(res)
    }

#' Netboost clustering step
#'
#' @name nb_clust
#' @param filter    Filter-Matrix as generated by the nb_filter function.
#' @param dist    Distance-Matrix as generated by the nb_dist function.
#' @param datan    Data frame were rows correspond to samples and columns to
#'   features.
#' @param max_singleton    Integer. The maximal singleton in the clustering.
#'   Usually equals the number of features.
#' @param min_cluster_size    Integer. The minimum number of features in one 
#'   module.
#' @param ME_diss_thres    Numeric. Module Eigengene Dissimilarity Threshold for
#'   merging close modules.
#' @param cores    Integer. Amount of CPU cores used (<=1 : sequential)
#' @param qc_plot Logical. Create plot.
#' @param n_pc    Number of principal components and variance explained
#'   entries to be calculated. The number of returned variance explained entries
#'   is currently ‘min(n_pc,10)’. If given ‘n_pc’ is greater than 10, a warning 
#'   is issued.
#' @param robust_PCs	Should PCA be calculated on ranked data (Spearman PCA)?
#'   Rotations will not correspond to original data if this is applied.
#' @param nb_min_varExpl    Minimum proportion of variance explained for
#'   returned module eigengenes. The number of PCs is capped at n_pc.
#' @param method    A character string specifying the method to be used for
#'   correlation coefficients.
#' @return List
#'
#' @examples
#'  data('tcga_aml_meth_rna_chr18',  package='netboost')
#'  cores <- as.integer(getOption('mc.cores', 2))
#'  datan <- as.data.frame(scale(tcga_aml_meth_rna_chr18, center=TRUE, 
#'  scale=TRUE))
#'  filter <- nb_filter(datan=datan, stepno=20L, until=0L, progress=1000L,
#'  cores=cores,mode=2L)
#'  dist <- nb_dist(datan=datan, filter=filter, soft_power=3L, cores=cores)
#'  max_singleton = dim(tcga_aml_meth_rna_chr18)[2]
#'  pdf("test.pdf",width=30)
#'  sum_res <- nb_clust(filter=filter, dist=dist, datan=datan,
#'  max_singleton=max_singleton, min_cluster_size=10L, ME_diss_thres=0.25,
#'  cores=cores, qc_plot=TRUE, n_pc=2L, nb_min_varExpl=0.5)
#'  dev.off()
#' @export
nb_clust <-
    function(filter,
             dist,
             datan,
             max_singleton = dim(datan)[2],
             min_cluster_size = 2L,
             ME_diss_thres = 0.25,
             cores = getOption("mc.cores", 2L),
             qc_plot = TRUE,
             n_pc = 1,
             robust_PCs = FALSE,
             nb_min_varExpl = 0.5,
             method = c("pearson", "kendall", "spearman")) {
        forest <-
            nb_mcupgma(
                filter = filter,
                dist = dist,
                max_singleton = max_singleton,
                cores = cores
            )
        trees <- tree_search(forest)
        results <-
            cut_trees(
                trees = trees,
                datan = datan,
                forest = forest,
                min_cluster_size = min_cluster_size,
                ME_diss_thres = ME_diss_thres,
                qc_plot = qc_plot,
                n_pc = n_pc,
                robust_PCs = robust_PCs,
                nb_min_varExpl = nb_min_varExpl,
                method = method
            )
#         sum_res <- nb_summary(clust_res = results, qc_plot = qc_plot)
        sum_res <- nb_summary(clust_res = results)
        
        return(sum_res)
    }

#' Summarize results from a forest. Plot trees together.
#'
#' @name nb_summary
#' @param clust_res    Clustering results from cut_trees call.
#' @return List
#'
#' @examples
#' data('tcga_aml_meth_rna_chr18',  package='netboost')
#'  cores <- as.integer(getOption('mc.cores', 2))
#'  datan <- as.data.frame(scale(tcga_aml_meth_rna_chr18, center=TRUE,
#'  scale=TRUE))
#'  filter <- nb_filter(datan=datan, stepno=20L, until=0L, progress=1000L,
#'  cores=cores,mode=2L)
#'  dist <- nb_dist(datan=datan, filter=filter, soft_power=3L, cores=cores)
#'  max_singleton = dim(tcga_aml_meth_rna_chr18)[2]
#'  forest <- nb_mcupgma(filter=filter,dist=dist,max_singleton=max_singleton,
#'  cores=cores)
#'  trees <- tree_search(forest)
#'  results <- cut_trees(trees=trees,datan=datan, forest=forest,
#'  min_cluster_size=10L, ME_diss_thres=0.25, qc_plot=FALSE)
#'  sum_res <- nb_summary(clust_res=results)
#'
#' @export
nb_summary <- function(#qc_plot = TRUE,
    clust_res) {
    res <- vector("list")
    # res$clust_res <- clust_res
    n_MEs <- 0
    n_MEs_background <- 0
    for (tree in seq(from = 1,
                     to = length(clust_res),
                     by = 1)) {
        res[["dendros"]][[tree]] <- clust_res[[tree]][["dendro"]]
        res[["names"]] <- c(res[["names"]], clust_res[[tree]][["names"]])
        tmp.col <- clust_res[[tree]][["colors"]]
        tmp.col.new <- tmp.col
        tmp_MEs <- clust_res[[tree]][["MEs"]]
        tmp_MEs_new <- tmp_MEs
        tmp_rotation <- clust_res[[tree]][["rotation"]]
        tmp_rotation <-
            do.call("cbind", lapply(
                tmp_rotation,
                FUN = function(x) {
                    y <-
                        matrix(0,
                               nrow = length(clust_res[[tree]][["names"]]),
                               ncol = ncol(x))
                    rownames(y) <- clust_res[[tree]][["names"]]
                    y[rownames(x),] <- x
                    colnames(y) <- colnames(x)
                    return(y)
                }
            ))
        tmp_rotation_new <- tmp_rotation
        for (col in unique(tmp.col)) {
            if (col != 0 | length(unique(tmp.col)) == 1) {
                n_MEs <- n_MEs + 1
                tmp.col.new[tmp.col == col] <- n_MEs
                colnames(tmp_MEs_new)[grepl(pattern = paste0("ME", col, "_"),
                                            colnames(tmp_MEs))] <-
                    gsub(
                        pattern = paste0("ME",
                                         col, "_"),
                        replacement = paste0("ME", (n_MEs), "_"),
                        colnames(tmp_MEs_new)[grepl(pattern = paste0("ME",
                                                                     col, "_"),
                                                    colnames(tmp_MEs))]
                    )
                colnames(tmp_rotation_new)[grepl(pattern = paste0("ME", col,
                                                                  "_"),
                                                 colnames(tmp_rotation))] <-
                    gsub(
                        pattern = paste0("ME", col,
                                         "_"),
                        replacement = paste0("ME", (n_MEs), "_"),
                        colnames(tmp_rotation_new)[grepl(pattern =
                                                paste0("ME", col, "_"),
                                                colnames(tmp_rotation))]
                    )
            }
            if (col == 0 & length(unique(tmp.col)) != 1) {
                n_MEs_background <- n_MEs_background + 1
                tmp.col.new[tmp.col == col] <- -n_MEs_background
                colnames(tmp_MEs_new)[grepl(pattern = "ME0_",
                                            colnames(tmp_MEs))] <-
                    gsub(
                        pattern = "ME0_",
                        replacement = paste0("ME0_", n_MEs_background, "_"),
                        colnames(tmp_MEs_new)[grepl(pattern = "ME0_",
                                                    colnames(tmp_MEs))]
                    )
                colnames(tmp_rotation_new)[grepl(pattern = "ME0_",
                                                 colnames(tmp_rotation))] <-
                    gsub(
                        pattern = "ME0_",
                        replacement = paste0("ME0_", n_MEs_background, "_"),
                        colnames(tmp_rotation_new)[grepl(pattern = "ME0_",
                                                     colnames(tmp_rotation))]
                    )
            }
        }
        res[["colors"]] <- c(res[["colors"]], tmp.col.new)
        if ("MEs" %in% names(res)) {
            res[["MEs"]] <- cbind(res[["MEs"]], tmp_MEs_new)
        } else {
            res[["MEs"]] <- tmp_MEs_new
        }
        if ("rotation" %in% names(res)) {
            res[["rotation"]] <- c(res[["rotation"]], list(tmp_rotation_new))
        } else {
            res[["rotation"]] <- list(tmp_rotation_new)
        }
        ## if('svd_PCs' %in% names(res)){res$svd_PCs <- cbind(res$svd_PCs,
        ## clust_res[[tree]]$svd_PCs)}else{res$svd_PCs <-
        ## clust_res[[tree]]$svd_PCs}
        if ("var_explained" %in% names(res)) {
            res[["var_explained"]] <-
                cbind(res[["var_explained"]],
                clust_res[[tree]][["var_explained"]])
        } else {
            res[["var_explained"]] <- clust_res[[tree]][["var_explained"]]
        }
    }
    rownames(res[["var_explained"]]) <-
        paste0("PC", seq(
            from = 1,
            to = nrow(res[["var_explained"]]),
            by = 1
        ))
    colnames(res[["var_explained"]]) <-
        unique(unlist(lapply(
            strsplit(split = "_pc",
                     colnames(res[["MEs"]])),
            FUN = function(x) {
                x[1]
            }
        )))
    
    res[["rotation"]] <-
        do.call("cbind", lapply(
            res[["rotation"]],
            FUN = function(x) {
                y <- matrix(0, nrow = length(res[["names"]]), ncol = ncol(x))
                rownames(y) <- res[["names"]]
                y[rownames(x),] <- x
                colnames(y) <- colnames(x)
                return(y)
            }
        ))
    
    message(
        "\nNetboost detected ",
        n_MEs,
        " modules and ",
        n_MEs_background,
        " background modules in ",
        length(clust_res),
        " trees resulting in ",
        ncol(res[["MEs"]]),
        " aggreagate measures.\n"
    )
    message("Average size of the modules was ",
        mean(table(res[["colors"]][!(res[["colors"]] <= 0)])), ".\n")
    message(
        sum(res[["colors"]] <= 0),
        " of ",
        length(res[["colors"]]),
        " features (",
        (sum(res[["colors"]] <=
                 0) * 100 /
             length(res[["colors"]])),
        "%) were not assigned to modules.\n"
    )
    
#     if (qc_plot == TRUE) {
#         nb_plot_dendro(nb_summary = res, labels = FALSE)
#     }
    return(res)
}

#' Transfer of Netboost clustering to new data.
#'
#' @name nb_tranfer
#' @param nb_summary    Netboost results as generated by the nb_summary 
#'   function.
#' @param new_data    Data frame were rows correspond to samples and columns to
#'   features.
#' @param scale    Logical. Should data be scaled and centered?
#' @param only_module_membership    Logical. Should only module memberships be
#'   transfered and PCs be newly computed?
#' @param robust_PCs	Should PCA be calculated on ranked data (Spearman PCA)?
#'   Rotations will not correspond to original data if this is applied.
#' @return List
#'
#' @examples
#' data('tcga_aml_meth_rna_chr18',  package='netboost')
#' results <- netboost(datan = tcga_aml_meth_rna_chr18, stepno = 20L,
#'     soft_power = 3L, min_cluster_size = 10L, n_pc = 2, scale=TRUE,
#'     ME_diss_thres = 0.25, qc_plot=FALSE)
#' ME_transfer <- nb_transfer(nb_summary = results,
#'     new_data = tcga_aml_meth_rna_chr18,
#'     scale = TRUE)
#' all(round(results[["MEs"]], 12) == round(ME_transfer, 12))
#'
#' @export
nb_transfer <-
    function(nb_summary = NULL,
             new_data = NULL,
             scale = FALSE,
             robust_PCs = FALSE,
             only_module_membership = FALSE) {
        if (!exists("new_data"))
            stop("datan must be provided")
        
        if (!(is.data.frame(new_data) &&
              (nrow(new_data) > 0) && (ncol(new_data) >
                                       0)))
            stop("new_data must be a data frame with dim() > (0,0).")
        
        if (length(nb_summary[["colors"]]) != ncol(new_data)) {
            stop("The number of features in new_data must ",
                       "correspond to the number in nb_summary.")
        }
        
        if (!identical(sort(nb_summary[["names"]]), sort(colnames(new_data)))) {
            stop("The features in new_data (colnames) must ", 
                       "correspond to the features in nb_summary ",
                       "(nb_summary$names).")
        }
        
        new_data <- new_data[, nb_summary[["names"]]]
        
        if (scale) {
            new_data <-
                as.data.frame(scale(new_data, center = TRUE, scale = TRUE))
        }
        
        if (!only_module_membership) {
            MEs <- as.matrix(new_data) %*% nb_summary[["rotation"]]
        } else {
            MEs <- netboost::nb_moduleEigengenes(expr = new_data,robust = robust_PCs,
                                                 colors = nb_summary[["colors"]])[["nb_eigengenes"]]
            colnames(MEs)[lapply(strsplit(x = colnames(MEs), split = "-"), FUN = length) > 1] <-
                paste0("ME0_", substring(text = colnames(MEs)[lapply(strsplit(x = colnames(MEs),
                                                                              split = "-"), FUN = length) > 1], first = 4))
        }
        
        MEs <- MEs[, colnames(nb_summary[["MEs"]])]
        rownames(MEs) <- rownames(new_data)
        return(MEs)
    }


#' Boosting via C++ function. Parallelisation by R-package parallel with forking
#' (overhead of this method does not fall into account as single steps are
#' ~10s).
#'
#' Parallelisation via multicore (via 'parallel'-package). So *nix only atm.
#'
#' @name nb_filter
#' @param datan    Data frame were rows correspond to samples and columns to
#'   features.
#' @param stepno    Integer amount of boosting steps
#' @param until    Stop at index/column (if 0: iterate through all columns)
#' @param progress    Integer. If > 0, print progress after every X steps (mind:
#'   parallel!)
#' @param filter_method    The following filtering methods are supported:
#'   "boosting" (non-zero coefficients in likelihood based boosting),
#'   "skip" (no filter), "kendall" (stats::cor.test),
#'   "spearman" (stats::cor.test), "pearson" (stats::cor.test)
#' @param cores    Integer. Amount of CPU cores used (<=1 : sequential)
#' @param mode    Integer. Mode (0: x86, 1: FMA, 2: AVX). Features are only
#'   available if compiled accordingly and available on the hardware.
#' @param verbose    Additional diagnostic messages.
#' @return matrix n times 2 matrix with the indicies of the n unique entrees of
#'   the filter
#'
#'
#' @examples
#' data('tcga_aml_meth_rna_chr18',  package='netboost')
#'  cores <- as.integer(getOption('mc.cores', 2))
#'  datan <- as.data.frame(scale(tcga_aml_meth_rna_chr18, center=TRUE,
#'  scale=TRUE))
#'  filter <- nb_filter(datan=datan, stepno=20L, until=0L, progress=1000L,
#'  cores=cores,mode=2L)
#'  head(filter)
#'  nrow(filter)/(ncol(datan)*(ncol(datan)-1)/2) # proportion of potential undirected edges
#'
#' @export
nb_filter <-
    function(datan,
             stepno = 20L,
             until = 0L,
             progress = 1000L,
             filter_method = c("boosting","skip",
                 "kendall", "spearman", "pearson"),
             cores = getOption("mc.cores",
                               2L),
             mode = 2L,
             verbose = getOption("verbose")) {
        if (!exists("datan"))
            stop("datan must be provided")
        
        if (!(is.data.frame(datan) &&
              (nrow(datan) > 0) && (ncol(datan) > 0)))
            stop("datan must be a data frame with dim() > (0,0).")
        
        if (!(is.integer(stepno) && (stepno > 0)))
            stop("stepno must be an integer > 0.")
        
        if (!(is.integer(until) && (until >= 0)))
            stop("until must be an integer >= 0.")
        
        if (until == 0)
            until <- ncol(datan)
        
        # check mode
        if (!(mode %in% c(0, 1, 2))) {
            stop("mode must be 0 (x86), 1 (FMA) or 2 (AVX).")
        }
        
        if (ncol(datan) > 5e+06) {
            stop("A bug in sparse UPGMA currently prevents analyses ",
                       "with more than 5 million features.")
        }
        
        if(filter_method[1] == "boosting"){
            if (verbose) message(paste("Netboost: Filtering (boosting)"))
                ## Initialize data structures for optimized boosting (once)
        cpp_filter_base(as.matrix(datan), stepno, mode_ = mode)
        
        ## Parallelization 'conventional' via mclapply.
        if (cores > 1) {
            # print(paste('Parallel version:', cores, 'cores'))
            
            boosting_filter <- mclapply(seq(1, until), function(x) {
                if ((((x - 1) %% progress) == 0)) {
                    message(sprintf("idx: %d (%.1f%%) - %s", x,
                                  x * 100 / until, date()))
                }
                
                cpp_filter_step(x)
            }, mc.cores = cores)
        } else {
            ## Sequential function for debugging.  print(paste('Sequential version'))
            boosting_filter <- lapply(seq(1, until), function(x) {
                if ((((x - 1) %% progress) == 0)) {
                    message(sprintf("idx: %d (%.1f%%) - %s", x,
                                  x * 100 / until, date()))
                }
                
                cpp_filter_step(x)
            })
        }
        
        ## Important!: stop (free memory, else suitable memory is still blocked)
        cpp_filter_end()
        
        filter <-
            do.call("rbind", lapply(seq_along(boosting_filter), function(x) {
                return(as.data.frame(cbind(
                    as.integer(boosting_filter[[x]]),
                    as.integer(rep(x,
                                   length(
                                       boosting_filter[[x]]
                                   )))
                )))
            }))

        filter <- unique(t(apply(filter, 1, sort)))
        } else if (filter_method[1] == "skip"){
            if (verbose) message(paste("Netboost: Filtering (skip)"))
            filter <- t(utils::combn(x=ncol(datan),m=2))
        } else if (filter_method[1] %in% c("pearson", "kendall", "spearman")){
            if (verbose) message(paste0("Netboost: Filtering (",
            filter_method[1],")"))
            combs <- utils::combn(x=ncol(datan),m=2)
            index <- mclapply(1:ncol(combs),FUN=function(i){
            stats::cor.test(x=datan[,combs[1,i]],y=datan[,
              combs[2,i]],alternative = "two.sided",
              method = filter_method[1])$p.value < 0.05
            })
            filter <- t(combs[,unlist(index)])
        } else {
            stop("filter_method in nb_filter not supported.")
        }
        colnames(filter) <- c("cluster_id1", "cluster_id2")
        rownames(filter) <- seq(from = 1,
                                to = nrow(filter),
                                by = 1)

        if (verbose) message("Netboost: Filter includes ",
        nrow(filter)," of ",(ncol(datan)*(ncol(datan)-1)/2)," potential edges.")
        return(filter)
    }

#' Plot dendrogram from Netboost output.
#'
#' @name nb_plot_dendro
#' @param nb_summary    Netboost results as generated by the nb_summary function.
#' @param labels    Boolean flag whether labels should be attached to
#'   the leafs.
#' @param main    Plot title.
#' @param colorsrandom    Boolean flag whether module colors should be
#'   shuffeled.
#' @return invisible null
#'
#' @examples
#' data('tcga_aml_meth_rna_chr18',  package='netboost')
#' results <- netboost(datan = tcga_aml_meth_rna_chr18, stepno = 20L,
#' soft_power = 3L, min_cluster_size = 10L, n_pc = 2, scale=TRUE,
#' ME_diss_thres = 0.25, qc_plot = FALSE)
#' set.seed(1234) # reproducible but shuffled color-module matching
#' nb_plot_dendro(nb_summary = results, labels = FALSE, main = 'Test',
#' colorsrandom = TRUE)
#'
#' @export
nb_plot_dendro <-
    function(nb_summary = NULL,
             labels = FALSE,
             main = "",
             colorsrandom = FALSE) {
        if (!exists("nb_summary"))
            stop("Netboost output (nb_summary) must be provided.")
        
        colorHeight <- 0.2
        graphics::layout(matrix(seq(
            from = 1,
            to = (2 * length(nb_summary[["dendros"]])),
            by = 1
        ),
        nrow = 2),
        heights = c(1 - colorHeight, colorHeight))
        
        last_col <- 0
        n_colors <- length(unique(nb_summary[["colors"]]))
        middle <- floor(n_colors / 2)
        if (colorsrandom) {
            shuffel_index <- sample(x = n_colors, size = n_colors)
        } else {
            shuffel_index <- seq(from = 1,
                                 to = n_colors,
                                 by = 1)
        }
        shuffel_index <-
            c(shuffel_index[(2 * middle + 1)][(2 * middle + 1) == n_colors],
              rbind(shuffel_index[seq(from = 1, to = middle, by = 1)], shuffel_index[seq(from = (middle +
                                                                                                     1),
                                                                                         to = (2 * middle),
                                                                                         by = 1)]))
        plot_colors <-
            colorspace::rainbow_hcl(n = (length(unique(
                nb_summary[["colors"]]
            ))))[shuffel_index][as.factor(nb_summary[["colors"]])]
        plot_colors[nb_summary[["colors"]] <= 0] <- grDevices::gray(level = 0.7)
        for (tree in seq_along(nb_summary[["dendros"]])) {
            graphics::par(mar = c(0, 4, 8, 4))
            first_col <- last_col + 1
            last_col <-
                last_col + length(nb_summary[["dendros"]][[tree]][["labels"]])
            if (labels) {
                graphics::plot(nb_summary[["dendros"]][[tree]],
                     labels = nb_summary[["names"]][seq(from = first_col,
                                                   to = last_col,
                                                   by = 1)],
                     main = main)
            } else {
                graphics::plot(nb_summary[["dendros"]][[tree]],
                     labels = FALSE,
                     main = main)
            }
            graphics::par(mar = c(4, 4, 0, 4))
            WGCNA::plotColorUnderTree(nb_summary[["dendros"]][[tree]],
                                      colors = plot_colors[seq(from = first_col,
                                                               to = last_col,
                                                               by = 1)],
                                      rowLabels = "")
        }
        invisible(NULL)     # Only to not have empty value section
    }


#' Netboost module aggregate extraction.
#'
#' This is a modification of WGCNA::moduleEigengenes() (version WGCNA_1.66) to
#' include more than the first principal component. For details see
#' WGCNA::moduleEigengenes().
#'
#' @name nb_moduleEigengenes
#' @param expr    Expression data for a single set in the form of a data frame
#'   where rows are samples and columns are genes (probes).
#' @param colors    A vector of the same length as the number of probes in
#'   ‘expr’, giving module color for all probes (genes). Color ‘'grey'’ is
#'   reserved for unassigned genes.     Expression
#' @param n_pc    Number of principal components and variance explained
#'   entries to be calculated. The number of returned variance explained entries
#'   is currently ‘min(n_pc,10)’. If given ‘n_pc’ is greater than 10, a warning is
#'   issued.
#' @param align    Controls whether eigengenes, whose orientation is
#'   undetermined, should be aligned with average expression (‘align = 'along
#'   average'’, the default) or left as they are (‘align = ''’). Any other value
#'   will trigger an error.
#' @param exclude_grey    Should the improper module consisting of 'grey' genes be
#'   excluded from the eigengenes?
#' @param grey    Value of ‘colors’ designating the improper module. Note
#'   that if ‘colors’ is a factor of numbers, the default value will be
#'   incorrect.
#' @param subHubs    Controls whether hub genes should be substituted for
#'   missing eigengenes. If ‘TRUE’, each missing eigengene (i.e., eigengene
#'   whose calculation failed and the error was trapped) will be replaced by a
#'   weighted average of the most connected hub genes in the corresponding
#'   module. If this calculation fails, or if ‘subHubs==FALSE’, the value of
#'   ‘trapErrors’ will determine whether the offending module will be removed or
#'   whether the function will issue an error and stop.
#' @param robust	Should PCA be calculated on ranked data (Spearman PCA)?
#'   Rotations will not correspond to original data if this is applied.
#' @param trapErrors    Controls handling of errors from that may arise when
#'   there are too many ‘NA’ entries in expression data. If ‘TRUE’, errors from
#'   calling these functions will be trapped without abnormal exit.  If ‘FALSE’,
#'   errors will cause the function to stop. Note, however, that ‘subHubs’ takes
#'   precedence in the sense that if ‘subHubs==TRUE’ and ‘trapErrors==FALSE’, an
#'   error will be issued only if both the principal component and the hubgene
#'   calculations have failed.
#' @param return_valid_only    logical; controls whether the returned data frame of
#'   module eigengenes contains columns corresponding only to modules whose
#'   eigengenes or hub genes could be calculated correctly (‘TRUE’), or whether
#'   the data frame should have columns for each of the input color labels
#'   (‘FALSE’).
#' @param soft_power    The power used in soft-thresholding the adjacency
#'   matrix. Only used when the hubgene approximation is necessary because the
#'   principal component calculation failed. It must be non-negative. The
#'   default value should only be changed if there is a clear indication that it
#'   leads to incorrect results.
#' @param scale    logical; can be used to turn off scaling of the
#'   expression data before calculating the singular value decomposition. The
#'   scaling should only be turned off if the data has been scaled previously,
#'   in which case the function can run a bit faster. Note however that the
#'   function first imputes, then scales the expression data in each module. If
#'   the expression contain missing data, scaling outside of the function and
#'   letting the function impute missing data may lead to slightly different
#'   results than if the data is scaled within the function.
#' @param verbose    Controls verbosity of printed progress messages. 0 means
#'   silent, up to (about) 5 the verbosity gradually increases.
#' @param indent    A single non-negative integer controlling indentation of
#'   printed messages. 0 means no indentation, each unit above that adds two
#'   spaces.
#' @param nb_min_varExpl    Minimum proportion of variance explained for
#'   returned module eigengenes. Is capped at n_pc.
#'
#' @return eigengenes    Module eigengenes in a dataframe, with each column
#'   corresponding to one eigengene. The columns are named by the corresponding
#'   color with an ‘'ME'’ prepended, e.g., ‘MEturquoise’ etc. If
#'   ‘return_valid_only==FALSE’, module eigengenes whose calculation failed have
#'   all components set to ‘NA’.
#' @return averageExpr  If ‘align == 'along average'’, a dataframe containing
#'   average normalized expression in each module. The columns are named by the
#'   corresponding color with an ‘'AE'’ prepended, e.g., ‘AEturquoise’ etc.
#' @return var_explained A dataframe in which each column corresponds to a
#'   module, with the component ‘var_explained[PC, module]’ giving the variance
#'   of module ‘module’ explained by the principal component no. ‘PC’. The
#'   calculation is exact irrespective of the number of computed principal
#'   components. At most 10 variance explained values are recorded in this
#'   dataframe.
#' @return n_pc          A copy of the input ‘n_pc’.
#' @return validMEs     A boolean vector. Each component (corresponding to the
#'   columns in ‘data’) is ‘TRUE’ if the corresponding eigengene is valid, and
#'   ‘FALSE’ if it is invalid. Valid eigengenes include both principal
#'   components and their hubgene approximations. When ‘return_valid_only==FALSE’,
#'   by definition all returned eigengenes are valid and the entries of
#'   ‘validMEs’ are all ‘TRUE’.
#' @return validColors  A copy of the input colors with entries corresponding to
#'   invalid modules set to ‘grey’ if given, otherwise 0 if ‘colors’ is numeric
#'   and 'grey' otherwise.
#' @return allOK        Boolean flag signalling whether all eigengenes have been
#'   calculated correctly, either as principal components or as the hubgene
#'   average approximation.
#' @return allPC        Boolean flag signalling whether all returned eigengenes
#'   are principal components.
#' @return isPC         Boolean vector. Each component (corresponding to the
#'   columns in ‘eigengenes’) is ‘TRUE’ if the corresponding eigengene is the
#'   first principal component and ‘FALSE’ if it is the hubgene approximation or
#'   is invalid.
#' @return isHub        Boolean vector. Each component (corresponding to the
#'   columns in ‘eigengenes’) is ‘TRUE’ if the corresponding eigengene is the
#'   hubgene approximation and ‘FALSE’ if it is the first principal component or
#'   is invalid.
#' @return validAEs     Boolean vector. Each component (corresponding to the
#'   columns in ‘eigengenes’) is ‘TRUE’ if the corresponding module average
#'   expression is valid.
#' @return allAEOK      Boolean flag signalling whether all returned module
#'   average expressions contain valid data. Note that ‘return_valid_only==TRUE’
#'   does not imply ‘allAEOK==TRUE’: some invalid average expressions may be
#'   returned if their corresponding eigengenes have been calculated correctly.
#' @export
nb_moduleEigengenes <-
    function(expr,
             colors,
             n_pc = 1,
             align = "along average",
             exclude_grey = FALSE,
             grey = if (is.numeric(colors))
                 0
             else
                 "grey",
             subHubs = TRUE,
             robust = FALSE,
             trapErrors = FALSE,
             return_valid_only = trapErrors,
             soft_power = 6,
             scale = TRUE,
             verbose = 0,
             indent = 0,
             nb_min_varExpl = 0.5) {
        spaces <- indentSpaces(indent)
        
        if (verbose == 1)
            message(
                paste(
                    spaces,
                    "moduleEigengenes: Calculating",
                    nlevels(as.factor(colors)),
                    "module eigengenes in given set."
                )
            )
        
        if (is.null(expr)) {
            stop("moduleEigengenes: Error: expr is NULL. ")
        }
        
        if (is.null(colors)) {
            stop("moduleEigengenes: Error: colors is NULL. ")
        }
        
        if (is.null(dim(expr)) || length(dim(expr)) != 2)
            stop("moduleEigengenes: Error: expr must be two-dimensional.")
        
        if (dim(expr)[2] != length(colors))
            stop(
                "moduleEigengenes: Error: ncol(expr) and length(colors) must be equal (one color per gene)."
            )
        
        if (is.factor(colors)) {
            nl <- nlevels(colors)
            nlDrop <- nlevels(colors[, drop = TRUE])
            if (nl > nlDrop)
                stop(
                        "Argument 'colors' contains unused levels (empty modules). ",
                        "Use colors[, drop=TRUE] to get rid of them."
                )
        }
        
        if (soft_power < 0)
            stop("soft_power must be non-negative")
        
        alignRecognizedValues <- c("", "along average")
        
        if (!is.element(align, alignRecognizedValues)) {
            message(
                paste(
                    "ModulePrincipalComponents: Error:",
                    "parameter align has an unrecognised value:",
                    align,
                    "; Recognized values are ",
                    alignRecognizedValues
                )
            )
            stop()
        }
        
        maxVarExplained <- 10
        
        if (n_pc > maxVarExplained)
            warning(paste("Given n_pc is too large. Will use value", maxVarExplained))
        
        nVarExplained <- min(n_pc, maxVarExplained)
        modlevels <- levels(factor(colors))
        
        if (exclude_grey)
            if (sum(as.character(modlevels) != as.character(grey)) > 0) {
                modlevels <-
                    modlevels[as.character(modlevels) != as.character(grey)]
            } else {
                stop(
                        "Color levels are empty. Possible reason: the only color is grey",
                        " and grey module is excluded from the calculation."
                )
            }
        
        PrinComps <- data.frame(matrix(NA, nrow = dim(expr)[[1]],
                                       ncol = length(modlevels)))
        nb_PrinComps <- data.frame(matrix(NA, nrow = dim(expr)[[1]],
                                          ncol = 0))
        rotation <- vector("list", length(modlevels))
        averExpr <-
            data.frame(matrix(NA, nrow = dim(expr)[[1]],
                              ncol = length(modlevels)))
        varExpl <-
            data.frame(matrix(NA, nrow = nVarExplained,
                              ncol = length(modlevels)))
        validMEs <- rep(TRUE, length(modlevels))
        validAEs <- rep(FALSE, length(modlevels))
        isPC <- rep(TRUE, length(modlevels))
        isHub <- rep(FALSE, length(modlevels))
        validColors <- colors
        names(PrinComps) <-
            paste(moduleColor.getMEprefix(), modlevels, sep = "")
        names(averExpr) <- paste("AE", modlevels, sep = "")
        for (i in seq_along(modlevels)) {
            if (verbose > 1)
                message(paste(
                    spaces,
                    "moduleEigengenes : Working on ME for module",
                    modlevels[i]
                ))
            modulename <- modlevels[i]
            restrict1 <-
                as.character(colors) == as.character(modulename)
            if (verbose > 2)
                message(paste(spaces, " ...", sum(restrict1), "features"))
            datModule <- as.matrix(t(expr[, restrict1]))
            n <- dim(datModule)[2]
            p <- dim(datModule)[1]
            pc <- try({
                if (scale){
                	if (verbose > 5)
                    	message(paste(spaces, " ...scaling"))
                    datModule <- t(scale(t(datModule)))
                }
                if(robust){
                	if (verbose > 5)
                    	message(paste(spaces, " ...ranking"))
                    datModule <- apply(X=datModule,MARGIN=1,FUN=rank)
                    datModule <- t(scale(datModule))
                }
                if (verbose > 5)
                    message(paste(spaces, " ...calculating SVD"))
                svd1 <-
                    svd(datModule,
                        nu = min(n, p, n_pc),
                        nv = min(n, p, n_pc))
                nb_PCA <- stats::prcomp(
                        x = t(datModule),
                        retx = TRUE,
                        center = FALSE,
                        scale. = FALSE,
                        tol = NULL,
                        rank. = NULL)
                nb_PCA[["x"]] <- t(t(nb_PCA[["x"]]) / svd1[["d"]])
                nb_PCA[["rotation"]] <- t(t(nb_PCA[["rotation"]]) / svd1[["d"]])
                if (verbose > 5)
                    message(paste(spaces, " ...calculating PVE"))
                veMat <-
                    WGCNA::cor(svd1[["v"]][, seq(from = 1,
                                            to = min(n, p, nVarExplained),
                                            by = 1)], t(datModule), use = "p")
                varExpl[seq(
                    from = 1,
                    to = min(n, p, nVarExplained),
                    by = 1
                ), i] <- rowMeans(veMat ^ 2,
                                  na.rm = TRUE)
                svd1[["v"]][, 1]
            }, silent = TRUE)
            
            if (methods::is(pc, "try-error")) {
                if (!trapErrors)
                    stop(pc)
                if (verbose > 0) {
                    message(
                        paste(
                            spaces,
                            " ..ME calculation of module",
                            modulename,
                            "failed with the following error:"
                        )
                    )
                    message(
                        paste(
                            spaces,
                            "     ",
                            pc,
                            spaces,
                            " ..the offending module has been removed."
                        )
                    )
                }
                warning(
                    paste(
                        "Eigengene calculation of module",
                        modulename,
                        "failed with the following error \n     ",
                        pc,
                        "The offending module has been removed.\n"
                    )
                )
                validMEs[i] <- FALSE
                isPC[i] <- FALSE
                isHub[i] <- FALSE
                validColors[restrict1] <- grey
            } else {
                PrinComps[, i] <- pc
                nb_n_pcs <-
                    min(c(which(
                        cumsum(varExpl[seq(
                            from = 1,
                            to = min(n, p,
                                     nVarExplained),
                            by = 1
                        ), i]) > nb_min_varExpl
                    ), nVarExplained))
                nb_PrinComps <-
                    base::cbind(nb_PrinComps, nb_PCA[["x"]][, seq(from = 1,
                                                       to = nb_n_pcs,
                                                       by = 1)])
                colnames(nb_PrinComps)[seq(
                    from = (ncol(nb_PrinComps) - nb_n_pcs + 1),
                    to = ncol(nb_PrinComps),
                    by = 1
                )] <- paste0(
                    moduleColor.getMEprefix(),
                    modlevels[i],
                    "_pc",
                    seq(
                        from = 1,
                        to = nb_n_pcs,
                        by = 1
                    )
                )
                colnames(nb_PCA[["rotation"]])[seq(from = 1,
                                              to = nb_n_pcs,
                                              by = 1)] <- paste0(
                                                  moduleColor.getMEprefix(),
                                                  modlevels[i],
                                                  "_pc",
                                                  seq(
                                                      from = 1,
                                                      to = nb_n_pcs,
                                                      by = 1
                                                  )
                                              )
                rotation[[i]] <-
                    nb_PCA[["rotation"]][, seq(from = 1,
                                          to = nb_n_pcs,
                                          by = 1),
                                    drop = FALSE]
                ae <- try({
                    if (isPC[i])
                        scaledExpr <- scale(t(datModule))
                    averExpr[, i] <-
                        rowMeans(scaledExpr, na.rm = TRUE)
                    if (align == "along average") {
                        if (verbose > 4)
                            message(
                                paste(
                                    spaces,
                                    " .. aligning module eigengene with average expression."
                                )
                            )
                        corAve <-
                            WGCNA::cor(averExpr[, i], PrinComps[, i], use = "p")
                        if (!is.finite(corAve))
                            corAve <- 0
                        if (corAve < 0)
                            PrinComps[, i] <- -PrinComps[, i]
                    }
                    0
                }, silent = TRUE)
                if (methods::is(ae, "try-error")) {
                    if (!trapErrors)
                        stop(ae)
                    if (verbose > 0) {
                        message(
                            paste(
                                spaces,
                                " ..Average expression calculation of module",
                                modulename,
                                "failed with the following error:"
                            )
                        )
                        message(
                            paste(
                                spaces,
                                "     ",
                                ae,
                                spaces,
                                " ..the returned average expression ",
                                "vector will be invalid."
                            )
                        )
                    }
                    warning(
                        paste(
                            "Average expression calculation of module",
                            modulename,
                            "failed with the following error \n     ",
                            ae,
                            "The returned average expression vector will",
                            " be invalid.\n"
                        )
                    )
                }
                validAEs[i] <- !methods::is(ae, "try-error")
            }
        }
        
        allOK <- (sum(!validMEs) == 0)
        if (return_valid_only && sum(!validMEs) > 0) {
            PrinComps <- PrinComps[, validMEs]
            averExpr <- averExpr[, validMEs]
            varExpl <- varExpl[, validMEs]
            validMEs <- rep(TRUE, times = ncol(PrinComps))
            isPC <- isPC[validMEs]
            isHub <- isHub[validMEs]
            validAEs <- validAEs[validMEs]
        }
        
        allPC <- (sum(!isPC) == 0)
        allAEOK <- (sum(!validAEs) == 0)
        list(
            eigengenes = PrinComps,
            averageExpr = averExpr,
            var_explained = varExpl,
            n_pc = n_pc,
            validMEs = validMEs,
            validColors = validColors,
            allOK = allOK,
            allPC = allPC,
            isPC = isPC,
            isHub = isHub,
            validAEs = validAEs,
            allAEOK = allAEOK,
            nb_eigengenes = nb_PrinComps,
            rotation = rotation
        )
    }

## #' Example to get access to the MCUPGMA executables.
## #' 
## #' @examples
## #' mcupgma_example()
## #' @export
## mcupgma_example <- function() {
##     exec <- netboostMCUPGMAPath()
##     files <- Sys.glob(file.path(exec, "*"))
##     paste("Available MCUPGMA executables and scripts under:", exec)
##     print(sapply(files, basename, USE.NAMES = FALSE))
## }

## #' Test/example code. Applies netboost to the TCGA-AML CHR18 DNA methylation and
## #' gene expression data supplied with the package.
## #'
## #' @param cores Integer. CPU cores to use.
## #' @param keep Logical. Keep mcupgma intermediate files.
## #' @return Netboost result
## #'
## #' @examples
## #' nb_example()
## #'
## #' @export
# nb_example <-
#     function(cores = getOption("mc.cores", 2L),
#              keep = FALSE) {
#         # Keep data local.
#         exa_env <- new.env()
#         
#         # load data methylation and RNA data 180 patients x 5283 features
#         data("tcga_aml_meth_rna_chr18",
#              package = "netboost",
#              envir = exa_env)
#         
#         pdfFile <- file.path(tempdir(), "results_netboost.pdf")
#         
#         # pdf(file=file.path(getwd(), 'results_netboost.pdf'), width = 30)
#         pdf(file = pdfFile, width = 30)
#         results <-
#             netboost(
#                 datan = exa_env[["tcga_aml_meth_rna_chr18"]],
#                 stepno = 20L,
#                 soft_power = 3L,
#                 min_cluster_size = 10L,
#                 n_pc = 2,
#                 scale = TRUE,
#                 ME_diss_thres = 0.25
#             )
#         # set.seed(1234)
#         nb_plot_dendro(nb_summary = results,
#                        labels = TRUE,
#                        colorsrandom = TRUE)
#         dev.off()
#         
#         if (file.exists(pdfFile)) {
#             message(paste0("PDF created:", pdfFile))
#             
#             # If default PDF viewer is assigned, try to show PDF.
#             if (!is.null(getOption("pdfviewer"))) {
#       #          system2(getOption("pdfviewer"), pdfFile)
#             }
#         }
#         
#         ### Transfer results to the same data (bug check)
#         ME_transfer <-
#             nb_transfer(
#                 nb_summary = results,
#                 new_data = exa_env[["tcga_aml_meth_rna_chr18"]],
#                 scale = TRUE
#             )
#         
#         all(round(results[["MEs"]], 12) == round(ME_transfer, 12))
#         
#         # Cleanup all produced temporary filed (esp. clustering/iteration_*)
#         if (!keep)
#             netboostTmpCleanup()
#         else
#             message(paste("Kept MCUPGMA temporary files in:", netboostTmpPath()))
#         
#         invisible(results)
#     }

Try the netboost package in your browser

Any scripts or data that you put into this service are public.

netboost documentation built on Nov. 8, 2020, 4:58 p.m.