R/SAFE.R

Defines functions SAFE

Documented in SAFE

# !/usr/bin/env Rscript
# 11/10/2017
#
#' @title Single-cell Aggregated clustering From Ensemble (SAFE)
#'
#' @description SAFE (Single-cell Aggregated clustering From Ensemble): Cluster ensemble for single-cell RNA-seq data
#'
#' @param cluster_results a J*N matrix with J individual clustering methods and N cells
#' @param program.dir defines the directory of two programs shmetis and gpmetis, which are required by HGPA algorithm and MCLA and CSPA algorithms, respectively.
#' Default is the current directory (".").
#' @param k_min defines the minimum number of clusters used for ensembel clustering. Default is 2.
#' @param k_max defines the maximum number of clusters used for ensembel clustering. Default is
#' the maximum cluster number estimated by all the single solutions.
#' @param MCLA a boolean parameter that defines whether to use MCLA algorithm for ensemble clustering.
#' Default is "TRUE".
#' @param HGPA a boolean parameter that defines whether to use HGPA algorithm for ensemble clustering.
#' Default is "FALSE".
#' @param CSPA a boolean parameter that defines whether to use CSPA algorithm for ensemble clustering.
#' Default is "FALSE".
#' @param cspc_cell_max defines the maximum number of cells above which CSPA is not run, when \code{CSPA = TRUE}.
#' @param SEED sets the seed of the random number generator. Setting the seed to a fixed value can produce
#' reproducible clustering results for MCLA and CSPA algorithms.
#'
#' @return SAFE returns a list object containing:
#' @return \itemize{
#'     \item optimal_clustering: optimal ensemble clustering result determined by ANMI
#'     \item optimal_k: optimal cluster number
#'     \item HGPA: optimal cluster result using HGPA algorithm
#'     \item HGPA_optimal_k: optimal cluster number estimated by HGPA algorithm
#'     \item HGPA_ANMI: Average normalized mutual information (ANMI) for the optimal cluster result using HGPA algorithm
#'     \item MCLA: optimal cluster result using MCLA algorithm
#'     \item MCLA_optimal_k: optimal cluster number estimated by MCLA algorithm
#'     \item MCLA_ANMI: ANMI for the optimal cluster result using MCLA algorithm
#'     \item CSPA: optimal cluster result using CSPA algorithm
#'     \item CSPA_optimal_k: optimal cluster number estimated by CSPA algorithm
#'     \item CSPA_ANMI: ANMI for the optimal cluster result using CSPA algorithm
#'     \item Summary: a summary of the ensemble clustering result
#' }
#' @author Yuchen Yang <[email protected]>, Ruth Huh <[email protected]>, Yun Li <[email protected]>
#' @references Yuchen Yang, Ruth Huh, Houston Culpepper, Yuan Lin, Michael Love, Yun Li. SAFE (Single-cell Aggregated clustering From Ensemble): Cluster ensemble for single-cell RNA-seq data. 2017
#' @examples
#' # Load the example data data_SAFE
#' data("data_SAFE")
#' working.dir = "~/Documents/single_cell_clustering"
#'
#' # Zheng dataset
#' # Run individual_clustering
#' cluster.result <- individual_clustering(inputTags=data_SAFE$Zheng.expr, SEED=123)
#'
#' # Cluster ensemble using SAFE clustering:
#' cluster.ensemble <- SAFE(cluster_results=cluster.result, program.dir = working.dir, SEED=123)
#'
#' # Biase dataset
#' # Run individual_clustering
#' cluster.result <- individual_clustering(inputTags = data_SAFE$Biase.expr, datatype = "FPKM", seurat_min_cell = 200, resolution_min = 1.2, tsne_min_cells = 200, tsne_min_perplexity = 10, SEED=123)
#'
#' # Cluster ensemble using SAFE clustering:
#' cluster.ensemble <- SAFE(cluster_results=cluster.result, program.dir = working.dir, SEED=123)
#' @import bit64
#' @import stringr
#' @import Matrix
#' @importFrom methods as
#' @importFrom utils write.table
#' @importFrom utils read.table
#' @importFrom graphics hist
#' @importFrom stats runif
#' @export
SAFE <- function(cluster_results, program.dir = ".", k_min = 2, k_max = NULL, MCLA = TRUE, HGPA = FALSE, CSPA = FALSE, cspc_cell_max = NULL, SEED = 1){
    ### Example for Input data
    ### cluster_results = matrix(data = c(1,1,1,2,2,3,3,2,2,2,3,3,1,1,1,1,2,2,3,3,3),nrow = 3, byrow = T)

    set.seed(SEED)

    ### Transforming missing data into NA
    for (i in 1:nrow(cluster_results)){
        cluster_results[i,which(is.na(cluster_results[i,]))] <- 0
    }

    ###remapping cluster labels where needed
    for (r in 1:nrow(cluster_results)){
        labels <- cluster_results[r,]
        if (max(labels) != length(levels(as.factor(labels)))){
            labels_new <- cbind(labels, t(t(c(1:length(labels)))))
            labels_inorder <- labels_new[order(labels_new[,1]),]
            last <- NULL
            curind <- 0
            for (i in 1:nrow(labels_inorder)){
                if (last != labels_inorder[i,1] || is.null(last)){
                    last <- labels_inorder[i,1]
                    curind <- curind + 1
                }
                labels_inorder[i,1] <- curind
            }
            labels_inorder <- labels_inorder[order(labels_inorder[,2]),]
            labels_new <- t(labels_inorder[,1])
            cluster_results[r,] <- labels_new
        }
    }

    if(is.null(k_max)){
        k_max <- max(cluster_results)
    }


    ### Function hypergraph_generation performs transformation from clustering results to hypergraph
    # object contains clustering results generated by all the single-cell clustering methods. nrow equals to No. of methods, and ncol is No. of cells
    hypergraph_generation <- function(object){
        hypergraph <- matrix(data = 0, nrow = length(object), ncol = max(object))
        for (i in 1:length(hypergraph[1,])){
            hypergraph[,i] <- as.numeric(object == i)
        }
        return(hypergraph)
    }


    ### Define function ints
    ints <- function(s, tafter = 1000000000){
        tbefore <- sum(s)
        while (tafter/tbefore <= 0.5){
            tafter = tafter * 10
        }
        if (tbefore != 0){
            s = round(s * (tafter/tbefore))
        }
        return(s)
    }


    ### Function wgraph
    # object is the hypergraph generated before
    wgraph <- function(object, w = NULL, method = NULL, dataname = NULL) {
        options(scipen = 100)
        if (is.null(method)){
            method <- 0
        }
        if (is.null(dataname)){
            dataname <- "graph"
        }
        dataname <- paste(dataname, method, sep = "")

        if (method == 0 || method == 1){
            object <- object - diag(diag(object))
        }

        object <- ints(object)

        if (method == 1){
            w <- ints(w)
        }

        while (as.numeric(file.exists(dataname))){
            dataname <- paste(dataname, method, sep = "")
        }

        print(paste("wgraph: writing ", dataname, sep = ""))

        file_name = paste(dataname, sep = "")
        if (method == 0){
            object_sum <- sum(as.numeric(object > 0))/2
            if (object_sum < ceiling(object_sum)){
                x <- as.numeric(c(nrow(object),format(object_sum, scientific = TRUE), 1))
            } else{
                x <- c(nrow(object), object_sum, 1)
            }
            x <- as.integer64(x)
            write.table(x, file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = " ")
            write.table("\n", file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = "", append = TRUE)
        } else if (method == 1){
            object_sum <- sum(as.numeric(object > 0))/2
            if (object_sum < ceiling(object_sum)){
                x <- as.numeric(c(nrow(object),format(object_sum, scientific = TRUE), 11))
            } else{
                x <- c(nrow(object), object_sum, 11)
            }
            x <- as.integer64(x)
            write.table(x, file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = " ")
            write.table("\n", file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = "", append = TRUE)
        } else {
            validcolumns <- which(colSums(object) > 0)
            if (length(validcolumns) != ncol(object)){
                print ("wgraph: removing empty feature columns")
                object <- object[, validcolumns]
            }
            x <- c(ncol(object), nrow(object), 1)
            x <- as.integer64(x)
            write.table(x, file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = " ")
            write.table("\n", file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = "", append = TRUE)
        }

        if (method == 0 || method == 1){
            for (i in 1:nrow(object)){
                if (sum(object[i,]) == 0){
                    write.table("\n", file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = "", append = TRUE)
                } else{
                    edges <- which(object[i,] > 0)
                    weights <- object[i,edges]
                    if (method == 0){
                        interlaced <- vector(mode = "numeric", length = 2 * length(edges))
                        interlaced[seq(1, 2 * length(edges) -1, 2)] <- edges
                        interlaced[seq(2, 2 * length(edges), 2)] <- weights
                    } else {
                        interlaced <- vector(mode = "numeric", length = 2 * length(edges) + 1)
                        interlaced[1] <- w[i]
                        interlaced[seq(2, 2 * length(edges), 2)] <- edges
                        interlaced[seq(3, 2 * length(edges) + 1, 2)] <- weights
                    }
                    interlaced <- as.integer64(interlaced)
                    write.table(interlaced, file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = " ", append = TRUE)
                    write.table("\n", file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = "", append = TRUE)
                }
            }
        } else {
            print (paste('wgraph: ', nrow(object), ' vertices and ', ncol(object), ' non-zero hyperedges', sep = ""))
            for (i in 1:ncol(object)){
                if (sum(object[,i]) == 0){
                    write.table("\n", file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = "", append = TRUE)
                } else {
                    edges <- which(object[,i] > 0)
                    if (method == 2){
                        weight <- sum(object[,i])
                    } else {
                        weight <- w[i]
                    }
                    interlaced <- c(weight, edges)
                    interlaced <- as.integer64(interlaced)
                    write.table(interlaced, file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = " ", append = TRUE)
                    write.table("\n", file = file_name, row.names = FALSE, col.names = FALSE, quote = FALSE, eol = "", append = TRUE)
                }
            }
        }
        return(dataname)
    }


    ### Function sgraph performs cluster ensembles using pmeits or shmetis.
    ### Be sure that both gpmetis and shmetis are located at the work directory and are executable.
    # k is the No. of clusters.
    # dataname is filename returned by function wgraph.
    # Moreover, this function requires to install package "stringr"
    sgraph <- function(k, dataname){
        if (is.null(dataname)){
            dataname <- "graph0"
        }
        result_name <- paste(dataname, '.part.', k, sep = '')

        last_char <- as.numeric(substr(dataname, str_length(dataname), str_length(dataname)))

        if (is.null(last_char)){
            print ("sgraph: file dose not comply to name convention")
            last_char <- 0
        }

        command <- NULL
        if (last_char < 2){
            command <- paste(program.dir, '/gpmetis -seed=', SEED, ' ', dataname, ' ', k, sep = '')
            employed_method <- "gpmetis"
            print (paste('cluster ensembles: ', command, sep = ''))
        } else {
            ubfactor <- 5
            command <- paste(program.dir, '/shmetis ', dataname, ' ', k, ' ', ubfactor, sep = '')
            employed_method <- "shmetis"
            print (paste('cluster ensembles: ', command, sep = ''))
        }

        system(command)

        fid <- read.table(result_name)
        if (is.null(fid)){
            print ('sgraph: partitioning not successful due to external error')
            fid <- read.table(dataname)
            if (is.null(fid)){
                print ('sgraph: graph file is not accessible')
            } else{
                if (last_char >= 2){
                    junk <- read.table("result_name", header = FALSE)
                }
                labels <- matrix(1, ncol = length(fid[,1]))
                if (is.null(labels)){
                    print ('sgraph: empty label vetor - suspecting file system full')
                }
            }
        } else{
            print (paste('sgraph: ', employed_method, ' completely loads ', result_name, sep = ''))
            labels <- t(as.matrix(fid + 1))
        }
        return(labels)
    }


    ### Function evolmutual is for transforming mutual information between the lables of ensemble cluster and clustering.
    # labels are the ensemble cluster labels generated by function sgraph.
    # object is the cluster labels of individual single-cell clustering method.
    evalmutual <- function(labels, object){
        remappedecl <- matrix(0, ncol = length(object))
        A = matrix(0, nrow = max(object), ncol = 2 + max(labels))
        for (i in 1:max(object)){
            activepoints <- which(object == i)
            composition <- hist(labels[activepoints], breaks = c(0:max(labels)), plot = FALSE)
            j <- which(composition$counts == max(composition$counts))
            j <- j[1]
            A[i,] <- c(j, i, composition$counts)
        }
        A <- A[order(A[,1]),]
        A <- A[,c(-1, -2)]

        pdf <- A/sum(A)
        pdf_trans <- pdf
        pdf_trans[which(pdf_trans == 0)] = 1
        px <- rowSums(pdf)
        px_trans <- px
        px_trans[which(px_trans == 0)] = 1
        py <- colSums(pdf)
        py_trans <- py
        py_trans[which(py_trans == 0)] = 1

        if (length(px) > 1){
            hxbk <- -sum(px * log2(px_trans))
        } else {
            hxbk <- 0
        }
        if (length(py) > 1){
            hybg <- -sum(py * log2(py_trans))
        } else {
            hybg <- 0
        }

        if (length(px) * length(py) == 1 || hxbk ==0 || hybg == 0){
            p <- 0
        } else {
            p <- sum(pdf * log2(pdf_trans/(px%*%t(py))))/sqrt(hxbk * hybg)
        }
        return(p)
    }


    ### Function check can validate and fix problems in cluster labels
    # labels are the ensemble cluster lables generated by function sgraph.
    check <- function(labels){
        labels_new <- cbind(t(labels), t(t(c(1:length(labels)))))
        labels_inorder <- labels_new[order(labels_new[,1]),]

        last <- NULL
        curind <- 0
        for (i in 1:nrow(labels_inorder)){
            if (last != labels_inorder[i,1] || is.null(last)){
                last <- labels_inorder[i,1]
                curind <- curind + 1
            }
            labels_inorder[i,1] <- curind
        }
        labels_inorder <- labels_inorder[order(labels_inorder[,2]),]
        labels_new <- t(labels_inorder[,1])

        if (max(labels) != max(labels_new)){
            print ('Result checking: not a dense integer mapping')
            labels <- NULL
            labels <- labels_new
            print (paste('Remapped to 1 to ', max(labels), ' clusters', sep = ''))
        }
        return(labels)
    }


    ### Function ANMI_calculation can calculate ANMI for each cluster ensemble result.
    # object is the clustering labels of individual single-cell clustering methods
    # labels are the enseble cluster labels generated by function sgraph.
    ANMI_calculation <- function(object, labels){
        m <- 0
        total_inds <- 0
        q <- numeric()
        object <- as.matrix(object)

        for (i in 1:nrow(object)){
            inds <- which(is.finite(object[i,]))
            q[i] <- evalmutual(labels = check(labels = t(as.matrix(labels[inds]))), object = check(labels = t(as.matrix(object[i,inds]))))
            m <- q[i] * length(inds) + m
            total_inds <- total_inds + length(inds)
        }

        m <- m/total_inds
        return(m)
    }


    ### Function hgpa performs cluster ensemble by HyperGraph-Paritioning Algorithm (HGPA)
    # object is the clustering results generated by all single-cell clustering methods
    # k is the maximum No. of clusters
    hgpa <- function(object, k = k){
        timestart<-Sys.time()
        hypergraph <- NULL
        print('Cluster ensembles using HGPA')

        # Generating hypergraph
        for (i in 1:nrow(object)) {
            hypergraph_each <- hypergraph_generation(object[i,])
            hypergraph = cbind(hypergraph, hypergraph_each)
        }

        filename <- wgraph(object = hypergraph, method = 2)
        clusters <- sgraph(k = k, dataname = filename)
        row.names(clusters) <- NULL

        system('rm graph2')
        rm_file <- paste('rm graph2.part.', k, sep = '')
        system(rm_file)

        timeend<-Sys.time()
        runningtime<-timeend-timestart
        print(timestart)
        print(timeend)
        hgpa_time = paste('Runing time of HGPA is ', runningtime, sep = "")
        print(hgpa_time)

        return(clusters)
    }


    ### Function mcla performs cluster ensemble by Meta-CLustering Algorithm (MCLA)
    # object is the clustering results generated by all single-cell clustering methods
    # k is the maximum No. of clusters
    mcla <- function(object, k = k){
        timestart<-Sys.time()
        hypergraph <- NULL
        print('Cluster ensembles using MCLA')

        for (i in 1:nrow(object)) {
            hypergraph_each <- hypergraph_generation(object[i,])
            hypergraph = cbind(hypergraph, hypergraph_each)
        }

        ### Function simxjac computes extended Jaccard similarity between row objects in matrices a and b
        # matrixA is the hypergraph generated by function hypergraph_generation
        simxjac <- function(matrixA){
            A_matrix_square <- matrixA %*% t(matrixA)
            A_square <- as.matrix(rowSums(matrixA ^2))
            sim_jac <- A_matrix_square/((A_square %*% matrix(1,ncol = nrow(matrixA))) + matrix(1, nrow = nrow(matrixA)) %*% t(A_square) - A_matrix_square)
            return(sim_jac)
        }

        sim_jac <- simxjac(matrixA = t(hypergraph))

        ### Function cmetis provides cluster labels 1 to k from edge weighted value balanced graph partitioning
        # object is the similarity matrix sim_jac generated by function simjac
        # k is the maximum No. of clusters
        cmetis <- function(sim_matrix, w, k){
            filename <- wgraph(object = sim_matrix, w = w, method = 1)
            labels <- sgraph(k = k, dataname = filename)
            return(labels)
        }

        labels <- cmetis(sim_matrix = sim_jac, w = rowSums(t(hypergraph)), k = k)

        hypergraph_cum <- NULL
        for (i in 1:max(labels)){
            matched_clusters <- which(labels == i)
            if (length(matched_clusters) > 1){
                hypergraph_cum <- rbind(hypergraph_cum, rowSums(hypergraph[,matched_clusters])/ncol(hypergraph[,matched_clusters]))
            } else if (length(matched_clusters) == 1){
                hypergraph_cum <- rbind(hypergraph_cum, hypergraph[,matched_clusters])
            }
        }

        ### Function hypergraphTOcluster generate the final cluster ensemble results using normalized hypergraph
        hypergraphTOcluster <- function(hypergraph_cum){
            randbreakties <- 1
            allzeroclumns <-NULL
            allzeroclumns <- which(colSums(hypergraph_cum) == 0)

            if (length(allzeroclumns) != 0){
                print (paste('hypergraphTOcluster: ', length(allzeroclumns), ' objects (', round(100*length(allzeroclumns)/ncol(hypergraph)), '%) with all zero associations', sep = ''))
                hypergraph_cum[,allzeroclumns] <- matrix(stats::runif(nrow(hypergraph_cum) * length(allzeroclumns)), nrow = nrow(hypergraph_cum), ncol = length(allzeroclumns))
            }
            hypergraph_cum <- hypergraph_cum + matrix(stats::runif(nrow(hypergraph_cum) * ncol(hypergraph_cum)), nrow = nrow(hypergraph_cum), ncol = ncol(hypergraph_cum))/10000

            ### Normalization
            #library("Matrix")
            hypergraph_cum <- t(as(diag(1/rowSums(abs(t(hypergraph_cum)))), "sparseMatrix") %*% t(hypergraph_cum))

            m <- apply(hypergraph_cum, 2, max)
            clusters <- matrix(0, ncol = ncol(hypergraph_cum))
            max_postp <- matrix(0, ncol = ncol(hypergraph_cum))
            for (i in nrow(hypergraph_cum):1){
                max_position <- which(hypergraph_cum[i,] == m)
                if (length(max_position) != 0){
                    clusters[max_position] <- i * matrix(1, ncol = length(max_position))
                    max_postp[max_position] <- hypergraph_cum[i, max_position]
                }
            }
            print(paste('Delivering ', max(clusters), ' clusters', sep = ""))
            print(paste('Average posterior probablity is ', mean(max_postp), sep = ""))
            return(clusters)
        }

        clusters <- hypergraphTOcluster(hypergraph_cum = hypergraph_cum)

        system('rm graph1')
        rm_file <- paste('rm graph1.part.', k, sep = '')
        system(rm_file)

        timeend<-Sys.time()
        runningtime<-timeend-timestart
        print(timestart)
        print(timeend)
        mcla_time = paste('Runing time of MCLA is ', runningtime, sep = "")
        print(mcla_time)

        return(clusters)
    }


    ### Function mcla performs cluster ensemble by Meta-CLustering Algorithm (MCLA)
    # object is the clustering results generated by all single-cell clustering methods
    # k is the maximum No. of clusters
    cspa <- function(object, k = k){
        timestart<-Sys.time()
        hypergraph <- NULL
        print('Cluster ensembles using CSPA')

        for (i in 1:nrow(object)) {
            hypergraph_each <- hypergraph_generation(object[i,])
            hypergraph = cbind(hypergraph, hypergraph_each)
        }

        ### Function checks
        checks <- function(object){
            if (nrow(object) == ncol(object)){
                if (sum(diag(object) != 1) > 0){
                    print ('self-similarity object[i,i] is not always 1')
                    for (i in 1:nrow(object)){
                        object[i,i] <- 1;
                    }
                    print ('diagonal is set to 1')
                }
            }
            return(object)
        }

        similarity_matrix <- (hypergraph %*% t(hypergraph))/nrow(object)

        filename <- wgraph(object = checks(similarity_matrix), method = 0)
        clusters <- sgraph(k = k, dataname = filename)
        row.names(clusters) <- NULL

        system('rm graph0')
        rm_file <- paste('rm graph0.part.', k, sep = '')
        system(rm_file)

        timeend<-Sys.time()
        runningtime<-timeend-timestart
        print(timestart)
        print(timeend)
        cspa_time = paste('Runing time of CSPA is ', runningtime, sep = "")
        print(cspa_time)

        return(clusters)
    }

    cluster_ensembles <- list()

    if (HGPA == TRUE){
        cluster_ensembles$HGPA_ANMI <- 0
        hgpa_ANMI_each <- numeric()
        hgpa_k_each <- numeric()

        for (K in k_min:k_max){
            hgpa_all_result <- matrix(0, nrow = 10, ncol = ncol(cluster_results))
            hgpa_all_ANMI = numeric()
            for (i in 1:10){
                hgpa_all_result[i,] <- check(labels = hgpa(object = cluster_results, k = K))
                hgpa_all_ANMI[i] <- ANMI_calculation(object = cluster_results, labels = hgpa_all_result[i,])
            }

            hgpa_result <- hgpa_all_result[order(hgpa_all_ANMI)[6],]
            hgpa_ANMI <- hgpa_all_ANMI[order(hgpa_all_ANMI)[6]]

            hgpa_ANMI_each[K-1] <- hgpa_ANMI
            hgpa_k_each[K-1] <- max(hgpa_result)

            if (hgpa_ANMI >= cluster_ensembles$HGPA_ANMI){
                cluster_ensembles$HGPA <- c(hgpa_result)
                cluster_ensembles$HGPA_optimal_k <- max(hgpa_result)
                cluster_ensembles$HGPA_ANMI <- hgpa_ANMI
            }
        }
    }

    if (MCLA == TRUE){
        cluster_ensembles$MCLA_ANMI <- 0
        mcla_ANMI_each <- numeric()
        mcla_k_each <- numeric()

        for (K in k_min:k_max){
            mcla_result <- check(mcla(object = cluster_results, k = K))
            mcla_ANMI <- ANMI_calculation(object = cluster_results, labels = mcla_result)

            mcla_ANMI_each[K-1] <- mcla_ANMI
            mcla_k_each[K-1] <- max(mcla_result)

            if (mcla_ANMI >= cluster_ensembles$MCLA_ANMI){
                cluster_ensembles$MCLA <- c(mcla_result)
                cluster_ensembles$MCLA_optimal_k <- max(mcla_result)
                cluster_ensembles$MCLA_ANMI <- mcla_ANMI
            }
        }
    }

    if (CSPA == TRUE & ((is.null(cspc_cell_max)) || ((!is.null(cspc_cell_max)) & (ncol(cluster_results) <= cspc_cell_max)))){
        cluster_ensembles$CSPA_ANMI <- 0
        cspa_ANMI_each <- numeric()
        cspa_k_each <- numeric()

        for (K in k_min:k_max){
            cspa_result <- check(cspa(object = cluster_results, k = K))
            cspa_ANMI <- ANMI_calculation(object = cluster_results, labels = cspa_result)

            cspa_ANMI_each[K-1] <- cspa_ANMI
            cspa_k_each[K-1] <- max(cspa_result)

            if (cspa_ANMI >= cluster_ensembles$CSPA_ANMI){
                cluster_ensembles$CSPA <- c(cspa_result)
                cluster_ensembles$CSPA_optimal_k <- max(cspa_result)
                cluster_ensembles$CSPA_ANMI <- cspa_ANMI
            }
        }
    }

    for (K in k_min:k_max){
        if (HGPA == TRUE){
            print(paste('HGPA partitioning at K = ', K, ': ', hgpa_k_each[K-1], ' clusters at ANMI = ', hgpa_ANMI_each[K-1], sep = ""))
        }
    }
    for (K in k_min:k_max){
        if (MCLA == TRUE){
            print(paste('MCLA partitioning at K = ', K, ': ', mcla_k_each[K-1], ' clusters at ANMI = ', mcla_ANMI_each[K-1], sep = ""))
        }
    }
    for (K in k_min:k_max){
        if (CSPA == TRUE & ((is.null(cspc_cell_max)) || ((!is.null(cspc_cell_max)) & (ncol(cluster_results) <= cspc_cell_max)))){
            print(paste('CSPA partitioning at K = ', K, ': ', cspa_k_each[K-1], ' clusters at ANMI = ', cspa_ANMI_each[K-1], sep = ""))
        }
    }

    if ((HGPA == TRUE) + (MCLA == TRUE) + (CSPA == TRUE) > 1){
        labels_all <- logical()
        ANMI <- logical()
        if (HGPA == TRUE){
            labels_all = rbind(labels_all, cluster_ensembles$HGPA)
            ANMI = cbind(ANMI, cluster_ensembles$HGPA_ANMI)
        }
        if (MCLA == TRUE){
            labels_all = rbind(labels_all, cluster_ensembles$MCLA)
            ANMI = cbind(ANMI, cluster_ensembles$MCLA_ANMI)
        }
        if (CSPA == TRUE){
            labels_all = rbind(labels_all, cluster_ensembles$CSPA)
            ANMI = cbind(ANMI, cluster_ensembles$CSPA_ANMI)
        }

        cluster_ensembles$optimal_k <- max(labels_all[which(ANMI == max(ANMI))[1],])
        cluster_ensembles$optimal_clustering <- labels_all[which(ANMI == max(ANMI))[1],]
        cluster_ensembles$Summary = paste('Optimal number of clusters is ', cluster_ensembles$optimal_k, ' with ANMI = ', max(ANMI), sep = "")
        print(cluster_ensembles$Summary)
    } else {
        if (HGPA == TRUE){
            cluster_ensembles$optimal_clustering = cluster_ensembles$HGPA
            cluster_ensembles$optimal_k = cluster_ensembles$HGPA_optimal_k

            cluster_ensembles$Summary = paste('Optimal number of clusters is ', cluster_ensembles$optimal_k, ' with ANMI = ', cluster_ensembles$HGPA_ANMI, sep = "")
            print(cluster_ensembles$Summary)
        }
        if (MCLA == TRUE){
            cluster_ensembles$optimal_clustering = cluster_ensembles$MCLA
            cluster_ensembles$optimal_k = cluster_ensembles$MCLA_optimal_k

            cluster_ensembles$Summary = paste('Optimal number of clusters is ', cluster_ensembles$optimal_k, ' with ANMI = ', cluster_ensembles$MCLA_ANMI, sep = "")
            print(cluster_ensembles$Summary)
        }
        if (CSPA == TRUE){
            cluster_ensembles$optimal_clustering = cluster_ensembles$CSPA
            cluster_ensembles$optimal_k = cluster_ensembles$CSPA_optimal_k

            cluster_ensembles$Summary = paste('Optimal number of clusters is ', cluster_ensembles$optimal_k, ' with ANMI = ', cluster_ensembles$CSPA_ANMI, sep = "")
            print(cluster_ensembles$Summary)
        }
    }

    return(cluster_ensembles)
}
yycunc/SAFEclustering documentation built on Dec. 8, 2018, 6:57 a.m.