#' similarity-based ensemble meta-clustering
#'
#' This function is to do similarity-based ensemble clustering for combining the results from different partitions of single cells. The similarity is measured by the group-group correlations.
#'
#' @param nC a m*n matrix, where m is the number of cells, n is the number of clustering algorithms, and the (i,j)-element of nC represents the cluter for the i-th cell by the j-th clutering predictor.
#'
#' @examples
#' finalrowColor = sMetaC(fColor, E1, folds, hmethod, N.cluster, minN.cluster, maxN.cluster, sil.thre, height.Ntimes)
#'
#' @import cluster
#'
#' @import clues
#'
#' @import clusterCrit
#'
#' @export
sMetaC <- function(rerowColor, sE1, folds, hmethod, finalN.cluster, minN.cluster,
maxN.cluster, sil.thre, height.Ntimes) {
# This is to do meta-clustering the results obtained for each smaller group of
# the original large-scale datasets
R = unique(rerowColor) #unique clusters
# print(R)
# cat("Unique clusters for similarity-based meta-clustering are: \n", R, "\n")
nC = length(R) #number of unique clusters
cat("Number of meta-clusters is:", nC, "\n")
# # get the number of clusters for each partition groups
# T = unique(folds)
# t0 = vector(mode = "numeric", length = length(T))
# # ff = numeric()
# for (t in 1:length(T)) {
# tmp = which(folds == T[t])
# t0[t] = length(unique(rerowColor[tmp])) #number of clusters for each partition
# # ff = c(ff, rep(t, each = t0[t]))
# }
# pind = which.max(t0)#the partition which has the most predicted clusters rind =
# setdiff(1:T, pind) #preprocessing
###using similarity matrix instead of correlation, we do not need these pre-processing steps
# sE1 = sE1[apply(sE1, 1, function(x) sd(x) != 0), ] #avoid NA when sd is 0 during scaling
# sE1 <- t(scale(t(sE1))) #vector scaling
# mE = matrix(0, nrow = length(R), ncol = dim(sE1)[2]) for(i in 1:length(R)){ G =
# sE1[rerowColor == R[i], , drop = FALSE] mE[i, ] = colMeans(G)*dim(G)[1] } mf =
# getwfperC(frowColor, E, p)#weighted features for each partitioned cluster
cb = combn(nC, 2) #all possible combinations (n*(n-1)/2)
# t = p alls = apply(cb, 2, getpaircor, rerowColor = rerowColor, E1 =
# E1)#calculate the correlation/simialrity for all combinations (apply to the
# columns) parallel programming####
h = dim(cb)[2]#n*(n-1)/2 combinations
cat("Number of combinations:", h, "\n")
# registerDoParallel(n.cores)
#get the mean vector for each cluster
aG = foreach(t = 1:nC, .combine = "rbind") %dopar%{
G1 = sE1[which(rerowColor %in% R[t]), , drop = FALSE] #find all the cells for cluster i; 'drop = F' is to avoid a one-row/column matrix being converted to a vector
aG1 = colMeans(G1)
return(aG1)
}
# alls = numeric(h)
#get the cluster-pairwise correlation
alls = foreach(t = 1:h, .combine = "c") %dopar% {
corss = cor(aG[cb[1, t], ], aG[cb[2, t], ])
# corss = getpaircor(cb[, t], rerowColor, sE1, R)
# # if(is.null(corss)){
# # cat(cb[, t], "(", t,"-th element) is NULL!\n")
# # corss = 0
# # }
# if(t %% ceiling(h/10) == 0){
# cat("The", t, "-th combination\n")
# }
return(corss)
}
# stopImplicitCluster()
# cat("The length of alls is:", length(alls), "\n")
# alls = foreach(t=1:dim(cb)[2], .combine = 'c') %dopar%{exp(-sum((mf[cb[1, t]] -
# mf[cb[2, t]])^2)/t)}
S0 = sparseMatrix(i = cb[1, ], j = cb[2, ], x = alls, dims = c(nC, nC)) #triangle part of the S
S = S0 + t(S0) + diag(nC)
# print(S)
# w = matrix(0, nrow = length(R), ncol = 2) for(i in 1:length(R)){ tind
# =setdiff(1:length(R), which(ff == ff[i]))#rest index w[i,1] = min(S[i, tind]) }
# # S = t(t(S)/(colSums(S)-1)) # diag(S) = 1 print(S)
# S = matrix(0, ncol = nC, nrow = nC)#initialization for a similarity matrix of
# all the clusters for(i in 1:nC){ for(j in (i+1):nC){ G1 = E1[rerowColor ==
# allC[i], ]#find all the cells for cluster i G2 = E1[rerowColor == allC[j],
# ]#find all the cells for cluster j cor1 = cor(t(G1), t(G2))#correlation between
# these two clusters S[i, j] = (max.col(cor1) + max.col(t(cor1)))/(nrow(G1) +
# nrow(G2))#similarity between two clusters S[j, i] = S[i, j] } S[i, i] = 1 }
# res = kmeans(mE, max(t0)) d=as.dist(1-cor(t(mE)))
ncells = length(rerowColor)
mm = floor(ncells/10000)
if(ncells < 1e6){
baseN = min(max(mm, 2), 10)
if(minN.cluster == 2 && min(maxN.cluster, nrow(S))-baseN >= 3){#in case the maximum tried ncluster is not large enough, we do not change the minimum one
minN.cluster = baseN
}
}else{
# maxN.cluster = max(maxN.cluster, mm)
mm3 = floor(ncells/50000)
# minN.cluster = max(minN.cluster, mm2)
mm2 = floor(ncells/5000)
# maxN.cluster = max(maxN.cluster, mm)
# minN.cluster = max(minN.cluster, mm3)
maxN.cluster = max(maxN.cluster, mm2)
minN.cluster = max(minN.cluster, mm3)
}
# if (missing(minN.cluster)) {
# minN.cluster = max(baseN, min(t0))
# }
# if (missing(maxN.cluster)) {
# maxN.cluster = max(mm*2, 40)
# }
hres = get_opt_hclust(S, hmethod, N.cluster = finalN.cluster, minN.cluster, maxN.cluster,
sil.thre, height.Ntimes)
print(hres$msil)#the silhouette index for different clusters
v = hres$v#different numbers of clustering results
# cat("Dim of v is:", dim(v), "\n")
s0 = hres$msil
# cat("Dim of s0 is:", dim(s0), "\n")
n = length(s0)
# cat("n:", n, "\n")
if (n > 1 && length(unique(hres$f)) == 2 && hres$maxsil > sil.thre) {
s1 = sort(s0,partial=n-1)[n-1]#the second largest value
# cat("s1:", s1, "\n")
s2 = which(s0 == s1)#the index
# cat("Selected s2/index is", s2, "\n")
s3 = hres$v[, s2]
tf = s3
}else{
tf = hres$f
cat("Number of predicted clusters:", hres$optN.cluster, "\n")
}
# d=as.dist(1-S) # d=as.dist(S) # print(S) # metah = hclust(d, method='ward.D') h
# = hclust(d, method='ward.D')#although it is also meta, it is for a single large
# dataset # S = mE # nc = 2:min(40, dim(S)[1]-1) nc = max(2, min(t0)):min(40,
# dim(S)[1]-1) v = matrix(0, nrow = dim(S)[1], ncol = length(nc))#for all
# different numbers of clusters msil = rep(0, length(nc))#declare a vector of
# zeros CHind = rep(0, length(nc)) print(paste('The height for the top 10 are: ',
# tail(h$height, n = 10), sep = '')) # stop('Stop here!') for(i in 1:length(nc)){
# # foreach(i=1:length(nc)) %dopar%{ v[,i] = cutree(h, k = nc[i])#for different
# numbers of clusters sil = silhouette(v[,i], d)#calculate the silhouette index #
# msil[i] = mean(sil[,3])#the mean value of the index msil[i] =
# median(sil[,3])#the mean value of the index # CHind[i] = get_CH(S, v[,i],
# disMethod = 'Euclidean') ch0 =
# intCriteria(data.matrix(S),as.integer(v[,i]),'Calinski_Harabasz') CHind[i] =
# unlist(ch0, use.names = F)#convert a list to a vector/value } print(msil)#the
# average sil index for all cases print(CHind) # oind = which.max(msil)#the index
# corresponding to the max sil index tmp = which(msil == max(msil))#in case there
# are more than one maximum if(length(tmp)>1){oind =
# tmp[ceiling(length(tmp)/2)]}else{oind = tmp} if(max(msil)<=0.35){ oind =
# which.max(CHind) if(oind ==1){#it's likely that the CH index is not reliable
# either tmp = tail(h$height, n =10)#the height diftmp = diff(tmp) flag = diftmp
# > tmp[1:(length(tmp)-1)]#require the height is more than 2 times of the
# immediate consecutive one if(any(flag)){#if any satifies the condition; make
# sure at least one satisfied pind = which.max(flag) opth = (tmp[pind] +
# tmp[pind+1])/2#the optimal height to cut optv = cutree(h, h = opth)#using the
# appropriate height to cut oind = length(unique(optv)) - 1#for consistency } } }
# # oind = 1 tf = v[,oind]#the optimal clustering results
# tf=cutree(metah,k=N.cluster)
rerowColor[] <- vapply(rerowColor, function(x) tf[match(x, R)], numeric(1)) #apply to every element;this is for matrices
# finalC = apply(rerowColor, 1, function(d)
# names(sort(table(d),decreasing=TRUE)[1])) #find the most repeated elements for
# each row
# f = rerowColor my = sE1 silthre = 0 nthre = 0.05*length(f) nf =
# character(length = length(f)) # if(hres$maxsil > silthre){ # nf =
# as.character(f) # }else{ uf = unique(f) r = length(unique(f)) for(t in 1:r){
# tind = which(f==uf[t]) if(length(tind)<= nthre){ nf[tind] =
# as.character(f[tind]) }else{ tE = my[tind,] d1=as.dist(1-cor(t(tE))) hres1 =
# gethclust(d1, tE) rf = hres1$f nf[tind] = paste(rf, 'sub', t, sep = '') } } # }
finalColor = rerowColor
N.cluster = length(unique(finalColor)) #note that the number of clusters for meta-clustering is not determined by previous selection, but by the unique number in the final round.
# print(paste('The optimal number of clusters for combining partitions is: ',
# N.cluster, sep = ''))
cat("The optimal number of clusters for combining partitions is:", N.cluster,
"\n")
finalres = list()
finalres$finalColor = finalColor
finalres$tf = tf#optimal clustering results
return(finalres)
# return(finalColor)
}
getpaircor <- function(pairind, rerowColor, sE1, R) {
# G1 = sE1[rerowColor == R[pairind[1]], , drop = FALSE] #find all the cells for cluster i; 'drop = F' is to avoid a one-row/column matrix being converted to a vector
# G2 = sE1[rerowColor == R[pairind[2]], , drop = FALSE] #find all the cells for cluster j
G1 = sE1[which(rerowColor %in% R[pairind[1]]), , drop = FALSE] #find all the cells for cluster i; 'drop = F' is to avoid a one-row/column matrix being converted to a vector
G2 = sE1[which(rerowColor %in% R[pairind[2]]), , drop = FALSE] #find all the cells for cluster j
# cor1 = cor(t(G1), t(G2))#correlation between these two clusters t =
# 1.5*dim(G1)[2]
aG1 = colMeans(G1)
aG2 = colMeans(G2)
# corss = dist(rbind(aG1, aG2)) corss = exp(-sum((aG1 - aG2)^2)/t)
corss = cor(aG1, aG2)
# if(dim(G1)[1] != 0 && dim(G2)[1] != 0){#if not available or missing, one fo the sd's is 0, and then correlation is 0
# aG1 = colMeans(G1)
# aG2 = colMeans(G2)
# # corss = dist(rbind(aG1, aG2)) corss = exp(-sum((aG1 - aG2)^2)/t)
# corss = cor(aG1, aG2)
# if(length(corss) == 0){
# cat("Cor NULL exists for", R[pairind[1]], "and", R[pairind[2]], "\n")
# }
# }else{
# cat(R[pairind[1]], "and", R[pairind[2]], ": at least one of them is with length 0\n")
# corss = 0
# }
# as1 = 1/(1 + exp(-aG1)) as2 = 1/(1 + exp(-aG2)) ass =
# sum(aG1*aG2)/sqrt(sum(aG1^2)*sum(aG2^2)) corss = 1/(1 + exp(-ass)) cor1 =
# foreach(i = 1:dim(G1)[1], .combine = 'cbind') %:% foreach(j = 1:dim(G2)[1],
# .combine = 'c') %do%{ # sum(G1[i,]*G2[j,])/sqrt(sum(G1[i,]^2)*sum(G2[j,]^2))
# exp(-sum((G1[i,] - G2[j,])^2)/t) } m = dim(cor1) if(m[1] <= m[2]){#use the
# smaller-size group to measure the correlation/similarity corss =
# sum(do.call(pmax, data.frame(cor1)))/nrow(G1)#similarity between two clusters;
# summing all the max correlation/similarity }else{ corss = sum(do.call(pmax,
# data.frame(t(cor1))))/nrow(G2) }
# c1 = as.vector(cor1) c2 = c1[order(-c1)]#sorting the number in descending order
# corss = sum(c2[1:ceiling(length(c2)/2)])/sum(c2) c1 = do.call(pmax,
# data.frame(cor1)) c2 = do.call(pmax, data.frame(t(cor1))) i0 = intersect(c1,
# c2) u0 = union(c1, c2) corss = sum(i0)/sum(u0) corss = (sum(do.call(pmax,
# data.frame(cor1))) + sum(do.call(pmax, data.frame(t(cor1)))))/(nrow(G1) +
# nrow(G2))#similarity between two clusters; summing all the max
# correlation/similarity corss = (sum(do.call(pmax, data.frame(cor1)))/nrow(G1) +
# sum(do.call(pmax, data.frame(t(cor1))))/nrow(G2))/2 corss = max(max(cor1))
# print(corss) corss = mean(mean(cor1))
return(corss)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.