library(ICN) source("../R/Auxiliary.R") library(RColorBrewer) library(pracma) library(rARPACK) library(gplots) library(fields) library(parallel) library(matlab) set.seed(1)
In this vignette, we provide an example analysis starting with a correlation matrix consisting of 100 nodes and 2 initial clusters. We will perform the 3 steps of clustering, identifying connected communities, and narrowing down the connected edges for this example.
## Simulate Two Communities (100 nodes) # Simulate true corr matrix true1 = matrix(0, nrow = 100, ncol = 100) true1[1:20,1:20] = 0.5 true1[21:30,21:30] = 0.5 true1[1:20,21:30] =0.2 true1[21:30, 1:20] = 0.2 true1[1:30,1:30] = true1[1:30,1:30] + 0.5*diag(30) true1[31:100,31:100] = true1[31:100,31:100] + diag(70); heatmap.2(true1, Rowv = FALSE, Colv = FALSE, margins = c(6,12), col = jet.colors(100), trace = "none", labRow = FALSE, labCol = FALSE, key.title = "Color Key", keysize = 0.9, key.par = list(cex=0.5), key.xlab = "value", breaks=seq(0,1,0.01), density.info = "none", dendrogram = "none") # Simulate samples based on true corr matrix, and then calculate the sample corr matrix. nvars = 100 nobs = 50 R = chol(true1) # upper triangular Z_vec = rnorm(nvars * nobs) Z = matrix(Z_vec, nrow = 100) samples = crossprod(R,Z) # can use crossprod samples = t(samples) # For each node (100 nodes in total), draw 50 samples. simu1 = cor(samples) # Get corr between nodes heatmap.2(simu1, Rowv = FALSE, Colv = FALSE, margins = c(6,12), col = jet.colors(100), trace = "none", labRow = FALSE, labCol = FALSE, key.title = "Color Key", keysize = 0.9, key.par = list(cex=0.5), key.xlab = "value", density.info = "none", dendrogram = "none") # Shuffle the sample corr matrix shuffle_index = sample(1:100) corr = simu1[shuffle_index, shuffle_index] heatmap.2(corr, Rowv = FALSE, Colv = FALSE, margins = c(6,12), col = jet.colors(100), trace = "none", labRow = FALSE, labCol = FALSE, key.title = "Color Key", keysize = 0.9, key.par = list(cex=0.5), key.xlab = "value", density.info = "none", dendrogram = "none") nice = NICE_fast(corr, 0.2)
We have obtained the correlation matrix corr.rds
and the NICE clustering output nice.rds
from step 1. Now we compute the KL-divergence between community pair correlations and the null distribution to establish the significance of the interconnectedness between these community pairs.
Here we count the number of communities (clusters containing more than 1 node), locate their indices, and construct a "pairs" matrix enumerating all indexed pairs of communities (ordered by size descending):
n = dim(corr)[1] nicer = NICER(nice) # find number of 1+ node communities K = nicer$K K.ind = nicer$K.ind # find community indices comm.ind = vector(mode = "list", length = K) for (ind in 1:K){ comm.ind[[ind]] = as.vector(which(nice$Cindx == K.ind[ind])) }
# create a vector of singletons ordered.ind = nice$Cindx[nice$Clist] num.comm.ind = max(which(ordered.ind %in% K.ind)) singles = nice$Clist[(num.comm.ind + 1):n] # create the singleton matrix G.R = corr[singles, singles] diag(G.R) = 0 true_dist = squareform(G.R)
pairs = lst_pairs(K) n.pairs = dim(to_matrix(pairs))[1] G.pairwise = vector(mode = "list", length = n.pairs) null = vector() for (p in 1:n.pairs){ pairs = to_matrix(pairs) nodes.comm1 = comm.ind[[pairs[p,1]]] nodes.comm2 = comm.ind[[pairs[p,2]]] CCp = corr[nodes.comm1, nodes.comm2] G.pairwise[[p]] = as.vector(CCp) null = c(null, G.pairwise[[p]]) } # write null distribution: Off_2 = corr[nice$Clist[1:num.comm.ind], singles] null = c(null, as.vector(Off_2))
connections = lapply(G.pairwise, KLtest, null = null, true_dist = true_dist, a = 0.05)
# transform list interconnection into a vector interResult = unlist(connections) # transform matrix of community pairs index into dataframe inRes = as.data.frame(pairs) colnames(inRes) = c('community_one', 'community_two') inRes$whether_interconnected = interResult
d = max(inRes$community_two) A.mat = matrix(0, nrow = d, ncol = d) diag(A.mat) = 1 for(i in c(1:nrow(inRes))){ A.mat[inRes$community_one[i], inRes$community_two[i]] = inRes$whether_interconnected[i] A.mat[inRes$community_two[i], inRes$community_one[i]] = inRes$whether_interconnected[i] } par(cex.main=0.8, par(mar = c(1, 2, 2, 6))) colors = rep(brewer.pal(9, "Blues"), each = 3)[1:17] plot.A.mat = A.mat + diag(rep(2, nrow(A.mat))) # emphasize diagonal in plot adj.mat = heatmap.2(A.mat, dendrogram='none', Rowv=FALSE, Colv=FALSE, trace='none', key = F, col = as.vector(colors), main = "Communities Interconnection", colsep = 1:n, rowsep = 1:n, sepcolor = "grey", sepwidth=c(0.005,0.005)) image.plot(legend.only=T, zlim=range(0,1), col=colors, legend.mar = 3, legend.shrink = 0.5, legend.width = 0.5, legend.cex = 0.3)
interCon = inRes[inRes$whether_interconnected == 1,] #print(interCon) ### Select two interconnected communities from dataframe interCon and detect ### interconnected edges. kID1 = nice$CID[1] # an example, 4 here means the 4th cluster in heatmap. kID2 = nice$CID[2] nodes1 = which(nice$Cindx == kID1) nodes2 = which(nice$Cindx == kID2) nodecom = c(nodes1, nodes2) mat1 = corr[nodes1, nodes1] mat2 = corr[nodes2, nodes2] mat12 = corr[nodecom, nodecom] inter12 = corr[nodes1, nodes2] result3 = InterCon(mat1, mat2, mat12, inter12, 0.6, c(0.3, 0.5, 0.6, 0.8, 0.85, 0.9), nodes1, nodes2) paste('best threshold r: ',result3[[1]]) print('Interconnected edges: ') print(result3[[2]])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.