In lwa19/ICN: Use Interconnected Communities Network (ICN) to characterize population subgroup features.

library(ICN)
source("../R/Auxiliary.R")
library(RColorBrewer)
library(pracma)
library(rARPACK)
library(gplots)
library(fields)
library(parallel)
library(matlab)
set.seed(1)

In this vignette, we provide an example analysis starting with a correlation matrix consisting of 100 nodes and 2 initial clusters. We will perform the 3 steps of clustering, identifying connected communities, and narrowing down the connected edges for this example.

Step 1

## Simulate Two Communities (100 nodes)
# Simulate true corr matrix
true1 = matrix(0, nrow = 100, ncol = 100)
true1[1:20,1:20] = 0.5
true1[21:30,21:30] = 0.5
true1[1:20,21:30] =0.2
true1[21:30, 1:20] = 0.2
true1[1:30,1:30] = true1[1:30,1:30] + 0.5*diag(30)
true1[31:100,31:100] = true1[31:100,31:100] + diag(70);
heatmap.2(true1, Rowv = FALSE, Colv = FALSE, margins = c(6,12), col = jet.colors(100), 
          trace = "none", labRow = FALSE, labCol = FALSE, key.title = "Color Key", 
          keysize = 0.9, key.par = list(cex=0.5), key.xlab = "value", breaks=seq(0,1,0.01),
          density.info = "none", dendrogram = "none")

# Simulate samples based on true corr matrix, and then calculate the sample corr matrix.
nvars = 100
nobs = 50
R = chol(true1) # upper triangular
Z_vec = rnorm(nvars * nobs)
Z = matrix(Z_vec, nrow = 100)
samples =  crossprod(R,Z) # can use crossprod
samples = t(samples) # For each node (100 nodes in total), draw 50 samples.
simu1 = cor(samples) # Get corr between nodes
heatmap.2(simu1, Rowv = FALSE, Colv = FALSE, margins = c(6,12), col = jet.colors(100), 
          trace = "none", labRow = FALSE, labCol = FALSE, key.title = "Color Key", 
          keysize = 0.9, key.par = list(cex=0.5), key.xlab = "value", density.info = "none", 
          dendrogram = "none")

# Shuffle the sample corr matrix
shuffle_index = sample(1:100)
corr = simu1[shuffle_index, shuffle_index]
heatmap.2(corr, Rowv = FALSE, Colv = FALSE, margins = c(6,12), col = jet.colors(100), 
          trace = "none", labRow = FALSE, labCol = FALSE, key.title = "Color Key", 
          keysize = 0.9, key.par = list(cex=0.5), key.xlab = "value", density.info = "none", 
          dendrogram = "none")

nice = NICE_fast(corr, 0.2)

Step 2: Identify significant connections between communities

We have obtained the correlation matrix corr.rds and the NICE clustering output nice.rds from step 1. Now we compute the KL-divergence between community pair correlations and the null distribution to establish the significance of the interconnectedness between these community pairs.

NICE post-processing

Here we count the number of communities (clusters containing more than 1 node), locate their indices, and construct a "pairs" matrix enumerating all indexed pairs of communities (ordered by size descending):

n = dim(corr)[1]
nicer = NICER(nice)

# find number of 1+ node communities
K = nicer$K
K.ind = nicer$K.ind

# find community indices
comm.ind = vector(mode = "list", length = K)
for (ind in 1:K){
  comm.ind[[ind]] = as.vector(which(nice$Cindx == K.ind[ind]))
}

Identify the singletons:

# create a vector of singletons
ordered.ind = nice$Cindx[nice$Clist]
num.comm.ind = max(which(ordered.ind %in% K.ind))
singles = nice$Clist[(num.comm.ind + 1):n]  

# create the singleton matrix
G.R = corr[singles, singles]
diag(G.R) = 0
true_dist = squareform(G.R)

find all pairwise communities G(Vc, Vc')

pairs = lst_pairs(K)
n.pairs = dim(to_matrix(pairs))[1]
G.pairwise = vector(mode = "list", length = n.pairs)
null = vector()
for (p in 1:n.pairs){
  pairs = to_matrix(pairs)
  nodes.comm1 = comm.ind[[pairs[p,1]]]
  nodes.comm2 = comm.ind[[pairs[p,2]]]
  CCp = corr[nodes.comm1, nodes.comm2] 
  G.pairwise[[p]] = as.vector(CCp)     
  null = c(null, G.pairwise[[p]])
}

# write null distribution: 
Off_2 = corr[nice$Clist[1:num.comm.ind], singles]
null = c(null, as.vector(Off_2))

Compute KL-divergence

connections = lapply(G.pairwise, KLtest, null = null,
                     true_dist = true_dist, a = 0.05)

Generate a dataframe of all community relationships:

1 if interconnected otherwise 0

# transform list interconnection into a vector
interResult = unlist(connections)
# transform matrix of community pairs index into dataframe
inRes = as.data.frame(pairs)
colnames(inRes) = c('community_one', 'community_two')
inRes$whether_interconnected = interResult

plot adjacency matrix

d = max(inRes$community_two)
A.mat = matrix(0, nrow = d, ncol = d)
diag(A.mat) = 1
for(i in c(1:nrow(inRes))){
  A.mat[inRes$community_one[i], inRes$community_two[i]] = 
    inRes$whether_interconnected[i]
  A.mat[inRes$community_two[i], inRes$community_one[i]] = 
    inRes$whether_interconnected[i]
}

par(cex.main=0.8, par(mar = c(1, 2, 2, 6)))
colors = rep(brewer.pal(9, "Blues"), each = 3)[1:17]
plot.A.mat = A.mat + diag(rep(2, nrow(A.mat)))    # emphasize diagonal in plot
adj.mat = heatmap.2(A.mat, dendrogram='none', Rowv=FALSE, Colv=FALSE, trace='none', 
                    key = F, col = as.vector(colors), main = "Communities Interconnection", 
                    colsep = 1:n, rowsep = 1:n, sepcolor = "grey", sepwidth=c(0.005,0.005))
image.plot(legend.only=T, zlim=range(0,1), col=colors, legend.mar = 3, 
           legend.shrink = 0.5, legend.width = 0.5, legend.cex = 0.3)

Step 3

interCon = inRes[inRes$whether_interconnected == 1,]
#print(interCon)
### Select two interconnected communities from dataframe interCon and detect
### interconnected edges.

kID1 = nice$CID[1] # an example, 4 here means the 4th cluster in heatmap.
kID2 = nice$CID[2]
nodes1 = which(nice$Cindx == kID1)
nodes2 = which(nice$Cindx == kID2)
nodecom = c(nodes1, nodes2)
mat1 = corr[nodes1, nodes1]
mat2 = corr[nodes2, nodes2]
mat12 = corr[nodecom, nodecom]
inter12 = corr[nodes1, nodes2]

result3 = InterCon(mat1, mat2, mat12, inter12, 
                   0.6, c(0.3, 0.5, 0.6, 0.8, 0.85, 0.9), nodes1, nodes2)
paste('best threshold r: ',result3[[1]])
print('Interconnected edges: ')
print(result3[[2]])