In youngser/gmmase: Graph Spectral Clustering

source("~/Dropbox/Worm/Codes/Connectome/mbstructure/R/structure-utils.R")
#source("http://www.cis.jhu.edu/~parky/Semipar_vs_Nonpar/utils.r")
source("~/Dropbox/RFiles/ccc_utils.R")

suppressMessages(library(knitr))
#opts_knit$set(animation.fun = hook_scianimator)

#suppressMessages(library(tourr))
#suppressMessages(library(animint))
suppressMessages(library(RColorBrewer))

Problem

GM250_seed is an instance of the Graph Matching problem.
In this problem two graphs are given. G1 and G2.
A partial map between nodes of G1 and G2 are provided in the train data.
The task is to predict the mapping between the unmapped nodes in the test data.

Data

Data for GM250_seed consists of two graphs in raw_data dir:

G1.gml: attributed undirected graph; ~~250~~ 755 nodes; 5138 edges; ~~12~~ 4 features for each node
G2.gml: attributed undirected graph; ~~124~~ 755 nodes; ~~324~~ 5138 edges; ~~12~~ 4 features for each node
trainData.csv contains G1 nodes and trainTargets.csv contains G2 nodes. Together, they constitute 151 known mappings.
~~testData.csv contains G2 nodes, for which the mappings have to be predicted in testTargets.csv~~

suppressMessages(library(tidyverse))
suppressMessages(library(igraph))
suppressMessages(library(Matrix))
suppressMessages(library(VN))

dtifile <- "~/Dropbox/D3M/D3M/connectome_stats/dwimean_3216.edgelist"
mrifile <- "~/Dropbox/D3M/D3M/connectome_stats/fmrimean_2039.edgelist"

dtidat <- read.table(dtifile)
mridat <- read.table(mrifile)
gdti <- graph_from_edgelist(as.matrix(dtidat)[,-3], directed=FALSE); 
E(gdti)$weight <- dtidat[,3]; summary(gdti); is.connected(gdti)
gmri <- graph_from_edgelist(as.matrix(mridat)[,-3], directed=FALSE); 
E(gmri)$weight <- mridat[,3]; summary(gmri); is.connected(gmri)

out1 <- gmmase(gdti, dmax = 20, embed = "ASE", Kmax = 10, clustering = "GMM", verbose=FALSE)
#out1 <- gmmase(g, dmax = 100, embed = "LSE", Kmax = 10, clustering = "GMM", verbose=FALSE)
#out1 <- gmmase(g, dmax = 100, embed = "ASE", Kmax = 10, clustering = "Kmeans", verbose=FALSE)

Xhat1 <- out1$mc$data
Yhat <- out1$Y
df2 <- data.frame(Xhat=Xhat1, cluster=as.factor(Yhat))
ggpairs(df2, columns=1:(ncol(df2)-2), mapping=aes(color=cluster, alpha=0.5))

# rearrange the graphs so that seeds are the first m vertices
matched.id1 <- match(train$G1.nodeID, V(g1)$nodeID) # 151
perm.g1 <- invPerm(c(matched.id1, (1:vcount(g1))[-matched.id1]))
matched.id2 <- unique(match(train$G2.nodeID, V(g2)$nodeID)) # 145
perm.g2 <- invPerm(c(matched.id2, (1:vcount(g2))[-matched.id2]))
g1.new <- permute.vertices(g1, perm.g1)
g2.new <- permute.vertices(g2, perm.g2)
g2.sub <- induced.subgraph(g2.new, 1:nrow(train)); summary(g2.sub)

Seeded Graph Matching

So, m = r nrow(train) correspondence are given.
We will use the first $s = {0,30,60,90,120, 150}$ vertices as seeds and repeat the process 100 times to see the matching performance.

On Sep 9, 2017, at 12:18 PM, Vince Lyzinski vincelyzinski@gmail.com wrote:

hard seeding enforces the seeds throughout the problem (they can't change), while soft seeding just initializes at the seeds, but allows them to change in the course of the optimization

A1 <- as.matrix(g1.new[])
A2 <- as.matrix(g2.new[])
n <- nrow(A1)
m <- nrow(A2)
gamma <- 1
s <- 10 # numseed
M <- rsp(n-s,gamma)
S <- diag(n);
S[(s+1):n,(s+1):n] <- M
system.time(out <- sgm(A2, A1, 0, start=S, pad=0, iteration=30))
newA2 <- out$P %*% A1 %*% t(out$P)
(f <- norm(A2[1:m,1:m]-newA2[1:m,1:m], "F"))
(matchV <- sum(out$corr[1:m] == 1:m))
#The accuracy for the first `m` = `r m` vertices is `r matchV`, which is `r matchV/m`%.

set.seed(12345)

A1 <- as.matrix(g1.new[])
A2 <- as.matrix(g2.new[])
n <- nrow(A1)
m <- nrow(A2)
gamma <- 1

nmc <- 100
niter <- 30
svec <- seq(0, 150, by=30)

method <- "hard"

for (s in svec) {
    cat("Working on s = ", s, "\n")
    mc <- foreach (i=1:nmc) %dopar% {
        ## S is a starting point for softseeding
        if (method=="soft") {
            M <- rsp(n-s,gamma)
            S <- diag(n);
            S[(s+1):n,(s+1):n] <- M
            out <- sgm(A2,A1,0,start=S,pad=0,iteration=niter)
        } else { # "hard"
            S <-matrix(1/(n-s), n-s, n-s) 
            out <- sgm(A2,A1,s,start=S,pad=0,iteration=niter)
            if (s > 0) {
                out$corr <- c(1:s,out$corr)  
            }
        }

        newA2 <- out$P %*% A1 %*% t(out$P)
        f <- norm(A2[1:m,1:m]-newA2[1:m,1:m], "F")
        matchV <- sum(out$corr[1:m] == 1:m)
        matchE <- sum((A2[1:m,1:m]+newA2[1:m,1:m])>=2) / 2
        c(matchV, matchE, f)
    }
    save(mc,n,m,nmc,s,gamma,file=paste0("mc-r49-s",s,"-nmc",nmc,"-niter",niter,"-",method,".Rbin"))
}

matchV <- matchE <- f <- NULL
for (s in svec) {
    load(paste0("~/Dropbox/D3M/D3M/r49/mc-r49-s",s,"-nmc",nmc,"-niter",niter,".Rbin"))
    matchV <- c(matchV, unlist(sapply(mc,"[",1)))
    matchE <- c(matchE, unlist(sapply(mc,"[",2))/2)
    f <- c(f, unlist(sapply(mc,"[",3)))
}

df <- data.frame(init=rep(1:nmc,times=length(svec)),
                 f=f, matchV=matchV, matchE=matchE,
                 s=factor(rep(svec, each=nmc)))

df3 <- df %>% group_by(s) %>% arrange(f) %>% mutate(init2=order(f)) %>% select(-init)

# "hard"
niter <- 30
matchV <- matchE <- f <- NULL
for (s in svec) {
    load(paste0("~/Dropbox/D3M/D3M/r49/mc-r49-s",s,"-nmc1-niter",niter,"-hard.Rbin")); nmc=100
    matchV <- c(matchV, rep(unlist(sapply(mc,"[",1)),nmc))
    matchE <- c(matchE, rep(unlist(sapply(mc,"[",2)),nmc))
    f <- c(f, rep(unlist(sapply(mc,"[",3)),nmc))
}

df <- data.frame(init=rep(1:nmc,times=length(svec)),
                 f=f, matchV=matchV, matchE=matchE,
                 s=factor(rep(svec, each=nmc)))

df32 <- df %>% group_by(s) %>% arrange(f) %>% mutate(init2=order(f)) %>% select(-init)

df33 <- rbind(data.frame(method="soft", df3), data.frame(method="hard", df32))

df4 <- gather(df33, key="variable", value="value", -c(s,init2,method))

ggplot(df4, aes(x=init2,y=value)) +
    facet_grid(variable~., scales="free") + xlab("initialization") +
#    geom_point(aes(col=s, shape=method)) +
    geom_line(aes(col=s, linetype=method), size=1.2) + 
#    scale_color_brewer(palette = "Set1") +
    theme(text=element_text(size=12)) + 
    theme(strip.text=element_text(size=rel(1.1))) +
    theme(legend.title=element_text(size=rel(1.1),face="bold")) + labs(color="seeds") +
    theme(legend.text=element_text(size=rel(1.0),face="plain"))