CTD_Lab-Exercise.R
In CTD: A Method for 'Connecting The Dots' in Weighted Graphs

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, fig.align="center")
require(CTD)
require(huge)
require(ggplot2)
require(gplots)
require(RColorBrewer)
print(sprintf("Current directory is %s: ", getwd()))

## ----manually_build_graph-----------------------------------------------------
adj_mat = rbind(c(0,3,1,0,0,0,0,0,0), #A's neighbors
                c(3,0,2,2,0,0,0,0,0), #B's neighbors
                c(1,2,0,0,2,0,0,0,0), #C's neighbors
                c(0,2,0,0,1,0,1,1,0), #D's neighbors
                c(0,0,2,1,0,2,0,2,0), #E's neighbors
                c(0,0,0,0,2,0,0,0,0), #F's neighbors
                c(0,0,0,1,0,0,0,1,0), #G's neighbors
                c(0,0,0,1,2,0,1,0,1), #H's neighbors
                c(0,0,0,0,0,0,0,1,0) #I's neighbors
                )
rownames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
colnames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
# Convert adjacency matrices to igrpah objects for all three graphs.
ig = graph.adjacency(adj_mat, mode="undirected", weighted=TRUE,
                        add.colnames = "name")
print(ig)
# Multiply edge weights by 5 to amplify edge width
# differences
plot.igraph(ig, edge.width=5*E(ig)$weight)


## ----learn_graph--------------------------------------------------------------
# Load the Miller2015_Heparin dataset
data(Miller2015)
# Only include metabolites that are present in >90% reference samples.
fil.rate=as.numeric(Miller2015$`Times identifed in all 200 samples`[-1])/200
names(fil.rate) = rownames(Miller2015)[-1]
data_mx = Miller2015[,grep("IEM_", colnames(Miller2015))]
data_mx = data_mx[which(fil.rate>0.90), ]
dim(data_mx)
# Remove any metabolites where any profile has a z-score > 1000. 
# These are likely imputed raw values that were not z-scored.
rmMets = names(which(apply(data_mx, 1, function(i) any(i>20))))
if (length(rmMets)>0) {
    data_mx = data_mx[-which(rownames(data_mx) %in% rmMets),]
}
dim(data_mx)

# Get data from all patients with Argininemia
diags = Miller2015["diagnosis", grep("IEM", colnames(Miller2015))]
arg_data = data_mx[,which(diags=="Argininemia")]
# Add surrogate disease and surrogate reference profiles based on 1 standard
# deviation around profiles from real patients to improve rank of matrix when
# learning Gaussian Markov Random Field network on data. While surrogate
# profiles are not required, they tend to learn less complex networks
# (i.e., networks with less edges) and in faster time.
ind = which(diags=="No biochemical genetic diagnosis")
arg_data=data.surrogateProfiles(arg_data, 1, ref_data = data_mx[,ind])
dim(arg_data)

# Learn a Gaussian Markov Random Field model using the Graphical LASSO in
# the R package "huge". 
# Select the regularization parameter based on the "STARS" stability
# estimate.
#This will take 30 seconds - 1 minute.
arg = huge(t(arg_data), method="glasso")
plot(arg)
# This will take several minutes. For a faster option, you can use the
# "ebic" criterion instead of "stars", but we recommend "stars".
arg.select = huge.select(arg, criterion="stars")
plot(arg.select)
# This is the regularization parameter the STARS method selected.
print(arg.select$opt.lambda)
# This is the corresponding inverse of the covariance matrix that corresponds
# to the selected regularization level.
arg_icov = as.matrix(arg.select$opt.icov)
# Remove all "self" edges, as we are not interested in self-relationships.
diag(arg_icov) = 0
rownames(arg_icov) = rownames(arg_data)
colnames(arg_icov) = rownames(arg_data)
# Convert adjacency matrices to an igraph object.
ig_arg = graph.adjacency(arg_icov, mode="undirected", weighted=TRUE,
                            add.colnames="name")
print(ig_arg)

## ----diffusion_no_visited-----------------------------------------------------
if (!dir.exists(sprintf("%s/images", getwd()))) { 
    dir.create(sprintf("%s/images", getwd()))
}
G=vector(mode="list", length=length(V(ig)$name))
G[1:length(G)] = 0
names(G) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
startNode = "A"
visitedNodes = startNode
coords = layout.fruchterman.reingold(ig)
# Uncomment next lines if you want to generate movie images
#G_new = graph.diffuseP1(p1=1.0, sn=startNode, G=G, vNodes=visitedNodes, 
#                        thresholdDiff=0.01, adj_mat=adj_mat, verbose=TRUE, 
#                        out_dir = sprintf("%s/images", getwd()), 
#                        r_level = 1, coords = coords)
# Inherited probabilities across all nodes should add to 1.
#sum(unlist(G_new))
# Which node inherited the highest probability from startNode?
#G_new[which.max(G_new)]

## ----diffusion_with_visited---------------------------------------------------
# Now let's see how the probability diffusion algorithm diffuses probability
# after B has been "stepped" into.
visitedNodes = c("A", "B")
startNode = "B"
# Uncomment the next lines if you want to generate movie images
#G_new = graph.diffuseP1(p1=1.0, sn=startNode, G, vNodes=visitedNodes,
#                        thresholdDiff=0.01, adj_mat, TRUE, 
#                        out_dir = sprintf("%s/images", getwd()), 
#                        1, coords)
# Inherited probabilities across all nodes should add to 1.
#sum(unlist(G_new))
# Which node inherited the highest probability from startNode?
#G_new[which.max(G_new)]

## ----diffuse_through_visited--------------------------------------------------
adj_mat = rbind(c(0,1,2,0,0,0,0,0,0), #A's neighbors
                c(1,0,3,0,0,0,0,0,0), #B's neighbors
                c(2,3,0,0,1,0,0,0,0), #C's neighbors
                c(0,0,0,0,0,0,1,1,0), #D's neighbors
                c(0,0,1,0,0,1,0,0,0), #E's neighbors
                c(0,0,0,0,1,0,0,0,0), #F's neighbors
                c(0,0,0,1,0,0,0,1,0), #G's neighbors
                c(0,0,0,1,0,0,1,0,0), #H's neighbors
                c(0,0,0,0,0,0,0,0,0) #I's neighbors
                )
rownames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
colnames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
# Convert adjacency matrices to igrpah objects for all three graphs.
ig = graph.adjacency(adj_mat, mode="undirected", weighted=TRUE,
                        add.colnames = "name")
coords = layout.fruchterman.reingold(ig)
print(ig)
# Now let's see how the probability diffusion algorithm diffuses probability
# after B has been "stepped" into "C" and then "A". As you can see, startNode
# "A" is surrounded by visited nodes "B" and "C". It needs to be smart enough
# to weigh "E" and "F" before "D", "H", "G" and "I".
visitedNodes = c("B", "C", "A")
startNode = "A"
# Uncomment the next lines if you want to generate movie images
#G_new = graph.diffuseP1(p1=1.0, sn=startNode, G, vNodes=visitedNodes,
#                        thresholdDiff=0.01, adj_mat, TRUE, 
#                        out_dir = sprintf("%s/images", getwd()), 
#                        1, coords)
# Inherited probabilities across all nodes should add to 1.
#sum(unlist(G_new))
# Which node inherited the highest probability from startNode?
#G_new[which.max(G_new)]

## ----multinode_network_walker-------------------------------------------------
# The multi-node network walker tends to overfit on the network and is more
# computationally intensive/slow compared to the single-node network walker.
# It is therefore not recommended that you use this network walker over the
# single-node network walker. However, it is unclear if this network walker
# can be beneficial in some circumstances or application areas. We include
# it as an experimental feature only.
adj_mat = rbind(c(0,3,1,0,0,0,0,0,0), #A's neighbors
                c(3,0,2,2,0,0,0,0,0), #B's neighbors
                c(1,2,0,0,2,0,0,0,0), #C's neighbors
                c(0,2,0,0,1,0,1,1,0), #D's neighbors
                c(0,0,2,1,0,2,0,2,0), #E's neighbors
                c(0,0,0,0,2,0,0,0,0), #F's neighbors
                c(0,0,0,1,0,0,0,1,0), #G's neighbors
                c(0,0,0,1,2,0,1,0,1), #H's neighbors
                c(0,0,0,0,0,0,0,1,0) #I's neighbors
                )
rownames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
colnames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
# Convert adjacency matrices to igrpah objects for all three graphs.
ig = graph.adjacency(adj_mat, mode="undirected", weighted=TRUE,
                        add.colnames = "name")
coords = layout.fruchterman.reingold(ig)
print(ig)
# Uncomment following line to generate PNGs and animate the 
# multi-node encoding node ranks.
#ranks = multiNode.getNodeRanks(S = c("A", "B"), G, p1=1.0,
#                                thresholdDiff=0.01, adj_mat, 
#                                log2(length(G)), FALSE, 
#                                out_dir = sprintf("%s/images", getwd()),
#                                TRUE, coords)
# Get node ranks as list object, with no images generated
ranks = multiNode.getNodeRanks(S = c("A", "B"), G, p1=1.0, 
                                thresholdDiff=0.01, adj_mat, 
                                log2(length(G)), FALSE)

## ----singlenode_network_walker------------------------------------------------
# This network walker tends to avoid overfitting. Of further note, since
# single-node network walker is not parameterized by the subset being 
# encoded, you can pre-compute node rankingsusing dynamic programming.
# Pre-computing node ranks enables quick encoding of thousands of subsets
# at a time (see The Encoding Process).
S = c("A", "B")
#out_dir=sprintf("%s/images",getwd())
# Uncomment the following lines to generate PNGs and animate the 
# single-node encoding node ranks.
#ranks = list()
#for (n in seq_len(length(S))) {
#    ind = which(names(G)==S[n])
#    ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01,
#                                        adj_mat,S=S,num.misses=log2(length(G)),
#                                        FALSE,out_dir,TRUE,coords)
#}
#names(ranks) = S
# Get node ranks as list object, with no images generated
S = c("A", "B")
ranks = list()
for (n in 1:length(S)) {
    ind = which(names(G)==S[n])
    ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01,
                                        adj_mat,S=S, 
                                        num.misses=log2(length(G)),FALSE)
}
names(ranks) = S

## ----arg_network--------------------------------------------------------------
print(ig_arg)
adj_mat = as.matrix(get.adjacency(ig_arg, attr="weight"))
G=vector(mode="list", length=length(V(ig_arg)$name))
G[1:length(G)] = 0
names(G) = V(ig_arg)$name

## ----node_subset--------------------------------------------------------------
# Maximum subset size to inspect
kmx=15
# Get our node subset associated with the $KMX highest perturbed (up or down)
# in our first Arginase deficiency sample.
S_arg = sort(abs(arg_data[,1]), decreasing=TRUE)[1:kmx]
print(S_arg)

## ----node_permutations--------------------------------------------------------
# Get the single-node encoding node ranks starting from each node in the subset
# S_arg.
ranks = list()
for (n in 1:length(S_arg)) {
    ind = which(names(G)==names(S_arg)[n])
    ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01,
                                        adj_mat,S=names(S_arg),
                                        num.misses=log2(length(G)),TRUE)
}
names(ranks) = names(S_arg)

## ----convert_to_bitstrings----------------------------------------------------
# Get the bitstrings associated with the patient's perturbed metabolites in
# "S_arg" based on the node ranks calculated in the previous step, "ranks".
ptBSbyK = mle.getPtBSbyK(names(S_arg), ranks)

## ----encoding_length----------------------------------------------------------
ind = which(colnames(arg_data) %in% names(diags))
data_mx.pvals=apply(arg_data[,ind], c(1,2),
                    function(i) 2*pnorm(abs(i), lower.tail=FALSE))
ptID = "IEM_1006"
res = mle.getEncodingLength(ptBSbyK, t(data_mx.pvals), ptID, G)
ind.mx = which.max(res$d.score)
res[ind.mx,]

## ----probability_of_set-------------------------------------------------------
# This is the lower bound of the probability associated with the metabolites
# in S_arg. The higher the probability relative to a random set of the same
# size, the more tightly connected the metabolite set is.
2^-res[ind.mx,"IS.alt"]
# Note the probability printed above may seem low, but there are
# log2(length(G), kmx) outcomes that probability is assigned between. 
# We should expect a probability for a node set of size kmx in a length(G)
# network to have probability:
2^-(log2(choose(length(G), kmx)))
# You'll notice the probability associated with the metabolite set we encoded, 
# S_arg, is orders of magnitude higher than a uniform probability model. This 
# implies the metabolites in S_arg are  connected in the network ig_arg more 
# than is expected by chance.

## ----pvalue_of_set------------------------------------------------------------
# You can interpret the probability assigned to this metabolite set by
# comparing it to a null encoding algorithm, which uses fixed-length codes
# for all metabolites in the set. The "d.score" is the difference in bitlength 
# between the null and alternative encoding models. Using the Algorithmic 
# Significance theorem, we can estimate the upper bounds on a p-value by
# 2^-d.score.
2^-res[ind.mx,"d.score"]
# All metabolites in S_arg 
names(S_arg)
# Which metabolites were in the 8 metabolite subset of patient IEM_1006's 
# top 15 perturbed metabolites that had the above p-value?
ptBSbyK[[ind.mx]] # all metabolites in the bitstring
# just the F metabolites that are in S_arg that were were "found"
names(which(ptBSbyK[[ind.mx]]==1))

## ----patient_distances--------------------------------------------------------
data_mx=arg_data[,which(colnames(arg_data) %in% names(diags))]
data_mx=data_mx[,seq_len(8)]
S_arg=c()
for (pt in 1:ncol(data_mx)) {
    ptID=colnames(data_mx)[pt]
    S_arg=c(S_arg,names(sort(abs(data_mx[,pt]),decreasing=TRUE)[1:kmx]))
}
S_arg = unique(S_arg)
# Pre-computing node ranks from all perturbed metabolites across all patients
# is the overhead we have to pay for when using this mutual information-based 
# similarity metric, but will pay off when we go to compute several pairwise 
# calculations of similarity.
# It feels like a lot of overhead when run serially, but when run in parallel 
# (recommended) (e.g., a computing cluster) this finishes in minutes. 
ranks=list()
for (n in 1:length(S_arg)) {
    ind=which(names(G)==S_arg[n])
    ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01, 
                                        adj_mat,S=S_arg, 
                                        num.misses=log2(length(G)),FALSE)
}
names(ranks)=S_arg
# Calculate patient bitstrings
ptBSbyK=list()
for (pt in 1:ncol(data_mx)) {
    ptID=colnames(data_mx)[pt]
    S_pt=names(sort(abs(data_mx[,pt]),decreasing=TRUE)[1:kmx])
    ptBSbyK[[ptID]]=mle.getPtBSbyK(S_pt, ranks)
}
# Now perform mutual information-based patient similarity scoring
res = list()
t = list(ncd=matrix(NA, nrow=ncol(data_mx), ncol=ncol(data_mx)))
rownames(t$ncd) = colnames(data_mx)
colnames(t$ncd) = colnames(data_mx)
for (i in 1:kmx) { res[[i]] = t }
for (pt in 1:ncol(data_mx)) {
    ptID=colnames(data_mx)[pt]
    for (pt2 in pt:ncol(data_mx)) {
        ptID2=colnames(data_mx)[pt2]
        # Because we pre-computed node ranks for all perturbed metabolites
        # across our 8 patients, this will complete very quickly.
        tmp = mle.getPtDist(ptBSbyK[[ptID]],ptID,ptBSbyK[[ptID2]],ptID2, 
                            data_mx,ranks,p1=1.0,thresholdDiff=0.01,adj_mat)
        for (k in 1:kmx) {
            res[[k]]$ncd[ptID, ptID2] = tmp$NCD[k]
            res[[k]]$ncd[ptID2, ptID] = tmp$NCD[k]
        }
    }
}

## ----visualizations-----------------------------------------------------------
# Multi-dimensional scaling
plot.mdsDist = function(patientDist, diagnoses, diag) {
    if (is.null(diagnoses)) {
        print("To view patient clusters, please provide clinical labels.")
        return(0)
    }
    fitDist = cmdscale(patientDist, eig=FALSE, k=2)
    x = round(fitDist[,1], 2)
    y = round(fitDist[,2], 2)
    df=data.frame(x=x,y=y,color=as.factor(diagnoses),label=colnames(patientDist))
    p=ggplot(df,aes(x,y,label=label))+geom_point(aes(colour=color),size=5)+
        geom_label(nudge_x = 0.1)
    return(p)
}
# K-nearest neighbors
plot.knnDist = function(patientDist, diagnoses, diag) {
    diagnoses = diagnoses[colnames(patientDist)]
    # Add a GREEN edge between patient nodes if k nearest neighbor is
    # correct diagnosis (either TP or TN)
    # Add a RED edge between patient nodes if k nearest neighbor is
    # incorrect diagnosis (either FP or FN)
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    ig = make_empty_graph(n=ncol(patientDist), directed=TRUE)
    V(ig)$name = colnames(patientDist)
    for (pt1 in 1:length(diagnoses)) {
        diag_pt1 = diagnoses[pt1]
        ind = sort(patientDist[pt1,-pt1], decreasing = FALSE)
        ind = ind[which(ind==min(ind))]
        diag_pt_ind = diagnoses[which(names(diagnoses) %in% names(ind))]
        if (any(diag_pt_ind==diag) && diag_pt1==diag) { # True positive
            tp=tp + 1
            ind=ind[which(diag_pt_ind==diag)]
            ig=add.edges(ig,
                            edges=c(colnames(patientDist)[pt1],names(ind[1])),
                            attr=list(color="green", lty=1))
        } else if (diag_pt_ind!=diag && diag_pt1!=diag) { # True negative
            tn=tn + 1
            ig=add.edges(ig,
                            edges=c(colnames(patientDist)[pt1],names(ind[1])),
                            attr=list(color="green", lty=1))
        } else if (diag_pt_ind==diag && diag_pt1!=diag) { # False positive
            fp=fp + 1
            ig=add.edges(ig,
                            edges=c(colnames(patientDist)[pt1],names(ind[1])),
                            attr=list(color="red", lty=1))
        } else { # False negative
            fn=fn + 1
            ig=add.edges(ig,
                            edges=c(colnames(patientDist)[pt1],names(ind[1])),
                            attr=list(color="red", lty=1))
        }
    }
    print(sprintf("Tp = %d, Tn= %d, Fp = %d, Fn=%d", tp, tn, fp, fn))
    sens = tp / (tp+fn)
    spec = tn / (tn+fp)
    print(sprintf("Sens = %.2f, Spec= %.2f", sens, spec))
    V(ig)$label = rep("", length(V(ig)$name))
    return(ig)
}

# If you have diagnostic labels associated with the colnames(data_mx),
# send them using diagnoses parameter
res_ncd = lapply(res, function(i) i$ncd)
ncd = mle.getMinPtDistance(res_ncd)
dd = colnames(data_mx)
dd[which(dd %in% names(diags)[which(diags=="Argininemia")])] = "ARG"
dd[which(dd %in% names(diags)[which(diags!="Argininemia")])] = "negCntl"
names(dd) = colnames(res[[1]]$ncd)

colnames(ncd)=colnames(res[[1]]$ncd)
rownames(ncd)=colnames(res[[1]]$ncd)
p=plot.mdsDist(ncd, dd, NULL)
p 
# Hierarchical clustering
dd_f = as.numeric(as.factor(as.character(dd)))
heatmap.2(x=ncd,dendrogram="both", Rowv=TRUE,Colv=TRUE,
            ColSideColors=c(brewer.pal(12,"Set3"),brewer.pal(9,"BrBG"))[dd_f],
            RowSideColors=c(brewer.pal(12,"Set3"),brewer.pal(9,"BrBG"))[dd_f],
            cexRow=0.75,cexCol=1, margins=c(12,12), trace="none", key=TRUE,
            col=bluered, notecol="black")
legend("left", legend=unique(sort(as.character(dd))), 
        fill=c(brewer.pal(12,"Set3"), brewer.pal(9,"BrBG")), cex=2)
# K-NN
ig=plot.knnDist(ncd, dd, diag="ARG")
grps=list()
grps[[1]]=names(dd)[which(dd=="ARG")]
grps[[2]]=names(dd)[which(dd!="ARG")]
names(grps)=names(table(dd))
plot.igraph(ig, mark.groups=grps, mark.col=c("white", "black"), 
            layout=layout.circle, edge.width=3, edge.arrow.size=0.5)