Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, fig.align="center")
require(CTD)
require(huge)
require(ggplot2)
require(gplots)
require(RColorBrewer)
print(sprintf("Current directory is %s: ", getwd()))
## ----manually_build_graph-----------------------------------------------------
adj_mat = rbind(c(0,3,1,0,0,0,0,0,0), #A's neighbors
c(3,0,2,2,0,0,0,0,0), #B's neighbors
c(1,2,0,0,2,0,0,0,0), #C's neighbors
c(0,2,0,0,1,0,1,1,0), #D's neighbors
c(0,0,2,1,0,2,0,2,0), #E's neighbors
c(0,0,0,0,2,0,0,0,0), #F's neighbors
c(0,0,0,1,0,0,0,1,0), #G's neighbors
c(0,0,0,1,2,0,1,0,1), #H's neighbors
c(0,0,0,0,0,0,0,1,0) #I's neighbors
)
rownames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
colnames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
# Convert adjacency matrices to igrpah objects for all three graphs.
ig = graph.adjacency(adj_mat, mode="undirected", weighted=TRUE,
add.colnames = "name")
print(ig)
# Multiply edge weights by 5 to amplify edge width
# differences
plot.igraph(ig, edge.width=5*E(ig)$weight)
## ----learn_graph--------------------------------------------------------------
# Load the Miller2015_Heparin dataset
data(Miller2015)
# Only include metabolites that are present in >90% reference samples.
fil.rate=as.numeric(Miller2015$`Times identifed in all 200 samples`[-1])/200
names(fil.rate) = rownames(Miller2015)[-1]
data_mx = Miller2015[,grep("IEM_", colnames(Miller2015))]
data_mx = data_mx[which(fil.rate>0.90), ]
dim(data_mx)
# Remove any metabolites where any profile has a z-score > 1000.
# These are likely imputed raw values that were not z-scored.
rmMets = names(which(apply(data_mx, 1, function(i) any(i>20))))
if (length(rmMets)>0) {
data_mx = data_mx[-which(rownames(data_mx) %in% rmMets),]
}
dim(data_mx)
# Get data from all patients with Argininemia
diags = Miller2015["diagnosis", grep("IEM", colnames(Miller2015))]
arg_data = data_mx[,which(diags=="Argininemia")]
# Add surrogate disease and surrogate reference profiles based on 1 standard
# deviation around profiles from real patients to improve rank of matrix when
# learning Gaussian Markov Random Field network on data. While surrogate
# profiles are not required, they tend to learn less complex networks
# (i.e., networks with less edges) and in faster time.
ind = which(diags=="No biochemical genetic diagnosis")
arg_data=data.surrogateProfiles(arg_data, 1, ref_data = data_mx[,ind])
dim(arg_data)
# Learn a Gaussian Markov Random Field model using the Graphical LASSO in
# the R package "huge".
# Select the regularization parameter based on the "STARS" stability
# estimate.
#This will take 30 seconds - 1 minute.
arg = huge(t(arg_data), method="glasso")
plot(arg)
# This will take several minutes. For a faster option, you can use the
# "ebic" criterion instead of "stars", but we recommend "stars".
arg.select = huge.select(arg, criterion="stars")
plot(arg.select)
# This is the regularization parameter the STARS method selected.
print(arg.select$opt.lambda)
# This is the corresponding inverse of the covariance matrix that corresponds
# to the selected regularization level.
arg_icov = as.matrix(arg.select$opt.icov)
# Remove all "self" edges, as we are not interested in self-relationships.
diag(arg_icov) = 0
rownames(arg_icov) = rownames(arg_data)
colnames(arg_icov) = rownames(arg_data)
# Convert adjacency matrices to an igraph object.
ig_arg = graph.adjacency(arg_icov, mode="undirected", weighted=TRUE,
add.colnames="name")
print(ig_arg)
## ----diffusion_no_visited-----------------------------------------------------
if (!dir.exists(sprintf("%s/images", getwd()))) {
dir.create(sprintf("%s/images", getwd()))
}
G=vector(mode="list", length=length(V(ig)$name))
G[1:length(G)] = 0
names(G) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
startNode = "A"
visitedNodes = startNode
coords = layout.fruchterman.reingold(ig)
# Uncomment next lines if you want to generate movie images
#G_new = graph.diffuseP1(p1=1.0, sn=startNode, G=G, vNodes=visitedNodes,
# thresholdDiff=0.01, adj_mat=adj_mat, verbose=TRUE,
# out_dir = sprintf("%s/images", getwd()),
# r_level = 1, coords = coords)
# Inherited probabilities across all nodes should add to 1.
#sum(unlist(G_new))
# Which node inherited the highest probability from startNode?
#G_new[which.max(G_new)]
## ----diffusion_with_visited---------------------------------------------------
# Now let's see how the probability diffusion algorithm diffuses probability
# after B has been "stepped" into.
visitedNodes = c("A", "B")
startNode = "B"
# Uncomment the next lines if you want to generate movie images
#G_new = graph.diffuseP1(p1=1.0, sn=startNode, G, vNodes=visitedNodes,
# thresholdDiff=0.01, adj_mat, TRUE,
# out_dir = sprintf("%s/images", getwd()),
# 1, coords)
# Inherited probabilities across all nodes should add to 1.
#sum(unlist(G_new))
# Which node inherited the highest probability from startNode?
#G_new[which.max(G_new)]
## ----diffuse_through_visited--------------------------------------------------
adj_mat = rbind(c(0,1,2,0,0,0,0,0,0), #A's neighbors
c(1,0,3,0,0,0,0,0,0), #B's neighbors
c(2,3,0,0,1,0,0,0,0), #C's neighbors
c(0,0,0,0,0,0,1,1,0), #D's neighbors
c(0,0,1,0,0,1,0,0,0), #E's neighbors
c(0,0,0,0,1,0,0,0,0), #F's neighbors
c(0,0,0,1,0,0,0,1,0), #G's neighbors
c(0,0,0,1,0,0,1,0,0), #H's neighbors
c(0,0,0,0,0,0,0,0,0) #I's neighbors
)
rownames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
colnames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
# Convert adjacency matrices to igrpah objects for all three graphs.
ig = graph.adjacency(adj_mat, mode="undirected", weighted=TRUE,
add.colnames = "name")
coords = layout.fruchterman.reingold(ig)
print(ig)
# Now let's see how the probability diffusion algorithm diffuses probability
# after B has been "stepped" into "C" and then "A". As you can see, startNode
# "A" is surrounded by visited nodes "B" and "C". It needs to be smart enough
# to weigh "E" and "F" before "D", "H", "G" and "I".
visitedNodes = c("B", "C", "A")
startNode = "A"
# Uncomment the next lines if you want to generate movie images
#G_new = graph.diffuseP1(p1=1.0, sn=startNode, G, vNodes=visitedNodes,
# thresholdDiff=0.01, adj_mat, TRUE,
# out_dir = sprintf("%s/images", getwd()),
# 1, coords)
# Inherited probabilities across all nodes should add to 1.
#sum(unlist(G_new))
# Which node inherited the highest probability from startNode?
#G_new[which.max(G_new)]
## ----multinode_network_walker-------------------------------------------------
# The multi-node network walker tends to overfit on the network and is more
# computationally intensive/slow compared to the single-node network walker.
# It is therefore not recommended that you use this network walker over the
# single-node network walker. However, it is unclear if this network walker
# can be beneficial in some circumstances or application areas. We include
# it as an experimental feature only.
adj_mat = rbind(c(0,3,1,0,0,0,0,0,0), #A's neighbors
c(3,0,2,2,0,0,0,0,0), #B's neighbors
c(1,2,0,0,2,0,0,0,0), #C's neighbors
c(0,2,0,0,1,0,1,1,0), #D's neighbors
c(0,0,2,1,0,2,0,2,0), #E's neighbors
c(0,0,0,0,2,0,0,0,0), #F's neighbors
c(0,0,0,1,0,0,0,1,0), #G's neighbors
c(0,0,0,1,2,0,1,0,1), #H's neighbors
c(0,0,0,0,0,0,0,1,0) #I's neighbors
)
rownames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
colnames(adj_mat) = c("A", "B", "C", "D", "E", "F", "G", "H", "I")
# Convert adjacency matrices to igrpah objects for all three graphs.
ig = graph.adjacency(adj_mat, mode="undirected", weighted=TRUE,
add.colnames = "name")
coords = layout.fruchterman.reingold(ig)
print(ig)
# Uncomment following line to generate PNGs and animate the
# multi-node encoding node ranks.
#ranks = multiNode.getNodeRanks(S = c("A", "B"), G, p1=1.0,
# thresholdDiff=0.01, adj_mat,
# log2(length(G)), FALSE,
# out_dir = sprintf("%s/images", getwd()),
# TRUE, coords)
# Get node ranks as list object, with no images generated
ranks = multiNode.getNodeRanks(S = c("A", "B"), G, p1=1.0,
thresholdDiff=0.01, adj_mat,
log2(length(G)), FALSE)
## ----singlenode_network_walker------------------------------------------------
# This network walker tends to avoid overfitting. Of further note, since
# single-node network walker is not parameterized by the subset being
# encoded, you can pre-compute node rankingsusing dynamic programming.
# Pre-computing node ranks enables quick encoding of thousands of subsets
# at a time (see The Encoding Process).
S = c("A", "B")
#out_dir=sprintf("%s/images",getwd())
# Uncomment the following lines to generate PNGs and animate the
# single-node encoding node ranks.
#ranks = list()
#for (n in seq_len(length(S))) {
# ind = which(names(G)==S[n])
# ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01,
# adj_mat,S=S,num.misses=log2(length(G)),
# FALSE,out_dir,TRUE,coords)
#}
#names(ranks) = S
# Get node ranks as list object, with no images generated
S = c("A", "B")
ranks = list()
for (n in 1:length(S)) {
ind = which(names(G)==S[n])
ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01,
adj_mat,S=S,
num.misses=log2(length(G)),FALSE)
}
names(ranks) = S
## ----arg_network--------------------------------------------------------------
print(ig_arg)
adj_mat = as.matrix(get.adjacency(ig_arg, attr="weight"))
G=vector(mode="list", length=length(V(ig_arg)$name))
G[1:length(G)] = 0
names(G) = V(ig_arg)$name
## ----node_subset--------------------------------------------------------------
# Maximum subset size to inspect
kmx=15
# Get our node subset associated with the $KMX highest perturbed (up or down)
# in our first Arginase deficiency sample.
S_arg = sort(abs(arg_data[,1]), decreasing=TRUE)[1:kmx]
print(S_arg)
## ----node_permutations--------------------------------------------------------
# Get the single-node encoding node ranks starting from each node in the subset
# S_arg.
ranks = list()
for (n in 1:length(S_arg)) {
ind = which(names(G)==names(S_arg)[n])
ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01,
adj_mat,S=names(S_arg),
num.misses=log2(length(G)),TRUE)
}
names(ranks) = names(S_arg)
## ----convert_to_bitstrings----------------------------------------------------
# Get the bitstrings associated with the patient's perturbed metabolites in
# "S_arg" based on the node ranks calculated in the previous step, "ranks".
ptBSbyK = mle.getPtBSbyK(names(S_arg), ranks)
## ----encoding_length----------------------------------------------------------
ind = which(colnames(arg_data) %in% names(diags))
data_mx.pvals=apply(arg_data[,ind], c(1,2),
function(i) 2*pnorm(abs(i), lower.tail=FALSE))
ptID = "IEM_1006"
res = mle.getEncodingLength(ptBSbyK, t(data_mx.pvals), ptID, G)
ind.mx = which.max(res$d.score)
res[ind.mx,]
## ----probability_of_set-------------------------------------------------------
# This is the lower bound of the probability associated with the metabolites
# in S_arg. The higher the probability relative to a random set of the same
# size, the more tightly connected the metabolite set is.
2^-res[ind.mx,"IS.alt"]
# Note the probability printed above may seem low, but there are
# log2(length(G), kmx) outcomes that probability is assigned between.
# We should expect a probability for a node set of size kmx in a length(G)
# network to have probability:
2^-(log2(choose(length(G), kmx)))
# You'll notice the probability associated with the metabolite set we encoded,
# S_arg, is orders of magnitude higher than a uniform probability model. This
# implies the metabolites in S_arg are connected in the network ig_arg more
# than is expected by chance.
## ----pvalue_of_set------------------------------------------------------------
# You can interpret the probability assigned to this metabolite set by
# comparing it to a null encoding algorithm, which uses fixed-length codes
# for all metabolites in the set. The "d.score" is the difference in bitlength
# between the null and alternative encoding models. Using the Algorithmic
# Significance theorem, we can estimate the upper bounds on a p-value by
# 2^-d.score.
2^-res[ind.mx,"d.score"]
# All metabolites in S_arg
names(S_arg)
# Which metabolites were in the 8 metabolite subset of patient IEM_1006's
# top 15 perturbed metabolites that had the above p-value?
ptBSbyK[[ind.mx]] # all metabolites in the bitstring
# just the F metabolites that are in S_arg that were were "found"
names(which(ptBSbyK[[ind.mx]]==1))
## ----patient_distances--------------------------------------------------------
data_mx=arg_data[,which(colnames(arg_data) %in% names(diags))]
data_mx=data_mx[,seq_len(8)]
S_arg=c()
for (pt in 1:ncol(data_mx)) {
ptID=colnames(data_mx)[pt]
S_arg=c(S_arg,names(sort(abs(data_mx[,pt]),decreasing=TRUE)[1:kmx]))
}
S_arg = unique(S_arg)
# Pre-computing node ranks from all perturbed metabolites across all patients
# is the overhead we have to pay for when using this mutual information-based
# similarity metric, but will pay off when we go to compute several pairwise
# calculations of similarity.
# It feels like a lot of overhead when run serially, but when run in parallel
# (recommended) (e.g., a computing cluster) this finishes in minutes.
ranks=list()
for (n in 1:length(S_arg)) {
ind=which(names(G)==S_arg[n])
ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01,
adj_mat,S=S_arg,
num.misses=log2(length(G)),FALSE)
}
names(ranks)=S_arg
# Calculate patient bitstrings
ptBSbyK=list()
for (pt in 1:ncol(data_mx)) {
ptID=colnames(data_mx)[pt]
S_pt=names(sort(abs(data_mx[,pt]),decreasing=TRUE)[1:kmx])
ptBSbyK[[ptID]]=mle.getPtBSbyK(S_pt, ranks)
}
# Now perform mutual information-based patient similarity scoring
res = list()
t = list(ncd=matrix(NA, nrow=ncol(data_mx), ncol=ncol(data_mx)))
rownames(t$ncd) = colnames(data_mx)
colnames(t$ncd) = colnames(data_mx)
for (i in 1:kmx) { res[[i]] = t }
for (pt in 1:ncol(data_mx)) {
ptID=colnames(data_mx)[pt]
for (pt2 in pt:ncol(data_mx)) {
ptID2=colnames(data_mx)[pt2]
# Because we pre-computed node ranks for all perturbed metabolites
# across our 8 patients, this will complete very quickly.
tmp = mle.getPtDist(ptBSbyK[[ptID]],ptID,ptBSbyK[[ptID2]],ptID2,
data_mx,ranks,p1=1.0,thresholdDiff=0.01,adj_mat)
for (k in 1:kmx) {
res[[k]]$ncd[ptID, ptID2] = tmp$NCD[k]
res[[k]]$ncd[ptID2, ptID] = tmp$NCD[k]
}
}
}
## ----visualizations-----------------------------------------------------------
# Multi-dimensional scaling
plot.mdsDist = function(patientDist, diagnoses, diag) {
if (is.null(diagnoses)) {
print("To view patient clusters, please provide clinical labels.")
return(0)
}
fitDist = cmdscale(patientDist, eig=FALSE, k=2)
x = round(fitDist[,1], 2)
y = round(fitDist[,2], 2)
df=data.frame(x=x,y=y,color=as.factor(diagnoses),label=colnames(patientDist))
p=ggplot(df,aes(x,y,label=label))+geom_point(aes(colour=color),size=5)+
geom_label(nudge_x = 0.1)
return(p)
}
# K-nearest neighbors
plot.knnDist = function(patientDist, diagnoses, diag) {
diagnoses = diagnoses[colnames(patientDist)]
# Add a GREEN edge between patient nodes if k nearest neighbor is
# correct diagnosis (either TP or TN)
# Add a RED edge between patient nodes if k nearest neighbor is
# incorrect diagnosis (either FP or FN)
tp = 0
fp = 0
tn = 0
fn = 0
ig = make_empty_graph(n=ncol(patientDist), directed=TRUE)
V(ig)$name = colnames(patientDist)
for (pt1 in 1:length(diagnoses)) {
diag_pt1 = diagnoses[pt1]
ind = sort(patientDist[pt1,-pt1], decreasing = FALSE)
ind = ind[which(ind==min(ind))]
diag_pt_ind = diagnoses[which(names(diagnoses) %in% names(ind))]
if (any(diag_pt_ind==diag) && diag_pt1==diag) { # True positive
tp=tp + 1
ind=ind[which(diag_pt_ind==diag)]
ig=add.edges(ig,
edges=c(colnames(patientDist)[pt1],names(ind[1])),
attr=list(color="green", lty=1))
} else if (diag_pt_ind!=diag && diag_pt1!=diag) { # True negative
tn=tn + 1
ig=add.edges(ig,
edges=c(colnames(patientDist)[pt1],names(ind[1])),
attr=list(color="green", lty=1))
} else if (diag_pt_ind==diag && diag_pt1!=diag) { # False positive
fp=fp + 1
ig=add.edges(ig,
edges=c(colnames(patientDist)[pt1],names(ind[1])),
attr=list(color="red", lty=1))
} else { # False negative
fn=fn + 1
ig=add.edges(ig,
edges=c(colnames(patientDist)[pt1],names(ind[1])),
attr=list(color="red", lty=1))
}
}
print(sprintf("Tp = %d, Tn= %d, Fp = %d, Fn=%d", tp, tn, fp, fn))
sens = tp / (tp+fn)
spec = tn / (tn+fp)
print(sprintf("Sens = %.2f, Spec= %.2f", sens, spec))
V(ig)$label = rep("", length(V(ig)$name))
return(ig)
}
# If you have diagnostic labels associated with the colnames(data_mx),
# send them using diagnoses parameter
res_ncd = lapply(res, function(i) i$ncd)
ncd = mle.getMinPtDistance(res_ncd)
dd = colnames(data_mx)
dd[which(dd %in% names(diags)[which(diags=="Argininemia")])] = "ARG"
dd[which(dd %in% names(diags)[which(diags!="Argininemia")])] = "negCntl"
names(dd) = colnames(res[[1]]$ncd)
colnames(ncd)=colnames(res[[1]]$ncd)
rownames(ncd)=colnames(res[[1]]$ncd)
p=plot.mdsDist(ncd, dd, NULL)
p
# Hierarchical clustering
dd_f = as.numeric(as.factor(as.character(dd)))
heatmap.2(x=ncd,dendrogram="both", Rowv=TRUE,Colv=TRUE,
ColSideColors=c(brewer.pal(12,"Set3"),brewer.pal(9,"BrBG"))[dd_f],
RowSideColors=c(brewer.pal(12,"Set3"),brewer.pal(9,"BrBG"))[dd_f],
cexRow=0.75,cexCol=1, margins=c(12,12), trace="none", key=TRUE,
col=bluered, notecol="black")
legend("left", legend=unique(sort(as.character(dd))),
fill=c(brewer.pal(12,"Set3"), brewer.pal(9,"BrBG")), cex=2)
# K-NN
ig=plot.knnDist(ncd, dd, diag="ARG")
grps=list()
grps[[1]]=names(dd)[which(dd=="ARG")]
grps[[2]]=names(dd)[which(dd!="ARG")]
names(grps)=names(table(dd))
plot.igraph(ig, mark.groups=grps, mark.col=c("white", "black"),
layout=layout.circle, edge.width=3, edge.arrow.size=0.5)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.