w.dir=getwd() #""
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = w.dir)
setwd(w.dir)

require(R.utils)
graph.dir="graphs"
rank.dir="ranks"
model.dir="models"
OP.dir="OP"
sapply(c(graph.dir,rank.dir,model.dir,OP.dir),mkdirs)

Data Load

rm(list=setdiff(ls(),grep("dir",ls(),value = T)))
require(R.utils)
require(CTD)
require(CTDext)
require(openxlsx)
url="https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-022-10415-5/MediaObjects/41598_2022_10415_MOESM2_ESM.xlsx"
download.file(url,destfile = "data_mx.xlsx")
data_mx.og=read.xlsx("data_mx.xlsx",
                  sheet = "Z-score",startRow = 2,
                  colNames = T,rowNames = T,skipEmptyRows = TRUE,skipEmptyCols = TRUE,)
data_mx.og=data_mx.og[
  ,!colnames(data_mx.og) %in% c("PATHWAY_SORTORDER", "SUPER_PATHWAY", "SUB_PATHWAY",
                                "CHEMICAL_ID", "RI", "MASS",
                                "PUBCHEM", "CAS", "KEGG", "HMDB_ID" )]
# 6 alaimo samples with diseases diagnosed were also used for model training
dupNames=matrix(nrow = 6, ncol = 2)
dupNames[,1]=c(
  "ALAIMO-68b", "ALAIMO-68a","ALAIMO-44", "ALAIMO-85", "ALAIMO-136", "ALAIMO-13" )
dupNames[,2]=c(
  "EDTA-ABAT-5", "EDTA-ABAT-6", "EDTA-AADC-2", "EDTA-ABAT-7", "EDTA-AADC-3", "EDTA-OTC-17")
temp=data_mx.og[,dupNames[,1]]
colnames(temp)=dupNames[,2]
data_mx.og=cbind(data_mx.og,temp)
models=sort(c("zsd","rcdp","arg","otc","asld","abat","aadc","adsl","cit","cob","ga","gamt","msud","mma","pa","pku"))
cohorts=sapply(c(models,"hep-ref", "edta-ref", "alaimo"),
               function(x) grep(paste(x,"-",sep = ""),colnames(data_mx.og),value = T,ignore.case = T))
names(cohorts)[names(cohorts)=="hep-ref"]="hep_refs"
names(cohorts)[names(cohorts)=="edta-ref"]="edta_refs"
# binary code argininosuccinate
data_mx.raw=read.xlsx(system.file("dataset/dataset1.xlsx",package = "CTDext"),
                  sheet = "Raw intensity",startRow = 2,
                  colNames = T,rowNames = T,skipEmptyRows = TRUE,skipEmptyCols = TRUE,)
data_mx.raw=data_mx.raw[,colnames(data_mx.og)[colnames(data_mx.og)%in%colnames(data_mx.raw)]]
ptZeroFromRaw=colnames(data_mx.raw[,is.na(unlist(data_mx.raw["argininosuccinate",]))])
pt10FromRaw=colnames(data_mx.raw[,!is.na(unlist(data_mx.raw["argininosuccinate",]))])
temp=data_mx.og[,!colnames(data_mx.og) %in% colnames(data_mx.raw)]
ptZeroFromOg=colnames(temp)[ is.na(unlist(temp["argininosuccinate",]))| as.numeric(unlist(temp["argininosuccinate",]))==0]
data_mx.ogBnry=data_mx.og # Jul
data_mx.ogBnry["argininosuccinate",c(ptZeroFromRaw,ptZeroFromOg)]=0
data_mx.ogBnry["argininosuccinate",pt10FromRaw]=10 
# data_mx.og: load("clinical_data_July2020.RData")
# data_mx.ogBnry : "clinical_data_Oct2020.RData"
rm(list=setdiff(ls(),grep("dir|data_mx|cohorts|dupNames",ls(),value = T))) 
save.image("clinical_data.RData")

Get number of metabolites detected

``` {r counts} rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str()))) data_mx=loadToEnv("clinical_data.RData")[["data_mx.raw"]]

data_mx_unnamed = data_mx[grep("x -", rownames(data_mx)), which(colnames(data_mx) %in% c(unlist(cohorts)))] zscore_cts = apply(data_mx_unnamed, 2, function(i) length(which(!is.na(i)))) mean(zscore_cts) min(zscore_cts) max(zscore_cts)

data_mx_named = data_mx[-grep("x -", rownames(data_mx)), which(colnames(data_mx) %in% c(unlist(cohorts)))] zscore_cts = apply(data_mx_named, 2, function(i) length(which(!is.na(i)))) mean(zscore_cts) min(zscore_cts) max(zscore_cts)

## Building a Gaussian Graphical Model to Diagnose 16 Different Metabolic Disorders (skip if using github-saved networks)
Metabolites represented in at least 50% of reference and 50% of disease cases were included in network learning. This is a time-consuming step
``` {r learn-nets_heparin}
require(R.utils)
require(CTD)
require(huge)
rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str())))
data_mx=loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]]
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,grep("hep-",colnames(data_mx),ignore.case = T)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]

refs=data_mx[,grep("ref",colnames(data_mx),ignore.case = T)]
ref_fill = apply(refs, 1, function(i) sum(is.na(i))/length(i))

# refs2=refs[which(ref_fill[rownames(data_mx)]>0.8),]
refs2=refs
cohorts = lapply(cohorts, function(i) i[which(i %in% colnames(data_mx))])

# We perfer training models on edta samples for various reasons (latest dataset); here we show an example of network learning on Citrullinemia disease that only contains heparin samples; include "msud", "mma", "pa", "pku", "cob", "ga", "gamt" if you want to train all 8 models
for (model in c("cit")) { #, "msud", "mma", "pa", "pku", "cob", "ga", "gamt"
  for (fold in 1:length(cohorts[[model]])) {
    print(sprintf("Learning graphs for diag %s, fold %d...", model, fold))
    diag_pts = cohorts[[model]][-fold]
    print(diag_pts)
    fill.rate = apply(data_mx[,which(colnames(data_mx) %in% diag_pts)], 1, function(i) sum(is.na(i))/length(i))
    diag_data = data_mx[intersect(which(ref_fill<0.5), which(fill.rate<=0.50)), which(colnames(data_mx) %in% diag_pts)]
    diag_data = diag_data[which(rownames(diag_data) %in% rownames(refs2)),]
    print("Individual samples as training data. Latent variable embedding and network pruning.")
    diag_data = CTD::data.surrogateProfiles(data = diag_data, std = 1, ref_data = refs2)
    print(dim(diag_data))

    # Disease Network: GLASSO approach
    inv_covmatt = huge(t(diag_data), method="glasso")
    inv_covmatt_select = huge.select(inv_covmatt, criterion = "stars")
    inv_covmat = as.matrix(inv_covmatt_select$icov[[which(inv_covmatt_select$lambda==inv_covmatt_select$opt.lambda)]])
    diag(inv_covmat) = 0;
    colnames(inv_covmat) = rownames(diag_data)
    ig = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name')
    V(ig)$name = rownames(diag_data)
    print(ig)

    # Reference Network: GLASSO approach
    ref_data = data.surrogateProfiles(data = refs2, std = 1, ref_data = refs2)#useMnDiseaseProfile = FALSE, addHealthyControls = TRUE, 
    ref_data = ref_data[,-which(duplicated(colnames(ref_data)))]
    print(dim(ref_data))
    inv_covmatt = huge(t(ref_data), method="glasso", lambda = inv_covmatt_select$opt.lambda)
    inv_covmat = as.matrix(inv_covmatt$icov[[1]])
    diag(inv_covmat) = 0;
    colnames(inv_covmat) = rownames(ref_data)
    ig_ref = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name')
    V(ig_ref)$name = rownames(ref_data)
    print(ig_ref)

    ig_pruned = graph.naivePruning(ig, ig_ref)
    print(ig_pruned)
    save(ig, ig_ref, ig_pruned, file=sprintf("%s/bg_%s_fold%d.RData",graph.dir, model, fold))
    rm(ig, ig_pruned)
  }
}

``` {r learn-networks_edta} rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str()))) require(CTD) require(huge) require(R.utils)

data_mx=loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]] cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] data_mx=data_mx[,grep("edta",colnames(data_mx),ignore.case = T)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx["argininosuccinate", which(is.na(data_mx["argininosuccinate",]))] = 0 refs = data_mx[, which(colnames(data_mx) %in% cohorts$edta_refs)] ref_fill = apply(refs, 1, function(i) sum(is.na(i))/length(i)) cohorts = lapply(cohorts, function(i) i[which(i %in% colnames(data_mx))])

here we show an example of training one disease cohort that only contains EDTA samples; include "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc" if you want to train all edta models

for (model in c("asld")) {#,"aadc", "abat", "adsl", "arg","zsd", "rcdp","otc" for (fold in 1:length(cohorts[[model]])) { print(sprintf("Learning graphs for diag %s, fold %d...", model, fold)) diag_pts = cohorts[[model]][-fold] print(diag_pts) fill.rate = apply(data_mx[,which(colnames(data_mx) %in% diag_pts)], 1, function(i) sum(is.na(i))/length(i)) diag_data = data_mx[intersect(which(ref_fill<0.5), which(fill.rate<0.5)), which(colnames(data_mx) %in% diag_pts)] print(nrow(diag_data)) print(diag_data["argininosuccinate",]) print("Individual samples as training data. Latent variable embedding and network pruning.") refs2 = refs[which(rownames(refs) %in% rownames(diag_data)), ] diag_data = data.surrogateProfiles(data = diag_data, std = 1, ref_data = refs2) print(dim(diag_data))

# Disease Network: GLASSO approach
inv_covmatt = huge(t(diag_data), method="glasso")
inv_covmatt_select = huge.select(inv_covmatt, criterion = "stars")
inv_covmat = as.matrix(inv_covmatt_select$icov[[which(inv_covmatt_select$lambda==inv_covmatt_select$opt.lambda)]])
diag(inv_covmat) = 0;
colnames(inv_covmat) = rownames(diag_data)
ig = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name')
V(ig)$name = rownames(diag_data)
print(ig)

# Reference Network: GLASSO approach
ref_data = data.surrogateProfiles(data = refs2, std = 1, ref_data = refs2)
ref_data = ref_data[,-which(duplicated(colnames(ref_data)))]
print(dim(ref_data))
inv_covmatt = huge(t(ref_data), method="glasso", lambda = inv_covmatt_select$opt.lambda)
inv_covmat = as.matrix(inv_covmatt$icov[[1]])
diag(inv_covmat) = 0;
colnames(inv_covmat) = rownames(ref_data)
ig_ref = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name')
V(ig_ref)$name = rownames(ref_data)
print(ig_ref)

ig_pruned = graph.naivePruning(ig, ig_ref)
print(ig_pruned)
print(degree(ig_pruned, "argininosuccinate"))
save(ig, ig_ref, ig_pruned, file=sprintf("%s/bg_%s_fold%d.RData",graph.dir , model, fold))
rm(ig, ig_ref, ig_pruned)

} }

Network descriptive statistics

for (model in c("arg", "asld", "cit", "otc", "rcdp", "zsd", "abat", "aadc", "adsl", "mma", "msud", "pa", "pku", "cob", "ga", "gamt")) { # print(model) for (fold in 1:length(cohorts[[model]])) { print(fold) load(sprintf("Clinical_paper/graphs/bg_%s_fold%d.RData", model, fold)) print(sprintf("Disease-Control Network: %d-%d", length(V(ig)$name), length(E(ig)$weight))) print(sprintf("Control-only Network: %d-%d", length(V(ig_ref)$name), length(E(ig_ref)$weight))) print(sprintf("Disease-Specific Network: %d-%d", length(V(ig_pruned)$name), length(E(ig_pruned)$weight))) } }

## Get pre-compute ranks for each Gaussian Graphical Model. (skip if using github-saved ranks)
This part should preferably run on cluster (pbs system).
Scripts "wrapper_ranks.r" "getRanksN.r" "getRanks.pbs" are commented out.
```r
# 
# cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
# models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc"))
# cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T))
# cohorts[["alaimo"]]=NULL
# 
# for (model in models){
#   fold.begin=1
#   fold.end=length(cohorts[[model]])
#   system(sprintf("Rscript wrapper_ranks.r %s %s %s %s %s",
#                model, fold.begin, fold.end, w.dir, rank.dir, graph.dir))
# }

######################################## wrapper_ranks.r ########################################
# require(R.utils)
# require(igraph)
# args = commandArgs(trailingOnly=TRUE)
# model=as.character(args[1])
# fold.begin=as.numeric(args[2])
# fold.end=as.numeric(args[3])
# w.dir=as.character(args[4])
# rank.dir=as.character(args[5])
# graph.dir=as.character(args[6])
# diag=model
# setwd(sprintf("%s/%s",w.dir,rank.dir))
# 
# for (fold in fold.begin:fold.end){
#     dir.create(sprintf("./%s", model), showWarnings = FALSE)
#     output_dir = sprintf("./%s/diag%s%d", model,diag,fold)
#     str = sprintf("mkdir %s", output_dir)
#     system(str)
# 
# ig = loadToEnv(sprintf("../%s/bg_%s_fold%d.RData",graph.dir,model,fold))[["ig_pruned"]]
# 
# totalN = length(V(ig)$name)
# print(sprintf("Total N = %d", totalN))
# 
# for (n in 1:totalN) {
#   str = sprintf("qsub -v diag=%s,fold=%d,n=%d,wkdir=%s,rankdir=%s,graphdir=%s getRanks.pbs",
#                 diag, fold, n, w.dir, rank.dir, graph.dir)
#   system(str, wait=FALSE)
#   system("sleep 0.1")
# }
# 
#   
# ready=FALSE
# last_sum = 0
# while (!ready) {
#   f = list.files(sprintf("./%s/diag%s%d", model, diag, fold), pattern=".RData")
#   print(sprintf("#permutation files ready = %d", length(f)))
#   if (length(f)==totalN) {
#     ready=TRUE
#   } else {
#     system("sleep 30")
#     system("rm *.pbs.*")
#     curr_sum = length(f)
#     if (curr_sum > last_sum) {
#       last_sum = curr_sum
#     } 
#   }
# }
# system("rm *.pbs.*")
#   
# # collate permutations into one R object
# all_nodes = V(ig)$name
# permutationByStartNode = list()
# for (n in 1:length(all_nodes)) {
#   load(sprintf("./%s/diag%s%d/%d.RData",model, diag, fold, n))
#   permutationByStartNode[[n]] = toupper(current_node_set)
# }
# names(permutationByStartNode) = all_nodes
# save.image(file=sprintf("./%s/%s%d-ranks.RData",model, toupper(diag), fold))
# print("collate complete...")
# str = sprintf("rm -r ./%s/diag%s%d",model, diag, fold)
# system(str)
# 
# }

################################ end of wrapper_ranks.r #########################################


################################ getRanksN.r ################################################
# require(igraph)
# require(CTD)
# require(R.utils)
# singleNode.getNodeRanksN = function(n, G, S=NULL, num.misses=NULL, verbose=FALSE) {
#   if (!is.null(num.misses)) {
#     if (is.null(S)) {
#       print("You must supply a subset of nodes as parameter S if you supply num.misses.")
#       return(0)
#     }
#   }
#   all_nodes = names(G)
#   if (verbose) {
#     print(sprintf("Calculating node rankings %d of %d.", n, length(all_nodes)))
#   }
#   current_node_set = NULL
#   stopIterating=FALSE
#   startNode = all_nodes[n]
#   currentGraph = G
#   numMisses = 0
#   current_node_set = c(current_node_set, startNode)
#   while (stopIterating==FALSE) {
#     # Clear probabilities
#     currentGraph[1:length(currentGraph)] = 0 #set probabilities of all nodes to 0
#     #determine base p0 probability
#     baseP = p0/(length(currentGraph)-length(current_node_set))
#     #set probabilities of unseen nodes to baseP
#     currentGraph[!(names(currentGraph) %in% current_node_set)] = baseP
#     # Sanity check. p0_event should add up to exactly p0 (global variable)
#     p0_event = sum(unlist(currentGraph[!(names(currentGraph) %in% current_node_set)]))
#     currentGraph = graph.diffuseP1(p1, startNode, currentGraph, current_node_set, 1, verbose=FALSE)
#     # Sanity check. p1_event should add up to exactly p1 (global variable)
#     p1_event = sum(unlist(currentGraph[!(names(currentGraph) %in% current_node_set)]))
#     if (abs(p1_event-1)>thresholdDiff) {
#       extra.prob.to.diffuse = 1-p1_event
#       currentGraph[names(current_node_set)] = 0
#       currentGraph[!(names(currentGraph) %in% names(current_node_set))] = unlist(currentGraph[!(names(currentGraph) %in% names(current_node_set))]) + extra.prob.to.diffuse/sum(!(names(currentGraph) %in% names(current_node_set)))
#     }
#     #Set startNode to a node that is the max probability in the new currentGraph
#     maxProb = names(which.max(currentGraph))
#     # Break ties: When there are ties, choose the first of the winners.
#     startNode = names(currentGraph[maxProb[1]])
#     if (!is.null(num.misses)) {
#       if (startNode %in% S) {
#         numMisses = 0
#       } else {
#         numMisses = numMisses + 1
#       }
#       current_node_set = c(current_node_set, startNode)
#       if (numMisses>num.misses || length(c(startNode,current_node_set))>=(length(G))) {
#         stopIterating = TRUE
#       }
#     } else {
#       # Keep drawing until you've drawn all nodes.
#       current_node_set = c(current_node_set, startNode)
#       if (length(current_node_set)>=(length(G))) {
#         stopIterating = TRUE
#       }
#     }
#     
#   }
#   return(current_node_set)
# }
# 
# args = commandArgs(trailingOnly=TRUE)
# diag = as.character(args[1])
# fold = as.numeric(args[2])
# n = as.numeric(args[3])
# wkdir = as.character(args[4])
# rankdir = as.character(args[5])
# graphdir = as.character(args[6])
# 
# model=diag
# print(diag)
# print(fold)
# print(n)
# 
# setwd(sprintf("%s/%s",wkdir,rankdir))
# ig = loadToEnv(sprintf("../%s/bg_%s_fold%d.RData", graphdir, model,fold))[["ig_pruned"]]
# 
# print(ig)
# V(ig)$name = tolower(V(ig)$name)
# G = vector(mode="list", length=length(V(ig)$name))
# names(G) = V(ig)$name
# adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
# p0=0.1
# p1=0.9
# thresholdDiff=0.01
# all_nodes = tolower(V(ig)$name)
# print(head(all_nodes))
# current_node_set = singleNode.getNodeRanksN(n, G)
# new_dir =  sprintf("./%s/diag%s%d",model, diag, fold)
# if (!dir.exists(new_dir)) {
#   str = sprintf("mkdir %s", new_dir)
#   system(str)
# }
# save(current_node_set, file=sprintf("./%s/diag%s%d/%d.RData",model, diag, fold, n))
# 
################################ end of getRanksN.r ##########################################

################################### getRanks.pbs ##############################################
# # Request 1 processors on 1 node
# #PBS -l nodes=1:ppn=1
# 
# #Request x number of hours of walltime
# #PBS -l walltime=1:00:00
# 
# #Request that regular output and terminal output go to the same file
# #PBS -j oe
# #PBS -m abe
# 
# module load R/3.5
# 
# diag=${diag}
# fold=${fold}
# n=${n}
# wkdir=${wkdir}
# rankdir=${rankdir}
# cd ${wkdir}/${rankdir}
# 
# Rscript ../getRanksN.r $diag $fold $n $wkdir ${rankdir} ${graphdir} > ranks:$diag-$fold-$n.out
# rm ranks:$diag-$fold-$n.out
###################################### end of getRanks.pbs ########################################

Get the main disease module for each disease-specific network model. (skip if using github-saved ranks)

rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))

# Add CTDsim distance from "downtown module" as second quantitative variable
disFromDowntown = function(dis_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G) {
  p1.e = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)[,"IS.alt"]
  p2.e = mle.getEncodingLength(p2.optBS, NULL, ptID2, G)[,"IS.alt"]

  # Using predefined node ranks, get optimal bitstring for encoding of patient1's union patient2's subsets.
  p12.sig.nodes = unique(c(dis_mod, p2.sig.nodes))
  p12.e = c()
  for (k in 1:length(ptBSbyK.dis)) {
    dis_mod_cpy = dis_mod
    p2.sig.nodes_cpy = p2.sig.nodes

    dis_mod_k = names(which(ptBSbyK.dis[[k]]==1))
    p2.sig.nodes_k = names(which(p2.optBS[[k]]==1))
    while (length(dis_mod_k)<k) {
      dis_mod_k = unique(c(dis_mod_k, dis_mod_cpy[1]))
      dis_mod_cpy = dis_mod_cpy[-1]
    }
    while (length(p2.sig.nodes_k)<k) {
      p2.sig.nodes_k = unique(c(p2.sig.nodes_k, p2.sig.nodes_cpy[1]))
      p2.sig.nodes_cpy = p2.sig.nodes_cpy[-1]
    }
    p12.sig.nodes_k = sapply(unique(c(dis_mod_k, p2.sig.nodes_k)), trimws)
    p12.optBS = mle.getPtBSbyK(p12.sig.nodes_k, ranks)
    res = mle.getEncodingLength(p12.optBS, NULL, NULL, G)
    p12.e[k] = res[which.max(res[,"d.score"]), "IS.alt"] + log2(choose(length(G), 1))*(length(p12.sig.nodes_k)-which.max(res[,"d.score"]))
  }
  return (list(p1.e=p1.e, p2.e=p2.e, p12.e=p12.e,
               NCD=(p12.e-apply(cbind(p1.e, p2.e), 1, min))/apply(cbind(p1.e, p2.e), 1, max)))
}

data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc"))
cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T))
cohorts[["alaimo"]]=NULL
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]

table(data_mx["argininosuccinate",])
p0=0.1
p1=0.9
thresholdDiff=0.01

# Get downtown disease modules for all modelled disease states
disMod = list()
data_mx0=data_mx
# use edta samples for model arg; edta sampels for model asld; hep samples for otc
cohorts[["arg"]]=cohorts[["arg"]][grep("EDTA",cohorts[["arg"]])]
cohorts[["asld"]]=cohorts[["asld"]][grep("EDTA",cohorts[["asld"]])]
cohorts[["otc"]]=cohorts[["otc"]][grep("HEP",cohorts[["otc"]])]

# showing one example
for (model in c("cit")) { # "aadc", "abat", "adsl", "arg", "asld", ,"otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku"
  print(sprintf("%s...", toupper(model)))
  data_mx=data_mx0
  mn = apply(data_mx[,which(colnames(data_mx) %in% cohorts[[model]])], 1, function(i) mean(na.omit(i)))
  mn = mn[-which(is.na(mn))]
  downtown_disease_module = c()
  for (pt in 1:length(cohorts[[model]])) {
    # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData",graph.dir, model, pt))[["ig_pruned"]]
    ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, pt),package = "CTDext"))[["ig_pruned"]]
    # ranks = loadToEnv(sprintf("%s/bg_%s_fold%d.RData",rank.dir, model, pt))[["ig_pruned"]]
    ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", model, pt),package = "CTDext"))[["permutationByStartNode"]]
    ranks = lapply(ranks, tolower)
    adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
    G = vector(mode="list", length=length(V(ig)$name))
    names(G) = V(ig)$name
    S = mn[which(abs(mn)>2)]
    S = S[which(names(S) %in% names(G))]
    it=0
    while (length(S)<15) {
      S = mn[which(abs(mn)>(2-0.1*it))]
      S = S[which(names(S) %in% names(G))]
      it = it + 1
    }
    ptBSbyK = mle.getPtBSbyK(names(S), ranks)
    res = mle.getEncodingLength(ptBSbyK, NULL, NULL, G)
    downtown_disease_module = c(downtown_disease_module, names(which(ptBSbyK[[which.max(res[,"d.score"])]]==1)))
  }
  print(model)
  length(unique(downtown_disease_module))

  # Estimate disease module "purity" based on 2 size thresholds and known case mean distances
  fold=1
  # ig = loadToEnv(sprintf("graphs_GMpaper/bg_%s_fold%d.RData",graph.dir, model, fold))[["ig_pruned"]]
  ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]]
  # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData",rank.dir ,model, fold))[["permutationByStartNode"]]
  ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", model, fold),package = "CTDext"))[["permutationByStartNode"]]
  ranks = lapply(ranks, tolower)
  adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
  G = vector(mode="list", length=length(V(ig)$name))
  names(G) = V(ig)$name
  data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ]
  data_mx = data_mx[,which(colnames(data_mx) %in% cohorts[[model]])]
  purity_mets = list()
  purity = c()
  # Size1: Full downtown disease module
  tmp.mm = unique(downtown_disease_module)
  tmp.mm = tmp.mm[which(tmp.mm %in% names(G))]
  ptBSbyK.dis = mle.getPtBSbyK(tmp.mm, ranks)
  res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)
  downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1))
  print(length(unique(downtown_disease_mod)))
  dd = c()
  for (pt in 1:length(cohorts[[model]])) {
    ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_mod), ranks)
    p2.sig.nodes = names(data_mx[order(abs(data_mx[,pt]), decreasing = TRUE), pt][1:length(unique(downtown_disease_mod))])
    p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks)
    ctdsim = disFromDowntown(unique(downtown_disease_mod), ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G)
    dd[pt] = min(ctdsim$NCD)
  }
  purity_mets[[1]] = unique(downtown_disease_module)
  purity[1] = mean(dd)
  # Size2: Smaller disease module
  downtown_disease_module = names(which(table(downtown_disease_module)>(length(cohorts[[model]])/2)))
  tmp.mm = unique(downtown_disease_module)
  tmp.mm = tmp.mm[which(tmp.mm %in% names(G))]
  ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_module), ranks)
  res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)
  downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1))
  print(length(unique(downtown_disease_mod)))
  dd = c()
  for (pt in 1:length(cohorts[[model]])) {
    ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_mod), ranks)
    p2.sig.nodes = names(data_mx[order(abs(data_mx[,pt]), decreasing = TRUE), pt][1:length(unique(downtown_disease_mod))])
    p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks)
    ctdsim = disFromDowntown(unique(downtown_disease_mod), ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G)
    dd[pt] = min(ctdsim$NCD)
  }
  purity_mets[[2]] = downtown_disease_module
  purity[2] = mean(dd)

  # Selected disease module based on "purity" estimates
  downtown_disease_module = purity_mets[[which.min(purity)]]

  disMod[[model]] = downtown_disease_module
}
lapply(disMod, length)
save(disMod, file=sprintf("%s/disMod.RData",model.dir))

Run CTD on Metabolomics Profiles to Interpret Perturbation Patterns in Disease Context

``` {r runCTD}

Note: Do not include Alaimo patients in this result - must have known diagnosis from one of the 16 disease network models,

or be a healthy control (heparin or EDTA).

require(R.utils) require(CTD) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]

dupNames=loadToEnv("clinical_data.RData")[["dupNames"]] data_mx=data_mx[,-which(colnames(data_mx) %in% dupNames[,1])] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx

p0=0.1 p1=0.9 thresholdDiff=0.01 kmx=30

disMod=loadToEnv(system.file("shiny-app/disMod_Oct2020.RData",package = "CTDext"))[["disMod"]]

Add CTDsim distance from "downtown module" as second quantitative variable

disFromDowntown = function(dis_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G, mn_prof, p2_prof) { p1.e = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)[,"IS.alt"] p2.e = mle.getEncodingLength(p2.optBS, NULL, ptID2, G)[,"IS.alt"]

dirSim = vector("numeric", length = length(ptBSbyK.dis)) for (k in 1:length(ptBSbyK.dis)) { p1.sn = dis_mod[1:k] p2.sn = p2.sig.nodes[1:k] p1.dirs = mn_prof[p1.sn] p1.dirs[which(!(p1.dirs > 0))] = 0 p1.dirs[which(p1.dirs > 0)] = 1 p2.dirs = p2_prof[p2.sn] p2.dirs[which(!(p2.dirs > 0))] = 0 p2.dirs[which(p2.dirs > 0)] = 1 p1.snn = sprintf("%s%d", p1.sn, p1.dirs) p2.snn = sprintf("%s%d", p2.sn, p2.dirs) dirSim[k] = 1 - (length(intersect(p1.snn, p2.snn))/length(union(p1.snn, p2.snn))) } # Using predefined node ranks, get optimal bitstring for encoding of patient1's union patient2's subsets. p12.sig.nodes = unique(c(dis_mod, p2.sig.nodes)) p12.e = c() for (k in 1:length(ptBSbyK.dis)) { dis_mod_cpy = dis_mod p2.sig.nodes_cpy = p2.sig.nodes

dis_mod_k = names(which(ptBSbyK.dis[[k]]==1))
p2.sig.nodes_k = names(which(p2.optBS[[k]]==1))
while (length(dis_mod_k)<k) {
  dis_mod_k = unique(c(dis_mod_k, dis_mod_cpy[1]))
  dis_mod_cpy = dis_mod_cpy[-1]
}
while (length(p2.sig.nodes_k)<k) {
  p2.sig.nodes_k = unique(c(p2.sig.nodes_k, p2.sig.nodes_cpy[1]))
  p2.sig.nodes_cpy = p2.sig.nodes_cpy[-1]
}
p12.sig.nodes_k = sapply(unique(c(dis_mod_k, p2.sig.nodes_k)), trimws)
p12.optBS = mle.getPtBSbyK(p12.sig.nodes_k, ranks)
res = mle.getEncodingLength(p12.optBS, NULL, NULL, G)
p12.e[k] = res[which.max(res[,"d.score"]), "IS.alt"] + log2(choose(length(G), 1))*(length(p12.sig.nodes_k)-which.max(res[,"d.score"]))

} return (list(p1.e=p1.e, p2.e=p2.e, p12.e=p12.e, JAC = dirSim, NCD=(p12.e-apply(cbind(p1.e, p2.e), 1, min))/apply(cbind(p1.e, p2.e), 1, max))) }

First, run CTDdisMod for all models (kmx doesn't matter).

for (model in c("aadc","cob","pa","abat", "adsl", "arg", "asld", "cit", "otc", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { #
data_mx=data_mx.0 table(data_mx["argininosuccinate",]) # Selected disease module mn = apply(data_mx[,which(colnames(data_mx) %in% cohorts[[model]])], 1, function(i) mean(na.omit(i))) mn = mn[-which(is.na(mn))] downtown_disease_module = disMod[[model]] # For each network fold, run CTDdm against several diagnoses for (fold in 1:length(cohorts[[model]])) { print(sprintf("Fold %d/%d...", fold, length(cohorts[[model]]))) # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]] ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]] # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData",rank.dir, model, fold))[["permutationByStartNode"]] ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]] ranks = lapply(ranks, tolower) adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight"))) G = vector(mode="list", length=length(V(ig)$name)) names(G) = V(ig)$name data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ] data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))]

tmp.mm = unique(downtown_disease_module)
tmp.mm = tmp.mm[which(tmp.mm %in% names(G))]
ptBSbyK.dis = mle.getPtBSbyK(tmp.mm, ranks)
res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)
downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1))
ptBSbyK.dis = mle.getPtBSbyK(downtown_disease_mod, ranks)

df_disMod = data.frame(ptID=character(), diag=character(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE)
for (p in 1:ncol(data_mx)) {
  #print(sprintf("Patient %d/%d...", p, ncol(data_mx)))
  ptID = colnames(data_mx)[p]
  if (ptID %in% unlist(cohorts)) {diag = names(cohorts)[which(unlist(lapply(cohorts, function(i) ptID %in% i)))]}else{diag = "reference"}
  # CTD: get module best explained by network's connectivity
  S = data_mx[order(abs(data_mx[,p]), decreasing = TRUE),p]
  p2.sig.nodes = names(S)[1:length(downtown_disease_mod)]
  p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks)
  ctdDisMod = disFromDowntown(downtown_disease_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G, mn, S)
  #if (ptID=="X606789") {print(min(ctdDisMod$NCD))}
  df_disMod[p, "ptID"] = colnames(data_mx)[p]
  df_disMod[p, "diag"] = diag[1]
  df_disMod[p, "ctdDisMod"] = min(ctdDisMod$NCD)
  df_disMod[p, "jac"] = min(ctdDisMod$JAC)
}
save(df_disMod, file=sprintf("%s/ctdDisMod_%s_fold%d.RData",model.dir, model, fold))

} }

Next, run CTD on all models for K=30

for (kmx in c(30)) { for (model in c( "aadc","cob","pa", "abat", "adsl", "arg", "asld", "cit", "otc", "ga", "gamt", "msud", "mma", ,"zsd", "rcdp", "pku")) { # data_mx=data_mx.0 # table(data_mx["argininosuccinate",]) # For each network fold, runCTD against several diagnoses for (fold in 1:length(cohorts[[model]])) { # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]] ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]] # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData", model, rank.dir fold))[["permutationByStartNode"]] ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]]

  ranks = lapply(ranks, tolower)
  adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
  G = vector(mode="list", length=length(V(ig)$name))
  names(G) = V(ig)$name
  data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ]
  data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))]

  df_ctd = data.frame(ptID=character(), S=character(), lenS=numeric(), optT=numeric(), fishers=numeric(), I0=numeric(), IA=numeric(), d=numeric(), stringsAsFactors = FALSE)
  r=1
  ptBSbyK = list()
  for (p in 1:ncol(data_mx)) {
    ptID = colnames(data_mx)[p]
    if (ptID %in% unlist(cohorts)) {diag = names(cohorts)[which(unlist(lapply(cohorts, function(i) ptID %in% i)))]}else{diag = "reference"}
    S = data_mx[order(abs(data_mx[,p]), decreasing = TRUE),p][1:kmx]
    ptBSbyK[[ptID]] = mle.getPtBSbyK(names(S), ranks)
    res = mle.getEncodingLength(ptBSbyK[[ptID]], NULL, ptID, G)
    #if (ptID=="X606789") {
    #  print(sprintf("Patient %d/%d...", p, ncol(data_mx)))
    #  print(max(res[,"d.score"]))
    #}
    for (k in 1:kmx) {
      df_ctd[r, "ptID"] = colnames(data_mx)[p]
      df_ctd[r, "diag"] = diag[1]
      df_ctd[r, "S"] = paste(names(S)[1:k], collapse="/")
      df_ctd[r, "lenS"] = k
      df_ctd[r, "optT"] = res[k, "opt.T"]
      df_ctd[r, "I0"] = res[k, "IS.null"]
      df_ctd[r, "IA"] = res[k, "IS.alt"]
      df_ctd[r, "d"] = res[k, "d.score"]
      r = r + 1
    }
  }
  save(ptBSbyK, df_ctd, file=sprintf("%s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx))
}

} }

Rbind df_ctdDisMod to df_ctd

rm(list=setdiff(ls(),c(grep("dir|cohorts",ls(),value = T),lsf.str())))

for (model in c("aadc","cob","pa", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { ##" for (fold in 1:length(cohorts[[model]])) { load(sprintf("%s/ctdDisMod_%s_fold%d.RData",model.dir, model, fold)) for (kmx in c(30)) { load(sprintf("%s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) df = data.frame(ptID=character(), S=character(), lenS=numeric(), optT=numeric(), fishers=numeric(), I0=numeric(), IA=numeric(), d=numeric(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE) r = 1 for (p in 1:nrow(df_disMod)) { for (k in 1:kmx) { df[r, "ptID"] = df_ctd[r,"ptID"] df[r, "diag"] = df_ctd[r,"diag"] df[r, "S"] = df_ctd[r,"S"] df[r, "lenS"] = df_ctd[r,"lenS"] df[r, "optT"] = df_ctd[r,"optT"] df[r, "I0"] = df_ctd[r,"I0"] df[r, "IA"] = df_ctd[r,"IA"] df[r, "d"] = df_ctd[r,"d"] df[r, "ctdDisMod"] = df_disMod[p,"ctdDisMod"] df[r, "jac"] = df_disMod[p,"jac"] r = r + 1 } } save(ptBSbyK, df, file=sprintf("%s/model_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) system(sprintf("rm %s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) } system(sprintf("rm %s/ctdDisMod_%s_fold%d.RData", model.dir, model, fold)) } }

Collapse fold signal to LOOCV signal, where the test patients scores are averaged across folds, and cases

signal is the left out fold score.

require(CTD) require(R.utils) require(R.utils) require(CTD) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]

dupNames=loadToEnv("clinical_data.RData")[["dupNames"]] data_mx=data_mx[,-which(colnames(data_mx) %in% dupNames[,1])] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx

which(apply(data_mx, 2, function(i) sum(na.omit(i)==0)/length(na.omit(i)))==1) for (kmx in c(30)) { # for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "zsd", "rcdp", "msud", "mma", "pa", "pku")) { # df_model = data.frame(fold=numeric(), pt=numeric(), bits=numeric(), ctdDisMod=numeric(), jac=numeric(), diag=character(), stringsAsFactors = FALSE) for (fold in 1:length(cohorts[[model]])) { df = loadToEnv(sprintf("%s/model_%s_fold%d_kmx%d.RData", model.dir , model, fold, kmx))[["df"]] df = df[!is.na(df$ptID),] pts = unique(df$ptID) df_best = data.frame(pt=numeric(), ptID=character(), bits=numeric(), diag=character(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE) for (pt in 1:length(pts)) { pt_data = df[which(df$ptID==pts[pt]),] ptID = unique(df[which(df$ptID==pts[pt]), "ptID"]) if (pt_data[1,"diag"]==model) { df_best[pt, "pt"] = which(cohorts[[model]]==ptID) } df_best[pt, "ptID"] = ptID df_best[pt, "bits"] = max(df[which(df$ptID==pts[pt]), "d"])-log2(nrow(pt_data)) df_best[pt, "jac"] = pt_data[1, "jac"] df_best[pt, "diag"] = unique(pt_data[,"diag"]) df_best[pt, "ctdDisMod"] = pt_data[1, "ctdDisMod"] } df_best$bits[which(df_best$bits<0)] = 0 df_best$fold = rep(fold, nrow(df_best)) df_model = rbind(df_model, df_best) } # Do not do multiple hypothesis test correction here. #df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="fdr")) df_model$loocv = rep(0, nrow(df_model)) df_model$loocv[which(df_model$pt==df_model$fold)] = 1 df_model = df_model[-intersect(which(df_model$loocv==0), which(df_model$diag==model)), ] b_bits = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"bits"]))) ctdDisMod_loocv = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"ctdDisMod"]))) jac_loocv = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"jac"]))) df_model = df_model[-which(duplicated(df_model$ptID)),] df_model = df_model[order(df_model$ptID),] b_bits = b_bits[order(b_bits[,1]),] ctdDisMod_loocv = ctdDisMod_loocv[order(ctdDisMod_loocv[,1]),] jac_loocv = jac_loocv[order(jac_loocv[,1]),] df_model$bits = as.numeric(b_bits[,2]) df_model$ctdDisMod = as.numeric(ctdDisMod_loocv[,2]) df_model$jac = as.numeric(jac_loocv[,2]) print(any(is.na(df_model$bits))) print(any(is.na(df_model$ctdDisMod))) kmx = nrow(pt_data) save(df_model, file=sprintf("%s/dd-%s-loocv-kmx%d.RData",model.dir, model, kmx)) #sapply(1:length(cohorts[[model]]), function(i) system(sprintf("rm model/model_%s_fold%d_kmx%d.RData", model, i, kmx))) } }

kmx=30

load("../clnms_map-macbook.RData")

clnms_map=as.data.frame(clnms_map)

for (names in dupNames[,1]){

clnms_map[clnms_map$clnms==names,]=dupNames[dupNames[,1]==names,2]}

for(model in c("aadc", "cob", "abat", "adsl", "arg", "asld", "cit", "otc", "ga", "gamt", "msud", "mma", "pa","zsd", "rcdp", "pku")){

for (fold in 1:length(cohorts[[model]])) {

load(sprintf("%s/dd-%s-loocv-kmx%d.RData",model.dir, model, kmx))

df_model$ptID=clnms_map[,2][match(df_model$ptID,clnms_map[,1])]

save(df_model, file=sprintf("%s/dd-%s-loocv-kmx%d.RData",model.dir, model, kmx))

}

}

For each kmx, get a list of patient rankings across all 16 disease-specific network models

Be sure to change ptIDs to coded ptIDs

require(EmpiricalBrownsMethod) setwd(model.dir) models = c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "mma", "msud", "otc", "pa", "pku", "rcdp", "zsd")

ff = list.files(".", pattern=sprintf("loocv-kmx%d.RData", 30)) dff = loadToEnv(ff[1])[["df_model"]] pts = dff$ptID for (kmx in 30) {#, 30, 5 ff = list.files(".", pattern=sprintf("loocv-kmx%d.RData", kmx)) df_models = list() for (model in 1:length(models)) { df_model = loadToEnv(ff[model])[["df_model"]] ctd_pval_uncorrected = ifelse(2^-df_model[, "bits"]>1, 1, 2^-df_model[, "bits"]) df_model$bits2 = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) ctd_pval = ifelse(2^-df_model[, "bits2"]>1, 1, 2^-df_model[, "bits2"]) ctdDisMod_percentile = sapply(df_model[, "ctdDisMod"], function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) combined_network = apply(cbind(ctd_pval, ctdDisMod_percentile), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test) names(ctd_pval) = df_model$ptID names(ctdDisMod_percentile) = df_model$ptID names(combined_network) = df_model$ptID df_models[[models[model]]][["ctd_pval_uncorrected"]] = ctd_pval_uncorrected df_models[[models[model]]][["ctd_pval"]] = ctd_pval df_models[[models[model]]][["ctdDisMod_percentile"]] = ctdDisMod_percentile df_models[[models[model]]][["combined_network"]] = combined_network print(model) }

pt_ranks = list() for (pt in 1:length(pts)) { ptID = pts[pt] # ptID_coded = clnms_map[which(clnms_map[,1]==ptID), 2] ptID_coded = ptID for (pc in ptID_coded) { df_pt = data.frame(model=character(), ctd=numeric(), ctdDisMod=numeric(), brown.comb=numeric(), stringsAsFactors = FALSE) r = 1 for (model in 1:length(models)) { ctd_pval = df_models[[models[model]]][["ctd_pval"]] ctdDisMod_percentile = df_models[[models[model]]][["ctdDisMod_percentile"]] combined_network = df_models[[models[model]]][["combined_network"]]

    df_pt[r, "model"] = gsub(sprintf("dd-|-loocv-kmx%d.RData", kmx), "", ff[model])
    df_pt[r, "ctd"] = ctd_pval[pt]
    df_pt[r, "ctdDisMod"] = ctdDisMod_percentile[pt]
    df_pt[r, "brown.comb"] = combined_network[pt]   # Use Brown's combined for dependent tests
    r = r + 1
  }
  pt_ranks[[pc]] = df_pt
}

} save(pt_ranks, file=sprintf("ptRanks_kmx%d.RData", kmx)) }

AUCs

setwd("../") require(pROC) require(pls) require(EmpiricalBrownsMethod)

dff = data.frame(kmx=numeric(), model=character(), ctd.auc=numeric(), ctddismod.auc=numeric(), jac.auc=numeric(), combined.auc=numeric(), stringsAsFactors = FALSE) r= 1 for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "otc", "pa", "pku", "rcdp", "zsd")) { for (kmx in c(30)) { print(sprintf("MODEL %s...", model)) # df_model$bits add Bonferroni correction load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) df_model = df_model[-which(df_model$diag %in% c("alaimo","maps")),] df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) print(toupper(model)) print(dim(df_model)) diags = df_model$diag diags[-which(diags==model)] = 0 diags[which(diags==model)] = 1

# AUC, CTD only
ctd.auc = roc(diags, df_model$bits)
print(ctd.auc$auc)
# AUC, CTDdisMod only
#ctddismod.auc = roc(diags, df_model$ctdDisMod)
#print(ctddismod.auc$auc)
# AUC, Jaccard only
#jac.auc = roc(diags, df_model$jac)
#print(jac.auc$auc)

# AUC, Network-combined: CTD+CTDdisMod
ctd_pvals = 2^-(df_model$bits)
ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test)
network.auc = roc(diags, combined_brown)
print(network.auc$auc)

# AUC, all CTD+CTDdisMod+Jaccard, Browns combined
ctd_pvals = 2^-(df_model$bits)
ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model))
combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)
all.auc = roc(diags, combined_brown)
print(all.auc$auc)

dff[r, "kmx"] = kmx
dff[r, "model"] = model
dff[r, "ctd.auc"] = ctd.auc$auc
#dff[r, "ctddismod.auc"] = ctddismod.auc$auc
#dff[r, "jac.auc"] = jac.auc$auc
dff[r, "network.auc"] = network.auc$auc
dff[r, "all.auc"] = all.auc$auc
r = r + 1

} } save(dff,file=sprintf("%s/AUC_kmx%s.RData",model.dir,kmx))

require(ggplot2)

dff$kmx = as.factor(dff$kmx)

temp=sapply(cohorts,length)

dff$cohort.size=paste(toupper(dff$model)," (n=",temp[dff$model],")",sep = "")

dff=dff[order(dff$network.auc),]

fixed_level=dff$cohort.size

dff$cohort.size=factor(dff$cohort.size,levels = fixed_level)

dff$AUCformat=sapply(dff$network.auc,function(x) sprintf(".%s",format(100*x,digits = 2)))

dff$AUCformat[which(dff$AUCformat==".100")]="1"

p=ggplot(dff,aes(x=cohort.size, y=network.auc)) +

geom_bar(stat="identity",width = 0.8,fill="lightblue") +

geom_text(aes(label=AUCformat)) +

theme_bw()+theme(axis.text.x = element_text(angle = 45) ) +xlab("Disease model") +ylab("CTD+CTDdm AUC")

p

ggsave(p,device = cairo_pdf,filename = "revisionOutput/16IEMauc.pdf",width = 5,height = 4)

## Compare Usefulness of Information Mined in a Pathway Context to an Unbiased Network Context
``` {r pathway-vs-ctd-global}
# Pathway discrimination: PLS w/ metabolies in urea cycle. Diagnose patients for urea cycle defects.
# CTD Global discrimination: CTD disease specific models call "yes" or "no" based on significance of patient's top metabolite perturbations.

require(CTD)
require(R.utils)
require(pls)
require(pROC)

rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
# models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc"))
# cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T))
# cohorts[["alaimo"]]=NULL
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx

kmx=30
p0=0.1
p1=0.9
thresholdDiff=0.01

refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)]
#cohorts$arg = c(cohorts$arg, cohorts$test_arg)
#cohorts$asld = c(cohorts$asld, cohorts$test_asld)
#cohorts$otc = c(cohorts$otc, cohorts$test_otc)
cohorts = cohorts[-which(names(cohorts) %in% c("alaimo", "maps", "abat", "aadc", "adsl", "zsd", "rcdp", "test_arg", "test_asld", "test_otc", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "hep_refs"))]
data_mx = data_mx[,which(colnames(data_mx) %in% c(unlist(cohorts)))]
fill.rate = apply(data_mx, 1, function(i) sum(is.na(i))/length(i))
data_mx = data_mx[which(fill.rate<0.20),]
data_mx = data_mx[,sort(colnames(data_mx))]
print(dim(data_mx))

table(data_mx["argininosuccinate",])
data_mx["argininosuccinate", which(data_mx["argininosuccinate",]!=0)] = 1

load(system.file(sprintf("extdata/RData/%s.RData", "Arginine-Metabolism"), package="CTDext"))

uc_pathway_mets = V(ig)$label[which(V(ig)$shape=="circle")]
uc_pathway_mets = uc_pathway_mets[-which(uc_pathway_mets %in% c("H2O", "FAD", "O2", "NADP+", "CO2", "ADP", "NADH", "ATP", "PPi", "H+", "Mg2+", "Ca2+", "FMN", "PLP", "AMP", "Mg2+", "Mn2+", "NAD+", "ARGININE METABOLISM AND UREA CYCLE", "Aspartate Glutamate Metabolism", "Proline Metabolism", "Polyamine Metabolism", "TCA Cycle", "Pi"))]
uc_pathway_mets = uc_pathway_mets[-which(uc_pathway_mets %in% c("NADPH", "Co2+", "NH3"))]
uc_pathway_metsAll = tolower(uc_pathway_mets)
uc_pathway_metsLocal = c("argininosuccinate", "citrulline", "arginine", "ornithine")
dff_all = data.frame(diag=character(), model=character(), sensitivity=numeric(), specificity=numeric(), stringsAsFactors = FALSE)
for (model in c("arg", "asld", "cit", "otc")) {#
  print(sprintf("%s...", toupper(model)))
  diags = colnames(data_mx)
  diags[which(!(diags %in% cohorts[[model]]))] = 0
  diags[which(diags %in% cohorts[[model]])] = 1

  # Data frame for pathway (local) view - PLS on zscores, predicting diagnosis (0=control, 1=case)
  uc_data = rbind(diags, data_mx)
  rownames(uc_data)[1] = "diag"
  uc_data = uc_data[which(rownames(uc_data) %in% c("diag", uc_pathway_metsLocal)),]
  uc_data = apply(uc_data, c(1,2), as.numeric)

  # Full pathway view
  uc_data2 = rbind(diags, data_mx)
  # uc_data2=data_mx
  rownames(uc_data2)[1] = "diag"
  uc_data2 = uc_data2[which(rownames(uc_data2) %in% c("diag", uc_pathway_metsAll)),]
  uc_data2 = apply(uc_data2, c(1,2), as.numeric)
  # temp=data.frame(diag=as.numeric(diags),df=I(t(uc_data2)))
  # uc_data2=temp

  # All frequently detected molecules (fill rate > 0.8)
  # uc_data3=data_mx
  uc_data3 = rbind(diags, data_mx)
  rownames(uc_data3)[1] = "diag"
  fillrate = apply(uc_data3, 1, function(x) sum(!is.na(x))/dim(uc_data3)[2])
  uc_data3 = uc_data3[fillrate>=0.8,]
  uc_data3 = apply(uc_data3, c(1,2), as.numeric)
  NAasMin = function(x){
    if (sum(is.na(x))>0) {x[is.na(x)]=min(x,na.rm = T)}
    return(x)
  }
  for (r in 1:ncol(uc_data3)){
    uc_data3[,r]=NAasMin(uc_data3[,r])
  }
  # temp=data.frame(diag=as.numeric(diags),df=I(t(uc_data3)))
  # uc_data3=temp
  # rm(temp)


  # Data frame for CTD/CTDdisMod network-quantified variables
  load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx))
  df_model = df_model[which(df_model$ptID %in% colnames(data_mx)),]
  df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni"))
  ctd_pvals = 2^-(df_model$bits)
  ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
  combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test)
  #df2_ctd = data.frame(combined=combined_brown, ctd=df_model$bits, ctddisMod=df_model$ctdDisMod, diag=as.numeric(diags))  # 
  df2_ctd = data.frame(combined=-log2(combined_brown), ctd=df_model$bits, ctddisMod=-log2(ctdDisMod_pvals), diag=as.numeric(diags))  # 
  #df2_ctd = data.frame(combined=-log2(combined_brown), diag=as.numeric(diags))  # 
  rownames(df2_ctd) = df_model$ptID

  # LOOCV PLS
  dff_model = data.frame(pls_pwyLocal=numeric(), pls_pwyAll=numeric(), pls_ctd_ctddm=numeric(), pls_allfreqmets=numeric(), stringsAsFactors = FALSE)
  for (it in 1:length(diags)) {
    isTrain = c(1:length(diags))[-it]
    # Pathway Z-score feature selection: Local
    model_res = plsr(diag~., data = as.data.frame(t(uc_data[,isTrain])))
    tst_data = uc_data[,-isTrain]
    model_tst =  predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data)))
    dff_model[it, "pls_pwyLocal"] = model_tst

    # Pathway Z-score feature selection: All
    model_res = plsr(diag~., data = as.data.frame(t(uc_data2[,isTrain])))
    # model_res = plsr(diag~df, data = temp)
    tst_data = uc_data2[,-isTrain]
    if (length(which(is.na(tst_data))) >0) {
      tst_data[which(is.na(tst_data))] = min(na.omit(uc_data2[which(is.na(tst_data)),]))
    }
    model_tst =  predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data)))
    dff_model[it, "pls_pwyAll"] = model_tst


    # CTD and CTDdm as covariates
    ctd_res = plsr(diag~., data = df2_ctd[isTrain,])
    tst_data = as.matrix(df2_ctd[-isTrain,-which(colnames(df2_ctd)=="diag")])
    if (ncol(tst_data)==1) { colnames(tst_data) = "combined" }
    ctd_tst = predict(ctd_res, ncomp=ctd_res$ncomp, newdata=tst_data)
    dff_model[it, "pls_ctd_ctddm"] = ctd_tst


  }
    # All frequently detected molecules
    # model_res = mvr(diag~., data = as.data.frame(t(uc_data3[,isTrain])),validation = "CV")
    # uc_data3.train=uc_data3
    # uc_data3.train=uc_data3.train[isTrain,]
    model_res = plsr(diag~., data = as.data.frame(t(uc_data3)),validation = "LOO")
    # temp=selectNcomp(model_res, method = c("randomization"),
    #         nperm = 999, alpha = 0.01, ncomp = model_res$ncomp,
    #         plot = FALSE)
    # tst_data = uc_data3[,-isTrain]
    # if (length(which(is.na(tst_data))) >0) {
    #   tst_data[which(is.na(tst_data))] = apply(uc_data3[which(is.na(tst_data)),], 1, function(x) min(x,na.rm = T))
    # }
    # model_tst =  predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data)))
    # model_tst =  model_res$validation$pred[it,,which.min(model_res$validation$PRESS)]
    # model_tst =  model_res$validation$pred[it,,9]
    dff_model[, "pls_allfreqmets"] = model_res$validation$pred[,,model_res$ncomp]
    # dff_model[, "pls_allfreqmets"] = model_res$validation$pred[,,which.min(model_res$validation$PRESS)]

  pls_ctd_auc = roc(diags, dff_model$pls_ctd_ctddm)
  pls_pwyLocal_auc = roc(diags, dff_model$pls_pwyLocal)
  pls_pwyAll_auc = roc(diags, dff_model$pls_pwyAll)
  pls_allfreqmets_auc = roc(diags, dff_model$pls_allfreqmets)
  print(sprintf("PLS.ctd.ctddisMod  AUC = %.3f", pls_ctd_auc$auc))
  print(sprintf("PLS.pathwayLocal     AUC = %.3f", pls_pwyLocal_auc$auc))
  print(sprintf("PLS.pathwayAll     AUC = %.3f", pls_pwyAll_auc$auc))
  print(sprintf("PLS.allfreq     AUC = %.3f", pls_allfreqmets_auc$auc))


  ind = which(duplicated(pls_pwyLocal_auc$specificities))
  ind2 = which(duplicated(pls_pwyAll_auc$specificities))
  ind3 = which(duplicated(pls_ctd_auc$specificities))
  ind4 = which(duplicated(pls_allfreqmets_auc$specificities))

  dff2 = data.frame(diag=character(), model=character(), sensitivity=numeric(), specificity=numeric(), stringsAsFactors = FALSE)
  nrw = 4+length(pls_pwyLocal_auc$specificities[-ind]) + length(pls_pwyAll_auc$specificities[-ind2]) + length(pls_ctd_auc$specificities[-ind3]) + length(pls_allfreqmets_auc$specificities[-ind4])
  dff2[1:nrw, "diag"] = rep(model, nrw)
  dff2[1:nrw, "model"] = c(rep("pls-pwyLocal", 1+length(pls_pwyLocal_auc$specificities[-ind])), 
                           rep("pls-pwyAll", 1+length(pls_pwyAll_auc$specificities[-ind2])),
                           rep("plsctd", 1+length(pls_ctd_auc$specificities[-ind3])),
                           rep("pls-allMets", 1+length(pls_ctd_auc$specificities[-ind4])))
  dff2[1:nrw, "specificity"] = c(pls_pwyLocal_auc$specificities[-ind], 1,
                                 pls_pwyAll_auc$specificities[-ind2], 1,
                                 pls_ctd_auc$specificities[-ind3], 1,
                                 pls_allfreqmets_auc$specificities[-ind4], 1)
  dff2[1:nrw, "sensitivity"] = c(pls_pwyLocal_auc$sensitivities[-ind], 0,
                                 pls_pwyAll_auc$sensitivities[-ind2], 0,
                                 pls_ctd_auc$sensitivities[-ind3], 0,
                                 pls_allfreqmets_auc$sensitivities[-ind4], 0)

  dff_all = rbind(dff_all, dff2)
}
# save(dff_all,file="revisionOutput/pwy-vs-global_MJM_v1.RData")
save(dff_all,file=sprintf("%s/pwy-vs-global_MJM_v1.RData",OP.dir))

# load("revisionOutput/pwy-vs-global_MJM_v1.RData")
# load(sprintf("%s/pwy-vs-global_MJM_v2.RData",OP.dir))
require(ggplot2)
# # svg("pathway-vs-global/pwy-vs-global_MJM_v1.svg")
# svg("revisionOutput/pwy-vs-global_MJM_v1.svg")
# ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) + facet_wrap(~diag)
# dev.off()
# 
# # png("pathway-vs-global/pwy-vs-global_MJM_v1.png")
# png("revisionOutput/pwy-vs-global_MJM_v1.png")
# ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) + facet_wrap(~diag)
# dev.off()

dff_all$model=c("Asa-Arg-Orn-Cit","Pathway","Network","Full profile")[match(dff_all$model,c("pls-pwyLocal","pls-pwyAll","plsctd","pls-allMets"))]
dff_all$model=factor(dff_all$model,levels = c("Pathway","Asa-Arg-Orn-Cit","Network","Full profile"))
dff_all$diag=c("Argininosuccinic aciduria","Arginase deficiency","Ornithine transcarbamylase deficiency","Citrullinemia")[match(dff_all$diag,c("asld","arg","otc","cit"))]

p=ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) +
  facet_wrap(~diag) + 
  # scale_color_manual(labels = levels(dff_all$model), values = c("red","green","blue","purple")) +
  theme_bw()+
  theme(legend.position = c(0.25,0.75),
        legend.title = element_blank(),
        legend.box.background = element_rect(colour = "black"),
        legend.background = element_blank(),
        # legend.spacing.y = unit(0.1, 'cm'),
        legend.key.size = unit(0.1, "cm"),
        text = element_text(family = "Arial")) +
  xlab("specificity")
p
ggsave(p,file=sprintf("%s/pwy-vs-global_MJM_v1.pdf",OP.dir),width = 4,height=4, device = cairo_pdf)
ggsave(p,file=sprintf("%s/pwy-vs-global_MJM_v2.pdf",OP.dir),width = 4,height=4, device = cairo_pdf)
ggsave(p,file=sprintf("revisionOutput/pwy-vs-global_MJM_v1.svg",OP.dir),width = 6,height=6, device = svglite)
ggsave(p,file=sprintf("revisionOutput/pwy-vs-global_MJM_v2.svg",OP.dir),width = 6,height=6, device = svglite)

Haijes et al 2020 Comparison: Rank Diagnoses for Each Patient

``` {r rank-diagnoses} require(EmpiricalBrownsMethod) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))

16 models for k=30

for (kmx in c(30)) { dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 print(sprintf("Kmx = %d", kmx)) # Include "asld" and "otc" in the 16 model run, then comment them out for 14 model run. for (model in c("aadc", "abat", "adsl", "arg", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd", "asld", "otc")) { # print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) # df_model$bits is already FDR corrected df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) # Excluse OTC and ASLD for 14 model run. Only rank diagnosed patients (not healthy control reference patients). df_model = df_model[-which(df_model$diag %in% c("maps", "hep_refs", "edta_refs")),] # "otc", "asld"

ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits))
ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model))
#combined_hm = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) mean(i))
combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)
#combined_fisher = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) stats.fishersMethod(i))

for (pt in 1:nrow(df_model)) {
  dff[r, "model"] = model
  dff[r, "ptID"] = df_model$ptID[pt]
  dff[r, "diag"] = df_model$diag[pt]
  dff[r, "ctd_pval"] = ctd_pvals[pt]
  dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt]
  dff[r, "combined_pval"] = combined_brown[pt]
  r = r + 1
}
save(dff,file=sprintf("%s/ptpval_kmx%s.RData",OP.dir,kmx))

}

# CTD only # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "ctd_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "ctd_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"ctd_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"ctd_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd)

# CTDdisMod only # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "ctdDisMod_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "ctdDisMod_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"ctdDisMod_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"ctdDisMod_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd)

# Combined Network # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "combined_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "combined_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"combined_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"combined_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd) }

15 models for k=30

for (kmx in c(30)) { #seq(30,30,5) dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 print(sprintf("Kmx = %d", kmx)) # Include "asld" and "otc" in the 16 model run, then comment them out for 14 model run. for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd")) { # print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) # df_model$bits is already FDR corrected df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) # Excluse OTC and ASLD for 14 model run. Only rank diagnosed patients (not healthy control reference patients). df_model = df_model[-which(df_model$diag %in% c("maps", "hep_refs", "edta_refs", "otc")),]

ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits))
ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model))
combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)

for (pt in 1:nrow(df_model)) {
  dff[r, "model"] = model
  dff[r, "ptID"] = df_model$ptID[pt]
  dff[r, "diag"] = df_model$diag[pt]
  dff[r, "ctd_pval"] = ctd_pvals[pt]
  dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt]
  dff[r, "combined_pval"] = combined_brown[pt]
  r = r + 1
}

}

# Combined Network # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "combined_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "combined_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"combined_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"combined_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd) }

Get number of DDs in reference patients only

dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "otc", "pa", "pku", "rcdp", "zsd")) { print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx30.RData", model.dir, model)) df_model$bits = -log2(p.adjust(2^-df_model$bits, method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) df_model = df_model[which(df_model$diag %in% c("hep_refs", "edta_refs")),]

ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits)) ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model)) combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)

for (pt in 1:nrow(df_model)) { dff[r, "model"] = model dff[r, "ptID"] = df_model$ptID[pt] dff[r, "diag"] = df_model$diag[pt] dff[r, "ctd_pval"] = ctd_pvals[pt] dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt] dff[r, "combined_pval"] = combined_brown[pt] r = r + 1 } }

Length of DD

len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"combined_pval"]<0.05)))) median(len_dd) quantile(len_dd, 0.05) quantile(len_dd, 0.95)

## Zellweger Spectrum Disorders: PEX1 (Peroxisome Biogenesis disorders) vs. PEX7 (RCDP)
``` {r single-node_ptSim}
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
# Compute the patient similarity between negative controls, true cases and the "unknown disease" patient cohort.
require(R.utils)
require(CTD)
mkdirs("ptSim")
thresholdDiff=0.01
kmx=30
p0=0.1
p1=0.9

data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx



refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)]
data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))]
for (model in c("rcdp", "zsd")) { 
  for (fold in 1:length(cohorts[[model]])) {
    # Load pre-computed node ranks (run on cluster in parallel and collated)
    # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData", rank.dir, toupper(model), fold))[["permutationByStartNode"]]
    ranks  = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]]
    ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]]
    # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]]
    ranks = lapply(ranks, tolower)
    names(ranks) = as.character(lapply(ranks, function(i) i[[1]][1]))
    adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
    data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ]

    # Get patient bitstrings ahead of time
    ptBSbyK = list()
    for (pt in 1:ncol(data_mx)) {
      print(pt)
      ptID = colnames(data_mx)[pt]
      S1 = rownames(data_mx)[order(abs(data_mx[,ptID]), decreasing = TRUE)][1:kmx]
      ptBSbyK[[ptID]] = mle.getPtBSbyK(S1, ranks)
    }

    res = list()
    t = list(ncd=matrix(NA, nrow=ncol(data_mx), ncol=ncol(data_mx)), jac=matrix(NA, nrow=ncol(data_mx), ncol=ncol(data_mx)))
    rownames(t$ncd) = colnames(data_mx)
    colnames(t$ncd) = colnames(data_mx)
    rownames(t$jac) = colnames(data_mx)
    colnames(t$jac) = colnames(data_mx)
    for (i in 1:(kmx-1)) { res[[i]] = t }
    for (pt in 1:(ncol(data_mx)-1)) {
      print(pt)
      ptID = colnames(data_mx)[pt]
      for (pt2 in (pt+1):ncol(data_mx)) {  
        print(sprintf(" vs. %d...", pt2))
        ptID2 = colnames(data_mx)[pt2]
        tmp = mle.getPtDist(ptBSbyK[[ptID]], ptID, ptBSbyK[[ptID2]], ptID2, data_mx, ranks)
        for (k in 1:(kmx-1)) {
          res[[k]]$ncd[ptID, ptID2] = tmp$NCD[k]
          res[[k]]$jac[ptID, ptID2] = tmp$dirSim[k]
          res[[k]]$ncd[ptID2, ptID] = tmp$NCD[k]
          res[[k]]$jac[ptID2, ptID] = tmp$dirSim[k]
        }
      }
    }
    # Normalize rows by max value size k
    res_norm = res
    for (k in 1:(kmx-1)) {
      diag(res_norm[[k]]$ncd) = 0
      diag(res_norm[[k]]$jac) = 0
      res_norm[[k]]$ncd = res_norm[[k]]$ncd/max(na.omit(res_norm[[k]]$ncd))
    }
    save(res, res_norm, file=sprintf("ptSim/ptSim_%s%d_kmx%d.RData", model, fold, kmx))
  }
}


# For RCDP and ZSD similarity matrices, take the min patient distance for each k, and then across k, for each pairwise patient comparison. You will end up with one similarity matrix.
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
require(CTD)
require(R.utils)

data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx

kmx=30
mle.getMeanPtDistance = function (allSimMatrices) {
  ptSim = allSimMatrices[[1]]
  for (ind in 2:length(allSimMatrices)) {
    ptSim = ptSim + allSimMatrices[[ind]]
  }
  ptSim = ptSim/length(allSimMatrices)
  return(ptSim)
}
all_models_list = list()
for (model in c("rcdp", "zsd")) {
  print(model)
  model_list = list()
  for (fold in 1:length(cohorts[[model]])) {
    print(fold)
    res = loadToEnv(sprintf("ptSim/kmx%d/ptSim_%s%d_kmx%d.RData", kmx, model, fold, kmx))[["res"]]
    res = lapply(res, function(i) i$ncd)
    model_list[[fold]] = mle.getMinPtDistance(res)
  }
  res_ncd = mle.getMeanPtDistance(model_list)
  diag(res_ncd) = 0
  res_ncd = res_ncd / max(res_ncd)
  # Get colnames
  res = loadToEnv(sprintf("ptSim/kmx%d/ptSim_%s%d_kmx%d.RData", kmx, model, 1, kmx))[["res"]]
  colnames(res_ncd) = colnames(res[[1]]$ncd)
  rownames(res_ncd) = colnames(res[[1]]$ncd)
  all_models_list[[model]] = res_ncd
}
save(all_models_list, file=sprintf("ptSim/all_models_list_kmx%d.RData", kmx))


rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
require(CTD)
require(R.utils)

data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx


kmx=30
load(sprintf("ptSim/all_models_list_kmx%d.RData", kmx))

wangler_ages10_andDown = c("EDTA-ZSD-1", "EDTA-ZSD-2", "EDTA-ZSD-3", "EDTA-ZSD-7", 
                           "EDTA-ZSD-9", "EDTA-ZSD-10", "EDTA-ZSD-11", "EDTA-ZSD-13",
                           "EDTA-ZSD-14", "EDTA-ZSD-17", "EDTA-ZSD-18")
data_mx = data_mx[-grep("x - ", rownames(data_mx)),]
refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)]
res_overall = mle.getMeanPtDistance(all_models_list)
diag(res_overall) = 0
res_overall = res_overall / max(res_overall)
colnames(res_overall) = colnames(all_models_list[[1]])
ind = which(colnames(res_overall) %in% unlist(cohorts[c("zsd", "rcdp", "edta_refs")]))
res_overall = res_overall[ind, ind]
diags = colnames(res_overall)
diags[which(diags %in% wangler_ages10_andDown)] = "ZSD (<10 y.o.)"
diags[which(diags %in% cohorts$zsd)] = "ZSD (10+ y.o.)" 
diags[which(diags %in% cohorts$rcdp)] = "RCDP"
diags[which(diags %in% cohorts$edta_refs)] = "REF"

fitSim = cmdscale(res_overall, eig = FALSE, k = 2)
x = round(fitSim[, 1], 2)
y = round(fitSim[, 2], 2)
#z = round(fitSim[, 3], 2)
#df = data.frame(x = x, y = y, z = z, color = diags, label = colnames(res_overall))
#p = plot_ly(df, x = ~x, y = ~y, z = ~z, color = ~color, text = ~label, marker = list(size = 20))
df = data.frame(x = x, y = y, color = diags, label = colnames(res_overall))
p = plot_ly(df, x = ~x, y = ~y, color = ~color, text = ~label, marker = list(size = 20))
orca(p, sprintf("ptSim/CTDncd_origZscore_kmx%d.svg", kmx), format = "svg")

# Get UNK patients that cluster with PEX1 or PEX7 samples
# Kmeans
zsd.centroid = apply(res_overall[grep("ZSD", diags),], 2, mean)
rcdp.centroid = apply(res_overall[which(diags=="RCDP"),], 2, mean)
ref.centroid = apply(res_overall[which(diags=="REF"),], 2, mean)
kmns = kmeans(res_overall, centers = rbind(zsd.centroid, rcdp.centroid, ref.centroid))
df = cbind(as.numeric(kmns$cluster), diags, colnames(all_models_list[[1]])[ind])
colnames(df) = c("cluster.num", "diag", "ptID")
zsd.cluster = df[which(df[,1]=="1"), ]
rcdp.cluster = df[which(df[,1]=="2"), ]
ref.cluster = df[which(df[,1]==3), ]
clusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
clusterPurity(df[,1], df[,2])


# Compare to euclidean distance + MDS
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx

data_mx = data_mx[,which(colnames(data_mx) %in% c(cohorts$zsd, cohorts$rcdp, cohorts$edta_refs))]
ptSim_euc = as.matrix(dist(t(data_mx), method="euclidean"))
fitSim = cmdscale(ptSim_euc, eig = FALSE, k = 2)
x = round(fitSim[, 1], 2)
y = round(fitSim[, 2], 2)
df = data.frame(x = x, y = y, color = diags, label = colnames(res_overall))
p_euc = plot_ly(df, x = ~x, y = ~y, color = ~color, text = ~label, marker = list(size = 20))
orca(p_euc, sprintf("ptSim/zsd_rcdp_euc_origZscore.svg"), format = "svg")

zsd.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$zsd),], 2, mean)
rcdp.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$rcdp),], 2, mean)
ref.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$edta_refs),], 2, mean)
kmns = kmeans(ptSim_euc, centers = rbind(zsd.centroid, rcdp.centroid, ref.centroid))
df = cbind(as.numeric(kmns$cluster), diags, colnames(all_models_list[[1]])[ind])
colnames(df) = c("cluster.num", "diag", "ptID")
zsd.cluster = df[which(df[,1]=="1"), ]
rcdp.cluster = df[which(df[,1]=="2"), ]
ref.cluster = df[which(df[,1]==3), ]
clusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
clusterPurity(df[,1], df[,2])


# Compare to mahalanobis distance + basic plot
require(CTDext)
data_mx = data.imputeData(data_mx, data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)])
ptSim_mah = mahalanobis(t(data_mx), 
                        center=colMeans(data_mx), 
                        cov(t(data_mx)), inverted = FALSE, tol=1e-50)
ggplot() + geom_point(aes(1:ncol(data_mx), ptSim_mah, colour=diags))




# Visualize main disease module for ZSD and RCDP
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
require(R.utils)
load(system.file("shiny-app/disMod_July2020.RData", package = "CTDext"))
# Selected disease module for ZSD
mets = disMod$zsd
ig = loadToEnv(system.file("networks/ind_foldNets/bg_zsd_fold1.RData",package = "CTDext"))[["ig_pruned"]]
ig = subgraph(ig, v=mets)
E(ig)$weight = abs(E(ig)$weight)*100
E(ig)$color = "#6a0dad"
coords = layout.fruchterman.reingold(ig)
png("ptSim/zsd-blowout_origZscore.png")
plot(ig, layout=coords, edge.width=E(ig)$weight)
dev.off()
V(ig)$name = rep("", length(V(ig)$name))
svg("ptSim/zsd-blowout_origZscore.svg")
plot(ig, layout=coords, edge.width=E(ig)$weight)
dev.off()


# Selected disease module for RCDP
mets = disMod$rcdp
ig = loadToEnv(system.file("networks/ind_foldNets/bg_rcdp_fold3.RData")[["ig_pruned"]]
ig = subgraph(ig, v=mets[which(mets %in% V(ig)$name)])
E(ig)$weight = abs(E(ig)$weight)*100
E(ig)$color = "#228B22"
coords = layout.fruchterman.reingold(ig)
png("ptSim/rcdp-blowout_origZscore.png")
plot(ig, layout=coords, edge.width=E(ig)$weight)
dev.off()
V(ig)$name = rep("", length(V(ig)$name))
svg("ptSim/rcdp-blowout_origZscore.svg")
plot(ig, layout=coords, edge.width=E(ig)$weight)
dev.off()

Model Sensitivity and specificity

require(pROC)
require(R.utils)
require(caret)
require(dplyr)
kmx=30
load(sprintf("%s/ptpval_kmx%s.RData",OP.dir,kmx))
top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "combined_pval"]),c("model", "diag")])
correct.top1=unlist(apply(top1, 2, function(i) i$model==i$diag))
top1=as.data.frame(t(top1))
top1=as.data.frame(apply(top1,2,unlist))
top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "combined_pval"], decreasing = FALSE)[1:3],c("model", "diag")])
top3=as.data.frame(top3)
correct.top3=unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model))

all.models=c("aadc", "abat", "adsl", "arg", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd", "asld", "otc")
df.ModelMetrics=data.frame(matrix(ncol = 8,nrow=length(all.models)),stringsAsFactors = F)
rownames(df.ModelMetrics)=all.models
colnames(df.ModelMetrics)=c("Model","Sensitivity.top1","Specificity.top1","Accuracy.top1","Sensitivity.top3","Specificity.top3","Accuracy.top3","AUC")
options(digits=3)
for (model in all.models) { #

  # Combined Network
  # Top 1 is correct diagnosis
  df=dff[dff$model==model,]
  df=unique(df)
  df=df[match(unique(dff$ptID),df$ptID),]
  CP.v=df$diag==model
  AUC=roc(response=CP.v,predictor = df$combined_pval)

  PP.top1.v=top1$model==model
  PP.top3.v = sapply(unique(dff$ptID),function(x) model %in% top3[[x]]$model)

  conf_mx.top1 = confusionMatrix(data=factor(PP.top1.v,levels = c("TRUE","FALSE")),reference = factor(CP.v,levels = c("TRUE","FALSE")))
  # roc.top1=roc(response=as.numeric(CP.v),predictor = as.numeric(PP.top1.v))
  conf_mx.top3 = confusionMatrix(data=factor(PP.top3.v,levels = c("TRUE","FALSE")),reference = factor(CP.v,levels = c("TRUE","FALSE")))
  # roc.top3=roc(response=as.numeric(CP.v),predictor = as.numeric(PP.top3.v))

  df.ModelMetrics[model,]=c(toupper(model),
                            conf_mx.top1$byClass[c("Sensitivity","Specificity")],
                            conf_mx.top1$overall["Accuracy"],
                            # sprintf("%.3f(%.3f-%.3f)",
                            #         conf_mx.top1$overall["Accuracy"],
                            #         conf_mx.top1$overall["AccuracyLower"],
                            #         conf_mx.top1$overall["AccuracyUpper"]),
                            # roc.top1$auc,
                            conf_mx.top3$byClass[c("Sensitivity","Specificity")],
                            conf_mx.top3$overall["Accuracy"],
                            # sprintf("%.3f(%.3f-%.3f)",
                            #         conf_mx.top3$overall["Accuracy"],
                            #         conf_mx.top3$overall["AccuracyLower"],
                            #         conf_mx.top3$overall["AccuracyUpper"]),
                            # roc.top3$auc
                            NA
                            )
}

AUC.df=loadToEnv(sprintf("%s/AUC_kmx%s.RData",model.dir,kmx))[["dff"]]
df.ModelMetrics$AUC=AUC.df$network.auc[match(df.ModelMetrics$Model,toupper(AUC.df$model))]
df.ModelMetrics[,!grepl("Model",colnames(df.ModelMetrics))]=apply(df.ModelMetrics[,!grepl("Model",colnames(df.ModelMetrics))],c(1,2),as.numeric)

require(openxlsx)
write.xlsx(df.ModelMetrics,file="revisionOutput/ModelMetrics.xlsx")
 # write.table(df.ModelMetrics,file="revisionOutput/ModelMetrics.tsv",sep = "\t",quote = F)


# plot model performance metrics
require(tidyr)
require(reshape2)
levels.fixed=df.ModelMetrics$Model[order(df.ModelMetrics$Sensitivity.top1,decreasing = F)]
df2 = df.ModelMetrics[order(df.ModelMetrics$Sensitivity.top1,decreasing = T),]
# df2 = df2[,!grepl("Accuracy",colnames(df2))]
df2 = df2 %>% melt(id=("Model")) %>%
  separate(col = "variable",into = c("metrics","rule"),sep = "[[:punct:]]+",remove = TRUE,convert = FALSE)
df2$Model=factor(df2$Model,levels = levels.fixed)
df2$value=as.numeric(df2$value)
df2$metrics=factor(df2$metrics,levels = c("Sensitivity","Specificity","Accuracy","AUC"))
# df2=data.frame(xaxis=rep(df$xaxis,3),
#                value=c(df$Score2.mean.SharedEdges.,df$AUC.top1,df$AUC.top3),
#                metrics=as.vector(sapply(c("Stablity","AUC.top1","AUC.top3"),function(x) rep(x,dim(df)[1]))))
p=ggplot(data=df2[(!grepl("top3",df2$rule)) & !grepl("AUC",df2$metrics),],aes_string(x="Model",y="value")) + theme_minimal()+
  facet_wrap(~metrics)+
  geom_bar(stat="identity", position=position_dodge(), width = 0.7)+
  coord_flip()+
  theme(legend.position = "top",legend.title = element_blank(),panel.spacing = unit(1, "lines"))+
  labs(fill="Metrics rule: ")
p
ggsave(p,filename = sprintf("%s/ModelMetrics2.pdf", OP.dir), width = 4, height = 4, device = cairo_pdf)
require(svglite)
ggsave(p,filename = sprintf("%s/ModelMetrics2.svg", OP.dir), width = 6, height = 4, device = svglite)

Network stablity

require(R.utils)
require(CTD)

score_mx=list()
stablity_score=data.frame(Disease=character(),
                          `Cohort Size`=integer(),
                          `Score1 mean(SharedEdges)`=numeric(),
                          `Score1 sd(SharedEdges)`=numeric(),
                          `Score2 mean(SharedEdges)`=numeric(),
                          `Score2 sd(SharedEdges)`=numeric(),
                          stringsAsFactors = F
                          )
r=0
countDirection=TRUE
for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { # 
  print(sprintf("%s...", toupper(model)))
  r=r+1
  cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
  score_mx[["edgeCountByPairOuter"]][[model]]=matrix(nrow=length(cohorts[[model]]),ncol=length(cohorts[[model]]))
  colnames(score_mx[["edgeCountByPairOuter"]][[model]])=rownames(score_mx[["edgeCountByPairOuter"]][[model]])=cohorts[[model]]
  score_mx[["edgeCountByPair"]][[model]]=score_mx[["edgeCountByPairOuter"]][[model]]
  ig.outer=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, 1))[["ig_pruned"]]
  for (pt1 in 1:(length(cohorts[[model]])-1)) {
    ig1=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt1))[["ig_pruned"]]
    ig.outer=ig.outer%u%loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt1+1))[["ig_pruned"]]
    for (pt2 in (pt1+1):length(cohorts[[model]]) ){
      ig2=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt2))[["ig_pruned"]]
      ig.inner=ig1 %s% ig2
      if(countDirection){ig.inner=delete_edges(ig.inner,E(ig.inner)[E(ig.inner)$weight_1*E(ig.inner)$weight_2<=0])}
      ig.pairouter=ig1 %u% ig2
      score_mx[["edgeCountByPair"]][[model]][pt1,pt2]=length(E(ig.inner))
      score_mx[["edgeCountByPairOuter"]][[model]][pt1,pt2]=length(E(ig.pairouter))
    }
  }
  stablity_score[r,]=c(model,
                       length(cohorts[[model]]),
                       mean(score_mx[["edgeCountByPair"]][[model]],na.rm = T)/length(E(ig.outer)),
                       sd(score_mx[["edgeCountByPair"]][[model]],na.rm = T)/length(E(ig.outer)),
                       mean(score_mx[["edgeCountByPair"]][[model]]/score_mx[["edgeCountByPairOuter"]][[model]],na.rm = T),
                       sd(score_mx[["edgeCountByPair"]][[model]]/score_mx[["edgeCountByPairOuter"]][[model]],na.rm = T)
                       )

}
stablity_score$Score2.Normalized.sd=as.numeric(stablity_score$Score2.sd.SharedEdges.)/as.numeric(stablity_score$Score2.mean.SharedEdges.)
colnames(stablity_score)[colnames(stablity_score)=="Disease"]="Diagnosis"
stablity_score$Diagnosis=toupper(stablity_score$Diagnosis)
require(openxlsx)
# write.xlsx(stablity_score,file = "revisionOutput/NetworkStablity.xlsx")

#plot 
require(ggplot2)
stablity_score=read.xlsx("OP/NetworkStablity.xlsx",rowNames = F)
stablity_score[,2:6]=apply(stablity_score[,2:6],c(1,2),as.numeric)
ModelMetrics=read.xlsx("OP/ModelMetrics.xlsx",rowNames = F)
colnames(ModelMetrics)=c("Model","Sensitivity.top1","Specificity.top1","Accuracy.top1","AUC.top1","Sensitivity.top3","Specificity.top3","Accuracy.top3","AUC.top3")
ModelMetrics=ModelMetrics[!is.na(ModelMetrics$Model),]
df=cbind(stablity_score[,c("Diagnosis","Cohort.Size","Score2.mean.SharedEdges.","Score2.sd.SharedEdges.","Score2.Normalized.sd")],
         ModelMetrics[match(stablity_score$Diagnosis,ModelMetrics$Model),c("Sensitivity.top1","Specificity.top1","AUC.top1","Sensitivity.top3","Specificity.top3","AUC.top3")])
df=df[order(as.integer(df$Cohort.Size)),]
df$xaxis=paste(paste(df$Diagnosis,df$Cohort.Size,sep = " (n="),")",sep = "")
fixed_level=df$xaxis
df$xaxis=factor(df$xaxis,levels = fixed_level)
levels(df$xaxis)=df$xaxis

#plot stablity  
p = ggplot(data=df,aes_string(x="xaxis",y="Score2.mean.SharedEdges.",fill = "Cohort.Size")) +
  geom_bar(stat="identity", position=position_dodge(),color="black")+
  scale_fill_gradient(low="#ffcccc",high="darkred")+
  geom_errorbar(aes(ymin=Score2.mean.SharedEdges.-Score2.sd.SharedEdges., ymax=Score2.mean.SharedEdges.+Score2.sd.SharedEdges.), width=.2,
                 position=position_dodge(.9)) +
  theme_minimal()+
  theme(legend.position = "none")+
  # geom_col(aes(fill = Cohort.Size)) + 
  # scale_color_gradient(low = "blue", high = "red") +
  xlab("Disease")+ylab("Network Stability Score") + coord_flip() 
ggsave(p, filename = "OP/StablityScore.pdf", device = cairo_pdf, 
       width = 4, height = 4)


BRL-BCM/CTDext documentation built on May 7, 2022, 5:31 a.m.