In BRL-BCM/CTDext: Extension package for the CTD R package.

w.dir=getwd() #""
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = w.dir)
setwd(w.dir)

require(R.utils)
graph.dir="graphs"
rank.dir="ranks"
model.dir="models"
OP.dir="OP"
sapply(c(graph.dir,rank.dir,model.dir,OP.dir),mkdirs)

Data Load

rm(list=setdiff(ls(),grep("dir",ls(),value = T)))
require(R.utils)
require(CTD)
require(CTDext)
require(openxlsx)
url="https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-022-10415-5/MediaObjects/41598_2022_10415_MOESM2_ESM.xlsx"
download.file(url,destfile = "data_mx.xlsx")
data_mx.og=read.xlsx("data_mx.xlsx",
                  sheet = "Z-score",startRow = 2,
                  colNames = T,rowNames = T,skipEmptyRows = TRUE,skipEmptyCols = TRUE,)
data_mx.og=data_mx.og[
  ,!colnames(data_mx.og) %in% c("PATHWAY_SORTORDER", "SUPER_PATHWAY", "SUB_PATHWAY",
                                "CHEMICAL_ID", "RI", "MASS",
                                "PUBCHEM", "CAS", "KEGG", "HMDB_ID" )]
# 6 alaimo samples with diseases diagnosed were also used for model training
dupNames=matrix(nrow = 6, ncol = 2)
dupNames[,1]=c(
  "ALAIMO-68b", "ALAIMO-68a","ALAIMO-44", "ALAIMO-85", "ALAIMO-136", "ALAIMO-13" )
dupNames[,2]=c(
  "EDTA-ABAT-5", "EDTA-ABAT-6", "EDTA-AADC-2", "EDTA-ABAT-7", "EDTA-AADC-3", "EDTA-OTC-17")
temp=data_mx.og[,dupNames[,1]]
colnames(temp)=dupNames[,2]
data_mx.og=cbind(data_mx.og,temp)
models=sort(c("zsd","rcdp","arg","otc","asld","abat","aadc","adsl","cit","cob","ga","gamt","msud","mma","pa","pku"))
cohorts=sapply(c(models,"hep-ref", "edta-ref", "alaimo"),
               function(x) grep(paste(x,"-",sep = ""),colnames(data_mx.og),value = T,ignore.case = T))
names(cohorts)[names(cohorts)=="hep-ref"]="hep_refs"
names(cohorts)[names(cohorts)=="edta-ref"]="edta_refs"
# binary code argininosuccinate
data_mx.raw=read.xlsx(system.file("dataset/dataset1.xlsx",package = "CTDext"),
                  sheet = "Raw intensity",startRow = 2,
                  colNames = T,rowNames = T,skipEmptyRows = TRUE,skipEmptyCols = TRUE,)
data_mx.raw=data_mx.raw[,colnames(data_mx.og)[colnames(data_mx.og)%in%colnames(data_mx.raw)]]
ptZeroFromRaw=colnames(data_mx.raw[,is.na(unlist(data_mx.raw["argininosuccinate",]))])
pt10FromRaw=colnames(data_mx.raw[,!is.na(unlist(data_mx.raw["argininosuccinate",]))])
temp=data_mx.og[,!colnames(data_mx.og) %in% colnames(data_mx.raw)]
ptZeroFromOg=colnames(temp)[ is.na(unlist(temp["argininosuccinate",]))| as.numeric(unlist(temp["argininosuccinate",]))==0]
data_mx.ogBnry=data_mx.og # Jul
data_mx.ogBnry["argininosuccinate",c(ptZeroFromRaw,ptZeroFromOg)]=0
data_mx.ogBnry["argininosuccinate",pt10FromRaw]=10 
# data_mx.og: load("clinical_data_July2020.RData")
# data_mx.ogBnry : "clinical_data_Oct2020.RData"
rm(list=setdiff(ls(),grep("dir|data_mx|cohorts|dupNames",ls(),value = T))) 
save.image("clinical_data.RData")

Get number of metabolites detected

``` {r counts} rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str()))) data_mx=loadToEnv("clinical_data.RData")[["data_mx.raw"]]

data_mx_unnamed = data_mx[grep("x -", rownames(data_mx)), which(colnames(data_mx) %in% c(unlist(cohorts)))] zscore_cts = apply(data_mx_unnamed, 2, function(i) length(which(!is.na(i)))) mean(zscore_cts) min(zscore_cts) max(zscore_cts)

data_mx_named = data_mx[-grep("x -", rownames(data_mx)), which(colnames(data_mx) %in% c(unlist(cohorts)))] zscore_cts = apply(data_mx_named, 2, function(i) length(which(!is.na(i)))) mean(zscore_cts) min(zscore_cts) max(zscore_cts)

## Building a Gaussian Graphical Model to Diagnose 16 Different Metabolic Disorders (skip if using github-saved networks)
Metabolites represented in at least 50% of reference and 50% of disease cases were included in network learning. This is a time-consuming step
``` {r learn-nets_heparin}
require(R.utils)
require(CTD)
require(huge)
rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str())))
data_mx=loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]]
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,grep("hep-",colnames(data_mx),ignore.case = T)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]

refs=data_mx[,grep("ref",colnames(data_mx),ignore.case = T)]
ref_fill = apply(refs, 1, function(i) sum(is.na(i))/length(i))

# refs2=refs[which(ref_fill[rownames(data_mx)]>0.8),]
refs2=refs
cohorts = lapply(cohorts, function(i) i[which(i %in% colnames(data_mx))])

# We perfer training models on edta samples for various reasons (latest dataset); here we show an example of network learning on Citrullinemia disease that only contains heparin samples; include "msud", "mma", "pa", "pku", "cob", "ga", "gamt" if you want to train all 8 models
for (model in c("cit")) { #, "msud", "mma", "pa", "pku", "cob", "ga", "gamt"
  for (fold in 1:length(cohorts[[model]])) {
    print(sprintf("Learning graphs for diag %s, fold %d...", model, fold))
    diag_pts = cohorts[[model]][-fold]
    print(diag_pts)
    fill.rate = apply(data_mx[,which(colnames(data_mx) %in% diag_pts)], 1, function(i) sum(is.na(i))/length(i))
    diag_data = data_mx[intersect(which(ref_fill<0.5), which(fill.rate<=0.50)), which(colnames(data_mx) %in% diag_pts)]
    diag_data = diag_data[which(rownames(diag_data) %in% rownames(refs2)),]
    print("Individual samples as training data. Latent variable embedding and network pruning.")
    diag_data = CTD::data.surrogateProfiles(data = diag_data, std = 1, ref_data = refs2)
    print(dim(diag_data))

    # Disease Network: GLASSO approach
    inv_covmatt = huge(t(diag_data), method="glasso")
    inv_covmatt_select = huge.select(inv_covmatt, criterion = "stars")
    inv_covmat = as.matrix(inv_covmatt_select$icov[[which(inv_covmatt_select$lambda==inv_covmatt_select$opt.lambda)]])
    diag(inv_covmat) = 0;
    colnames(inv_covmat) = rownames(diag_data)
    ig = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name')
    V(ig)$name = rownames(diag_data)
    print(ig)

    # Reference Network: GLASSO approach
    ref_data = data.surrogateProfiles(data = refs2, std = 1, ref_data = refs2)#useMnDiseaseProfile = FALSE, addHealthyControls = TRUE, 
    ref_data = ref_data[,-which(duplicated(colnames(ref_data)))]
    print(dim(ref_data))
    inv_covmatt = huge(t(ref_data), method="glasso", lambda = inv_covmatt_select$opt.lambda)
    inv_covmat = as.matrix(inv_covmatt$icov[[1]])
    diag(inv_covmat) = 0;
    colnames(inv_covmat) = rownames(ref_data)
    ig_ref = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name')
    V(ig_ref)$name = rownames(ref_data)
    print(ig_ref)

    ig_pruned = graph.naivePruning(ig, ig_ref)
    print(ig_pruned)
    save(ig, ig_ref, ig_pruned, file=sprintf("%s/bg_%s_fold%d.RData",graph.dir, model, fold))
    rm(ig, ig_pruned)
  }
}

``` {r learn-networks_edta} rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str()))) require(CTD) require(huge) require(R.utils)

data_mx=loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]] cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] data_mx=data_mx[,grep("edta",colnames(data_mx),ignore.case = T)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx["argininosuccinate", which(is.na(data_mx["argininosuccinate",]))] = 0 refs = data_mx[, which(colnames(data_mx) %in% cohorts$edta_refs)] ref_fill = apply(refs, 1, function(i) sum(is.na(i))/length(i)) cohorts = lapply(cohorts, function(i) i[which(i %in% colnames(data_mx))])

here we show an example of training one disease cohort that only contains EDTA samples; include "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc" if you want to train all edta models

for (model in c("asld")) {#,"aadc", "abat", "adsl", "arg","zsd", "rcdp","otc" for (fold in 1:length(cohorts[[model]])) { print(sprintf("Learning graphs for diag %s, fold %d...", model, fold)) diag_pts = cohorts[[model]][-fold] print(diag_pts) fill.rate = apply(data_mx[,which(colnames(data_mx) %in% diag_pts)], 1, function(i) sum(is.na(i))/length(i)) diag_data = data_mx[intersect(which(ref_fill<0.5), which(fill.rate<0.5)), which(colnames(data_mx) %in% diag_pts)] print(nrow(diag_data)) print(diag_data["argininosuccinate",]) print("Individual samples as training data. Latent variable embedding and network pruning.") refs2 = refs[which(rownames(refs) %in% rownames(diag_data)), ] diag_data = data.surrogateProfiles(data = diag_data, std = 1, ref_data = refs2) print(dim(diag_data))

# Disease Network: GLASSO approach
inv_covmatt = huge(t(diag_data), method="glasso")
inv_covmatt_select = huge.select(inv_covmatt, criterion = "stars")
inv_covmat = as.matrix(inv_covmatt_select$icov[[which(inv_covmatt_select$lambda==inv_covmatt_select$opt.lambda)]])
diag(inv_covmat) = 0;
colnames(inv_covmat) = rownames(diag_data)
ig = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name')
V(ig)$name = rownames(diag_data)
print(ig)

# Reference Network: GLASSO approach
ref_data = data.surrogateProfiles(data = refs2, std = 1, ref_data = refs2)
ref_data = ref_data[,-which(duplicated(colnames(ref_data)))]
print(dim(ref_data))
inv_covmatt = huge(t(ref_data), method="glasso", lambda = inv_covmatt_select$opt.lambda)
inv_covmat = as.matrix(inv_covmatt$icov[[1]])
diag(inv_covmat) = 0;
colnames(inv_covmat) = rownames(ref_data)
ig_ref = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name')
V(ig_ref)$name = rownames(ref_data)
print(ig_ref)

ig_pruned = graph.naivePruning(ig, ig_ref)
print(ig_pruned)
print(degree(ig_pruned, "argininosuccinate"))
save(ig, ig_ref, ig_pruned, file=sprintf("%s/bg_%s_fold%d.RData",graph.dir , model, fold))
rm(ig, ig_ref, ig_pruned)

} }

Network descriptive statistics

for (model in c("arg", "asld", "cit", "otc", "rcdp", "zsd", "abat", "aadc", "adsl", "mma", "msud", "pa", "pku", "cob", "ga", "gamt")) { # print(model) for (fold in 1:length(cohorts[[model]])) { print(fold) load(sprintf("Clinical_paper/graphs/bg_%s_fold%d.RData", model, fold)) print(sprintf("Disease-Control Network: %d-%d", length(V(ig)$name), length(E(ig)$weight))) print(sprintf("Control-only Network: %d-%d", length(V(ig_ref)$name), length(E(ig_ref)$weight))) print(sprintf("Disease-Specific Network: %d-%d", length(V(ig_pruned)$name), length(E(ig_pruned)$weight))) } }

## Get pre-compute ranks for each Gaussian Graphical Model. (skip if using github-saved ranks)
This part should preferably run on cluster (pbs system).
Scripts "wrapper_ranks.r" "getRanksN.r" "getRanks.pbs" are commented out.
```r
# 
# cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
# models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc"))
# cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T))
# cohorts[["alaimo"]]=NULL
# 
# for (model in models){
#   fold.begin=1
#   fold.end=length(cohorts[[model]])
#   system(sprintf("Rscript wrapper_ranks.r %s %s %s %s %s",
#                model, fold.begin, fold.end, w.dir, rank.dir, graph.dir))
# }

######################################## wrapper_ranks.r ########################################
# require(R.utils)
# require(igraph)
# args = commandArgs(trailingOnly=TRUE)
# model=as.character(args[1])
# fold.begin=as.numeric(args[2])
# fold.end=as.numeric(args[3])
# w.dir=as.character(args[4])
# rank.dir=as.character(args[5])
# graph.dir=as.character(args[6])
# diag=model
# setwd(sprintf("%s/%s",w.dir,rank.dir))
# 
# for (fold in fold.begin:fold.end){
#     dir.create(sprintf("./%s", model), showWarnings = FALSE)
#     output_dir = sprintf("./%s/diag%s%d", model,diag,fold)
#     str = sprintf("mkdir %s", output_dir)
#     system(str)
# 
# ig = loadToEnv(sprintf("../%s/bg_%s_fold%d.RData",graph.dir,model,fold))[["ig_pruned"]]
# 
# totalN = length(V(ig)$name)
# print(sprintf("Total N = %d", totalN))
# 
# for (n in 1:totalN) {
#   str = sprintf("qsub -v diag=%s,fold=%d,n=%d,wkdir=%s,rankdir=%s,graphdir=%s getRanks.pbs",
#                 diag, fold, n, w.dir, rank.dir, graph.dir)
#   system(str, wait=FALSE)
#   system("sleep 0.1")
# }
# 
#   
# ready=FALSE
# last_sum = 0
# while (!ready) {
#   f = list.files(sprintf("./%s/diag%s%d", model, diag, fold), pattern=".RData")
#   print(sprintf("#permutation files ready = %d", length(f)))
#   if (length(f)==totalN) {
#     ready=TRUE
#   } else {
#     system("sleep 30")
#     system("rm *.pbs.*")
#     curr_sum = length(f)
#     if (curr_sum > last_sum) {
#       last_sum = curr_sum
#     } 
#   }
# }
# system("rm *.pbs.*")
#   
# # collate permutations into one R object
# all_nodes = V(ig)$name
# permutationByStartNode = list()
# for (n in 1:length(all_nodes)) {
#   load(sprintf("./%s/diag%s%d/%d.RData",model, diag, fold, n))
#   permutationByStartNode[[n]] = toupper(current_node_set)
# }
# names(permutationByStartNode) = all_nodes
# save.image(file=sprintf("./%s/%s%d-ranks.RData",model, toupper(diag), fold))
# print("collate complete...")
# str = sprintf("rm -r ./%s/diag%s%d",model, diag, fold)
# system(str)
# 
# }

################################ end of wrapper_ranks.r #########################################


################################ getRanksN.r ################################################
# require(igraph)
# require(CTD)
# require(R.utils)
# singleNode.getNodeRanksN = function(n, G, S=NULL, num.misses=NULL, verbose=FALSE) {
#   if (!is.null(num.misses)) {
#     if (is.null(S)) {
#       print("You must supply a subset of nodes as parameter S if you supply num.misses.")
#       return(0)
#     }
#   }
#   all_nodes = names(G)
#   if (verbose) {
#     print(sprintf("Calculating node rankings %d of %d.", n, length(all_nodes)))
#   }
#   current_node_set = NULL
#   stopIterating=FALSE
#   startNode = all_nodes[n]
#   currentGraph = G
#   numMisses = 0
#   current_node_set = c(current_node_set, startNode)
#   while (stopIterating==FALSE) {
#     # Clear probabilities
#     currentGraph[1:length(currentGraph)] = 0 #set probabilities of all nodes to 0
#     #determine base p0 probability
#     baseP = p0/(length(currentGraph)-length(current_node_set))
#     #set probabilities of unseen nodes to baseP
#     currentGraph[!(names(currentGraph) %in% current_node_set)] = baseP
#     # Sanity check. p0_event should add up to exactly p0 (global variable)
#     p0_event = sum(unlist(currentGraph[!(names(currentGraph) %in% current_node_set)]))
#     currentGraph = graph.diffuseP1(p1, startNode, currentGraph, current_node_set, 1, verbose=FALSE)
#     # Sanity check. p1_event should add up to exactly p1 (global variable)
#     p1_event = sum(unlist(currentGraph[!(names(currentGraph) %in% current_node_set)]))
#     if (abs(p1_event-1)>thresholdDiff) {
#       extra.prob.to.diffuse = 1-p1_event
#       currentGraph[names(current_node_set)] = 0
#       currentGraph[!(names(currentGraph) %in% names(current_node_set))] = unlist(currentGraph[!(names(currentGraph) %in% names(current_node_set))]) + extra.prob.to.diffuse/sum(!(names(currentGraph) %in% names(current_node_set)))
#     }
#     #Set startNode to a node that is the max probability in the new currentGraph
#     maxProb = names(which.max(currentGraph))
#     # Break ties: When there are ties, choose the first of the winners.
#     startNode = names(currentGraph[maxProb[1]])
#     if (!is.null(num.misses)) {
#       if (startNode %in% S) {
#         numMisses = 0
#       } else {
#         numMisses = numMisses + 1
#       }
#       current_node_set = c(current_node_set, startNode)
#       if (numMisses>num.misses || length(c(startNode,current_node_set))>=(length(G))) {
#         stopIterating = TRUE
#       }
#     } else {
#       # Keep drawing until you've drawn all nodes.
#       current_node_set = c(current_node_set, startNode)
#       if (length(current_node_set)>=(length(G))) {
#         stopIterating = TRUE
#       }
#     }
#     
#   }
#   return(current_node_set)
# }
# 
# args = commandArgs(trailingOnly=TRUE)
# diag = as.character(args[1])
# fold = as.numeric(args[2])
# n = as.numeric(args[3])
# wkdir = as.character(args[4])
# rankdir = as.character(args[5])
# graphdir = as.character(args[6])
# 
# model=diag
# print(diag)
# print(fold)
# print(n)
# 
# setwd(sprintf("%s/%s",wkdir,rankdir))
# ig = loadToEnv(sprintf("../%s/bg_%s_fold%d.RData", graphdir, model,fold))[["ig_pruned"]]
# 
# print(ig)
# V(ig)$name = tolower(V(ig)$name)
# G = vector(mode="list", length=length(V(ig)$name))
# names(G) = V(ig)$name
# adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
# p0=0.1
# p1=0.9
# thresholdDiff=0.01
# all_nodes = tolower(V(ig)$name)
# print(head(all_nodes))
# current_node_set = singleNode.getNodeRanksN(n, G)
# new_dir =  sprintf("./%s/diag%s%d",model, diag, fold)
# if (!dir.exists(new_dir)) {
#   str = sprintf("mkdir %s", new_dir)
#   system(str)
# }
# save(current_node_set, file=sprintf("./%s/diag%s%d/%d.RData",model, diag, fold, n))
# 
################################ end of getRanksN.r ##########################################

################################### getRanks.pbs ##############################################
# # Request 1 processors on 1 node
# #PBS -l nodes=1:ppn=1
# 
# #Request x number of hours of walltime
# #PBS -l walltime=1:00:00
# 
# #Request that regular output and terminal output go to the same file
# #PBS -j oe
# #PBS -m abe
# 
# module load R/3.5
# 
# diag=${diag}
# fold=${fold}
# n=${n}
# wkdir=${wkdir}
# rankdir=${rankdir}
# cd ${wkdir}/${rankdir}
# 
# Rscript ../getRanksN.r $diag $fold $n $wkdir ${rankdir} ${graphdir} > ranks:$diag-$fold-$n.out
# rm ranks:$diag-$fold-$n.out
###################################### end of getRanks.pbs ########################################

Get the main disease module for each disease-specific network model. (skip if using github-saved ranks)

rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))

# Add CTDsim distance from "downtown module" as second quantitative variable
disFromDowntown = function(dis_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G) {
  p1.e = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)[,"IS.alt"]
  p2.e = mle.getEncodingLength(p2.optBS, NULL, ptID2, G)[,"IS.alt"]

  # Using predefined node ranks, get optimal bitstring for encoding of patient1's union patient2's subsets.
  p12.sig.nodes = unique(c(dis_mod, p2.sig.nodes))
  p12.e = c()
  for (k in 1:length(ptBSbyK.dis)) {
    dis_mod_cpy = dis_mod
    p2.sig.nodes_cpy = p2.sig.nodes

    dis_mod_k = names(which(ptBSbyK.dis[[k]]==1))
    p2.sig.nodes_k = names(which(p2.optBS[[k]]==1))
    while (length(dis_mod_k)<k) {
      dis_mod_k = unique(c(dis_mod_k, dis_mod_cpy[1]))
      dis_mod_cpy = dis_mod_cpy[-1]
    }
    while (length(p2.sig.nodes_k)<k) {
      p2.sig.nodes_k = unique(c(p2.sig.nodes_k, p2.sig.nodes_cpy[1]))
      p2.sig.nodes_cpy = p2.sig.nodes_cpy[-1]
    }
    p12.sig.nodes_k = sapply(unique(c(dis_mod_k, p2.sig.nodes_k)), trimws)
    p12.optBS = mle.getPtBSbyK(p12.sig.nodes_k, ranks)
    res = mle.getEncodingLength(p12.optBS, NULL, NULL, G)
    p12.e[k] = res[which.max(res[,"d.score"]), "IS.alt"] + log2(choose(length(G), 1))*(length(p12.sig.nodes_k)-which.max(res[,"d.score"]))
  }
  return (list(p1.e=p1.e, p2.e=p2.e, p12.e=p12.e,
               NCD=(p12.e-apply(cbind(p1.e, p2.e), 1, min))/apply(cbind(p1.e, p2.e), 1, max)))
}

data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc"))
cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T))
cohorts[["alaimo"]]=NULL
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]

table(data_mx["argininosuccinate",])
p0=0.1
p1=0.9
thresholdDiff=0.01

# Get downtown disease modules for all modelled disease states
disMod = list()
data_mx0=data_mx
# use edta samples for model arg; edta sampels for model asld; hep samples for otc
cohorts[["arg"]]=cohorts[["arg"]][grep("EDTA",cohorts[["arg"]])]
cohorts[["asld"]]=cohorts[["asld"]][grep("EDTA",cohorts[["asld"]])]
cohorts[["otc"]]=cohorts[["otc"]][grep("HEP",cohorts[["otc"]])]

# showing one example
for (model in c("cit")) { # "aadc", "abat", "adsl", "arg", "asld", ,"otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku"
  print(sprintf("%s...", toupper(model)))
  data_mx=data_mx0
  mn = apply(data_mx[,which(colnames(data_mx) %in% cohorts[[model]])], 1, function(i) mean(na.omit(i)))
  mn = mn[-which(is.na(mn))]
  downtown_disease_module = c()
  for (pt in 1:length(cohorts[[model]])) {
    # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData",graph.dir, model, pt))[["ig_pruned"]]
    ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, pt),package = "CTDext"))[["ig_pruned"]]
    # ranks = loadToEnv(sprintf("%s/bg_%s_fold%d.RData",rank.dir, model, pt))[["ig_pruned"]]
    ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", model, pt),package = "CTDext"))[["permutationByStartNode"]]
    ranks = lapply(ranks, tolower)
    adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
    G = vector(mode="list", length=length(V(ig)$name))
    names(G) = V(ig)$name
    S = mn[which(abs(mn)>2)]
    S = S[which(names(S) %in% names(G))]
    it=0
    while (length(S)<15) {
      S = mn[which(abs(mn)>(2-0.1*it))]
      S = S[which(names(S) %in% names(G))]
      it = it + 1
    }
    ptBSbyK = mle.getPtBSbyK(names(S), ranks)
    res = mle.getEncodingLength(ptBSbyK, NULL, NULL, G)
    downtown_disease_module = c(downtown_disease_module, names(which(ptBSbyK[[which.max(res[,"d.score"])]]==1)))
  }
  print(model)
  length(unique(downtown_disease_module))

  # Estimate disease module "purity" based on 2 size thresholds and known case mean distances
  fold=1
  # ig = loadToEnv(sprintf("graphs_GMpaper/bg_%s_fold%d.RData",graph.dir, model, fold))[["ig_pruned"]]
  ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]]
  # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData",rank.dir ,model, fold))[["permutationByStartNode"]]
  ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", model, fold),package = "CTDext"))[["permutationByStartNode"]]
  ranks = lapply(ranks, tolower)
  adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
  G = vector(mode="list", length=length(V(ig)$name))
  names(G) = V(ig)$name
  data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ]
  data_mx = data_mx[,which(colnames(data_mx) %in% cohorts[[model]])]
  purity_mets = list()
  purity = c()
  # Size1: Full downtown disease module
  tmp.mm = unique(downtown_disease_module)
  tmp.mm = tmp.mm[which(tmp.mm %in% names(G))]
  ptBSbyK.dis = mle.getPtBSbyK(tmp.mm, ranks)
  res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)
  downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1))
  print(length(unique(downtown_disease_mod)))
  dd = c()
  for (pt in 1:length(cohorts[[model]])) {
    ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_mod), ranks)
    p2.sig.nodes = names(data_mx[order(abs(data_mx[,pt]), decreasing = TRUE), pt][1:length(unique(downtown_disease_mod))])
    p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks)
    ctdsim = disFromDowntown(unique(downtown_disease_mod), ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G)
    dd[pt] = min(ctdsim$NCD)
  }
  purity_mets[[1]] = unique(downtown_disease_module)
  purity[1] = mean(dd)
  # Size2: Smaller disease module
  downtown_disease_module = names(which(table(downtown_disease_module)>(length(cohorts[[model]])/2)))
  tmp.mm = unique(downtown_disease_module)
  tmp.mm = tmp.mm[which(tmp.mm %in% names(G))]
  ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_module), ranks)
  res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)
  downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1))
  print(length(unique(downtown_disease_mod)))
  dd = c()
  for (pt in 1:length(cohorts[[model]])) {
    ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_mod), ranks)
    p2.sig.nodes = names(data_mx[order(abs(data_mx[,pt]), decreasing = TRUE), pt][1:length(unique(downtown_disease_mod))])
    p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks)
    ctdsim = disFromDowntown(unique(downtown_disease_mod), ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G)
    dd[pt] = min(ctdsim$NCD)
  }
  purity_mets[[2]] = downtown_disease_module
  purity[2] = mean(dd)

  # Selected disease module based on "purity" estimates
  downtown_disease_module = purity_mets[[which.min(purity)]]

  disMod[[model]] = downtown_disease_module
}
lapply(disMod, length)
save(disMod, file=sprintf("%s/disMod.RData",model.dir))

Run CTD on Metabolomics Profiles to Interpret Perturbation Patterns in Disease Context

``` {r runCTD}

Note: Do not include Alaimo patients in this result - must have known diagnosis from one of the 16 disease network models,

or be a healthy control (heparin or EDTA).

require(R.utils) require(CTD) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]

dupNames=loadToEnv("clinical_data.RData")[["dupNames"]] data_mx=data_mx[,-which(colnames(data_mx) %in% dupNames[,1])] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx

p0=0.1 p1=0.9 thresholdDiff=0.01 kmx=30

disMod=loadToEnv(system.file("shiny-app/disMod_Oct2020.RData",package = "CTDext"))[["disMod"]]

Add CTDsim distance from "downtown module" as second quantitative variable

disFromDowntown = function(dis_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G, mn_prof, p2_prof) { p1.e = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)[,"IS.alt"] p2.e = mle.getEncodingLength(p2.optBS, NULL, ptID2, G)[,"IS.alt"]

dirSim = vector("numeric", length = length(ptBSbyK.dis)) for (k in 1:length(ptBSbyK.dis)) { p1.sn = dis_mod[1:k] p2.sn = p2.sig.nodes[1:k] p1.dirs = mn_prof[p1.sn] p1.dirs[which(!(p1.dirs > 0))] = 0 p1.dirs[which(p1.dirs > 0)] = 1 p2.dirs = p2_prof[p2.sn] p2.dirs[which(!(p2.dirs > 0))] = 0 p2.dirs[which(p2.dirs > 0)] = 1 p1.snn = sprintf("%s%d", p1.sn, p1.dirs) p2.snn = sprintf("%s%d", p2.sn, p2.dirs) dirSim[k] = 1 - (length(intersect(p1.snn, p2.snn))/length(union(p1.snn, p2.snn))) } # Using predefined node ranks, get optimal bitstring for encoding of patient1's union patient2's subsets. p12.sig.nodes = unique(c(dis_mod, p2.sig.nodes)) p12.e = c() for (k in 1:length(ptBSbyK.dis)) { dis_mod_cpy = dis_mod p2.sig.nodes_cpy = p2.sig.nodes

dis_mod_k = names(which(ptBSbyK.dis[[k]]==1))
p2.sig.nodes_k = names(which(p2.optBS[[k]]==1))
while (length(dis_mod_k)<k) {
  dis_mod_k = unique(c(dis_mod_k, dis_mod_cpy[1]))
  dis_mod_cpy = dis_mod_cpy[-1]
}
while (length(p2.sig.nodes_k)<k) {
  p2.sig.nodes_k = unique(c(p2.sig.nodes_k, p2.sig.nodes_cpy[1]))
  p2.sig.nodes_cpy = p2.sig.nodes_cpy[-1]
}
p12.sig.nodes_k = sapply(unique(c(dis_mod_k, p2.sig.nodes_k)), trimws)
p12.optBS = mle.getPtBSbyK(p12.sig.nodes_k, ranks)
res = mle.getEncodingLength(p12.optBS, NULL, NULL, G)
p12.e[k] = res[which.max(res[,"d.score"]), "IS.alt"] + log2(choose(length(G), 1))*(length(p12.sig.nodes_k)-which.max(res[,"d.score"]))

} return (list(p1.e=p1.e, p2.e=p2.e, p12.e=p12.e, JAC = dirSim, NCD=(p12.e-apply(cbind(p1.e, p2.e), 1, min))/apply(cbind(p1.e, p2.e), 1, max))) }

First, run CTDdisMod for all models (kmx doesn't matter).

for (model in c("aadc","cob","pa","abat", "adsl", "arg", "asld", "cit", "otc", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { #
data_mx=data_mx.0 table(data_mx["argininosuccinate",]) # Selected disease module mn = apply(data_mx[,which(colnames(data_mx) %in% cohorts[[model]])], 1, function(i) mean(na.omit(i))) mn = mn[-which(is.na(mn))] downtown_disease_module = disMod[[model]] # For each network fold, run CTDdm against several diagnoses for (fold in 1:length(cohorts[[model]])) { print(sprintf("Fold %d/%d...", fold, length(cohorts[[model]]))) # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]] ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]] # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData",rank.dir, model, fold))[["permutationByStartNode"]] ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]] ranks = lapply(ranks, tolower) adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight"))) G = vector(mode="list", length=length(V(ig)$name)) names(G) = V(ig)$name data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ] data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))]

tmp.mm = unique(downtown_disease_module)
tmp.mm = tmp.mm[which(tmp.mm %in% names(G))]
ptBSbyK.dis = mle.getPtBSbyK(tmp.mm, ranks)
res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)
downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1))
ptBSbyK.dis = mle.getPtBSbyK(downtown_disease_mod, ranks)

df_disMod = data.frame(ptID=character(), diag=character(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE)
for (p in 1:ncol(data_mx)) {
  #print(sprintf("Patient %d/%d...", p, ncol(data_mx)))
  ptID = colnames(data_mx)[p]
  if (ptID %in% unlist(cohorts)) {diag = names(cohorts)[which(unlist(lapply(cohorts, function(i) ptID %in% i)))]}else{diag = "reference"}
  # CTD: get module best explained by network's connectivity
  S = data_mx[order(abs(data_mx[,p]), decreasing = TRUE),p]
  p2.sig.nodes = names(S)[1:length(downtown_disease_mod)]
  p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks)
  ctdDisMod = disFromDowntown(downtown_disease_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G, mn, S)
  #if (ptID=="X606789") {print(min(ctdDisMod$NCD))}
  df_disMod[p, "ptID"] = colnames(data_mx)[p]
  df_disMod[p, "diag"] = diag[1]
  df_disMod[p, "ctdDisMod"] = min(ctdDisMod$NCD)
  df_disMod[p, "jac"] = min(ctdDisMod$JAC)
}
save(df_disMod, file=sprintf("%s/ctdDisMod_%s_fold%d.RData",model.dir, model, fold))

} }

Next, run CTD on all models for K=30

for (kmx in c(30)) { for (model in c( "aadc","cob","pa", "abat", "adsl", "arg", "asld", "cit", "otc", "ga", "gamt", "msud", "mma", ,"zsd", "rcdp", "pku")) { # data_mx=data_mx.0 # table(data_mx["argininosuccinate",]) # For each network fold, runCTD against several diagnoses for (fold in 1:length(cohorts[[model]])) { # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]] ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]] # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData", model, rank.dir fold))[["permutationByStartNode"]] ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]]

  ranks = lapply(ranks, tolower)
  adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
  G = vector(mode="list", length=length(V(ig)$name))
  names(G) = V(ig)$name
  data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ]
  data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))]

  df_ctd = data.frame(ptID=character(), S=character(), lenS=numeric(), optT=numeric(), fishers=numeric(), I0=numeric(), IA=numeric(), d=numeric(), stringsAsFactors = FALSE)
  r=1
  ptBSbyK = list()
  for (p in 1:ncol(data_mx)) {
    ptID = colnames(data_mx)[p]
    if (ptID %in% unlist(cohorts)) {diag = names(cohorts)[which(unlist(lapply(cohorts, function(i) ptID %in% i)))]}else{diag = "reference"}
    S = data_mx[order(abs(data_mx[,p]), decreasing = TRUE),p][1:kmx]
    ptBSbyK[[ptID]] = mle.getPtBSbyK(names(S), ranks)
    res = mle.getEncodingLength(ptBSbyK[[ptID]], NULL, ptID, G)
    #if (ptID=="X606789") {
    #  print(sprintf("Patient %d/%d...", p, ncol(data_mx)))
    #  print(max(res[,"d.score"]))
    #}
    for (k in 1:kmx) {
      df_ctd[r, "ptID"] = colnames(data_mx)[p]
      df_ctd[r, "diag"] = diag[1]
      df_ctd[r, "S"] = paste(names(S)[1:k], collapse="/")
      df_ctd[r, "lenS"] = k
      df_ctd[r, "optT"] = res[k, "opt.T"]
      df_ctd[r, "I0"] = res[k, "IS.null"]
      df_ctd[r, "IA"] = res[k, "IS.alt"]
      df_ctd[r, "d"] = res[k, "d.score"]
      r = r + 1
    }
  }
  save(ptBSbyK, df_ctd, file=sprintf("%s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx))
}

} }

Rbind df_ctdDisMod to df_ctd

rm(list=setdiff(ls(),c(grep("dir|cohorts",ls(),value = T),lsf.str())))

for (model in c("aadc","cob","pa", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { ##" for (fold in 1:length(cohorts[[model]])) { load(sprintf("%s/ctdDisMod_%s_fold%d.RData",model.dir, model, fold)) for (kmx in c(30)) { load(sprintf("%s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) df = data.frame(ptID=character(), S=character(), lenS=numeric(), optT=numeric(), fishers=numeric(), I0=numeric(), IA=numeric(), d=numeric(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE) r = 1 for (p in 1:nrow(df_disMod)) { for (k in 1:kmx) { df[r, "ptID"] = df_ctd[r,"ptID"] df[r, "diag"] = df_ctd[r,"diag"] df[r, "S"] = df_ctd[r,"S"] df[r, "lenS"] = df_ctd[r,"lenS"] df[r, "optT"] = df_ctd[r,"optT"] df[r, "I0"] = df_ctd[r,"I0"] df[r, "IA"] = df_ctd[r,"IA"] df[r, "d"] = df_ctd[r,"d"] df[r, "ctdDisMod"] = df_disMod[p,"ctdDisMod"] df[r, "jac"] = df_disMod[p,"jac"] r = r + 1 } } save(ptBSbyK, df, file=sprintf("%s/model_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) system(sprintf("rm %s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) } system(sprintf("rm %s/ctdDisMod_%s_fold%d.RData", model.dir, model, fold)) } }

Collapse fold signal to LOOCV signal, where the test patients scores are averaged across folds, and cases

signal is the left out fold score.

require(CTD) require(R.utils) require(R.utils) require(CTD) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]

dupNames=loadToEnv("clinical_data.RData")[["dupNames"]] data_mx=data_mx[,-which(colnames(data_mx) %in% dupNames[,1])] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx

which(apply(data_mx, 2, function(i) sum(na.omit(i)==0)/length(na.omit(i)))==1) for (kmx in c(30)) { # for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "zsd", "rcdp", "msud", "mma", "pa", "pku")) { # df_model = data.frame(fold=numeric(), pt=numeric(), bits=numeric(), ctdDisMod=numeric(), jac=numeric(), diag=character(), stringsAsFactors = FALSE) for (fold in 1:length(cohorts[[model]])) { df = loadToEnv(sprintf("%s/model_%s_fold%d_kmx%d.RData", model.dir , model, fold, kmx))[["df"]] df = df[!is.na(df$ptID),] pts = unique(df$ptID) df_best = data.frame(pt=numeric(), ptID=character(), bits=numeric(), diag=character(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE) for (pt in 1:length(pts)) { pt_data = df[which(df$ptID==pts[pt]),] ptID = unique(df[which(df$ptID==pts[pt]), "ptID"]) if (pt_data[1,"diag"]==model) { df_best[pt, "pt"] = which(cohorts[[model]]==ptID) } df_best[pt, "ptID"] = ptID df_best[pt, "bits"] = max(df[which(df$ptID==pts[pt]), "d"])-log2(nrow(pt_data)) df_best[pt, "jac"] = pt_data[1, "jac"] df_best[pt, "diag"] = unique(pt_data[,"diag"]) df_best[pt, "ctdDisMod"] = pt_data[1, "ctdDisMod"] } df_best$bits[which(df_best$bits<0)] = 0 df_best$fold = rep(fold, nrow(df_best)) df_model = rbind(df_model, df_best) } # Do not do multiple hypothesis test correction here. #df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="fdr")) df_model$loocv = rep(0, nrow(df_model)) df_model$loocv[which(df_model$pt==df_model$fold)] = 1 df_model = df_model[-intersect(which(df_model$loocv==0), which(df_model$diag==model)), ] b_bits = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"bits"]))) ctdDisMod_loocv = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"ctdDisMod"]))) jac_loocv = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"jac"]))) df_model = df_model[-which(duplicated(df_model$ptID)),] df_model = df_model[order(df_model$ptID),] b_bits = b_bits[order(b_bits[,1]),] ctdDisMod_loocv = ctdDisMod_loocv[order(ctdDisMod_loocv[,1]),] jac_loocv = jac_loocv[order(jac_loocv[,1]),] df_model$bits = as.numeric(b_bits[,2]) df_model$ctdDisMod = as.numeric(ctdDisMod_loocv[,2]) df_model$jac = as.numeric(jac_loocv[,2]) print(any(is.na(df_model$bits))) print(any(is.na(df_model$ctdDisMod))) kmx = nrow(pt_data) save(df_model, file=sprintf("%s/dd-%s-loocv-kmx%d.RData",model.dir, model, kmx)) #sapply(1:length(cohorts[[model]]), function(i) system(sprintf("rm model/model_%s_fold%d_kmx%d.RData", model, i, kmx))) } }

kmx=30

load("../clnms_map-macbook.RData")

clnms_map=as.data.frame(clnms_map)

for (names in dupNames[,1]){

clnms_map[clnms_map$clnms==names,]=dupNames[dupNames[,1]==names,2]}

for(model in c("aadc", "cob", "abat", "adsl", "arg", "asld", "cit", "otc", "ga", "gamt", "msud", "mma", "pa","zsd", "rcdp", "pku")){

for (fold in 1:length(cohorts[[model]])) {

load(sprintf("%s/dd-%s-loocv-kmx%d.RData",model.dir, model, kmx))

df_model$ptID=clnms_map[,2][match(df_model$ptID,clnms_map[,1])]

save(df_model, file=sprintf("%s/dd-%s-loocv-kmx%d.RData",model.dir, model, kmx))

}

For each kmx, get a list of patient rankings across all 16 disease-specific network models

Be sure to change ptIDs to coded ptIDs

require(EmpiricalBrownsMethod) setwd(model.dir) models = c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "mma", "msud", "otc", "pa", "pku", "rcdp", "zsd")

ff = list.files(".", pattern=sprintf("loocv-kmx%d.RData", 30)) dff = loadToEnv(ff[1])[["df_model"]] pts = dff$ptID for (kmx in 30) {#, 30, 5 ff = list.files(".", pattern=sprintf("loocv-kmx%d.RData", kmx)) df_models = list() for (model in 1:length(models)) { df_model = loadToEnv(ff[model])[["df_model"]] ctd_pval_uncorrected = ifelse(2^-df_model[, "bits"]>1, 1, 2^-df_model[, "bits"]) df_model$bits2 = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) ctd_pval = ifelse(2^-df_model[, "bits2"]>1, 1, 2^-df_model[, "bits2"]) ctdDisMod_percentile = sapply(df_model[, "ctdDisMod"], function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) combined_network = apply(cbind(ctd_pval, ctdDisMod_percentile), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test) names(ctd_pval) = df_model$ptID names(ctdDisMod_percentile) = df_model$ptID names(combined_network) = df_model$ptID df_models[[models[model]]][["ctd_pval_uncorrected"]] = ctd_pval_uncorrected df_models[[models[model]]][["ctd_pval"]] = ctd_pval df_models[[models[model]]][["ctdDisMod_percentile"]] = ctdDisMod_percentile df_models[[models[model]]][["combined_network"]] = combined_network print(model) }

pt_ranks = list() for (pt in 1:length(pts)) { ptID = pts[pt] # ptID_coded = clnms_map[which(clnms_map[,1]==ptID), 2] ptID_coded = ptID for (pc in ptID_coded) { df_pt = data.frame(model=character(), ctd=numeric(), ctdDisMod=numeric(), brown.comb=numeric(), stringsAsFactors = FALSE) r = 1 for (model in 1:length(models)) { ctd_pval = df_models[[models[model]]][["ctd_pval"]] ctdDisMod_percentile = df_models[[models[model]]][["ctdDisMod_percentile"]] combined_network = df_models[[models[model]]][["combined_network"]]

    df_pt[r, "model"] = gsub(sprintf("dd-|-loocv-kmx%d.RData", kmx), "", ff[model])
    df_pt[r, "ctd"] = ctd_pval[pt]
    df_pt[r, "ctdDisMod"] = ctdDisMod_percentile[pt]
    df_pt[r, "brown.comb"] = combined_network[pt]   # Use Brown's combined for dependent tests
    r = r + 1
  }
  pt_ranks[[pc]] = df_pt
}

} save(pt_ranks, file=sprintf("ptRanks_kmx%d.RData", kmx)) }

AUCs

setwd("../") require(pROC) require(pls) require(EmpiricalBrownsMethod)

dff = data.frame(kmx=numeric(), model=character(), ctd.auc=numeric(), ctddismod.auc=numeric(), jac.auc=numeric(), combined.auc=numeric(), stringsAsFactors = FALSE) r= 1 for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "otc", "pa", "pku", "rcdp", "zsd")) { for (kmx in c(30)) { print(sprintf("MODEL %s...", model)) # df_model$bits add Bonferroni correction load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) df_model = df_model[-which(df_model$diag %in% c("alaimo","maps")),] df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) print(toupper(model)) print(dim(df_model)) diags = df_model$diag diags[-which(diags==model)] = 0 diags[which(diags==model)] = 1

# AUC, CTD only
ctd.auc = roc(diags, df_model$bits)
print(ctd.auc$auc)
# AUC, CTDdisMod only
#ctddismod.auc = roc(diags, df_model$ctdDisMod)
#print(ctddismod.auc$auc)
# AUC, Jaccard only
#jac.auc = roc(diags, df_model$jac)
#print(jac.auc$auc)

# AUC, Network-combined: CTD+CTDdisMod
ctd_pvals = 2^-(df_model$bits)
ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test)
network.auc = roc(diags, combined_brown)
print(network.auc$auc)

# AUC, all CTD+CTDdisMod+Jaccard, Browns combined
ctd_pvals = 2^-(df_model$bits)
ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model))
combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)
all.auc = roc(diags, combined_brown)
print(all.auc$auc)

dff[r, "kmx"] = kmx
dff[r, "model"] = model
dff[r, "ctd.auc"] = ctd.auc$auc
#dff[r, "ctddismod.auc"] = ctddismod.auc$auc
#dff[r, "jac.auc"] = jac.auc$auc
dff[r, "network.auc"] = network.auc$auc
dff[r, "all.auc"] = all.auc$auc
r = r + 1

} } save(dff,file=sprintf("%s/AUC_kmx%s.RData",model.dir,kmx))

require(ggplot2)

dff$kmx = as.factor(dff$kmx)

temp=sapply(cohorts,length)

dff$cohort.size=paste(toupper(dff$model)," (n=",temp[dff$model],")",sep = "")

dff=dff[order(dff$network.auc),]

fixed_level=dff$cohort.size

dff$cohort.size=factor(dff$cohort.size,levels = fixed_level)

dff$AUCformat=sapply(dff$network.auc,function(x) sprintf(".%s",format(100*x,digits = 2)))

dff$AUCformat[which(dff$AUCformat==".100")]="1"

p=ggplot(dff,aes(x=cohort.size, y=network.auc)) +

geom_bar(stat="identity",width = 0.8,fill="lightblue") +

geom_text(aes(label=AUCformat)) +

theme_bw()+theme(axis.text.x = element_text(angle = 45) ) +xlab("Disease model") +ylab("CTD+CTDdm AUC")

p

ggsave(p,device = cairo_pdf,filename = "revisionOutput/16IEMauc.pdf",width = 5,height = 4)

## Compare Usefulness of Information Mined in a Pathway Context to an Unbiased Network Context
``` {r pathway-vs-ctd-global}
# Pathway discrimination: PLS w/ metabolies in urea cycle. Diagnose patients for urea cycle defects.
# CTD Global discrimination: CTD disease specific models call "yes" or "no" based on significance of patient's top metabolite perturbations.

require(CTD)
require(R.utils)
require(pls)
require(pROC)

rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
# models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc"))
# cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T))
# cohorts[["alaimo"]]=NULL
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx

kmx=30
p0=0.1
p1=0.9
thresholdDiff=0.01

refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)]
#cohorts$arg = c(cohorts$arg, cohorts$test_arg)
#cohorts$asld = c(cohorts$asld, cohorts$test_asld)
#cohorts$otc = c(cohorts$otc, cohorts$test_otc)
cohorts = cohorts[-which(names(cohorts) %in% c("alaimo", "maps", "abat", "aadc", "adsl", "zsd", "rcdp", "test_arg", "test_asld", "test_otc", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "hep_refs"))]
data_mx = data_mx[,which(colnames(data_mx) %in% c(unlist(cohorts)))]
fill.rate = apply(data_mx, 1, function(i) sum(is.na(i))/length(i))
data_mx = data_mx[which(fill.rate<0.20),]
data_mx = data_mx[,sort(colnames(data_mx))]
print(dim(data_mx))

table(data_mx["argininosuccinate",])
data_mx["argininosuccinate", which(data_mx["argininosuccinate",]!=0)] = 1

load(system.file(sprintf("extdata/RData/%s.RData", "Arginine-Metabolism"), package="CTDext"))

uc_pathway_mets = V(ig)$label[which(V(ig)$shape=="circle")]
uc_pathway_mets = uc_pathway_mets[-which(uc_pathway_mets %in% c("H2O", "FAD", "O2", "NADP+", "CO2", "ADP", "NADH", "ATP", "PPi", "H+", "Mg2+", "Ca2+", "FMN", "PLP", "AMP", "Mg2+", "Mn2+", "NAD+", "ARGININE METABOLISM AND UREA CYCLE", "Aspartate Glutamate Metabolism", "Proline Metabolism", "Polyamine Metabolism", "TCA Cycle", "Pi"))]
uc_pathway_mets = uc_pathway_mets[-which(uc_pathway_mets %in% c("NADPH", "Co2+", "NH3"))]
uc_pathway_metsAll = tolower(uc_pathway_mets)
uc_pathway_metsLocal = c("argininosuccinate", "citrulline", "arginine", "ornithine")
dff_all = data.frame(diag=character(), model=character(), sensitivity=numeric(), specificity=numeric(), stringsAsFactors = FALSE)
for (model in c("arg", "asld", "cit", "otc")) {#
  print(sprintf("%s...", toupper(model)))
  diags = colnames(data_mx)
  diags[which(!(diags %in% cohorts[[model]]))] = 0
  diags[which(diags %in% cohorts[[model]])] = 1

  # Data frame for pathway (local) view - PLS on zscores, predicting diagnosis (0=control, 1=case)
  uc_data = rbind(diags, data_mx)
  rownames(uc_data)[1] = "diag"
  uc_data = uc_data[which(rownames(uc_data) %in% c("diag", uc_pathway_metsLocal)),]
  uc_data = apply(uc_data, c(1,2), as.numeric)

  # Full pathway view
  uc_data2 = rbind(diags, data_mx)
  # uc_data2=data_mx
  rownames(uc_data2)[1] = "diag"
  uc_data2 = uc_data2[which(rownames(uc_data2) %in% c("diag", uc_pathway_metsAll)),]
  uc_data2 = apply(uc_data2, c(1,2), as.numeric)
  # temp=data.frame(diag=as.numeric(diags),df=I(t(uc_data2)))
  # uc_data2=temp

  # All frequently detected molecules (fill rate > 0.8)
  # uc_data3=data_mx
  uc_data3 = rbind(diags, data_mx)
  rownames(uc_data3)[1] = "diag"
  fillrate = apply(uc_data3, 1, function(x) sum(!is.na(x))/dim(uc_data3)[2])
  uc_data3 = uc_data3[fillrate>=0.8,]
  uc_data3 = apply(uc_data3, c(1,2), as.numeric)
  NAasMin = function(x){
    if (sum(is.na(x))>0) {x[is.na(x)]=min(x,na.rm = T)}
    return(x)
  }
  for (r in 1:ncol(uc_data3)){
    uc_data3[,r]=NAasMin(uc_data3[,r])
  }
  # temp=data.frame(diag=as.numeric(diags),df=I(t(uc_data3)))
  # uc_data3=temp
  # rm(temp)


  # Data frame for CTD/CTDdisMod network-quantified variables
  load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx))
  df_model = df_model[which(df_model$ptID %in% colnames(data_mx)),]
  df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni"))
  ctd_pvals = 2^-(df_model$bits)
  ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
  combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test)
  #df2_ctd = data.frame(combined=combined_brown, ctd=df_model$bits, ctddisMod=df_model$ctdDisMod, diag=as.numeric(diags))  # 
  df2_ctd = data.frame(combined=-log2(combined_brown), ctd=df_model$bits, ctddisMod=-log2(ctdDisMod_pvals), diag=as.numeric(diags))  # 
  #df2_ctd = data.frame(combined=-log2(combined_brown), diag=as.numeric(diags))  # 
  rownames(df2_ctd) = df_model$ptID

  # LOOCV PLS
  dff_model = data.frame(pls_pwyLocal=numeric(), pls_pwyAll=numeric(), pls_ctd_ctddm=numeric(), pls_allfreqmets=numeric(), stringsAsFactors = FALSE)
  for (it in 1:length(diags)) {
    isTrain = c(1:length(diags))[-it]
    # Pathway Z-score feature selection: Local
    model_res = plsr(diag~., data = as.data.frame(t(uc_data[,isTrain])))
    tst_data = uc_data[,-isTrain]
    model_tst =  predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data)))
    dff_model[it, "pls_pwyLocal"] = model_tst

    # Pathway Z-score feature selection: All
    model_res = plsr(diag~., data = as.data.frame(t(uc_data2[,isTrain])))
    # model_res = plsr(diag~df, data = temp)
    tst_data = uc_data2[,-isTrain]
    if (length(which(is.na(tst_data))) >0) {
      tst_data[which(is.na(tst_data))] = min(na.omit(uc_data2[which(is.na(tst_data)),]))
    }
    model_tst =  predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data)))
    dff_model[it, "pls_pwyAll"] = model_tst


    # CTD and CTDdm as covariates
    ctd_res = plsr(diag~., data = df2_ctd[isTrain,])
    tst_data = as.matrix(df2_ctd[-isTrain,-which(colnames(df2_ctd)=="diag")])
    if (ncol(tst_data)==1) { colnames(tst_data) = "combined" }
    ctd_tst = predict(ctd_res, ncomp=ctd_res$ncomp, newdata=tst_data)
    dff_model[it, "pls_ctd_ctddm"] = ctd_tst


  }
    # All frequently detected molecules
    # model_res = mvr(diag~., data = as.data.frame(t(uc_data3[,isTrain])),validation = "CV")
    # uc_data3.train=uc_data3
    # uc_data3.train=uc_data3.train[isTrain,]
    model_res = plsr(diag~., data = as.data.frame(t(uc_data3)),validation = "LOO")
    # temp=selectNcomp(model_res, method = c("randomization"),
    #         nperm = 999, alpha = 0.01, ncomp = model_res$ncomp,
    #         plot = FALSE)
    # tst_data = uc_data3[,-isTrain]
    # if (length(which(is.na(tst_data))) >0) {
    #   tst_data[which(is.na(tst_data))] = apply(uc_data3[which(is.na(tst_data)),], 1, function(x) min(x,na.rm = T))
    # }
    # model_tst =  predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data)))
    # model_tst =  model_res$validation$pred[it,,which.min(model_res$validation$PRESS)]
    # model_tst =  model_res$validation$pred[it,,9]
    dff_model[, "pls_allfreqmets"] = model_res$validation$pred[,,model_res$ncomp]
    # dff_model[, "pls_allfreqmets"] = model_res$validation$pred[,,which.min(model_res$validation$PRESS)]

  pls_ctd_auc = roc(diags, dff_model$pls_ctd_ctddm)
  pls_pwyLocal_auc = roc(diags, dff_model$pls_pwyLocal)
  pls_pwyAll_auc = roc(diags, dff_model$pls_pwyAll)
  pls_allfreqmets_auc = roc(diags, dff_model$pls_allfreqmets)
  print(sprintf("PLS.ctd.ctddisMod  AUC = %.3f", pls_ctd_auc$auc))
  print(sprintf("PLS.pathwayLocal     AUC = %.3f", pls_pwyLocal_auc$auc))
  print(sprintf("PLS.pathwayAll     AUC = %.3f", pls_pwyAll_auc$auc))
  print(sprintf("PLS.allfreq     AUC = %.3f", pls_allfreqmets_auc$auc))


  ind = which(duplicated(pls_pwyLocal_auc$specificities))
  ind2 = which(duplicated(pls_pwyAll_auc$specificities))
  ind3 = which(duplicated(pls_ctd_auc$specificities))
  ind4 = which(duplicated(pls_allfreqmets_auc$specificities))

  dff2 = data.frame(diag=character(), model=character(), sensitivity=numeric(), specificity=numeric(), stringsAsFactors = FALSE)
  nrw = 4+length(pls_pwyLocal_auc$specificities[-ind]) + length(pls_pwyAll_auc$specificities[-ind2]) + length(pls_ctd_auc$specificities[-ind3]) + length(pls_allfreqmets_auc$specificities[-ind4])
  dff2[1:nrw, "diag"] = rep(model, nrw)
  dff2[1:nrw, "model"] = c(rep("pls-pwyLocal", 1+length(pls_pwyLocal_auc$specificities[-ind])), 
                           rep("pls-pwyAll", 1+length(pls_pwyAll_auc$specificities[-ind2])),
                           rep("plsctd", 1+length(pls_ctd_auc$specificities[-ind3])),
                           rep("pls-allMets", 1+length(pls_ctd_auc$specificities[-ind4])))
  dff2[1:nrw, "specificity"] = c(pls_pwyLocal_auc$specificities[-ind], 1,
                                 pls_pwyAll_auc$specificities[-ind2], 1,
                                 pls_ctd_auc$specificities[-ind3], 1,
                                 pls_allfreqmets_auc$specificities[-ind4], 1)
  dff2[1:nrw, "sensitivity"] = c(pls_pwyLocal_auc$sensitivities[-ind], 0,
                                 pls_pwyAll_auc$sensitivities[-ind2], 0,
                                 pls_ctd_auc$sensitivities[-ind3], 0,
                                 pls_allfreqmets_auc$sensitivities[-ind4], 0)

  dff_all = rbind(dff_all, dff2)
}
# save(dff_all,file="revisionOutput/pwy-vs-global_MJM_v1.RData")
save(dff_all,file=sprintf("%s/pwy-vs-global_MJM_v1.RData",OP.dir))

# load("revisionOutput/pwy-vs-global_MJM_v1.RData")
# load(sprintf("%s/pwy-vs-global_MJM_v2.RData",OP.dir))
require(ggplot2)
# # svg("pathway-vs-global/pwy-vs-global_MJM_v1.svg")
# svg("revisionOutput/pwy-vs-global_MJM_v1.svg")
# ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) + facet_wrap(~diag)
# dev.off()
# 
# # png("pathway-vs-global/pwy-vs-global_MJM_v1.png")
# png("revisionOutput/pwy-vs-global_MJM_v1.png")
# ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) + facet_wrap(~diag)
# dev.off()

dff_all$model=c("Asa-Arg-Orn-Cit","Pathway","Network","Full profile")[match(dff_all$model,c("pls-pwyLocal","pls-pwyAll","plsctd","pls-allMets"))]
dff_all$model=factor(dff_all$model,levels = c("Pathway","Asa-Arg-Orn-Cit","Network","Full profile"))
dff_all$diag=c("Argininosuccinic aciduria","Arginase deficiency","Ornithine transcarbamylase deficiency","Citrullinemia")[match(dff_all$diag,c("asld","arg","otc","cit"))]

p=ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) +
  facet_wrap(~diag) + 
  # scale_color_manual(labels = levels(dff_all$model), values = c("red","green","blue","purple")) +
  theme_bw()+
  theme(legend.position = c(0.25,0.75),
        legend.title = element_blank(),
        legend.box.background = element_rect(colour = "black"),
        legend.background = element_blank(),
        # legend.spacing.y = unit(0.1, 'cm'),
        legend.key.size = unit(0.1, "cm"),
        text = element_text(family = "Arial")) +
  xlab("specificity")
p
ggsave(p,file=sprintf("%s/pwy-vs-global_MJM_v1.pdf",OP.dir),width = 4,height=4, device = cairo_pdf)
ggsave(p,file=sprintf("%s/pwy-vs-global_MJM_v2.pdf",OP.dir),width = 4,height=4, device = cairo_pdf)
ggsave(p,file=sprintf("revisionOutput/pwy-vs-global_MJM_v1.svg",OP.dir),width = 6,height=6, device = svglite)
ggsave(p,file=sprintf("revisionOutput/pwy-vs-global_MJM_v2.svg",OP.dir),width = 6,height=6, device = svglite)

Haijes et al 2020 Comparison: Rank Diagnoses for Each Patient

``` {r rank-diagnoses} require(EmpiricalBrownsMethod) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))

16 models for k=30

for (kmx in c(30)) { dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 print(sprintf("Kmx = %d", kmx)) # Include "asld" and "otc" in the 16 model run, then comment them out for 14 model run. for (model in c("aadc", "abat", "adsl", "arg", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd", "asld", "otc")) { # print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) # df_model$bits is already FDR corrected df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) # Excluse OTC and ASLD for 14 model run. Only rank diagnosed patients (not healthy control reference patients). df_model = df_model[-which(df_model$diag %in% c("maps", "hep_refs", "edta_refs")),] # "otc", "asld"

ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits))
ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model))
#combined_hm = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) mean(i))
combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)
#combined_fisher = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) stats.fishersMethod(i))

for (pt in 1:nrow(df_model)) {
  dff[r, "model"] = model
  dff[r, "ptID"] = df_model$ptID[pt]
  dff[r, "diag"] = df_model$diag[pt]
  dff[r, "ctd_pval"] = ctd_pvals[pt]
  dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt]
  dff[r, "combined_pval"] = combined_brown[pt]
  r = r + 1
}
save(dff,file=sprintf("%s/ptpval_kmx%s.RData",OP.dir,kmx))

}

# CTD only # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "ctd_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "ctd_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"ctd_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"ctd_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd)

# CTDdisMod only # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "ctdDisMod_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "ctdDisMod_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"ctdDisMod_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"ctdDisMod_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd)

# Combined Network # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "combined_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "combined_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"combined_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"combined_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd) }

15 models for k=30

for (kmx in c(30)) { #seq(30,30,5) dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 print(sprintf("Kmx = %d", kmx)) # Include "asld" and "otc" in the 16 model run, then comment them out for 14 model run. for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd")) { # print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) # df_model$bits is already FDR corrected df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) # Excluse OTC and ASLD for 14 model run. Only rank diagnosed patients (not healthy control reference patients). df_model = df_model[-which(df_model$diag %in% c("maps", "hep_refs", "edta_refs", "otc")),]

ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits))
ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model))
jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model))
combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i)  empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)

for (pt in 1:nrow(df_model)) {
  dff[r, "model"] = model
  dff[r, "ptID"] = df_model$ptID[pt]
  dff[r, "diag"] = df_model$diag[pt]
  dff[r, "ctd_pval"] = ctd_pvals[pt]
  dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt]
  dff[r, "combined_pval"] = combined_brown[pt]
  r = r + 1
}

}

Get number of DDs in reference patients only

dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "otc", "pa", "pku", "rcdp", "zsd")) { print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx30.RData", model.dir, model)) df_model$bits = -log2(p.adjust(2^-df_model$bits, method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) df_model = df_model[which(df_model$diag %in% c("hep_refs", "edta_refs")),]

ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits)) ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model)) combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)

for (pt in 1:nrow(df_model)) { dff[r, "model"] = model dff[r, "ptID"] = df_model$ptID[pt] dff[r, "diag"] = df_model$diag[pt] dff[r, "ctd_pval"] = ctd_pvals[pt] dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt] dff[r, "combined_pval"] = combined_brown[pt] r = r + 1 } }

Length of DD

len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"combined_pval"]<0.05)))) median(len_dd) quantile(len_dd, 0.05) quantile(len_dd, 0.95)

## Zellweger Spectrum Disorders: PEX1 (Peroxisome Biogenesis disorders) vs. PEX7 (RCDP)
``` {r single-node_ptSim}
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
# Compute the patient similarity between negative controls, true cases and the "unknown disease" patient cohort.
require(R.utils)
require(CTD)
mkdirs("ptSim")
thresholdDiff=0.01
kmx=30
p0=0.1
p1=0.9

data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx



refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)]
data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))]
for (model in c("rcdp", "zsd")) { 
  for (fold in 1:length(cohorts[[model]])) {
    # Load pre-computed node ranks (run on cluster in parallel and collated)
    # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData", rank.dir, toupper(model), fold))[["permutationByStartNode"]]
    ranks  = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]]
    ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]]
    # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]]
    ranks = lapply(ranks, tolower)
    names(ranks) = as.character(lapply(ranks, function(i) i[[1]][1]))
    adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
    data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ]

    # Get patient bitstrings ahead of time
    ptBSbyK = list()
    for (pt in 1:ncol(data_mx)) {
      print(pt)
      ptID = colnames(data_mx)[pt]
      S1 = rownames(data_mx)[order(abs(data_mx[,ptID]), decreasing = TRUE)][1:kmx]
      ptBSbyK[[ptID]] = mle.getPtBSbyK(S1, ranks)
    }

    res = list()
    t = list(ncd=matrix(NA, nrow=ncol(data_mx), ncol=ncol(data_mx)), jac=matrix(NA, nrow=ncol(data_mx), ncol=ncol(data_mx)))
    rownames(t$ncd) = colnames(data_mx)
    colnames(t$ncd) = colnames(data_mx)
    rownames(t$jac) = colnames(data_mx)
    colnames(t$jac) = colnames(data_mx)
    for (i in 1:(kmx-1)) { res[[i]] = t }
    for (pt in 1:(ncol(data_mx)-1)) {
      print(pt)
      ptID = colnames(data_mx)[pt]
      for (pt2 in (pt+1):ncol(data_mx)) {  
        print(sprintf(" vs. %d...", pt2))
        ptID2 = colnames(data_mx)[pt2]
        tmp = mle.getPtDist(ptBSbyK[[ptID]], ptID, ptBSbyK[[ptID2]], ptID2, data_mx, ranks)
        for (k in 1:(kmx-1)) {
          res[[k]]$ncd[ptID, ptID2] = tmp$NCD[k]
          res[[k]]$jac[ptID, ptID2] = tmp$dirSim[k]
          res[[k]]$ncd[ptID2, ptID] = tmp$NCD[k]
          res[[k]]$jac[ptID2, ptID] = tmp$dirSim[k]
        }
      }
    }
    # Normalize rows by max value size k
    res_norm = res
    for (k in 1:(kmx-1)) {
      diag(res_norm[[k]]$ncd) = 0
      diag(res_norm[[k]]$jac) = 0
      res_norm[[k]]$ncd = res_norm[[k]]$ncd/max(na.omit(res_norm[[k]]$ncd))
    }
    save(res, res_norm, file=sprintf("ptSim/ptSim_%s%d_kmx%d.RData", model, fold, kmx))
  }
}


# For RCDP and ZSD similarity matrices, take the min patient distance for each k, and then across k, for each pairwise patient comparison. You will end up with one similarity matrix.
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
require(CTD)
require(R.utils)

data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx

kmx=30
mle.getMeanPtDistance = function (allSimMatrices) {
  ptSim = allSimMatrices[[1]]
  for (ind in 2:length(allSimMatrices)) {
    ptSim = ptSim + allSimMatrices[[ind]]
  }
  ptSim = ptSim/length(allSimMatrices)
  return(ptSim)
}
all_models_list = list()
for (model in c("rcdp", "zsd")) {
  print(model)
  model_list = list()
  for (fold in 1:length(cohorts[[model]])) {
    print(fold)
    res = loadToEnv(sprintf("ptSim/kmx%d/ptSim_%s%d_kmx%d.RData", kmx, model, fold, kmx))[["res"]]
    res = lapply(res, function(i) i$ncd)
    model_list[[fold]] = mle.getMinPtDistance(res)
  }
  res_ncd = mle.getMeanPtDistance(model_list)
  diag(res_ncd) = 0
  res_ncd = res_ncd / max(res_ncd)
  # Get colnames
  res = loadToEnv(sprintf("ptSim/kmx%d/ptSim_%s%d_kmx%d.RData", kmx, model, 1, kmx))[["res"]]
  colnames(res_ncd) = colnames(res[[1]]$ncd)
  rownames(res_ncd) = colnames(res[[1]]$ncd)
  all_models_list[[model]] = res_ncd
}
save(all_models_list, file=sprintf("ptSim/all_models_list_kmx%d.RData", kmx))


rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
require(CTD)
require(R.utils)

data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx


kmx=30
load(sprintf("ptSim/all_models_list_kmx%d.RData", kmx))

wangler_ages10_andDown = c("EDTA-ZSD-1", "EDTA-ZSD-2", "EDTA-ZSD-3", "EDTA-ZSD-7", 
                           "EDTA-ZSD-9", "EDTA-ZSD-10", "EDTA-ZSD-11", "EDTA-ZSD-13",
                           "EDTA-ZSD-14", "EDTA-ZSD-17", "EDTA-ZSD-18")
data_mx = data_mx[-grep("x - ", rownames(data_mx)),]
refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)]
res_overall = mle.getMeanPtDistance(all_models_list)
diag(res_overall) = 0
res_overall = res_overall / max(res_overall)
colnames(res_overall) = colnames(all_models_list[[1]])
ind = which(colnames(res_overall) %in% unlist(cohorts[c("zsd", "rcdp", "edta_refs")]))
res_overall = res_overall[ind, ind]
diags = colnames(res_overall)
diags[which(diags %in% wangler_ages10_andDown)] = "ZSD (<10 y.o.)"
diags[which(diags %in% cohorts$zsd)] = "ZSD (10+ y.o.)" 
diags[which(diags %in% cohorts$rcdp)] = "RCDP"
diags[which(diags %in% cohorts$edta_refs)] = "REF"

fitSim = cmdscale(res_overall, eig = FALSE, k = 2)
x = round(fitSim[, 1], 2)
y = round(fitSim[, 2], 2)
#z = round(fitSim[, 3], 2)
#df = data.frame(x = x, y = y, z = z, color = diags, label = colnames(res_overall))
#p = plot_ly(df, x = ~x, y = ~y, z = ~z, color = ~color, text = ~label, marker = list(size = 20))
df = data.frame(x = x, y = y, color = diags, label = colnames(res_overall))
p = plot_ly(df, x = ~x, y = ~y, color = ~color, text = ~label, marker = list(size = 20))
orca(p, sprintf("ptSim/CTDncd_origZscore_kmx%d.svg", kmx), format = "svg")

# Get UNK patients that cluster with PEX1 or PEX7 samples
# Kmeans
zsd.centroid = apply(res_overall[grep("ZSD", diags),], 2, mean)
rcdp.centroid = apply(res_overall[which(diags=="RCDP"),], 2, mean)
ref.centroid = apply(res_overall[which(diags=="REF"),], 2, mean)
kmns = kmeans(res_overall, centers = rbind(zsd.centroid, rcdp.centroid, ref.centroid))
df = cbind(as.numeric(kmns$cluster), diags, colnames(all_models_list[[1]])[ind])
colnames(df) = c("cluster.num", "diag", "ptID")
zsd.cluster = df[which(df[,1]=="1"), ]
rcdp.cluster = df[which(df[,1]=="2"), ]
ref.cluster = df[which(df[,1]==3), ]
clusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
clusterPurity(df[,1], df[,2])


# Compare to euclidean distance + MDS
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric)
cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)]
data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),]
data_mx.0=data_mx

data_mx = data_mx[,which(colnames(data_mx) %in% c(cohorts$zsd, cohorts$rcdp, cohorts$edta_refs))]
ptSim_euc = as.matrix(dist(t(data_mx), method="euclidean"))
fitSim = cmdscale(ptSim_euc, eig = FALSE, k = 2)
x = round(fitSim[, 1], 2)
y = round(fitSim[, 2], 2)
df = data.frame(x = x, y = y, color = diags, label = colnames(res_overall))
p_euc = plot_ly(df, x = ~x, y = ~y, color = ~color, text = ~label, marker = list(size = 20))
orca(p_euc, sprintf("ptSim/zsd_rcdp_euc_origZscore.svg"), format = "svg")

zsd.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$zsd),], 2, mean)
rcdp.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$rcdp),], 2, mean)
ref.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$edta_refs),], 2, mean)
kmns = kmeans(ptSim_euc, centers = rbind(zsd.centroid, rcdp.centroid, ref.centroid))
df = cbind(as.numeric(kmns$cluster), diags, colnames(all_models_list[[1]])[ind])
colnames(df) = c("cluster.num", "diag", "ptID")
zsd.cluster = df[which(df[,1]=="1"), ]
rcdp.cluster = df[which(df[,1]=="2"), ]
ref.cluster = df[which(df[,1]==3), ]
clusterPurity <- function(clusters, classes) {
  sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
clusterPurity(df[,1], df[,2])


# Compare to mahalanobis distance + basic plot
require(CTDext)
data_mx = data.imputeData(data_mx, data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)])
ptSim_mah = mahalanobis(t(data_mx), 
                        center=colMeans(data_mx), 
                        cov(t(data_mx)), inverted = FALSE, tol=1e-50)
ggplot() + geom_point(aes(1:ncol(data_mx), ptSim_mah, colour=diags))




# Visualize main disease module for ZSD and RCDP
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
require(R.utils)
load(system.file("shiny-app/disMod_July2020.RData", package = "CTDext"))
# Selected disease module for ZSD
mets = disMod$zsd
ig = loadToEnv(system.file("networks/ind_foldNets/bg_zsd_fold1.RData",package = "CTDext"))[["ig_pruned"]]
ig = subgraph(ig, v=mets)
E(ig)$weight = abs(E(ig)$weight)*100
E(ig)$color = "#6a0dad"
coords = layout.fruchterman.reingold(ig)
png("ptSim/zsd-blowout_origZscore.png")
plot(ig, layout=coords, edge.width=E(ig)$weight)
dev.off()
V(ig)$name = rep("", length(V(ig)$name))
svg("ptSim/zsd-blowout_origZscore.svg")
plot(ig, layout=coords, edge.width=E(ig)$weight)
dev.off()


# Selected disease module for RCDP
mets = disMod$rcdp
ig = loadToEnv(system.file("networks/ind_foldNets/bg_rcdp_fold3.RData")[["ig_pruned"]]
ig = subgraph(ig, v=mets[which(mets %in% V(ig)$name)])
E(ig)$weight = abs(E(ig)$weight)*100
E(ig)$color = "#228B22"
coords = layout.fruchterman.reingold(ig)
png("ptSim/rcdp-blowout_origZscore.png")
plot(ig, layout=coords, edge.width=E(ig)$weight)
dev.off()
V(ig)$name = rep("", length(V(ig)$name))
svg("ptSim/rcdp-blowout_origZscore.svg")
plot(ig, layout=coords, edge.width=E(ig)$weight)
dev.off()

Model Sensitivity and specificity

require(pROC)
require(R.utils)
require(caret)
require(dplyr)
kmx=30
load(sprintf("%s/ptpval_kmx%s.RData",OP.dir,kmx))
top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "combined_pval"]),c("model", "diag")])
correct.top1=unlist(apply(top1, 2, function(i) i$model==i$diag))
top1=as.data.frame(t(top1))
top1=as.data.frame(apply(top1,2,unlist))
top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "combined_pval"], decreasing = FALSE)[1:3],c("model", "diag")])
top3=as.data.frame(top3)
correct.top3=unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model))

all.models=c("aadc", "abat", "adsl", "arg", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd", "asld", "otc")
df.ModelMetrics=data.frame(matrix(ncol = 8,nrow=length(all.models)),stringsAsFactors = F)
rownames(df.ModelMetrics)=all.models
colnames(df.ModelMetrics)=c("Model","Sensitivity.top1","Specificity.top1","Accuracy.top1","Sensitivity.top3","Specificity.top3","Accuracy.top3","AUC")
options(digits=3)
for (model in all.models) { #

  # Combined Network
  # Top 1 is correct diagnosis
  df=dff[dff$model==model,]
  df=unique(df)
  df=df[match(unique(dff$ptID),df$ptID),]
  CP.v=df$diag==model
  AUC=roc(response=CP.v,predictor = df$combined_pval)

  PP.top1.v=top1$model==model
  PP.top3.v = sapply(unique(dff$ptID),function(x) model %in% top3[[x]]$model)

  conf_mx.top1 = confusionMatrix(data=factor(PP.top1.v,levels = c("TRUE","FALSE")),reference = factor(CP.v,levels = c("TRUE","FALSE")))
  # roc.top1=roc(response=as.numeric(CP.v),predictor = as.numeric(PP.top1.v))
  conf_mx.top3 = confusionMatrix(data=factor(PP.top3.v,levels = c("TRUE","FALSE")),reference = factor(CP.v,levels = c("TRUE","FALSE")))
  # roc.top3=roc(response=as.numeric(CP.v),predictor = as.numeric(PP.top3.v))

  df.ModelMetrics[model,]=c(toupper(model),
                            conf_mx.top1$byClass[c("Sensitivity","Specificity")],
                            conf_mx.top1$overall["Accuracy"],
                            # sprintf("%.3f(%.3f-%.3f)",
                            #         conf_mx.top1$overall["Accuracy"],
                            #         conf_mx.top1$overall["AccuracyLower"],
                            #         conf_mx.top1$overall["AccuracyUpper"]),
                            # roc.top1$auc,
                            conf_mx.top3$byClass[c("Sensitivity","Specificity")],
                            conf_mx.top3$overall["Accuracy"],
                            # sprintf("%.3f(%.3f-%.3f)",
                            #         conf_mx.top3$overall["Accuracy"],
                            #         conf_mx.top3$overall["AccuracyLower"],
                            #         conf_mx.top3$overall["AccuracyUpper"]),
                            # roc.top3$auc
                            NA
                            )
}

AUC.df=loadToEnv(sprintf("%s/AUC_kmx%s.RData",model.dir,kmx))[["dff"]]
df.ModelMetrics$AUC=AUC.df$network.auc[match(df.ModelMetrics$Model,toupper(AUC.df$model))]
df.ModelMetrics[,!grepl("Model",colnames(df.ModelMetrics))]=apply(df.ModelMetrics[,!grepl("Model",colnames(df.ModelMetrics))],c(1,2),as.numeric)

require(openxlsx)
write.xlsx(df.ModelMetrics,file="revisionOutput/ModelMetrics.xlsx")
 # write.table(df.ModelMetrics,file="revisionOutput/ModelMetrics.tsv",sep = "\t",quote = F)


# plot model performance metrics
require(tidyr)
require(reshape2)
levels.fixed=df.ModelMetrics$Model[order(df.ModelMetrics$Sensitivity.top1,decreasing = F)]
df2 = df.ModelMetrics[order(df.ModelMetrics$Sensitivity.top1,decreasing = T),]
# df2 = df2[,!grepl("Accuracy",colnames(df2))]
df2 = df2 %>% melt(id=("Model")) %>%
  separate(col = "variable",into = c("metrics","rule"),sep = "[[:punct:]]+",remove = TRUE,convert = FALSE)
df2$Model=factor(df2$Model,levels = levels.fixed)
df2$value=as.numeric(df2$value)
df2$metrics=factor(df2$metrics,levels = c("Sensitivity","Specificity","Accuracy","AUC"))
# df2=data.frame(xaxis=rep(df$xaxis,3),
#                value=c(df$Score2.mean.SharedEdges.,df$AUC.top1,df$AUC.top3),
#                metrics=as.vector(sapply(c("Stablity","AUC.top1","AUC.top3"),function(x) rep(x,dim(df)[1]))))
p=ggplot(data=df2[(!grepl("top3",df2$rule)) & !grepl("AUC",df2$metrics),],aes_string(x="Model",y="value")) + theme_minimal()+
  facet_wrap(~metrics)+
  geom_bar(stat="identity", position=position_dodge(), width = 0.7)+
  coord_flip()+
  theme(legend.position = "top",legend.title = element_blank(),panel.spacing = unit(1, "lines"))+
  labs(fill="Metrics rule: ")
p
ggsave(p,filename = sprintf("%s/ModelMetrics2.pdf", OP.dir), width = 4, height = 4, device = cairo_pdf)
require(svglite)
ggsave(p,filename = sprintf("%s/ModelMetrics2.svg", OP.dir), width = 6, height = 4, device = svglite)

Network stablity

require(R.utils)
require(CTD)

score_mx=list()
stablity_score=data.frame(Disease=character(),
                          `Cohort Size`=integer(),
                          `Score1 mean(SharedEdges)`=numeric(),
                          `Score1 sd(SharedEdges)`=numeric(),
                          `Score2 mean(SharedEdges)`=numeric(),
                          `Score2 sd(SharedEdges)`=numeric(),
                          stringsAsFactors = F
                          )
r=0
countDirection=TRUE
for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { # 
  print(sprintf("%s...", toupper(model)))
  r=r+1
  cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
  score_mx[["edgeCountByPairOuter"]][[model]]=matrix(nrow=length(cohorts[[model]]),ncol=length(cohorts[[model]]))
  colnames(score_mx[["edgeCountByPairOuter"]][[model]])=rownames(score_mx[["edgeCountByPairOuter"]][[model]])=cohorts[[model]]
  score_mx[["edgeCountByPair"]][[model]]=score_mx[["edgeCountByPairOuter"]][[model]]
  ig.outer=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, 1))[["ig_pruned"]]
  for (pt1 in 1:(length(cohorts[[model]])-1)) {
    ig1=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt1))[["ig_pruned"]]
    ig.outer=ig.outer%u%loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt1+1))[["ig_pruned"]]
    for (pt2 in (pt1+1):length(cohorts[[model]]) ){
      ig2=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt2))[["ig_pruned"]]
      ig.inner=ig1 %s% ig2
      if(countDirection){ig.inner=delete_edges(ig.inner,E(ig.inner)[E(ig.inner)$weight_1*E(ig.inner)$weight_2<=0])}
      ig.pairouter=ig1 %u% ig2
      score_mx[["edgeCountByPair"]][[model]][pt1,pt2]=length(E(ig.inner))
      score_mx[["edgeCountByPairOuter"]][[model]][pt1,pt2]=length(E(ig.pairouter))
    }
  }
  stablity_score[r,]=c(model,
                       length(cohorts[[model]]),
                       mean(score_mx[["edgeCountByPair"]][[model]],na.rm = T)/length(E(ig.outer)),
                       sd(score_mx[["edgeCountByPair"]][[model]],na.rm = T)/length(E(ig.outer)),
                       mean(score_mx[["edgeCountByPair"]][[model]]/score_mx[["edgeCountByPairOuter"]][[model]],na.rm = T),
                       sd(score_mx[["edgeCountByPair"]][[model]]/score_mx[["edgeCountByPairOuter"]][[model]],na.rm = T)
                       )

}
stablity_score$Score2.Normalized.sd=as.numeric(stablity_score$Score2.sd.SharedEdges.)/as.numeric(stablity_score$Score2.mean.SharedEdges.)
colnames(stablity_score)[colnames(stablity_score)=="Disease"]="Diagnosis"
stablity_score$Diagnosis=toupper(stablity_score$Diagnosis)
require(openxlsx)
# write.xlsx(stablity_score,file = "revisionOutput/NetworkStablity.xlsx")

#plot 
require(ggplot2)
stablity_score=read.xlsx("OP/NetworkStablity.xlsx",rowNames = F)
stablity_score[,2:6]=apply(stablity_score[,2:6],c(1,2),as.numeric)
ModelMetrics=read.xlsx("OP/ModelMetrics.xlsx",rowNames = F)
colnames(ModelMetrics)=c("Model","Sensitivity.top1","Specificity.top1","Accuracy.top1","AUC.top1","Sensitivity.top3","Specificity.top3","Accuracy.top3","AUC.top3")
ModelMetrics=ModelMetrics[!is.na(ModelMetrics$Model),]
df=cbind(stablity_score[,c("Diagnosis","Cohort.Size","Score2.mean.SharedEdges.","Score2.sd.SharedEdges.","Score2.Normalized.sd")],
         ModelMetrics[match(stablity_score$Diagnosis,ModelMetrics$Model),c("Sensitivity.top1","Specificity.top1","AUC.top1","Sensitivity.top3","Specificity.top3","AUC.top3")])
df=df[order(as.integer(df$Cohort.Size)),]
df$xaxis=paste(paste(df$Diagnosis,df$Cohort.Size,sep = " (n="),")",sep = "")
fixed_level=df$xaxis
df$xaxis=factor(df$xaxis,levels = fixed_level)
levels(df$xaxis)=df$xaxis

#plot stablity  
p = ggplot(data=df,aes_string(x="xaxis",y="Score2.mean.SharedEdges.",fill = "Cohort.Size")) +
  geom_bar(stat="identity", position=position_dodge(),color="black")+
  scale_fill_gradient(low="#ffcccc",high="darkred")+
  geom_errorbar(aes(ymin=Score2.mean.SharedEdges.-Score2.sd.SharedEdges., ymax=Score2.mean.SharedEdges.+Score2.sd.SharedEdges.), width=.2,
                 position=position_dodge(.9)) +
  theme_minimal()+
  theme(legend.position = "none")+
  # geom_col(aes(fill = Cohort.Size)) + 
  # scale_color_gradient(low = "blue", high = "red") +
  xlab("Disease")+ylab("Network Stability Score") + coord_flip() 
ggsave(p, filename = "OP/StablityScore.pdf", device = cairo_pdf, 
       width = 4, height = 4)

BRL-BCM/CTDext documentation built on May 7, 2022, 5:31 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

BRL-BCM/CTDext Extension package for the CTD R package.