w.dir=getwd() #"" knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(root.dir = w.dir) setwd(w.dir) require(R.utils) graph.dir="graphs" rank.dir="ranks" model.dir="models" OP.dir="OP" sapply(c(graph.dir,rank.dir,model.dir,OP.dir),mkdirs)
rm(list=setdiff(ls(),grep("dir",ls(),value = T))) require(R.utils) require(CTD) require(CTDext) require(openxlsx) url="https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-022-10415-5/MediaObjects/41598_2022_10415_MOESM2_ESM.xlsx" download.file(url,destfile = "data_mx.xlsx") data_mx.og=read.xlsx("data_mx.xlsx", sheet = "Z-score",startRow = 2, colNames = T,rowNames = T,skipEmptyRows = TRUE,skipEmptyCols = TRUE,) data_mx.og=data_mx.og[ ,!colnames(data_mx.og) %in% c("PATHWAY_SORTORDER", "SUPER_PATHWAY", "SUB_PATHWAY", "CHEMICAL_ID", "RI", "MASS", "PUBCHEM", "CAS", "KEGG", "HMDB_ID" )] # 6 alaimo samples with diseases diagnosed were also used for model training dupNames=matrix(nrow = 6, ncol = 2) dupNames[,1]=c( "ALAIMO-68b", "ALAIMO-68a","ALAIMO-44", "ALAIMO-85", "ALAIMO-136", "ALAIMO-13" ) dupNames[,2]=c( "EDTA-ABAT-5", "EDTA-ABAT-6", "EDTA-AADC-2", "EDTA-ABAT-7", "EDTA-AADC-3", "EDTA-OTC-17") temp=data_mx.og[,dupNames[,1]] colnames(temp)=dupNames[,2] data_mx.og=cbind(data_mx.og,temp) models=sort(c("zsd","rcdp","arg","otc","asld","abat","aadc","adsl","cit","cob","ga","gamt","msud","mma","pa","pku")) cohorts=sapply(c(models,"hep-ref", "edta-ref", "alaimo"), function(x) grep(paste(x,"-",sep = ""),colnames(data_mx.og),value = T,ignore.case = T)) names(cohorts)[names(cohorts)=="hep-ref"]="hep_refs" names(cohorts)[names(cohorts)=="edta-ref"]="edta_refs" # binary code argininosuccinate data_mx.raw=read.xlsx(system.file("dataset/dataset1.xlsx",package = "CTDext"), sheet = "Raw intensity",startRow = 2, colNames = T,rowNames = T,skipEmptyRows = TRUE,skipEmptyCols = TRUE,) data_mx.raw=data_mx.raw[,colnames(data_mx.og)[colnames(data_mx.og)%in%colnames(data_mx.raw)]] ptZeroFromRaw=colnames(data_mx.raw[,is.na(unlist(data_mx.raw["argininosuccinate",]))]) pt10FromRaw=colnames(data_mx.raw[,!is.na(unlist(data_mx.raw["argininosuccinate",]))]) temp=data_mx.og[,!colnames(data_mx.og) %in% colnames(data_mx.raw)] ptZeroFromOg=colnames(temp)[ is.na(unlist(temp["argininosuccinate",]))| as.numeric(unlist(temp["argininosuccinate",]))==0] data_mx.ogBnry=data_mx.og # Jul data_mx.ogBnry["argininosuccinate",c(ptZeroFromRaw,ptZeroFromOg)]=0 data_mx.ogBnry["argininosuccinate",pt10FromRaw]=10 # data_mx.og: load("clinical_data_July2020.RData") # data_mx.ogBnry : "clinical_data_Oct2020.RData" rm(list=setdiff(ls(),grep("dir|data_mx|cohorts|dupNames",ls(),value = T))) save.image("clinical_data.RData")
``` {r counts} rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str()))) data_mx=loadToEnv("clinical_data.RData")[["data_mx.raw"]]
data_mx_unnamed = data_mx[grep("x -", rownames(data_mx)), which(colnames(data_mx) %in% c(unlist(cohorts)))] zscore_cts = apply(data_mx_unnamed, 2, function(i) length(which(!is.na(i)))) mean(zscore_cts) min(zscore_cts) max(zscore_cts)
data_mx_named = data_mx[-grep("x -", rownames(data_mx)), which(colnames(data_mx) %in% c(unlist(cohorts)))] zscore_cts = apply(data_mx_named, 2, function(i) length(which(!is.na(i)))) mean(zscore_cts) min(zscore_cts) max(zscore_cts)
## Building a Gaussian Graphical Model to Diagnose 16 Different Metabolic Disorders (skip if using github-saved networks) Metabolites represented in at least 50% of reference and 50% of disease cases were included in network learning. This is a time-consuming step ``` {r learn-nets_heparin} require(R.utils) require(CTD) require(huge) rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str()))) data_mx=loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]] cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] data_mx=data_mx[,grep("hep-",colnames(data_mx),ignore.case = T)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] refs=data_mx[,grep("ref",colnames(data_mx),ignore.case = T)] ref_fill = apply(refs, 1, function(i) sum(is.na(i))/length(i)) # refs2=refs[which(ref_fill[rownames(data_mx)]>0.8),] refs2=refs cohorts = lapply(cohorts, function(i) i[which(i %in% colnames(data_mx))]) # We perfer training models on edta samples for various reasons (latest dataset); here we show an example of network learning on Citrullinemia disease that only contains heparin samples; include "msud", "mma", "pa", "pku", "cob", "ga", "gamt" if you want to train all 8 models for (model in c("cit")) { #, "msud", "mma", "pa", "pku", "cob", "ga", "gamt" for (fold in 1:length(cohorts[[model]])) { print(sprintf("Learning graphs for diag %s, fold %d...", model, fold)) diag_pts = cohorts[[model]][-fold] print(diag_pts) fill.rate = apply(data_mx[,which(colnames(data_mx) %in% diag_pts)], 1, function(i) sum(is.na(i))/length(i)) diag_data = data_mx[intersect(which(ref_fill<0.5), which(fill.rate<=0.50)), which(colnames(data_mx) %in% diag_pts)] diag_data = diag_data[which(rownames(diag_data) %in% rownames(refs2)),] print("Individual samples as training data. Latent variable embedding and network pruning.") diag_data = CTD::data.surrogateProfiles(data = diag_data, std = 1, ref_data = refs2) print(dim(diag_data)) # Disease Network: GLASSO approach inv_covmatt = huge(t(diag_data), method="glasso") inv_covmatt_select = huge.select(inv_covmatt, criterion = "stars") inv_covmat = as.matrix(inv_covmatt_select$icov[[which(inv_covmatt_select$lambda==inv_covmatt_select$opt.lambda)]]) diag(inv_covmat) = 0; colnames(inv_covmat) = rownames(diag_data) ig = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name') V(ig)$name = rownames(diag_data) print(ig) # Reference Network: GLASSO approach ref_data = data.surrogateProfiles(data = refs2, std = 1, ref_data = refs2)#useMnDiseaseProfile = FALSE, addHealthyControls = TRUE, ref_data = ref_data[,-which(duplicated(colnames(ref_data)))] print(dim(ref_data)) inv_covmatt = huge(t(ref_data), method="glasso", lambda = inv_covmatt_select$opt.lambda) inv_covmat = as.matrix(inv_covmatt$icov[[1]]) diag(inv_covmat) = 0; colnames(inv_covmat) = rownames(ref_data) ig_ref = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name') V(ig_ref)$name = rownames(ref_data) print(ig_ref) ig_pruned = graph.naivePruning(ig, ig_ref) print(ig_pruned) save(ig, ig_ref, ig_pruned, file=sprintf("%s/bg_%s_fold%d.RData",graph.dir, model, fold)) rm(ig, ig_pruned) } }
``` {r learn-networks_edta} rm(list=setdiff(ls(),c(grep("dir|model|cohort",ls(),value = T),lsf.str()))) require(CTD) require(huge) require(R.utils)
data_mx=loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]] cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] data_mx=data_mx[,grep("edta",colnames(data_mx),ignore.case = T)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx["argininosuccinate", which(is.na(data_mx["argininosuccinate",]))] = 0 refs = data_mx[, which(colnames(data_mx) %in% cohorts$edta_refs)] ref_fill = apply(refs, 1, function(i) sum(is.na(i))/length(i)) cohorts = lapply(cohorts, function(i) i[which(i %in% colnames(data_mx))])
for (model in c("asld")) {#,"aadc", "abat", "adsl", "arg","zsd", "rcdp","otc" for (fold in 1:length(cohorts[[model]])) { print(sprintf("Learning graphs for diag %s, fold %d...", model, fold)) diag_pts = cohorts[[model]][-fold] print(diag_pts) fill.rate = apply(data_mx[,which(colnames(data_mx) %in% diag_pts)], 1, function(i) sum(is.na(i))/length(i)) diag_data = data_mx[intersect(which(ref_fill<0.5), which(fill.rate<0.5)), which(colnames(data_mx) %in% diag_pts)] print(nrow(diag_data)) print(diag_data["argininosuccinate",]) print("Individual samples as training data. Latent variable embedding and network pruning.") refs2 = refs[which(rownames(refs) %in% rownames(diag_data)), ] diag_data = data.surrogateProfiles(data = diag_data, std = 1, ref_data = refs2) print(dim(diag_data))
# Disease Network: GLASSO approach inv_covmatt = huge(t(diag_data), method="glasso") inv_covmatt_select = huge.select(inv_covmatt, criterion = "stars") inv_covmat = as.matrix(inv_covmatt_select$icov[[which(inv_covmatt_select$lambda==inv_covmatt_select$opt.lambda)]]) diag(inv_covmat) = 0; colnames(inv_covmat) = rownames(diag_data) ig = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name') V(ig)$name = rownames(diag_data) print(ig) # Reference Network: GLASSO approach ref_data = data.surrogateProfiles(data = refs2, std = 1, ref_data = refs2) ref_data = ref_data[,-which(duplicated(colnames(ref_data)))] print(dim(ref_data)) inv_covmatt = huge(t(ref_data), method="glasso", lambda = inv_covmatt_select$opt.lambda) inv_covmat = as.matrix(inv_covmatt$icov[[1]]) diag(inv_covmat) = 0; colnames(inv_covmat) = rownames(ref_data) ig_ref = graph.adjacency(as.matrix(inv_covmat), mode="undirected", weighted=TRUE, add.colnames='name') V(ig_ref)$name = rownames(ref_data) print(ig_ref) ig_pruned = graph.naivePruning(ig, ig_ref) print(ig_pruned) print(degree(ig_pruned, "argininosuccinate")) save(ig, ig_ref, ig_pruned, file=sprintf("%s/bg_%s_fold%d.RData",graph.dir , model, fold)) rm(ig, ig_ref, ig_pruned)
} }
for (model in c("arg", "asld", "cit", "otc", "rcdp", "zsd", "abat", "aadc", "adsl", "mma", "msud", "pa", "pku", "cob", "ga", "gamt")) { # print(model) for (fold in 1:length(cohorts[[model]])) { print(fold) load(sprintf("Clinical_paper/graphs/bg_%s_fold%d.RData", model, fold)) print(sprintf("Disease-Control Network: %d-%d", length(V(ig)$name), length(E(ig)$weight))) print(sprintf("Control-only Network: %d-%d", length(V(ig_ref)$name), length(E(ig_ref)$weight))) print(sprintf("Disease-Specific Network: %d-%d", length(V(ig_pruned)$name), length(E(ig_pruned)$weight))) } }
## Get pre-compute ranks for each Gaussian Graphical Model. (skip if using github-saved ranks) This part should preferably run on cluster (pbs system). Scripts "wrapper_ranks.r" "getRanksN.r" "getRanks.pbs" are commented out. ```r # # cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] # models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc")) # cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T)) # cohorts[["alaimo"]]=NULL # # for (model in models){ # fold.begin=1 # fold.end=length(cohorts[[model]]) # system(sprintf("Rscript wrapper_ranks.r %s %s %s %s %s", # model, fold.begin, fold.end, w.dir, rank.dir, graph.dir)) # } ######################################## wrapper_ranks.r ######################################## # require(R.utils) # require(igraph) # args = commandArgs(trailingOnly=TRUE) # model=as.character(args[1]) # fold.begin=as.numeric(args[2]) # fold.end=as.numeric(args[3]) # w.dir=as.character(args[4]) # rank.dir=as.character(args[5]) # graph.dir=as.character(args[6]) # diag=model # setwd(sprintf("%s/%s",w.dir,rank.dir)) # # for (fold in fold.begin:fold.end){ # dir.create(sprintf("./%s", model), showWarnings = FALSE) # output_dir = sprintf("./%s/diag%s%d", model,diag,fold) # str = sprintf("mkdir %s", output_dir) # system(str) # # ig = loadToEnv(sprintf("../%s/bg_%s_fold%d.RData",graph.dir,model,fold))[["ig_pruned"]] # # totalN = length(V(ig)$name) # print(sprintf("Total N = %d", totalN)) # # for (n in 1:totalN) { # str = sprintf("qsub -v diag=%s,fold=%d,n=%d,wkdir=%s,rankdir=%s,graphdir=%s getRanks.pbs", # diag, fold, n, w.dir, rank.dir, graph.dir) # system(str, wait=FALSE) # system("sleep 0.1") # } # # # ready=FALSE # last_sum = 0 # while (!ready) { # f = list.files(sprintf("./%s/diag%s%d", model, diag, fold), pattern=".RData") # print(sprintf("#permutation files ready = %d", length(f))) # if (length(f)==totalN) { # ready=TRUE # } else { # system("sleep 30") # system("rm *.pbs.*") # curr_sum = length(f) # if (curr_sum > last_sum) { # last_sum = curr_sum # } # } # } # system("rm *.pbs.*") # # # collate permutations into one R object # all_nodes = V(ig)$name # permutationByStartNode = list() # for (n in 1:length(all_nodes)) { # load(sprintf("./%s/diag%s%d/%d.RData",model, diag, fold, n)) # permutationByStartNode[[n]] = toupper(current_node_set) # } # names(permutationByStartNode) = all_nodes # save.image(file=sprintf("./%s/%s%d-ranks.RData",model, toupper(diag), fold)) # print("collate complete...") # str = sprintf("rm -r ./%s/diag%s%d",model, diag, fold) # system(str) # # } ################################ end of wrapper_ranks.r ######################################### ################################ getRanksN.r ################################################ # require(igraph) # require(CTD) # require(R.utils) # singleNode.getNodeRanksN = function(n, G, S=NULL, num.misses=NULL, verbose=FALSE) { # if (!is.null(num.misses)) { # if (is.null(S)) { # print("You must supply a subset of nodes as parameter S if you supply num.misses.") # return(0) # } # } # all_nodes = names(G) # if (verbose) { # print(sprintf("Calculating node rankings %d of %d.", n, length(all_nodes))) # } # current_node_set = NULL # stopIterating=FALSE # startNode = all_nodes[n] # currentGraph = G # numMisses = 0 # current_node_set = c(current_node_set, startNode) # while (stopIterating==FALSE) { # # Clear probabilities # currentGraph[1:length(currentGraph)] = 0 #set probabilities of all nodes to 0 # #determine base p0 probability # baseP = p0/(length(currentGraph)-length(current_node_set)) # #set probabilities of unseen nodes to baseP # currentGraph[!(names(currentGraph) %in% current_node_set)] = baseP # # Sanity check. p0_event should add up to exactly p0 (global variable) # p0_event = sum(unlist(currentGraph[!(names(currentGraph) %in% current_node_set)])) # currentGraph = graph.diffuseP1(p1, startNode, currentGraph, current_node_set, 1, verbose=FALSE) # # Sanity check. p1_event should add up to exactly p1 (global variable) # p1_event = sum(unlist(currentGraph[!(names(currentGraph) %in% current_node_set)])) # if (abs(p1_event-1)>thresholdDiff) { # extra.prob.to.diffuse = 1-p1_event # currentGraph[names(current_node_set)] = 0 # currentGraph[!(names(currentGraph) %in% names(current_node_set))] = unlist(currentGraph[!(names(currentGraph) %in% names(current_node_set))]) + extra.prob.to.diffuse/sum(!(names(currentGraph) %in% names(current_node_set))) # } # #Set startNode to a node that is the max probability in the new currentGraph # maxProb = names(which.max(currentGraph)) # # Break ties: When there are ties, choose the first of the winners. # startNode = names(currentGraph[maxProb[1]]) # if (!is.null(num.misses)) { # if (startNode %in% S) { # numMisses = 0 # } else { # numMisses = numMisses + 1 # } # current_node_set = c(current_node_set, startNode) # if (numMisses>num.misses || length(c(startNode,current_node_set))>=(length(G))) { # stopIterating = TRUE # } # } else { # # Keep drawing until you've drawn all nodes. # current_node_set = c(current_node_set, startNode) # if (length(current_node_set)>=(length(G))) { # stopIterating = TRUE # } # } # # } # return(current_node_set) # } # # args = commandArgs(trailingOnly=TRUE) # diag = as.character(args[1]) # fold = as.numeric(args[2]) # n = as.numeric(args[3]) # wkdir = as.character(args[4]) # rankdir = as.character(args[5]) # graphdir = as.character(args[6]) # # model=diag # print(diag) # print(fold) # print(n) # # setwd(sprintf("%s/%s",wkdir,rankdir)) # ig = loadToEnv(sprintf("../%s/bg_%s_fold%d.RData", graphdir, model,fold))[["ig_pruned"]] # # print(ig) # V(ig)$name = tolower(V(ig)$name) # G = vector(mode="list", length=length(V(ig)$name)) # names(G) = V(ig)$name # adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight"))) # p0=0.1 # p1=0.9 # thresholdDiff=0.01 # all_nodes = tolower(V(ig)$name) # print(head(all_nodes)) # current_node_set = singleNode.getNodeRanksN(n, G) # new_dir = sprintf("./%s/diag%s%d",model, diag, fold) # if (!dir.exists(new_dir)) { # str = sprintf("mkdir %s", new_dir) # system(str) # } # save(current_node_set, file=sprintf("./%s/diag%s%d/%d.RData",model, diag, fold, n)) # ################################ end of getRanksN.r ########################################## ################################### getRanks.pbs ############################################## # # Request 1 processors on 1 node # #PBS -l nodes=1:ppn=1 # # #Request x number of hours of walltime # #PBS -l walltime=1:00:00 # # #Request that regular output and terminal output go to the same file # #PBS -j oe # #PBS -m abe # # module load R/3.5 # # diag=${diag} # fold=${fold} # n=${n} # wkdir=${wkdir} # rankdir=${rankdir} # cd ${wkdir}/${rankdir} # # Rscript ../getRanksN.r $diag $fold $n $wkdir ${rankdir} ${graphdir} > ranks:$diag-$fold-$n.out # rm ranks:$diag-$fold-$n.out ###################################### end of getRanks.pbs ########################################
rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) # Add CTDsim distance from "downtown module" as second quantitative variable disFromDowntown = function(dis_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G) { p1.e = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)[,"IS.alt"] p2.e = mle.getEncodingLength(p2.optBS, NULL, ptID2, G)[,"IS.alt"] # Using predefined node ranks, get optimal bitstring for encoding of patient1's union patient2's subsets. p12.sig.nodes = unique(c(dis_mod, p2.sig.nodes)) p12.e = c() for (k in 1:length(ptBSbyK.dis)) { dis_mod_cpy = dis_mod p2.sig.nodes_cpy = p2.sig.nodes dis_mod_k = names(which(ptBSbyK.dis[[k]]==1)) p2.sig.nodes_k = names(which(p2.optBS[[k]]==1)) while (length(dis_mod_k)<k) { dis_mod_k = unique(c(dis_mod_k, dis_mod_cpy[1])) dis_mod_cpy = dis_mod_cpy[-1] } while (length(p2.sig.nodes_k)<k) { p2.sig.nodes_k = unique(c(p2.sig.nodes_k, p2.sig.nodes_cpy[1])) p2.sig.nodes_cpy = p2.sig.nodes_cpy[-1] } p12.sig.nodes_k = sapply(unique(c(dis_mod_k, p2.sig.nodes_k)), trimws) p12.optBS = mle.getPtBSbyK(p12.sig.nodes_k, ranks) res = mle.getEncodingLength(p12.optBS, NULL, NULL, G) p12.e[k] = res[which.max(res[,"d.score"]), "IS.alt"] + log2(choose(length(G), 1))*(length(p12.sig.nodes_k)-which.max(res[,"d.score"])) } return (list(p1.e=p1.e, p2.e=p2.e, p12.e=p12.e, NCD=(p12.e-apply(cbind(p1.e, p2.e), 1, min))/apply(cbind(p1.e, p2.e), 1, max))) } data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc")) cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T)) cohorts[["alaimo"]]=NULL data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] table(data_mx["argininosuccinate",]) p0=0.1 p1=0.9 thresholdDiff=0.01 # Get downtown disease modules for all modelled disease states disMod = list() data_mx0=data_mx # use edta samples for model arg; edta sampels for model asld; hep samples for otc cohorts[["arg"]]=cohorts[["arg"]][grep("EDTA",cohorts[["arg"]])] cohorts[["asld"]]=cohorts[["asld"]][grep("EDTA",cohorts[["asld"]])] cohorts[["otc"]]=cohorts[["otc"]][grep("HEP",cohorts[["otc"]])] # showing one example for (model in c("cit")) { # "aadc", "abat", "adsl", "arg", "asld", ,"otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku" print(sprintf("%s...", toupper(model))) data_mx=data_mx0 mn = apply(data_mx[,which(colnames(data_mx) %in% cohorts[[model]])], 1, function(i) mean(na.omit(i))) mn = mn[-which(is.na(mn))] downtown_disease_module = c() for (pt in 1:length(cohorts[[model]])) { # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData",graph.dir, model, pt))[["ig_pruned"]] ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, pt),package = "CTDext"))[["ig_pruned"]] # ranks = loadToEnv(sprintf("%s/bg_%s_fold%d.RData",rank.dir, model, pt))[["ig_pruned"]] ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", model, pt),package = "CTDext"))[["permutationByStartNode"]] ranks = lapply(ranks, tolower) adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight"))) G = vector(mode="list", length=length(V(ig)$name)) names(G) = V(ig)$name S = mn[which(abs(mn)>2)] S = S[which(names(S) %in% names(G))] it=0 while (length(S)<15) { S = mn[which(abs(mn)>(2-0.1*it))] S = S[which(names(S) %in% names(G))] it = it + 1 } ptBSbyK = mle.getPtBSbyK(names(S), ranks) res = mle.getEncodingLength(ptBSbyK, NULL, NULL, G) downtown_disease_module = c(downtown_disease_module, names(which(ptBSbyK[[which.max(res[,"d.score"])]]==1))) } print(model) length(unique(downtown_disease_module)) # Estimate disease module "purity" based on 2 size thresholds and known case mean distances fold=1 # ig = loadToEnv(sprintf("graphs_GMpaper/bg_%s_fold%d.RData",graph.dir, model, fold))[["ig_pruned"]] ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]] # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData",rank.dir ,model, fold))[["permutationByStartNode"]] ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", model, fold),package = "CTDext"))[["permutationByStartNode"]] ranks = lapply(ranks, tolower) adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight"))) G = vector(mode="list", length=length(V(ig)$name)) names(G) = V(ig)$name data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ] data_mx = data_mx[,which(colnames(data_mx) %in% cohorts[[model]])] purity_mets = list() purity = c() # Size1: Full downtown disease module tmp.mm = unique(downtown_disease_module) tmp.mm = tmp.mm[which(tmp.mm %in% names(G))] ptBSbyK.dis = mle.getPtBSbyK(tmp.mm, ranks) res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G) downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1)) print(length(unique(downtown_disease_mod))) dd = c() for (pt in 1:length(cohorts[[model]])) { ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_mod), ranks) p2.sig.nodes = names(data_mx[order(abs(data_mx[,pt]), decreasing = TRUE), pt][1:length(unique(downtown_disease_mod))]) p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks) ctdsim = disFromDowntown(unique(downtown_disease_mod), ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G) dd[pt] = min(ctdsim$NCD) } purity_mets[[1]] = unique(downtown_disease_module) purity[1] = mean(dd) # Size2: Smaller disease module downtown_disease_module = names(which(table(downtown_disease_module)>(length(cohorts[[model]])/2))) tmp.mm = unique(downtown_disease_module) tmp.mm = tmp.mm[which(tmp.mm %in% names(G))] ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_module), ranks) res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G) downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1)) print(length(unique(downtown_disease_mod))) dd = c() for (pt in 1:length(cohorts[[model]])) { ptBSbyK.dis = mle.getPtBSbyK(unique(downtown_disease_mod), ranks) p2.sig.nodes = names(data_mx[order(abs(data_mx[,pt]), decreasing = TRUE), pt][1:length(unique(downtown_disease_mod))]) p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks) ctdsim = disFromDowntown(unique(downtown_disease_mod), ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G) dd[pt] = min(ctdsim$NCD) } purity_mets[[2]] = downtown_disease_module purity[2] = mean(dd) # Selected disease module based on "purity" estimates downtown_disease_module = purity_mets[[which.min(purity)]] disMod[[model]] = downtown_disease_module } lapply(disMod, length) save(disMod, file=sprintf("%s/disMod.RData",model.dir))
``` {r runCTD}
require(R.utils) require(CTD) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
dupNames=loadToEnv("clinical_data.RData")[["dupNames"]] data_mx=data_mx[,-which(colnames(data_mx) %in% dupNames[,1])] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx
p0=0.1 p1=0.9 thresholdDiff=0.01 kmx=30
disMod=loadToEnv(system.file("shiny-app/disMod_Oct2020.RData",package = "CTDext"))[["disMod"]]
disFromDowntown = function(dis_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G, mn_prof, p2_prof) { p1.e = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G)[,"IS.alt"] p2.e = mle.getEncodingLength(p2.optBS, NULL, ptID2, G)[,"IS.alt"]
dirSim = vector("numeric", length = length(ptBSbyK.dis)) for (k in 1:length(ptBSbyK.dis)) { p1.sn = dis_mod[1:k] p2.sn = p2.sig.nodes[1:k] p1.dirs = mn_prof[p1.sn] p1.dirs[which(!(p1.dirs > 0))] = 0 p1.dirs[which(p1.dirs > 0)] = 1 p2.dirs = p2_prof[p2.sn] p2.dirs[which(!(p2.dirs > 0))] = 0 p2.dirs[which(p2.dirs > 0)] = 1 p1.snn = sprintf("%s%d", p1.sn, p1.dirs) p2.snn = sprintf("%s%d", p2.sn, p2.dirs) dirSim[k] = 1 - (length(intersect(p1.snn, p2.snn))/length(union(p1.snn, p2.snn))) } # Using predefined node ranks, get optimal bitstring for encoding of patient1's union patient2's subsets. p12.sig.nodes = unique(c(dis_mod, p2.sig.nodes)) p12.e = c() for (k in 1:length(ptBSbyK.dis)) { dis_mod_cpy = dis_mod p2.sig.nodes_cpy = p2.sig.nodes
dis_mod_k = names(which(ptBSbyK.dis[[k]]==1)) p2.sig.nodes_k = names(which(p2.optBS[[k]]==1)) while (length(dis_mod_k)<k) { dis_mod_k = unique(c(dis_mod_k, dis_mod_cpy[1])) dis_mod_cpy = dis_mod_cpy[-1] } while (length(p2.sig.nodes_k)<k) { p2.sig.nodes_k = unique(c(p2.sig.nodes_k, p2.sig.nodes_cpy[1])) p2.sig.nodes_cpy = p2.sig.nodes_cpy[-1] } p12.sig.nodes_k = sapply(unique(c(dis_mod_k, p2.sig.nodes_k)), trimws) p12.optBS = mle.getPtBSbyK(p12.sig.nodes_k, ranks) res = mle.getEncodingLength(p12.optBS, NULL, NULL, G) p12.e[k] = res[which.max(res[,"d.score"]), "IS.alt"] + log2(choose(length(G), 1))*(length(p12.sig.nodes_k)-which.max(res[,"d.score"]))
} return (list(p1.e=p1.e, p2.e=p2.e, p12.e=p12.e, JAC = dirSim, NCD=(p12.e-apply(cbind(p1.e, p2.e), 1, min))/apply(cbind(p1.e, p2.e), 1, max))) }
for (model in c("aadc","cob","pa","abat", "adsl", "arg", "asld", "cit", "otc", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { #
data_mx=data_mx.0
table(data_mx["argininosuccinate",])
# Selected disease module
mn = apply(data_mx[,which(colnames(data_mx) %in% cohorts[[model]])], 1, function(i) mean(na.omit(i)))
mn = mn[-which(is.na(mn))]
downtown_disease_module = disMod[[model]]
# For each network fold, run CTDdm against several diagnoses
for (fold in 1:length(cohorts[[model]])) {
print(sprintf("Fold %d/%d...", fold, length(cohorts[[model]])))
# ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]]
ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]]
# ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData",rank.dir, model, fold))[["permutationByStartNode"]]
ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]]
ranks = lapply(ranks, tolower)
adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight")))
G = vector(mode="list", length=length(V(ig)$name))
names(G) = V(ig)$name
data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ]
data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))]
tmp.mm = unique(downtown_disease_module) tmp.mm = tmp.mm[which(tmp.mm %in% names(G))] ptBSbyK.dis = mle.getPtBSbyK(tmp.mm, ranks) res = mle.getEncodingLength(ptBSbyK.dis, NULL, ptID, G) downtown_disease_mod = names(which(ptBSbyK.dis[[which.max(res[,"d.score"])]]==1)) ptBSbyK.dis = mle.getPtBSbyK(downtown_disease_mod, ranks) df_disMod = data.frame(ptID=character(), diag=character(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE) for (p in 1:ncol(data_mx)) { #print(sprintf("Patient %d/%d...", p, ncol(data_mx))) ptID = colnames(data_mx)[p] if (ptID %in% unlist(cohorts)) {diag = names(cohorts)[which(unlist(lapply(cohorts, function(i) ptID %in% i)))]}else{diag = "reference"} # CTD: get module best explained by network's connectivity S = data_mx[order(abs(data_mx[,p]), decreasing = TRUE),p] p2.sig.nodes = names(S)[1:length(downtown_disease_mod)] p2.optBS = mle.getPtBSbyK(p2.sig.nodes, ranks) ctdDisMod = disFromDowntown(downtown_disease_mod, ptBSbyK.dis, p2.sig.nodes, p2.optBS, ranks, G, mn, S) #if (ptID=="X606789") {print(min(ctdDisMod$NCD))} df_disMod[p, "ptID"] = colnames(data_mx)[p] df_disMod[p, "diag"] = diag[1] df_disMod[p, "ctdDisMod"] = min(ctdDisMod$NCD) df_disMod[p, "jac"] = min(ctdDisMod$JAC) } save(df_disMod, file=sprintf("%s/ctdDisMod_%s_fold%d.RData",model.dir, model, fold))
} }
for (kmx in c(30)) { for (model in c( "aadc","cob","pa", "abat", "adsl", "arg", "asld", "cit", "otc", "ga", "gamt", "msud", "mma", ,"zsd", "rcdp", "pku")) { # data_mx=data_mx.0 # table(data_mx["argininosuccinate",]) # For each network fold, runCTD against several diagnoses for (fold in 1:length(cohorts[[model]])) { # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]] ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]] # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData", model, rank.dir fold))[["permutationByStartNode"]] ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]]
ranks = lapply(ranks, tolower) adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight"))) G = vector(mode="list", length=length(V(ig)$name)) names(G) = V(ig)$name data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ] data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))] df_ctd = data.frame(ptID=character(), S=character(), lenS=numeric(), optT=numeric(), fishers=numeric(), I0=numeric(), IA=numeric(), d=numeric(), stringsAsFactors = FALSE) r=1 ptBSbyK = list() for (p in 1:ncol(data_mx)) { ptID = colnames(data_mx)[p] if (ptID %in% unlist(cohorts)) {diag = names(cohorts)[which(unlist(lapply(cohorts, function(i) ptID %in% i)))]}else{diag = "reference"} S = data_mx[order(abs(data_mx[,p]), decreasing = TRUE),p][1:kmx] ptBSbyK[[ptID]] = mle.getPtBSbyK(names(S), ranks) res = mle.getEncodingLength(ptBSbyK[[ptID]], NULL, ptID, G) #if (ptID=="X606789") { # print(sprintf("Patient %d/%d...", p, ncol(data_mx))) # print(max(res[,"d.score"])) #} for (k in 1:kmx) { df_ctd[r, "ptID"] = colnames(data_mx)[p] df_ctd[r, "diag"] = diag[1] df_ctd[r, "S"] = paste(names(S)[1:k], collapse="/") df_ctd[r, "lenS"] = k df_ctd[r, "optT"] = res[k, "opt.T"] df_ctd[r, "I0"] = res[k, "IS.null"] df_ctd[r, "IA"] = res[k, "IS.alt"] df_ctd[r, "d"] = res[k, "d.score"] r = r + 1 } } save(ptBSbyK, df_ctd, file=sprintf("%s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) }
} }
rm(list=setdiff(ls(),c(grep("dir|cohorts",ls(),value = T),lsf.str())))
for (model in c("aadc","cob","pa", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { ##" for (fold in 1:length(cohorts[[model]])) { load(sprintf("%s/ctdDisMod_%s_fold%d.RData",model.dir, model, fold)) for (kmx in c(30)) { load(sprintf("%s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) df = data.frame(ptID=character(), S=character(), lenS=numeric(), optT=numeric(), fishers=numeric(), I0=numeric(), IA=numeric(), d=numeric(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE) r = 1 for (p in 1:nrow(df_disMod)) { for (k in 1:kmx) { df[r, "ptID"] = df_ctd[r,"ptID"] df[r, "diag"] = df_ctd[r,"diag"] df[r, "S"] = df_ctd[r,"S"] df[r, "lenS"] = df_ctd[r,"lenS"] df[r, "optT"] = df_ctd[r,"optT"] df[r, "I0"] = df_ctd[r,"I0"] df[r, "IA"] = df_ctd[r,"IA"] df[r, "d"] = df_ctd[r,"d"] df[r, "ctdDisMod"] = df_disMod[p,"ctdDisMod"] df[r, "jac"] = df_disMod[p,"jac"] r = r + 1 } } save(ptBSbyK, df, file=sprintf("%s/model_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) system(sprintf("rm %s/ctd_%s_fold%d_kmx%d.RData", model.dir, model, fold, kmx)) } system(sprintf("rm %s/ctdDisMod_%s_fold%d.RData", model.dir, model, fold)) } }
require(CTD) require(R.utils) require(R.utils) require(CTD) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]]
dupNames=loadToEnv("clinical_data.RData")[["dupNames"]] data_mx=data_mx[,-which(colnames(data_mx) %in% dupNames[,1])] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx
which(apply(data_mx, 2, function(i) sum(na.omit(i)==0)/length(na.omit(i)))==1) for (kmx in c(30)) { # for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "zsd", "rcdp", "msud", "mma", "pa", "pku")) { # df_model = data.frame(fold=numeric(), pt=numeric(), bits=numeric(), ctdDisMod=numeric(), jac=numeric(), diag=character(), stringsAsFactors = FALSE) for (fold in 1:length(cohorts[[model]])) { df = loadToEnv(sprintf("%s/model_%s_fold%d_kmx%d.RData", model.dir , model, fold, kmx))[["df"]] df = df[!is.na(df$ptID),] pts = unique(df$ptID) df_best = data.frame(pt=numeric(), ptID=character(), bits=numeric(), diag=character(), ctdDisMod=numeric(), jac=numeric(), stringsAsFactors = FALSE) for (pt in 1:length(pts)) { pt_data = df[which(df$ptID==pts[pt]),] ptID = unique(df[which(df$ptID==pts[pt]), "ptID"]) if (pt_data[1,"diag"]==model) { df_best[pt, "pt"] = which(cohorts[[model]]==ptID) } df_best[pt, "ptID"] = ptID df_best[pt, "bits"] = max(df[which(df$ptID==pts[pt]), "d"])-log2(nrow(pt_data)) df_best[pt, "jac"] = pt_data[1, "jac"] df_best[pt, "diag"] = unique(pt_data[,"diag"]) df_best[pt, "ctdDisMod"] = pt_data[1, "ctdDisMod"] } df_best$bits[which(df_best$bits<0)] = 0 df_best$fold = rep(fold, nrow(df_best)) df_model = rbind(df_model, df_best) } # Do not do multiple hypothesis test correction here. #df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="fdr")) df_model$loocv = rep(0, nrow(df_model)) df_model$loocv[which(df_model$pt==df_model$fold)] = 1 df_model = df_model[-intersect(which(df_model$loocv==0), which(df_model$diag==model)), ] b_bits = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"bits"]))) ctdDisMod_loocv = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"ctdDisMod"]))) jac_loocv = cbind(unique(df_model$ptID), sapply(unique(df_model$ptID), function(i) mean(df_model[which(df_model$ptID==i),"jac"]))) df_model = df_model[-which(duplicated(df_model$ptID)),] df_model = df_model[order(df_model$ptID),] b_bits = b_bits[order(b_bits[,1]),] ctdDisMod_loocv = ctdDisMod_loocv[order(ctdDisMod_loocv[,1]),] jac_loocv = jac_loocv[order(jac_loocv[,1]),] df_model$bits = as.numeric(b_bits[,2]) df_model$ctdDisMod = as.numeric(ctdDisMod_loocv[,2]) df_model$jac = as.numeric(jac_loocv[,2]) print(any(is.na(df_model$bits))) print(any(is.na(df_model$ctdDisMod))) kmx = nrow(pt_data) save(df_model, file=sprintf("%s/dd-%s-loocv-kmx%d.RData",model.dir, model, kmx)) #sapply(1:length(cohorts[[model]]), function(i) system(sprintf("rm model/model_%s_fold%d_kmx%d.RData", model, i, kmx))) } }
require(EmpiricalBrownsMethod) setwd(model.dir) models = c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "mma", "msud", "otc", "pa", "pku", "rcdp", "zsd")
ff = list.files(".", pattern=sprintf("loocv-kmx%d.RData", 30)) dff = loadToEnv(ff[1])[["df_model"]] pts = dff$ptID for (kmx in 30) {#, 30, 5 ff = list.files(".", pattern=sprintf("loocv-kmx%d.RData", kmx)) df_models = list() for (model in 1:length(models)) { df_model = loadToEnv(ff[model])[["df_model"]] ctd_pval_uncorrected = ifelse(2^-df_model[, "bits"]>1, 1, 2^-df_model[, "bits"]) df_model$bits2 = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) ctd_pval = ifelse(2^-df_model[, "bits2"]>1, 1, 2^-df_model[, "bits2"]) ctdDisMod_percentile = sapply(df_model[, "ctdDisMod"], function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) combined_network = apply(cbind(ctd_pval, ctdDisMod_percentile), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test) names(ctd_pval) = df_model$ptID names(ctdDisMod_percentile) = df_model$ptID names(combined_network) = df_model$ptID df_models[[models[model]]][["ctd_pval_uncorrected"]] = ctd_pval_uncorrected df_models[[models[model]]][["ctd_pval"]] = ctd_pval df_models[[models[model]]][["ctdDisMod_percentile"]] = ctdDisMod_percentile df_models[[models[model]]][["combined_network"]] = combined_network print(model) }
pt_ranks = list() for (pt in 1:length(pts)) { ptID = pts[pt] # ptID_coded = clnms_map[which(clnms_map[,1]==ptID), 2] ptID_coded = ptID for (pc in ptID_coded) { df_pt = data.frame(model=character(), ctd=numeric(), ctdDisMod=numeric(), brown.comb=numeric(), stringsAsFactors = FALSE) r = 1 for (model in 1:length(models)) { ctd_pval = df_models[[models[model]]][["ctd_pval"]] ctdDisMod_percentile = df_models[[models[model]]][["ctdDisMod_percentile"]] combined_network = df_models[[models[model]]][["combined_network"]]
df_pt[r, "model"] = gsub(sprintf("dd-|-loocv-kmx%d.RData", kmx), "", ff[model]) df_pt[r, "ctd"] = ctd_pval[pt] df_pt[r, "ctdDisMod"] = ctdDisMod_percentile[pt] df_pt[r, "brown.comb"] = combined_network[pt] # Use Brown's combined for dependent tests r = r + 1 } pt_ranks[[pc]] = df_pt }
} save(pt_ranks, file=sprintf("ptRanks_kmx%d.RData", kmx)) }
setwd("../") require(pROC) require(pls) require(EmpiricalBrownsMethod)
dff = data.frame(kmx=numeric(), model=character(), ctd.auc=numeric(), ctddismod.auc=numeric(), jac.auc=numeric(), combined.auc=numeric(), stringsAsFactors = FALSE) r= 1 for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "otc", "pa", "pku", "rcdp", "zsd")) { for (kmx in c(30)) { print(sprintf("MODEL %s...", model)) # df_model$bits add Bonferroni correction load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) df_model = df_model[-which(df_model$diag %in% c("alaimo","maps")),] df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) print(toupper(model)) print(dim(df_model)) diags = df_model$diag diags[-which(diags==model)] = 0 diags[which(diags==model)] = 1
# AUC, CTD only ctd.auc = roc(diags, df_model$bits) print(ctd.auc$auc) # AUC, CTDdisMod only #ctddismod.auc = roc(diags, df_model$ctdDisMod) #print(ctddismod.auc$auc) # AUC, Jaccard only #jac.auc = roc(diags, df_model$jac) #print(jac.auc$auc) # AUC, Network-combined: CTD+CTDdisMod ctd_pvals = 2^-(df_model$bits) ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test) network.auc = roc(diags, combined_brown) print(network.auc$auc) # AUC, all CTD+CTDdisMod+Jaccard, Browns combined ctd_pvals = 2^-(df_model$bits) ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model)) combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test) all.auc = roc(diags, combined_brown) print(all.auc$auc) dff[r, "kmx"] = kmx dff[r, "model"] = model dff[r, "ctd.auc"] = ctd.auc$auc #dff[r, "ctddismod.auc"] = ctddismod.auc$auc #dff[r, "jac.auc"] = jac.auc$auc dff[r, "network.auc"] = network.auc$auc dff[r, "all.auc"] = all.auc$auc r = r + 1
} } save(dff,file=sprintf("%s/AUC_kmx%s.RData",model.dir,kmx))
## Compare Usefulness of Information Mined in a Pathway Context to an Unbiased Network Context ``` {r pathway-vs-ctd-global} # Pathway discrimination: PLS w/ metabolies in urea cycle. Diagnose patients for urea cycle defects. # CTD Global discrimination: CTD disease specific models call "yes" or "no" based on significance of patient's top metabolite perturbations. require(CTD) require(R.utils) require(pls) require(pROC) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.ogBnry"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] # models=c(c("asld", "aadc", "abat", "adsl", "arg","zsd", "rcdp","otc")) # cohorts[models]=sapply(models, function(x) grep("edta",cohorts[[x]],ignore.case = T,value = T)) # cohorts[["alaimo"]]=NULL data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx kmx=30 p0=0.1 p1=0.9 thresholdDiff=0.01 refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)] #cohorts$arg = c(cohorts$arg, cohorts$test_arg) #cohorts$asld = c(cohorts$asld, cohorts$test_asld) #cohorts$otc = c(cohorts$otc, cohorts$test_otc) cohorts = cohorts[-which(names(cohorts) %in% c("alaimo", "maps", "abat", "aadc", "adsl", "zsd", "rcdp", "test_arg", "test_asld", "test_otc", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "hep_refs"))] data_mx = data_mx[,which(colnames(data_mx) %in% c(unlist(cohorts)))] fill.rate = apply(data_mx, 1, function(i) sum(is.na(i))/length(i)) data_mx = data_mx[which(fill.rate<0.20),] data_mx = data_mx[,sort(colnames(data_mx))] print(dim(data_mx)) table(data_mx["argininosuccinate",]) data_mx["argininosuccinate", which(data_mx["argininosuccinate",]!=0)] = 1 load(system.file(sprintf("extdata/RData/%s.RData", "Arginine-Metabolism"), package="CTDext")) uc_pathway_mets = V(ig)$label[which(V(ig)$shape=="circle")] uc_pathway_mets = uc_pathway_mets[-which(uc_pathway_mets %in% c("H2O", "FAD", "O2", "NADP+", "CO2", "ADP", "NADH", "ATP", "PPi", "H+", "Mg2+", "Ca2+", "FMN", "PLP", "AMP", "Mg2+", "Mn2+", "NAD+", "ARGININE METABOLISM AND UREA CYCLE", "Aspartate Glutamate Metabolism", "Proline Metabolism", "Polyamine Metabolism", "TCA Cycle", "Pi"))] uc_pathway_mets = uc_pathway_mets[-which(uc_pathway_mets %in% c("NADPH", "Co2+", "NH3"))] uc_pathway_metsAll = tolower(uc_pathway_mets) uc_pathway_metsLocal = c("argininosuccinate", "citrulline", "arginine", "ornithine") dff_all = data.frame(diag=character(), model=character(), sensitivity=numeric(), specificity=numeric(), stringsAsFactors = FALSE) for (model in c("arg", "asld", "cit", "otc")) {# print(sprintf("%s...", toupper(model))) diags = colnames(data_mx) diags[which(!(diags %in% cohorts[[model]]))] = 0 diags[which(diags %in% cohorts[[model]])] = 1 # Data frame for pathway (local) view - PLS on zscores, predicting diagnosis (0=control, 1=case) uc_data = rbind(diags, data_mx) rownames(uc_data)[1] = "diag" uc_data = uc_data[which(rownames(uc_data) %in% c("diag", uc_pathway_metsLocal)),] uc_data = apply(uc_data, c(1,2), as.numeric) # Full pathway view uc_data2 = rbind(diags, data_mx) # uc_data2=data_mx rownames(uc_data2)[1] = "diag" uc_data2 = uc_data2[which(rownames(uc_data2) %in% c("diag", uc_pathway_metsAll)),] uc_data2 = apply(uc_data2, c(1,2), as.numeric) # temp=data.frame(diag=as.numeric(diags),df=I(t(uc_data2))) # uc_data2=temp # All frequently detected molecules (fill rate > 0.8) # uc_data3=data_mx uc_data3 = rbind(diags, data_mx) rownames(uc_data3)[1] = "diag" fillrate = apply(uc_data3, 1, function(x) sum(!is.na(x))/dim(uc_data3)[2]) uc_data3 = uc_data3[fillrate>=0.8,] uc_data3 = apply(uc_data3, c(1,2), as.numeric) NAasMin = function(x){ if (sum(is.na(x))>0) {x[is.na(x)]=min(x,na.rm = T)} return(x) } for (r in 1:ncol(uc_data3)){ uc_data3[,r]=NAasMin(uc_data3[,r]) } # temp=data.frame(diag=as.numeric(diags),df=I(t(uc_data3))) # uc_data3=temp # rm(temp) # Data frame for CTD/CTDdisMod network-quantified variables load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) df_model = df_model[which(df_model$ptID %in% colnames(data_mx)),] df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) ctd_pvals = 2^-(df_model$bits) ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod), p_values = i, TRUE)$P_test) #df2_ctd = data.frame(combined=combined_brown, ctd=df_model$bits, ctddisMod=df_model$ctdDisMod, diag=as.numeric(diags)) # df2_ctd = data.frame(combined=-log2(combined_brown), ctd=df_model$bits, ctddisMod=-log2(ctdDisMod_pvals), diag=as.numeric(diags)) # #df2_ctd = data.frame(combined=-log2(combined_brown), diag=as.numeric(diags)) # rownames(df2_ctd) = df_model$ptID # LOOCV PLS dff_model = data.frame(pls_pwyLocal=numeric(), pls_pwyAll=numeric(), pls_ctd_ctddm=numeric(), pls_allfreqmets=numeric(), stringsAsFactors = FALSE) for (it in 1:length(diags)) { isTrain = c(1:length(diags))[-it] # Pathway Z-score feature selection: Local model_res = plsr(diag~., data = as.data.frame(t(uc_data[,isTrain]))) tst_data = uc_data[,-isTrain] model_tst = predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data))) dff_model[it, "pls_pwyLocal"] = model_tst # Pathway Z-score feature selection: All model_res = plsr(diag~., data = as.data.frame(t(uc_data2[,isTrain]))) # model_res = plsr(diag~df, data = temp) tst_data = uc_data2[,-isTrain] if (length(which(is.na(tst_data))) >0) { tst_data[which(is.na(tst_data))] = min(na.omit(uc_data2[which(is.na(tst_data)),])) } model_tst = predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data))) dff_model[it, "pls_pwyAll"] = model_tst # CTD and CTDdm as covariates ctd_res = plsr(diag~., data = df2_ctd[isTrain,]) tst_data = as.matrix(df2_ctd[-isTrain,-which(colnames(df2_ctd)=="diag")]) if (ncol(tst_data)==1) { colnames(tst_data) = "combined" } ctd_tst = predict(ctd_res, ncomp=ctd_res$ncomp, newdata=tst_data) dff_model[it, "pls_ctd_ctddm"] = ctd_tst } # All frequently detected molecules # model_res = mvr(diag~., data = as.data.frame(t(uc_data3[,isTrain])),validation = "CV") # uc_data3.train=uc_data3 # uc_data3.train=uc_data3.train[isTrain,] model_res = plsr(diag~., data = as.data.frame(t(uc_data3)),validation = "LOO") # temp=selectNcomp(model_res, method = c("randomization"), # nperm = 999, alpha = 0.01, ncomp = model_res$ncomp, # plot = FALSE) # tst_data = uc_data3[,-isTrain] # if (length(which(is.na(tst_data))) >0) { # tst_data[which(is.na(tst_data))] = apply(uc_data3[which(is.na(tst_data)),], 1, function(x) min(x,na.rm = T)) # } # model_tst = predict(model_res, ncomp=model_res$ncomp, newdata=as.data.frame(t(tst_data))) # model_tst = model_res$validation$pred[it,,which.min(model_res$validation$PRESS)] # model_tst = model_res$validation$pred[it,,9] dff_model[, "pls_allfreqmets"] = model_res$validation$pred[,,model_res$ncomp] # dff_model[, "pls_allfreqmets"] = model_res$validation$pred[,,which.min(model_res$validation$PRESS)] pls_ctd_auc = roc(diags, dff_model$pls_ctd_ctddm) pls_pwyLocal_auc = roc(diags, dff_model$pls_pwyLocal) pls_pwyAll_auc = roc(diags, dff_model$pls_pwyAll) pls_allfreqmets_auc = roc(diags, dff_model$pls_allfreqmets) print(sprintf("PLS.ctd.ctddisMod AUC = %.3f", pls_ctd_auc$auc)) print(sprintf("PLS.pathwayLocal AUC = %.3f", pls_pwyLocal_auc$auc)) print(sprintf("PLS.pathwayAll AUC = %.3f", pls_pwyAll_auc$auc)) print(sprintf("PLS.allfreq AUC = %.3f", pls_allfreqmets_auc$auc)) ind = which(duplicated(pls_pwyLocal_auc$specificities)) ind2 = which(duplicated(pls_pwyAll_auc$specificities)) ind3 = which(duplicated(pls_ctd_auc$specificities)) ind4 = which(duplicated(pls_allfreqmets_auc$specificities)) dff2 = data.frame(diag=character(), model=character(), sensitivity=numeric(), specificity=numeric(), stringsAsFactors = FALSE) nrw = 4+length(pls_pwyLocal_auc$specificities[-ind]) + length(pls_pwyAll_auc$specificities[-ind2]) + length(pls_ctd_auc$specificities[-ind3]) + length(pls_allfreqmets_auc$specificities[-ind4]) dff2[1:nrw, "diag"] = rep(model, nrw) dff2[1:nrw, "model"] = c(rep("pls-pwyLocal", 1+length(pls_pwyLocal_auc$specificities[-ind])), rep("pls-pwyAll", 1+length(pls_pwyAll_auc$specificities[-ind2])), rep("plsctd", 1+length(pls_ctd_auc$specificities[-ind3])), rep("pls-allMets", 1+length(pls_ctd_auc$specificities[-ind4]))) dff2[1:nrw, "specificity"] = c(pls_pwyLocal_auc$specificities[-ind], 1, pls_pwyAll_auc$specificities[-ind2], 1, pls_ctd_auc$specificities[-ind3], 1, pls_allfreqmets_auc$specificities[-ind4], 1) dff2[1:nrw, "sensitivity"] = c(pls_pwyLocal_auc$sensitivities[-ind], 0, pls_pwyAll_auc$sensitivities[-ind2], 0, pls_ctd_auc$sensitivities[-ind3], 0, pls_allfreqmets_auc$sensitivities[-ind4], 0) dff_all = rbind(dff_all, dff2) } # save(dff_all,file="revisionOutput/pwy-vs-global_MJM_v1.RData") save(dff_all,file=sprintf("%s/pwy-vs-global_MJM_v1.RData",OP.dir)) # load("revisionOutput/pwy-vs-global_MJM_v1.RData") # load(sprintf("%s/pwy-vs-global_MJM_v2.RData",OP.dir)) require(ggplot2) # # svg("pathway-vs-global/pwy-vs-global_MJM_v1.svg") # svg("revisionOutput/pwy-vs-global_MJM_v1.svg") # ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) + facet_wrap(~diag) # dev.off() # # # png("pathway-vs-global/pwy-vs-global_MJM_v1.png") # png("revisionOutput/pwy-vs-global_MJM_v1.png") # ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) + facet_wrap(~diag) # dev.off() dff_all$model=c("Asa-Arg-Orn-Cit","Pathway","Network","Full profile")[match(dff_all$model,c("pls-pwyLocal","pls-pwyAll","plsctd","pls-allMets"))] dff_all$model=factor(dff_all$model,levels = c("Pathway","Asa-Arg-Orn-Cit","Network","Full profile")) dff_all$diag=c("Argininosuccinic aciduria","Arginase deficiency","Ornithine transcarbamylase deficiency","Citrullinemia")[match(dff_all$diag,c("asld","arg","otc","cit"))] p=ggplot(dff_all) + geom_line(aes(x=1-specificity, y=sensitivity, colour=model), size=1.5) + facet_wrap(~diag) + # scale_color_manual(labels = levels(dff_all$model), values = c("red","green","blue","purple")) + theme_bw()+ theme(legend.position = c(0.25,0.75), legend.title = element_blank(), legend.box.background = element_rect(colour = "black"), legend.background = element_blank(), # legend.spacing.y = unit(0.1, 'cm'), legend.key.size = unit(0.1, "cm"), text = element_text(family = "Arial")) + xlab("specificity") p ggsave(p,file=sprintf("%s/pwy-vs-global_MJM_v1.pdf",OP.dir),width = 4,height=4, device = cairo_pdf) ggsave(p,file=sprintf("%s/pwy-vs-global_MJM_v2.pdf",OP.dir),width = 4,height=4, device = cairo_pdf) ggsave(p,file=sprintf("revisionOutput/pwy-vs-global_MJM_v1.svg",OP.dir),width = 6,height=6, device = svglite) ggsave(p,file=sprintf("revisionOutput/pwy-vs-global_MJM_v2.svg",OP.dir),width = 6,height=6, device = svglite)
``` {r rank-diagnoses} require(EmpiricalBrownsMethod) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str())))
for (kmx in c(30)) { dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 print(sprintf("Kmx = %d", kmx)) # Include "asld" and "otc" in the 16 model run, then comment them out for 14 model run. for (model in c("aadc", "abat", "adsl", "arg", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd", "asld", "otc")) { # print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) # df_model$bits is already FDR corrected df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) # Excluse OTC and ASLD for 14 model run. Only rank diagnosed patients (not healthy control reference patients). df_model = df_model[-which(df_model$diag %in% c("maps", "hep_refs", "edta_refs")),] # "otc", "asld"
ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits)) ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model)) #combined_hm = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) mean(i)) combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test) #combined_fisher = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) stats.fishersMethod(i)) for (pt in 1:nrow(df_model)) { dff[r, "model"] = model dff[r, "ptID"] = df_model$ptID[pt] dff[r, "diag"] = df_model$diag[pt] dff[r, "ctd_pval"] = ctd_pvals[pt] dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt] dff[r, "combined_pval"] = combined_brown[pt] r = r + 1 } save(dff,file=sprintf("%s/ptpval_kmx%s.RData",OP.dir,kmx))
}
# CTD only # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "ctd_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "ctd_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"ctd_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"ctd_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd)
# CTDdisMod only # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "ctdDisMod_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "ctdDisMod_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"ctdDisMod_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"ctdDisMod_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd)
# Combined Network # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "combined_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "combined_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"combined_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"combined_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd) }
for (kmx in c(30)) { #seq(30,30,5) dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 print(sprintf("Kmx = %d", kmx)) # Include "asld" and "otc" in the 16 model run, then comment them out for 14 model run. for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd")) { # print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx%d.RData", model.dir, model, kmx)) # df_model$bits is already FDR corrected df_model$bits = -log2(p.adjust(2^-(df_model$bits), method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) # Excluse OTC and ASLD for 14 model run. Only rank diagnosed patients (not healthy control reference patients). df_model = df_model[-which(df_model$diag %in% c("maps", "hep_refs", "edta_refs", "otc")),]
ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits)) ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model)) combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test) for (pt in 1:nrow(df_model)) { dff[r, "model"] = model dff[r, "ptID"] = df_model$ptID[pt] dff[r, "diag"] = df_model$diag[pt] dff[r, "ctd_pval"] = ctd_pvals[pt] dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt] dff[r, "combined_pval"] = combined_brown[pt] r = r + 1 }
}
# Combined Network # Top 1 is correct diagnosis top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "combined_pval"]),c("model", "diag")]) top1 = sum(unlist(apply(top1, 2, function(i) i$model==i$diag)))/length(unique(dff$ptID)) print(top1) # Top 3 is correct diagnosis top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "combined_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3 = sum(unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(top3) # Length of DD len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"combined_pval"]<0.05)))) print(median(len_dd)) print(quantile(len_dd, 0.05)) print(quantile(len_dd, 0.95)) # Percentage in DD dd = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which(dff[which(dff$ptID==i),"combined_pval"]<0.05), c("model", "diag")]) dd = sum(unlist(apply(dd, 2, function(i) i$diag[1] %in% i$model)))/length(unique(dff$ptID)) print(dd) }
dff = data.frame(model=character(), ptID=character(), diag=character(), ctd_pval=numeric(), ctdDisMod_pval=numeric(), combined_pval=numeric(), stringsAsFactors = FALSE) r = 1 for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "cob", "ga", "gamt", "msud", "mma", "otc", "pa", "pku", "rcdp", "zsd")) { print(sprintf("MODEL %s...", model)) load(sprintf("%s/dd-%s-loocv-kmx30.RData", model.dir, model)) df_model$bits = -log2(p.adjust(2^-df_model$bits, method="bonferroni")) df_model$diag = gsub("test_", "", df_model$diag) df_model = df_model[which(df_model$diag %in% c("hep_refs", "edta_refs")),]
ctd_pvals = ifelse(2^-(df_model$bits)>1, 1, 2^-(df_model$bits)) ctdDisMod_pvals = sapply(df_model$ctdDisMod, function(i) length(which(df_model$ctdDisMod<=i))/nrow(df_model)) jac_pvals = sapply(df_model$jac, function(i) length(which(df_model$jac<=i))/nrow(df_model)) combined_brown = apply(cbind(ctd_pvals, ctdDisMod_pvals, jac_pvals), 1, function(i) empiricalBrownsMethod(data_matrix = rbind(df_model$bits, df_model$ctdDisMod, df_model$jac), p_values = i, TRUE)$P_test)
for (pt in 1:nrow(df_model)) { dff[r, "model"] = model dff[r, "ptID"] = df_model$ptID[pt] dff[r, "diag"] = df_model$diag[pt] dff[r, "ctd_pval"] = ctd_pvals[pt] dff[r, "ctdDisMod_pval"] = ctdDisMod_pvals[pt] dff[r, "combined_pval"] = combined_brown[pt] r = r + 1 } }
len_dd = unlist(sapply(unique(dff$ptID), function(i) length(which(dff[which(dff$ptID==i),"combined_pval"]<0.05)))) median(len_dd) quantile(len_dd, 0.05) quantile(len_dd, 0.95)
## Zellweger Spectrum Disorders: PEX1 (Peroxisome Biogenesis disorders) vs. PEX7 (RCDP) ``` {r single-node_ptSim} rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) # Compute the patient similarity between negative controls, true cases and the "unknown disease" patient cohort. require(R.utils) require(CTD) mkdirs("ptSim") thresholdDiff=0.01 kmx=30 p0=0.1 p1=0.9 data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)] data_mx = data_mx[,which(colnames(data_mx) %in% unlist(cohorts))] for (model in c("rcdp", "zsd")) { for (fold in 1:length(cohorts[[model]])) { # Load pre-computed node ranks (run on cluster in parallel and collated) # ranks = loadToEnv(sprintf("%s/%s%d-ranks.RData", rank.dir, toupper(model), fold))[["permutationByStartNode"]] ranks = loadToEnv(system.file(sprintf("ranks/ind_ranks/%s%d-ranks.RData", toupper(model), fold),package = "CTDext"))[["permutationByStartNode"]] ig = loadToEnv(system.file(sprintf("networks/ind_foldNets/bg_%s_fold%d.RData", model, fold),package = "CTDext"))[["ig_pruned"]] # ig = loadToEnv(sprintf("%s/bg_%s_fold%d.RData", graph.dir, model, fold))[["ig_pruned"]] ranks = lapply(ranks, tolower) names(ranks) = as.character(lapply(ranks, function(i) i[[1]][1])) adjacency_matrix = list(as.matrix(get.adjacency(ig, attr="weight"))) data_mx = data_mx[which(rownames(data_mx) %in% V(ig)$name), ] # Get patient bitstrings ahead of time ptBSbyK = list() for (pt in 1:ncol(data_mx)) { print(pt) ptID = colnames(data_mx)[pt] S1 = rownames(data_mx)[order(abs(data_mx[,ptID]), decreasing = TRUE)][1:kmx] ptBSbyK[[ptID]] = mle.getPtBSbyK(S1, ranks) } res = list() t = list(ncd=matrix(NA, nrow=ncol(data_mx), ncol=ncol(data_mx)), jac=matrix(NA, nrow=ncol(data_mx), ncol=ncol(data_mx))) rownames(t$ncd) = colnames(data_mx) colnames(t$ncd) = colnames(data_mx) rownames(t$jac) = colnames(data_mx) colnames(t$jac) = colnames(data_mx) for (i in 1:(kmx-1)) { res[[i]] = t } for (pt in 1:(ncol(data_mx)-1)) { print(pt) ptID = colnames(data_mx)[pt] for (pt2 in (pt+1):ncol(data_mx)) { print(sprintf(" vs. %d...", pt2)) ptID2 = colnames(data_mx)[pt2] tmp = mle.getPtDist(ptBSbyK[[ptID]], ptID, ptBSbyK[[ptID2]], ptID2, data_mx, ranks) for (k in 1:(kmx-1)) { res[[k]]$ncd[ptID, ptID2] = tmp$NCD[k] res[[k]]$jac[ptID, ptID2] = tmp$dirSim[k] res[[k]]$ncd[ptID2, ptID] = tmp$NCD[k] res[[k]]$jac[ptID2, ptID] = tmp$dirSim[k] } } } # Normalize rows by max value size k res_norm = res for (k in 1:(kmx-1)) { diag(res_norm[[k]]$ncd) = 0 diag(res_norm[[k]]$jac) = 0 res_norm[[k]]$ncd = res_norm[[k]]$ncd/max(na.omit(res_norm[[k]]$ncd)) } save(res, res_norm, file=sprintf("ptSim/ptSim_%s%d_kmx%d.RData", model, fold, kmx)) } } # For RCDP and ZSD similarity matrices, take the min patient distance for each k, and then across k, for each pairwise patient comparison. You will end up with one similarity matrix. rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) require(CTD) require(R.utils) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx kmx=30 mle.getMeanPtDistance = function (allSimMatrices) { ptSim = allSimMatrices[[1]] for (ind in 2:length(allSimMatrices)) { ptSim = ptSim + allSimMatrices[[ind]] } ptSim = ptSim/length(allSimMatrices) return(ptSim) } all_models_list = list() for (model in c("rcdp", "zsd")) { print(model) model_list = list() for (fold in 1:length(cohorts[[model]])) { print(fold) res = loadToEnv(sprintf("ptSim/kmx%d/ptSim_%s%d_kmx%d.RData", kmx, model, fold, kmx))[["res"]] res = lapply(res, function(i) i$ncd) model_list[[fold]] = mle.getMinPtDistance(res) } res_ncd = mle.getMeanPtDistance(model_list) diag(res_ncd) = 0 res_ncd = res_ncd / max(res_ncd) # Get colnames res = loadToEnv(sprintf("ptSim/kmx%d/ptSim_%s%d_kmx%d.RData", kmx, model, 1, kmx))[["res"]] colnames(res_ncd) = colnames(res[[1]]$ncd) rownames(res_ncd) = colnames(res[[1]]$ncd) all_models_list[[model]] = res_ncd } save(all_models_list, file=sprintf("ptSim/all_models_list_kmx%d.RData", kmx)) rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) require(CTD) require(R.utils) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx kmx=30 load(sprintf("ptSim/all_models_list_kmx%d.RData", kmx)) wangler_ages10_andDown = c("EDTA-ZSD-1", "EDTA-ZSD-2", "EDTA-ZSD-3", "EDTA-ZSD-7", "EDTA-ZSD-9", "EDTA-ZSD-10", "EDTA-ZSD-11", "EDTA-ZSD-13", "EDTA-ZSD-14", "EDTA-ZSD-17", "EDTA-ZSD-18") data_mx = data_mx[-grep("x - ", rownames(data_mx)),] refs = data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)] res_overall = mle.getMeanPtDistance(all_models_list) diag(res_overall) = 0 res_overall = res_overall / max(res_overall) colnames(res_overall) = colnames(all_models_list[[1]]) ind = which(colnames(res_overall) %in% unlist(cohorts[c("zsd", "rcdp", "edta_refs")])) res_overall = res_overall[ind, ind] diags = colnames(res_overall) diags[which(diags %in% wangler_ages10_andDown)] = "ZSD (<10 y.o.)" diags[which(diags %in% cohorts$zsd)] = "ZSD (10+ y.o.)" diags[which(diags %in% cohorts$rcdp)] = "RCDP" diags[which(diags %in% cohorts$edta_refs)] = "REF" fitSim = cmdscale(res_overall, eig = FALSE, k = 2) x = round(fitSim[, 1], 2) y = round(fitSim[, 2], 2) #z = round(fitSim[, 3], 2) #df = data.frame(x = x, y = y, z = z, color = diags, label = colnames(res_overall)) #p = plot_ly(df, x = ~x, y = ~y, z = ~z, color = ~color, text = ~label, marker = list(size = 20)) df = data.frame(x = x, y = y, color = diags, label = colnames(res_overall)) p = plot_ly(df, x = ~x, y = ~y, color = ~color, text = ~label, marker = list(size = 20)) orca(p, sprintf("ptSim/CTDncd_origZscore_kmx%d.svg", kmx), format = "svg") # Get UNK patients that cluster with PEX1 or PEX7 samples # Kmeans zsd.centroid = apply(res_overall[grep("ZSD", diags),], 2, mean) rcdp.centroid = apply(res_overall[which(diags=="RCDP"),], 2, mean) ref.centroid = apply(res_overall[which(diags=="REF"),], 2, mean) kmns = kmeans(res_overall, centers = rbind(zsd.centroid, rcdp.centroid, ref.centroid)) df = cbind(as.numeric(kmns$cluster), diags, colnames(all_models_list[[1]])[ind]) colnames(df) = c("cluster.num", "diag", "ptID") zsd.cluster = df[which(df[,1]=="1"), ] rcdp.cluster = df[which(df[,1]=="2"), ] ref.cluster = df[which(df[,1]==3), ] clusterPurity <- function(clusters, classes) { sum(apply(table(classes, clusters), 2, max)) / length(clusters) } clusterPurity(df[,1], df[,2]) # Compare to euclidean distance + MDS rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) data_mx=apply(loadToEnv("clinical_data.RData")[["data_mx.og"]],c(1,2),as.numeric) cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] data_mx=data_mx[,colnames(data_mx) %in% unlist(cohorts)] data_mx=data_mx[rowSums(is.na(data_mx)) != ncol(data_mx),] data_mx.0=data_mx data_mx = data_mx[,which(colnames(data_mx) %in% c(cohorts$zsd, cohorts$rcdp, cohorts$edta_refs))] ptSim_euc = as.matrix(dist(t(data_mx), method="euclidean")) fitSim = cmdscale(ptSim_euc, eig = FALSE, k = 2) x = round(fitSim[, 1], 2) y = round(fitSim[, 2], 2) df = data.frame(x = x, y = y, color = diags, label = colnames(res_overall)) p_euc = plot_ly(df, x = ~x, y = ~y, color = ~color, text = ~label, marker = list(size = 20)) orca(p_euc, sprintf("ptSim/zsd_rcdp_euc_origZscore.svg"), format = "svg") zsd.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$zsd),], 2, mean) rcdp.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$rcdp),], 2, mean) ref.centroid = apply(ptSim_euc[which(colnames(ptSim_euc) %in% cohorts$edta_refs),], 2, mean) kmns = kmeans(ptSim_euc, centers = rbind(zsd.centroid, rcdp.centroid, ref.centroid)) df = cbind(as.numeric(kmns$cluster), diags, colnames(all_models_list[[1]])[ind]) colnames(df) = c("cluster.num", "diag", "ptID") zsd.cluster = df[which(df[,1]=="1"), ] rcdp.cluster = df[which(df[,1]=="2"), ] ref.cluster = df[which(df[,1]==3), ] clusterPurity <- function(clusters, classes) { sum(apply(table(classes, clusters), 2, max)) / length(clusters) } clusterPurity(df[,1], df[,2]) # Compare to mahalanobis distance + basic plot require(CTDext) data_mx = data.imputeData(data_mx, data_mx[,which(colnames(data_mx) %in% cohorts$edta_refs)]) ptSim_mah = mahalanobis(t(data_mx), center=colMeans(data_mx), cov(t(data_mx)), inverted = FALSE, tol=1e-50) ggplot() + geom_point(aes(1:ncol(data_mx), ptSim_mah, colour=diags)) # Visualize main disease module for ZSD and RCDP rm(list=setdiff(ls(),c(grep("dir|models",ls(),value = T),lsf.str()))) require(R.utils) load(system.file("shiny-app/disMod_July2020.RData", package = "CTDext")) # Selected disease module for ZSD mets = disMod$zsd ig = loadToEnv(system.file("networks/ind_foldNets/bg_zsd_fold1.RData",package = "CTDext"))[["ig_pruned"]] ig = subgraph(ig, v=mets) E(ig)$weight = abs(E(ig)$weight)*100 E(ig)$color = "#6a0dad" coords = layout.fruchterman.reingold(ig) png("ptSim/zsd-blowout_origZscore.png") plot(ig, layout=coords, edge.width=E(ig)$weight) dev.off() V(ig)$name = rep("", length(V(ig)$name)) svg("ptSim/zsd-blowout_origZscore.svg") plot(ig, layout=coords, edge.width=E(ig)$weight) dev.off() # Selected disease module for RCDP mets = disMod$rcdp ig = loadToEnv(system.file("networks/ind_foldNets/bg_rcdp_fold3.RData")[["ig_pruned"]] ig = subgraph(ig, v=mets[which(mets %in% V(ig)$name)]) E(ig)$weight = abs(E(ig)$weight)*100 E(ig)$color = "#228B22" coords = layout.fruchterman.reingold(ig) png("ptSim/rcdp-blowout_origZscore.png") plot(ig, layout=coords, edge.width=E(ig)$weight) dev.off() V(ig)$name = rep("", length(V(ig)$name)) svg("ptSim/rcdp-blowout_origZscore.svg") plot(ig, layout=coords, edge.width=E(ig)$weight) dev.off()
require(pROC) require(R.utils) require(caret) require(dplyr) kmx=30 load(sprintf("%s/ptpval_kmx%s.RData",OP.dir,kmx)) top1 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][which.min(dff[which(dff$ptID==i), "combined_pval"]),c("model", "diag")]) correct.top1=unlist(apply(top1, 2, function(i) i$model==i$diag)) top1=as.data.frame(t(top1)) top1=as.data.frame(apply(top1,2,unlist)) top3 = sapply(unique(dff$ptID), function(i) dff[which(dff$ptID==i),][order(dff[which(dff$ptID==i), "combined_pval"], decreasing = FALSE)[1:3],c("model", "diag")]) top3=as.data.frame(top3) correct.top3=unlist(apply(top3, 2, function(i) i$diag[1] %in% i$model)) all.models=c("aadc", "abat", "adsl", "arg", "cit", "cob", "ga", "gamt", "msud", "mma", "pa", "pku", "rcdp", "zsd", "asld", "otc") df.ModelMetrics=data.frame(matrix(ncol = 8,nrow=length(all.models)),stringsAsFactors = F) rownames(df.ModelMetrics)=all.models colnames(df.ModelMetrics)=c("Model","Sensitivity.top1","Specificity.top1","Accuracy.top1","Sensitivity.top3","Specificity.top3","Accuracy.top3","AUC") options(digits=3) for (model in all.models) { # # Combined Network # Top 1 is correct diagnosis df=dff[dff$model==model,] df=unique(df) df=df[match(unique(dff$ptID),df$ptID),] CP.v=df$diag==model AUC=roc(response=CP.v,predictor = df$combined_pval) PP.top1.v=top1$model==model PP.top3.v = sapply(unique(dff$ptID),function(x) model %in% top3[[x]]$model) conf_mx.top1 = confusionMatrix(data=factor(PP.top1.v,levels = c("TRUE","FALSE")),reference = factor(CP.v,levels = c("TRUE","FALSE"))) # roc.top1=roc(response=as.numeric(CP.v),predictor = as.numeric(PP.top1.v)) conf_mx.top3 = confusionMatrix(data=factor(PP.top3.v,levels = c("TRUE","FALSE")),reference = factor(CP.v,levels = c("TRUE","FALSE"))) # roc.top3=roc(response=as.numeric(CP.v),predictor = as.numeric(PP.top3.v)) df.ModelMetrics[model,]=c(toupper(model), conf_mx.top1$byClass[c("Sensitivity","Specificity")], conf_mx.top1$overall["Accuracy"], # sprintf("%.3f(%.3f-%.3f)", # conf_mx.top1$overall["Accuracy"], # conf_mx.top1$overall["AccuracyLower"], # conf_mx.top1$overall["AccuracyUpper"]), # roc.top1$auc, conf_mx.top3$byClass[c("Sensitivity","Specificity")], conf_mx.top3$overall["Accuracy"], # sprintf("%.3f(%.3f-%.3f)", # conf_mx.top3$overall["Accuracy"], # conf_mx.top3$overall["AccuracyLower"], # conf_mx.top3$overall["AccuracyUpper"]), # roc.top3$auc NA ) } AUC.df=loadToEnv(sprintf("%s/AUC_kmx%s.RData",model.dir,kmx))[["dff"]] df.ModelMetrics$AUC=AUC.df$network.auc[match(df.ModelMetrics$Model,toupper(AUC.df$model))] df.ModelMetrics[,!grepl("Model",colnames(df.ModelMetrics))]=apply(df.ModelMetrics[,!grepl("Model",colnames(df.ModelMetrics))],c(1,2),as.numeric) require(openxlsx) write.xlsx(df.ModelMetrics,file="revisionOutput/ModelMetrics.xlsx") # write.table(df.ModelMetrics,file="revisionOutput/ModelMetrics.tsv",sep = "\t",quote = F) # plot model performance metrics require(tidyr) require(reshape2) levels.fixed=df.ModelMetrics$Model[order(df.ModelMetrics$Sensitivity.top1,decreasing = F)] df2 = df.ModelMetrics[order(df.ModelMetrics$Sensitivity.top1,decreasing = T),] # df2 = df2[,!grepl("Accuracy",colnames(df2))] df2 = df2 %>% melt(id=("Model")) %>% separate(col = "variable",into = c("metrics","rule"),sep = "[[:punct:]]+",remove = TRUE,convert = FALSE) df2$Model=factor(df2$Model,levels = levels.fixed) df2$value=as.numeric(df2$value) df2$metrics=factor(df2$metrics,levels = c("Sensitivity","Specificity","Accuracy","AUC")) # df2=data.frame(xaxis=rep(df$xaxis,3), # value=c(df$Score2.mean.SharedEdges.,df$AUC.top1,df$AUC.top3), # metrics=as.vector(sapply(c("Stablity","AUC.top1","AUC.top3"),function(x) rep(x,dim(df)[1])))) p=ggplot(data=df2[(!grepl("top3",df2$rule)) & !grepl("AUC",df2$metrics),],aes_string(x="Model",y="value")) + theme_minimal()+ facet_wrap(~metrics)+ geom_bar(stat="identity", position=position_dodge(), width = 0.7)+ coord_flip()+ theme(legend.position = "top",legend.title = element_blank(),panel.spacing = unit(1, "lines"))+ labs(fill="Metrics rule: ") p ggsave(p,filename = sprintf("%s/ModelMetrics2.pdf", OP.dir), width = 4, height = 4, device = cairo_pdf) require(svglite) ggsave(p,filename = sprintf("%s/ModelMetrics2.svg", OP.dir), width = 6, height = 4, device = svglite)
require(R.utils) require(CTD) score_mx=list() stablity_score=data.frame(Disease=character(), `Cohort Size`=integer(), `Score1 mean(SharedEdges)`=numeric(), `Score1 sd(SharedEdges)`=numeric(), `Score2 mean(SharedEdges)`=numeric(), `Score2 sd(SharedEdges)`=numeric(), stringsAsFactors = F ) r=0 countDirection=TRUE for (model in c("aadc", "abat", "adsl", "arg", "asld", "cit", "otc", "cob", "ga", "gamt", "msud", "mma", "pa", "zsd", "rcdp", "pku")) { # print(sprintf("%s...", toupper(model))) r=r+1 cohorts=loadToEnv("clinical_data.RData")[["cohorts"]] score_mx[["edgeCountByPairOuter"]][[model]]=matrix(nrow=length(cohorts[[model]]),ncol=length(cohorts[[model]])) colnames(score_mx[["edgeCountByPairOuter"]][[model]])=rownames(score_mx[["edgeCountByPairOuter"]][[model]])=cohorts[[model]] score_mx[["edgeCountByPair"]][[model]]=score_mx[["edgeCountByPairOuter"]][[model]] ig.outer=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, 1))[["ig_pruned"]] for (pt1 in 1:(length(cohorts[[model]])-1)) { ig1=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt1))[["ig_pruned"]] ig.outer=ig.outer%u%loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt1+1))[["ig_pruned"]] for (pt2 in (pt1+1):length(cohorts[[model]]) ){ ig2=loadToEnv(sprintf("graphs_GMpaper0/bg_%s_fold%d.RData", model, pt2))[["ig_pruned"]] ig.inner=ig1 %s% ig2 if(countDirection){ig.inner=delete_edges(ig.inner,E(ig.inner)[E(ig.inner)$weight_1*E(ig.inner)$weight_2<=0])} ig.pairouter=ig1 %u% ig2 score_mx[["edgeCountByPair"]][[model]][pt1,pt2]=length(E(ig.inner)) score_mx[["edgeCountByPairOuter"]][[model]][pt1,pt2]=length(E(ig.pairouter)) } } stablity_score[r,]=c(model, length(cohorts[[model]]), mean(score_mx[["edgeCountByPair"]][[model]],na.rm = T)/length(E(ig.outer)), sd(score_mx[["edgeCountByPair"]][[model]],na.rm = T)/length(E(ig.outer)), mean(score_mx[["edgeCountByPair"]][[model]]/score_mx[["edgeCountByPairOuter"]][[model]],na.rm = T), sd(score_mx[["edgeCountByPair"]][[model]]/score_mx[["edgeCountByPairOuter"]][[model]],na.rm = T) ) } stablity_score$Score2.Normalized.sd=as.numeric(stablity_score$Score2.sd.SharedEdges.)/as.numeric(stablity_score$Score2.mean.SharedEdges.) colnames(stablity_score)[colnames(stablity_score)=="Disease"]="Diagnosis" stablity_score$Diagnosis=toupper(stablity_score$Diagnosis) require(openxlsx) # write.xlsx(stablity_score,file = "revisionOutput/NetworkStablity.xlsx") #plot require(ggplot2) stablity_score=read.xlsx("OP/NetworkStablity.xlsx",rowNames = F) stablity_score[,2:6]=apply(stablity_score[,2:6],c(1,2),as.numeric) ModelMetrics=read.xlsx("OP/ModelMetrics.xlsx",rowNames = F) colnames(ModelMetrics)=c("Model","Sensitivity.top1","Specificity.top1","Accuracy.top1","AUC.top1","Sensitivity.top3","Specificity.top3","Accuracy.top3","AUC.top3") ModelMetrics=ModelMetrics[!is.na(ModelMetrics$Model),] df=cbind(stablity_score[,c("Diagnosis","Cohort.Size","Score2.mean.SharedEdges.","Score2.sd.SharedEdges.","Score2.Normalized.sd")], ModelMetrics[match(stablity_score$Diagnosis,ModelMetrics$Model),c("Sensitivity.top1","Specificity.top1","AUC.top1","Sensitivity.top3","Specificity.top3","AUC.top3")]) df=df[order(as.integer(df$Cohort.Size)),] df$xaxis=paste(paste(df$Diagnosis,df$Cohort.Size,sep = " (n="),")",sep = "") fixed_level=df$xaxis df$xaxis=factor(df$xaxis,levels = fixed_level) levels(df$xaxis)=df$xaxis #plot stablity p = ggplot(data=df,aes_string(x="xaxis",y="Score2.mean.SharedEdges.",fill = "Cohort.Size")) + geom_bar(stat="identity", position=position_dodge(),color="black")+ scale_fill_gradient(low="#ffcccc",high="darkred")+ geom_errorbar(aes(ymin=Score2.mean.SharedEdges.-Score2.sd.SharedEdges., ymax=Score2.mean.SharedEdges.+Score2.sd.SharedEdges.), width=.2, position=position_dodge(.9)) + theme_minimal()+ theme(legend.position = "none")+ # geom_col(aes(fill = Cohort.Size)) + # scale_color_gradient(low = "blue", high = "red") + xlab("Disease")+ylab("Network Stability Score") + coord_flip() ggsave(p, filename = "OP/StablityScore.pdf", device = cairo_pdf, width = 4, height = 4)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.