knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE )
The following Python script downloads the ogbl-biokg
(i.e., BioKG) knowledge graph from the Open Graph Benchmark (OGB).
The ogbl-biokg
dataset is described by OGB as follows:
"The
ogbl-biokg
dataset is a Knowledge Graph (KG), which we created using data from a large number of biomedical data repositories. It contains 5 types of entities: diseases (10,687 nodes), proteins (17,499), drugs (10,533 nodes), side effects (9,969 nodes), and protein functions (45,085 nodes). There are 51 types of directed relations connecting two types of entities, including 38 kinds of drug-drug interactions, 8 kinds of protein-protein interaction, as well as drug-protein, drug-side effect, function-function relations. All relations are modeled as directed edges, among which the relations connecting the same entity types (e.g., protein-protein, drug-drug, function-function) are always symmetric, i.e., the edges are bi-directional.This dataset is relevant to both biomedical and fundamental ML research. On the biomedical side, the dataset allows us to get better insights into human biology and generate predictions that can guide downstream biomedical research. On the fundamental ML side, the dataset presents challenges in handling a noisy, incomplete KG with possible contradictory observations. This is because the
ogbl-biokg
dataset involves heterogeneous interactions that span from the molecular scale (e.g., protein-protein interactions within a cell) to whole populations (e.g., reports of unwanted side effects experienced by patients in a particular country). Further, triplets in the KG come from sources with a variety of confidence levels, including experimental readouts, human-curated annotations, and automatically extracted metadata."
```{python download-kg} import numpy as np import copy import json from ogb.linkproppred import LinkPropPredDataset
dataset = LinkPropPredDataset(name = "ogbl-biokg")
split_edge = dataset.get_edge_split() train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"] graph = dataset[0] # graph: library-agnostic graph object
The edge index in the `graph` object is converted into JSON format to be read into R. ```{python edge-types} print(graph.keys()) edge_index = graph["edge_index_dict"].copy()
To make the conversion, the dictionary keys must be renamed (i.e., cannot be tuples).
```{python rename-key} old_keys = list(edge_index.keys()) for old_name in old_keys: new_name = "--".join(old_name) edge_index[new_name] = edge_index[old_name] del edge_index[old_name]
A special numpy encoder is defined, borrowed from [this StackOverflow post](https://stackoverflow.com/questions/57269741/typeerror-object-of-type-ndarray-is-not-json-serializable). ```{python numpy-encoder} class NumpyEncoder(json.JSONEncoder): """ Special json encoder for numpy types """ def default(self, obj): if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)): return int(obj) elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj, (np.ndarray,)): return obj.tolist() return json.JSONEncoder.default(self, obj)
Finally, we convert the edge index to JSON and write to a file.
```{python convert-json} edge_index_json = json.dumps(edge_index, cls = NumpyEncoder)
with open('inst/extdata/edge_index.json', 'a') as f: f.write(edge_index_json + '\n')
# Parse KG in R Read the JSON dataset saved previously. ```r # load libraries library(data.table) library(purrr) library(magrittr) library(rjson) # load metapaths library library(metapaths) # read data print(getwd()) biokg = fromJSON(file = "inst/extdata/edge_index.json")
Read mappings downloaded from OGB.
mapping_dir = "inst/extdata/biokg_mappings" read_mapping = function(mapping_path) { mapping = fread(file.path(mapping_dir, mapping_path)) mapping[, Type := strsplit(mapping_path, "_")[[1]][1]] return(mapping) } mappings = list.files(mapping_dir) %>% .[grepl("entidx2name", .)] %>% map_dfr(read_mapping) mappings = mappings %>% copy() %>% setnames(c("ent idx", "ent name"), c("Index", "Name")) %>% setcolorder("Type") %>% .[, Label := paste(Type, Index, sep = "_")]
Create a function to convert the edge list to a data.table
. Note that the node IDs are specific to each type, so we must add a type-specific prefix.
convert_biokg = function(sub_kg, sub_label) { # split label split_label = strsplit(sub_label, "--")[[1]] origin_label = paste(split_label[1], sub_kg[[1]], sep = "_") destination_label = paste(split_label[3], sub_kg[[2]], sep = "_") # create data table kg_dt = data.table( Origin = origin_label, Destination = destination_label, OriginType = split_label[1], DestinationType = split_label[3], EdgeType = split_label[2]) }
Now, map the conversion function over the biokg
list.
biokg_edge_list = imap_dfr(biokg, convert_biokg) biokg_node_list = get_node_list(biokg_edge_list) head(biokg_edge_list)
Check that the counts of each node type conform with graph["num_nodes_dict"]
from the Python script.
| disease | drug | function | protein | sideeffect | |:-------:|:-----:|:--------:|:-------:|:----------:| | 10687 | 10533 | 45085 | 17499 | 9969 |
biokg_node_list[, .N, by = NodeType]
Add node names from mappings
table.
# add node names biokg_node_list = merge(biokg_node_list, mappings[, .(Label, Name)], by.x = "Node", by.y = "Label", sort = F) %>% setnames("Name", "NodeName")
Sample the knowledge graph to generate a small, connected test set.
# load igraph library library(igraph) # generate graph biokg_graph = igraph::graph.data.frame(biokg_edge_list, vertices = biokg_node_list, directed = T) # sample random nodes set.seed(42) seed_nodes = sample(biokg_node_list$Node, 5000) # generate induced subgraph biokg_sample = igraph::induced.subgraph(graph = biokg_graph, vids = seed_nodes) # get largest connected component comp = igraph::components(biokg_sample) comp_idx = which.max(comp$csize) lcc = comp$membership %>% {names(.)[. == comp_idx]} # generate connected subgraph biokg_sub = igraph::induced.subgraph(graph = biokg_graph, vids = lcc) # generate subsamples node and edge lists sub_edge_list = biokg_edge_list[Origin %in% lcc & Destination %in% lcc] sub_node_list = biokg_node_list[Node %in% lcc] # convenience function lookup = function(node) { # return(sub_node_list[Node == node, NodeName]) return(biokg_node_list[Node == node, NodeName]) }
Save node list and edge list to file.
save(sub_edge_list, file = "data/sub_edge_list.rda") save(sub_node_list, file = "data/sub_node_list.rda")
Check possible edge types.
message("- ", paste(unique(sub_node_list[, NodeType]), collapse = "\n- "))
Find hub genes.
# calculate degree sub_degrees = degree(biokg_sub) # find candidate nodes candidate_nodes = sub_degrees %>% .[which(. > 20 & . < 30)] %>% names() %>% .[grepl("protein", .)] %>% { sub_node_list[Node %in% ., NodeName] } cat(paste("HGNC:", candidate_nodes, sep = ""), sep = "\n")
Test similarity between an Alzheimer's disease (AD)-related drug (donepezil) and an AD-related pathway ("regulation of amyloid fibril formation").
total_time = 0 for (i in 1:100) { start_time = Sys.time() sim_test = get_similarity( "drug_762", "function_42893", mp = c("drug", "disease", "protein", "function"), node_list = biokg_node_list, edge_list = biokg_edge_list, verbose = F) end_time = Sys.time() time_required = end_time - start_time total_time = total_time + time_required } average_time = total_time/100 message("Computed in ", as.character(average_time), " seconds.") sim_res = sim_test$OriginPaths %>% .[.[["function"]] == "function_36342", ]
Visualize identified metapaths in the KG.
sim_nodes = unique(unlist(sim_res)) # generate connected subgraph sim_graph = igraph::induced.subgraph(graph = biokg_graph, vids = sim_nodes) # plot subgraph plot(sim_graph, vertex.size = 15, edge.arrow.size = 0.25)
As a negative control, randomly sample 100 pathways (i.e., nodes of type "function"
likely not AD-related) and compute the similarity.
set.seed(42) # sample 100 pathways random_neg_pathways = biokg_node_list[NodeType == "function", ] %>% .[sample(1:nrow(.), 100), ] neg_tests = list() neg_sims = data.table() # for each pathway, compute similarity total_time = 0 for (i in 1:100) { message("Pathway #", i, ": ", random_neg_pathways[i, NodeName]) start_time = Sys.time() sim_neg_test = get_similarity( "drug_762", random_neg_pathways[i, Node], # "function_29825", mp = c("drug", "disease", "protein", "function"), metric = c("pc", "npc", "dwpc"), node_list = biokg_node_list, edge_list = biokg_edge_list, verbose = F) end_time = Sys.time() time_required = end_time - start_time total_time = total_time + time_required neg_tests = append(neg_tests, sim_neg_test) neg_sims = rbind(neg_sims, sim_neg_test$Similarity[, Pathway := random_neg_pathways[i, NodeName]]) } average_time = total_time/100 message("Computed in ", as.character(average_time), " seconds.") # calculate statistics (mean, median, SD, etc.) neg_sims[, sum(Similarity == 0), by = "Metric"] neg_sims[, mean(Similarity), by = "Metric"] neg_sims[, median(Similarity), by = "Metric"] neg_sims[, sd(Similarity), by = "Metric"] # saveRDS(neg_tests, "<insert file path>.RDS") # fwrite(neg_sims, "<insert file path>.csv")
Again, test set-to-set similarity by computing similarity scores between three AD-related drugs (i.e., donepezil, memantine, and galantamine) and a set of six AD-related pathways.
# set of three AD drugs alzheimer_drugs = c("drug_762", "drug_1075", "drug_895") # set of six AD-related pathways alzheimer_pathways = c("function_36342", "function_26258", "function_21333", "function_42901", "function_17601", "function_36167") # compare sets alzheimer_test = compare_sets( alzheimer_drugs, alzheimer_pathways, mp = c("drug", "disease", "protein", "function"), method = c("minimum", "maximum"), node_list = biokg_node_list, edge_list = biokg_edge_list)
As a negative control, test set-to-set similarity by computing similarity scores between the same three AD-related drugs and a set of six randomly subset pathways.
set.seed(1234) # selecting different seed # randomly subset pathways random_pathways = biokg_node_list[NodeType == "function", ] %>% .[sample(1:nrow(.), 6), ] # compare sets random_test = compare_sets( alzheimer_drugs, random_pathways$Node, mp = c("drug", "disease", "protein", "function"), method = c("minimum", "maximum"), node_list = biokg_node_list, edge_list = biokg_edge_list)
Example utility function to generate random meta paths of specified lengths; variants are used below.
set.seed(42) # generate random metapath of specified length random_mp = function(origin_type, mp_length, edge_types, verbose = T) { # construct random meta path starting at origin gen_mp = c(origin_type) if(verbose) { cat(origin_type) } # construct meta path for (k in 1:(mp_length - 1)) { possible_types = edge_types[OriginType == tail(gen_mp, 1), unique(DestinationType)] if(length(possible_types) == 0) { return(random_mp(origin_type, mp_length, edge_types, verbose)) } next_type = sample(possible_types, 1) if(verbose) { cat(" >", next_type) } gen_mp = append(gen_mp, next_type) } if(verbose) { cat("\n") } # return meta path return(gen_mp) }
Randomly subsample BioKG by starting at a randomly sampled node and adding all neighbors (irrespective of edge direction) until the desired graph size is reached.
# generate connnected BioKG subgraph of specific size subsample_biokg = function(sample_size, origin_types = NULL, verbose = F) { # sample seed node if(!is.null(origin_types)) { subgraph_nodes = sample(biokg_node_list[NodeType %in% origin_types, Node], 1) } else { subgraph_nodes = sample(biokg_node_list[, Node], 1) } subgraph_size = 1 if(verbose) { message("Subgraph Size: ", subgraph_size) } sub_index = 1 # augment subgraph until subgraph is as large as desired while (subgraph_size < sample_size) { # augment subgraph by appending neighbors of i-th node in subgraph # take neighbors irrespective of destination new_neighbors = biokg_edge_list %>% .[Origin == subgraph_nodes[sub_index] | Destination == subgraph_nodes[sub_index]] %>% .[, c(Origin, Destination)] # append nodes to subgraph subgraph_nodes = c(subgraph_nodes, new_neighbors) %>% unique() subgraph_size = length(subgraph_nodes) if(verbose) { message("Subgraph Size: ", subgraph_size) } # increment index sub_index = sub_index + 1 } # subset nodes to size subgraph_nodes = subgraph_nodes[1:sample_size] subgraph_size = length(subgraph_nodes) if(verbose) { message("Final Subgraph Size: ", subgraph_size) } # generate node and edge lists of subgraph sub_edge_list = biokg_edge_list[Origin %in% subgraph_nodes & Destination %in% subgraph_nodes] sub_node_list = biokg_node_list[Node %in% subgraph_nodes] # return node and edge list return(list(Nodes = sub_node_list, Edges = sub_edge_list)) }
Write function to evaluate how execution time of the get_similarity()
function scales with graph size. Generate n_graphs
number of random subgraphs of size graph_size
; for each subgraph, run n_trials
trials (where, in each trial, a random metapath of length mp_length
is evaluated.
# run experiments on graph of given size test_size = function(graph_size, mp_length = 3, n_graphs = 10, n_trials = 10, origin_types = NULL, verbose = F) { # test with the parameters: # graph_size = 500 # mp_length = 3 # n_graphs = 10 # n_trials = 10 # define overall averages graph_total_time = 0 graph_average_times = rep(NA, times = n_graphs) graph_trial_times = list() # time trial for (graph_number in 1:n_graphs) { # print graph number if(verbose) { message("\nGraph Number: ", graph_number) } # subsample graph random_graph = subsample_biokg(graph_size, origin_types = origin_types, verbose = verbose) sample_node_list = random_graph$Nodes sample_edge_list = random_graph$Edges # generate possible edge types sample_edge_types = sample_edge_list[, .(OriginType, DestinationType)] %>% unique() # remove terminal types sample_edge_types = sample_edge_types[DestinationType %in% OriginType] if(is.null(origin_types)) { origin_types = sample_edge_types$OriginType } else { origin_types = origin_types[origin_types %in% sample_edge_types$OriginType] } if(verbose) { message("-- Origin Types: ", origin_types) # print(sample_edge_types) } # total time variable total_time = 0 trial_times = rep(NA, times = n_trials) # time trial for (trial_number in 1:n_trials) { # sample random origin node random_origin = sample_edge_list[OriginType %in% origin_types, Origin] %>% unique() %>% sample(1) random_origin_type = sample_node_list[Node == random_origin, NodeType] # print trial number if(verbose) { message("-- Trial Number: ", trial_number, " (", random_origin_type, ")") } # construct random meta path starting at origin test_mp = random_mp(random_origin_type, mp_length, sample_edge_types, verbose = verbose) # sample random destination node random_destination = sample_node_list %>% .[NodeType == tail(test_mp, 1)] %>% .[Node == sample(Node, 1), Node] # time similarity search start_time = Sys.time() sim_result = get_similarity( random_origin, random_destination, mp = test_mp, metric = c("dwpc"), node_list = sample_node_list, edge_list = sample_edge_list, verbose = F) end_time = Sys.time() # compute time required time_required = end_time - start_time trial_times[trial_number] = time_required total_time = total_time + time_required } # compute average time average_time = total_time / n_trials # append to graph list graph_total_time = graph_total_time + average_time graph_average_times[graph_number] = average_time graph_trial_times = append(graph_trial_times, list(trial_times)) } # generate return object graph_average_time = graph_total_time / n_graphs names(graph_trial_times) = paste("Graph", 1:n_graphs) return(list(Average = graph_average_time, `Graph Averages` = graph_average_times, Trials = graph_trial_times)) }
Test on randomly generated graphs of various sizes.
# map over sizes # size_list = c(128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536) size_list = c(128, 256, 512, 1024, 2048, 4096, 8192, 16384) time_test = map(size_list, ~test_size(.x, mp_length = 3, n_graphs = 10, n_trials = 10, origin_types = c("disease", "protein", "function"), verbose = T))
Plot execution time by graph size.
library(ggplot2) # parse times time_vals = data.table( Size = rep(size_list, each = 10), Time = unlist(map(time_test, ~.x$`Graph Averages`))) # generate statistics time_stats = time_vals[, .(Mean = mean(Time), SD = sd(Time)), by = Size] %>% .[, Upper := Mean + SD] %>% .[, Lower := Mean - SD] # generate plot p = ggplot(time_stats, aes(x = Size, y = Mean, fill = log2(Size))) + geom_bar(stat = "identity", color = "black") + geom_errorbar(data = time_stats, aes(x = Size, ymin = Lower, ymax = Upper), width = 0.2, inherit.aes = FALSE) + scale_fill_gradientn(colors = c("#FFDAB9", "#FBC4AB", "#F8AD9D", "#F4978E", "#F08080")) + labs(x = "Graph Size", y = "Mean Running Time (seconds)") + scale_x_continuous(trans = "log2", n.breaks = length(size_list)) + scale_y_continuous(expand = expansion(mult = c(0, 0.06))) + theme_bw() + theme( axis.title = element_text(face = "bold", size = 14), axis.text = element_text(size = 12), legend.position = "None" ) # save plot ggsave(filename = "../Figures/size_runtime.pdf", p, width = 8, height = 5, units = "in") ggsave(filename = "../Figures/size_runtime.svg", p, width = 8, height = 5, units = "in") ggsave(filename = "../Figures/size_runtime.png", p, width = 8, height = 5, units = "in")
Test how execution time scales with meta path length.
# get all node types node_types = unique(biokg_node_list$NodeType) # list possible edge types from unique(biokg_edge_list$EdgeType) # excluding "drug-sideeffect" since all sideeffect nodes are terminal # excluding "protein-function" and "function-function" to prevent uninformative "function" x n metapaths edge_types = c("disease-protein", "drug-disease", "drug-drug", "drug-protein", "protein-protein", "protein-function", "function-function") edge_types = data.table(Origin = map_chr(strsplit(edge_types, "-"), ~.x[1]), Destination = map_chr(strsplit(edge_types, "-"), ~.x[2])) # list origin types origin_types = unique(edge_types$Origin) # list destination types destination_types = unique(edge_types$Destination) # define random performance evaluation function # meta path length must be at least 2 random_test = function(mp_length) { # sample random origin node random_origin = biokg_node_list[NodeType %in% origin_types][Node == sample(Node, 1), ] # construct random meta path starting at origin random_mp = c(random_origin$NodeType) for (k in 1:(mp_length - 1)) { next_type = edge_types[Origin == tail(random_mp, 1), Destination] %>% sample(1) random_mp = append(random_mp, next_type) } # sample random destination node random_destination = biokg_node_list %>% .[NodeType == tail(random_mp, 1)] %>% .[Node == sample(Node, 1), ] # time similarity search start_time = Sys.time() sim_result = get_similarity( random_origin$Node, random_destination$Node, mp = random_mp, metric = c("dwpc"), node_list = biokg_node_list, edge_list = biokg_edge_list, verbose = T) end_time = Sys.time() # compute time required time_required = end_time - start_time # construct return object out = list( Origin = random_origin, Destination = random_destination, MP = random_mp, Time = time_required ) return(out) } # test meta paths with lengths 2-4 length_2 = map(1:50, ~random_test(2)) length_3 = map(1:50, ~random_test(3)) length_4 = map(1:50, ~random_test(4)) all_times = c(length_2, length_3, length_4) # get times times_2 = map_dbl(length_2, ~.x$Time) times_3 = map_dbl(length_3, ~.x$Time) times_4 = map_dbl(length_4, ~.x$Time)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.