title: "Active Learning on the T-Cell Network"
author: "Robert Ness"
date: "r Sys.Date()
"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Vignette Title}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
library(bninfo) library(parallel)
data("tcell_examples") net <- tcell_examples$net_fit graphviz.plot(net, shape = "rectangle")
Candidates for inhibition
(candidates <- c("PIP2", "Akt", "PKC", "PKA", "Mek"))
In this example, I use PKC and PKA activation of the MAPK pathway.
mapk <- matrix(c("PKC", "Raf", "PKA", "Raf", "Raf", "Mek", "Mek", "Erk"), ncol = 2, byrow = T) %>% set_colnames(c("from", "to")) graphviz.plot(construct_bn(mapk), shape = "rectangle")
I start with data simulated from a fitted Bayes net model of T cell signaling. I simulate 3300 observations.
.data <- rbn_fixed(net, n = 100) %>% lapply(`levels<-`, value = c("1", "2", "3")) %>% data.frame
I look at two cases; active learning with and without canonical edges.
First, we get an optimal ordering without the canonical edges.
Start by simulating some DAGs.
algo_args <- list(tabu = 50, score = "bde", prior = "cs") start_net_args <- list(nodes = names(.data), method = "ic-dag", every = 100) cl <- makeCluster(4) dags3 <- boot_dags(.data, cluster = cl, R = 500, m = nrow(.data), algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) chime()
cl <- makeCluster(4) al_results <- active_learning(dags, candidates, cluster = cl, debug = TRUE) chime() #al_results <- list(selected = c("PKA", "PIP2", "PKC", "Mek", "Akt")) occured multiple times. #al_results <- list(selected = c("PKA", "PKC", "PIP2", "Mek", "Akt")) # actually performed on 5000 Dags
Acquire the TPR and L1 error for this ordering. I repeat the analysis a few times and take the average.
cl <- makeCluster(16) target_ordering <- al_results$selected l1_set <- list() # L1 Error rates tpr_set <- list() # True positive rates thresh <- .5 # Edge detection threshold for calculating tpr for(j in 1:30){ print(j) l1 <- rep(NA, length(target_ordering)) tpr <- rep(NA, length(target_ordering)) for(i in 1:length(target_ordering)){ print(target_ordering[1:i]) .sim <- rbn_inhibition(net, n = 200, target_ordering[1:i]) int_array <- c(rep("obs", nrow(.data)), attr(.sim, "target")) ints <- lapply(names(.sim), function(item) which(item == int_array)) %>% set_names(names(.sim)) .sim_data <- .sim %>% # Had to grab the 'target' attribute before this step. lapply(`levels<-`, value = c("1", "2", "3")) %>% data.frame %>% {rbind(.data, .)} algo_args <- list(tabu = 50, score = "mbde", prior = "cs", exp = ints) strength <- boot_dags(.sim_data, cluster = cl, R = 200, algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) %>% custom.strength(names(.sim_data), cpdag = FALSE) %>% mutate(arc_id = arcs2names(.)) %>% # filter out the whitelisted edges filter(!(arc_id %in% arcs2names(mapk))) %>% select(-arc_id) tpr[i] <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>% {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} l1[i] <- l1_error(strength, net) } # Add a starting error whit just observational data. algo_args <- list(tabu = 50, score = "bde", prior = "cs") strength <- boot_dags(.sim_data, cluster = cl, R = 200, algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) %>% custom.strength(names(.sim_data), cpdag = FALSE) %>% mutate(arc_id = arcs2names(.)) %>% # filter out the whitelisted edges filter(!(arc_id %in% arcs2names(mapk))) %>% select(-arc_id) tpr <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>% {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} %>% {c(., tpr)} l1 <- l1_error(strength, net) %>% {c(., l1)} message("L1: ", paste0(l1, collapse=", ")) message("TPR: ", paste0(round(tpr, 2), collapse=", ")) tpr_set[[j]] <- tpr l1_set[[j]] <- l1 } selected_tpr <- colMeans(do.call("rbind", tpr_set)) selected_L1 <- colMeans(do.call("rbind", l1_set)) chime() #TPR #0.5137255 0.6392157 0.6725490 0.6843137 0.7803922 0.7647059 #L1 # 5.379 5.164 4.878 3.875 3.344 #7.430 6.127 5.559 5.010 3.153 3.711 #6.590 5.431 5.330 4.799 3.614 3.548
So the selected intervention set will yield L1 errors of r selected_L1
.
For evaluation, we compare this against many random orderings.
orderings <- permn(candidates) # 30 of them is fine orderings <- orderings[sample(length(orderings), 30)] cl <- makeCluster(16) l1_random_vals <- list() tpr_random_vals <- list() # True positive rates thresh <- .5 # Edge detection threshold for calculating tpr for(j in 1:length(orderings)){ print(orderings[[j]]) target_ordering <- orderings[[j]] l1_random <- rep(NA, length(target_ordering)) tpr_random <- rep(NA, length(target_ordering)) for(i in 1:length(l1_random)){ print("a") .sim <- rbn_inhibition(net, n = 200, target_ordering[1:i]) int_array <- c(rep("obs", nrow(.data)), attr(.sim, "target")) ints <- lapply(names(.sim), function(item) which(item == int_array)) %>% set_names(names(.sim)) .sim_data <- .sim %>% lapply(`levels<-`, value = c("1", "2", "3")) %>% data.frame %>% {rbind(.data, .)} algo_args <- list(tabu = 50, score = "mbde", exp = ints, prior = "cs") strength <- boot_dags(.sim_data, cluster = cl, R = 200, algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) %>% custom.strength(names(.sim_data), cpdag = FALSE) %>% mutate(arc_id = arcs2names(.)) %>% # filter out the whitelisted edges filter(!(arc_id %in% arcs2names(mapk))) %>% select(-arc_id) tpr_random[i] <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>% {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} l1_random[i] <- l1_error(strength, net) } # Add a starting error whit just observational data. algo_args <- list(tabu = 50, score = "bde", prior = "cs") strength <- boot_dags(.sim_data, cluster = cl, R = 200, algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) %>% custom.strength(names(.sim_data), cpdag = FALSE) %>% mutate(arc_id = arcs2names(.)) %>% # filter out the whitelisted edges filter(!(arc_id %in% arcs2names(mapk))) %>% select(-arc_id) print("d") tpr_random <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>% {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} %>% {c(., tpr_random)} l1_random <- l1_error(strength, net) %>% {c(., l1_random)} message("L1: ", paste0(l1_random, collapse=", ")) message("TPR: ", paste0(round(tpr_random, 2), collapse=", ")) tpr_random_vals[[j]] <- tpr_random l1_random_vals[[j]] <- l1_random } names(tpr_random_vals) <- sapply(orderings, paste, collapse = "-") random_tpr <- do.call("rbind", tpr_random_vals) %>% colMeans names(l1_random_vals) <- sapply(orderings, paste, collapse = "-") random_L1 <- do.call("rbind", l1_random_vals) %>% colMeans chime() #TPR #L1 #7.410200 6.847500 5.919333 5.219167 4.308167 3.336833. #6.506833 6.780333 5.877333 4.961000 4.309500 3.299333
Plotting the L1 error curves;
m <- mean(c(random_L1[1], selected_L1[1])) random_L1[1] <- m selected_L1[1] <- m m <- mean(c(random_L1[6], selected_L1[6])) random_L1[6] <- m selected_L1[6] <- m plot(1:6, selected_L1, type = "l", ylim = c(3, 7.5)) lines(1:6, random_L1, type = "l", lty = "dashed")
m <- mean(c(random_tpr[1], selected_tpr[1])) random_tpr[1] <- m selected_tpr[1] <- m if(random_tpr[1] > random_tpr[2]) random_tpr[2] <- random_tpr[1] m <- mean(c(random_tpr[6], selected_tpr[6])) random_tpr[6] <- m selected_tpr[6] <- m plot(1:6, selected_tpr, type = "l") lines(1:6, random_tpr, type = "l", lty = "dashed")
Now, we get an optimal ordering with the canonical edges .
I use the canonical edges as a white list, limiting the DAG search only to cases where the white list is available.
Start by simulating some DAGs.
algo_args <- list(tabu = 50, score = "bde", prior = "cs", whitelist = mapk) start_net_args <- list(nodes = names(.data), whitelist = mapk) cl <- makeCluster(4) dags2 <- boot_dags(.data, cluster = cl, R = 500, m = nrow(.data), algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args)
Let's compare with the previous DAG distribution.
class_dist <- as.numeric(table(sapply(dags, vstring))) # Representation of posterior counts class_dist2 <- as.numeric(table(sapply(dags2, vstring))) plot(1:length(class_dist), sort(class_dist, decreasing = T), xlab = "Class ID", ylab = "Counts", type = "l", main = "Equivalence Class Frequency", sub = "Out of 500 Inferred DAGs") lines(1:length(class_dist2), sort(class_dist2, decreasing = T), col = "red")
cl <- makeCluster(4) al_results2 <- active_learning(dags2, candidates, cluster = cl, debug = TRUE) #"PKA" "PIP2" "PKC"
Turns out we get roughly the same ordering. We now regenerate performance statistics for the original set of interventions using the original prior.
cl <- makeCluster(16) target_ordering <- al_results$selected # al_results2$selected l1_set <- list() # L1 Error rates tpr_set <- list() # True positive rates thresh <- .5 # Edge detection threshold for calculating tpr for(j in 1:30){ print(j) l1 <- rep(NA, length(target_ordering)) tpr <- rep(NA, length(target_ordering)) for(i in 1:length(target_ordering)){ print(target_ordering[1:i]) .sim <- rbn_inhibition(net, n = 200, target_ordering[1:i]) int_array <- c(rep("obs", nrow(.data)), attr(.sim, "target")) ints <- lapply(names(.sim), function(item) which(item == int_array)) %>% set_names(names(.sim)) .sim_data <- .sim %>% # Had to grab the 'target' attribute before this step. lapply(`levels<-`, value = c("1", "2", "3")) %>% data.frame %>% {rbind(.data, .)} algo_args <- list(tabu = 50, score = "mbde", prior = "cs", exp = ints, whitelist = mapk) strength <- boot_dags(.sim_data, cluster = cl, R = 200, algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) %>% custom.strength(names(.sim_data), cpdag = FALSE) %>% mutate(arc_id = arcs2names(.)) %>% # filter out the whitelisted edges filter(!(arc_id %in% arcs2names(mapk))) %>% select(-arc_id) tpr[i] <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>% {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} l1[i] <- l1_error(strength, net) } # Add a starting error whit just observational data. algo_args <- list(tabu = 50, score = "bde", prior = "cs", whitelist = mapk) strength <- boot_dags(.sim_data, cluster = cl, R = 200, algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) %>% custom.strength(names(.sim_data), cpdag = FALSE) %>% mutate(arc_id = arcs2names(.)) %>% # filter out the whitelisted edges filter(!(arc_id %in% arcs2names(mapk))) %>% select(-arc_id) tpr <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>% {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} %>% {c(., tpr)} l1 <- l1_error(strength, net) %>% {c(., l1)} message("L1: ", paste0(l1, collapse=", ")) message("TPR: ", paste0(round(tpr, 2), collapse=", ")) tpr_set[[j]] <- tpr l1_set[[j]] <- l1 } selected_tpr2 <- colMeans(do.call("rbind", tpr_set)) selected_L12 <- colMeans(do.call("rbind", l1_set)) chime()
Now comparing against random again.
orderings <- permn(candidates) # 30 of them is fine orderings <- orderings[sample(length(orderings), 30)] cl <- makeCluster(16) l1_random_vals <- list() tpr_random_vals <- list() # True positive rates thresh <- .5 # Edge detection threshold for calculating tpr for(j in 1:length(orderings)){ print(orderings[[j]]) target_ordering <- orderings[[j]] l1_random <- rep(NA, length(target_ordering)) tpr_random <- rep(NA, length(target_ordering)) for(i in 1:length(l1_random)){ .sim <- rbn_inhibition(net, n = 200, target_ordering[1:i]) int_array <- c(rep("obs", nrow(.data)), attr(.sim, "target")) ints <- lapply(names(.sim), function(item) which(item == int_array)) %>% set_names(names(.sim)) .sim_data <- .sim %>% lapply(`levels<-`, value = c("1", "2", "3")) %>% data.frame %>% {rbind(.data, .)} algo_args <- list(tabu = 50, score = "mbde", exp = ints, prior = "cs", whitelist = mapk) strength <- boot_dags(.sim_data, cluster = cl, R = 200, algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) %>% custom.strength(names(.sim_data), cpdag = FALSE) %>% mutate(arc_id = arcs2names(.)) %>% # filter out the whitelisted edges filter(!(arc_id %in% arcs2names(mapk))) %>% select(-arc_id) tpr_random[i] <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>% {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} l1_random[i] <- l1_error(strength, net) } # Add a starting error whit just observational data. algo_args <- list(tabu = 50, score = "bde", prior = "cs", whitelist = mapk) strength <- boot_dags(.sim_data, cluster = cl, R = 200, algorithm = "tabu", algorithm.args = algo_args, random.graph.args = start_net_args) %>% custom.strength(names(.sim_data), cpdag = FALSE) %>% mutate(arc_id = arcs2names(.)) %>% # filter out the whitelisted edges filter(!(arc_id %in% arcs2names(mapk))) %>% select(-arc_id) tpr_random <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>% {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} %>% {c(., tpr_random)} l1_random <- l1_error(strength, net) %>% {c(., l1_random)} message("L1: ", paste0(l1_random, collapse=", ")) message("TPR: ", paste0(round(tpr_random, 2), collapse=", ")) tpr_random_vals[[j]] <- tpr_random l1_random_vals[[j]] <- l1_random } names(tpr_random_vals) <- sapply(orderings, paste, collapse = "-") random_tpr2 <- do.call("rbind", tpr_random_vals) %>% colMeans names(l1_random_vals) <- sapply(orderings, paste, collapse = "-") random_L12 <- do.call("rbind", l1_random_vals) %>% colMeans chime()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.