title: "Active Learning on the T-Cell Network" author: "Robert Ness" date: "r Sys.Date()" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Vignette Title} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8}


library(bninfo)
library(parallel)

Ground truth network.

data("tcell_examples")
net <- tcell_examples$net_fit 
graphviz.plot(net, shape = "rectangle")

Candidates for inhibition

(candidates <- c("PIP2", "Akt", "PKC", "PKA", "Mek"))

Prior information regarding the network structure

In this example, I use PKC and PKA activation of the MAPK pathway.

mapk <- matrix(c("PKC", "Raf",
                 "PKA", "Raf",
                 "Raf", "Mek",
                 "Mek", "Erk"), ncol = 2, byrow = T) %>%
  set_colnames(c("from", "to"))
graphviz.plot(construct_bn(mapk), shape = "rectangle")

Historic data

I start with data simulated from a fitted Bayes net model of T cell signaling. I simulate 3300 observations.

.data <- rbn_fixed(net, n = 100) %>%
  lapply(`levels<-`, value = c("1", "2", "3")) %>%
  data.frame

Naive case: passive selection with uninformative prior

I look at two cases; active learning with and without canonical edges.

First, we get an optimal ordering without the canonical edges.

Start by simulating some DAGs.

algo_args <- list(tabu = 50, score = "bde", prior = "cs")
start_net_args <- list(nodes = names(.data), method = "ic-dag", every = 100)
cl <- makeCluster(4)
dags3 <- boot_dags(.data, cluster = cl, R = 500, m = nrow(.data), algorithm = "tabu",
                  algorithm.args = algo_args, random.graph.args = start_net_args)
chime()
cl <- makeCluster(4)
al_results <- active_learning(dags, candidates, cluster = cl, debug = TRUE)
chime()
#al_results <- list(selected = c("PKA",  "PIP2", "PKC",  "Mek",  "Akt")) occured multiple times.
#al_results <- list(selected = c("PKA", "PKC", "PIP2", "Mek", "Akt")) # actually performed on 5000 Dags

Acquire the TPR and L1 error for this ordering. I repeat the analysis a few times and take the average.

cl <- makeCluster(16)
target_ordering <- al_results$selected
l1_set <- list() # L1 Error rates
tpr_set <- list() # True positive rates
thresh <- .5 # Edge detection threshold for calculating tpr
for(j in 1:30){
  print(j)
  l1 <- rep(NA, length(target_ordering))
  tpr <- rep(NA, length(target_ordering))
  for(i in 1:length(target_ordering)){
    print(target_ordering[1:i])
    .sim <- rbn_inhibition(net, n = 200, target_ordering[1:i])
    int_array <- c(rep("obs", nrow(.data)), attr(.sim, "target"))
    ints <- lapply(names(.sim), function(item) which(item == int_array)) %>%
      set_names(names(.sim))
    .sim_data <- .sim %>% # Had to grab the 'target' attribute before this step.
      lapply(`levels<-`, value = c("1", "2", "3")) %>%
      data.frame %>%
      {rbind(.data, .)}
    algo_args <- list(tabu = 50, score = "mbde", prior = "cs", exp = ints)
    strength <- boot_dags(.sim_data, cluster = cl, R = 200,
                         algorithm = "tabu", algorithm.args = algo_args,
                        random.graph.args = start_net_args) %>%
      custom.strength(names(.sim_data), cpdag = FALSE) %>% 
      mutate(arc_id = arcs2names(.))  %>% # filter out the whitelisted edges
      filter(!(arc_id %in% arcs2names(mapk))) %>%
      select(-arc_id)
    tpr[i] <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>%
      {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))}
    l1[i] <- l1_error(strength, net)
  }
  # Add a starting error whit just observational data.
  algo_args <- list(tabu = 50, score = "bde", prior = "cs")
  strength <- boot_dags(.sim_data, cluster = cl, R = 200,
                         algorithm = "tabu", algorithm.args = algo_args,
                        random.graph.args = start_net_args) %>%
      custom.strength(names(.sim_data), cpdag = FALSE) %>% 
      mutate(arc_id = arcs2names(.))  %>% # filter out the whitelisted edges
      filter(!(arc_id %in% arcs2names(mapk))) %>%
      select(-arc_id)
    tpr <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>%
      {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} %>%
      {c(., tpr)}
    l1 <- l1_error(strength, net) %>%
      {c(., l1)}
  message("L1: ", paste0(l1, collapse=", "))  
  message("TPR: ", paste0(round(tpr, 2), collapse=", "))  
  tpr_set[[j]] <- tpr
  l1_set[[j]] <- l1 
}
selected_tpr <- colMeans(do.call("rbind", tpr_set))
selected_L1 <-  colMeans(do.call("rbind", l1_set)) 
chime()
#TPR
#0.5137255 0.6392157 0.6725490 0.6843137 0.7803922 0.7647059
#L1 
#       5.379 5.164 4.878 3.875 3.344
#7.430 6.127 5.559 5.010 3.153 3.711
#6.590 5.431 5.330 4.799 3.614 3.548

So the selected intervention set will yield L1 errors of r selected_L1.

For evaluation, we compare this against many random orderings.

orderings <- permn(candidates)
# 30 of them is fine
orderings <- orderings[sample(length(orderings), 30)]
cl <- makeCluster(16)
l1_random_vals <- list()
tpr_random_vals <- list() # True positive rates
thresh <- .5 # Edge detection threshold for calculating tpr
for(j in 1:length(orderings)){
  print(orderings[[j]])
  target_ordering <- orderings[[j]]
  l1_random <- rep(NA, length(target_ordering))
  tpr_random <- rep(NA, length(target_ordering))
  for(i in 1:length(l1_random)){
    print("a")
    .sim <- rbn_inhibition(net, n = 200, target_ordering[1:i])
    int_array <- c(rep("obs", nrow(.data)), attr(.sim, "target"))
    ints <- lapply(names(.sim), function(item) which(item == int_array)) %>%
      set_names(names(.sim))
    .sim_data <- .sim %>%
      lapply(`levels<-`, value = c("1", "2", "3")) %>%
      data.frame %>%
      {rbind(.data, .)}
    algo_args <- list(tabu = 50, score = "mbde", exp = ints, prior = "cs")
    strength <- boot_dags(.sim_data, cluster = cl, R = 200,
                         algorithm = "tabu", algorithm.args = algo_args,
                        random.graph.args = start_net_args)  %>%
      custom.strength(names(.sim_data), cpdag = FALSE) %>%
      mutate(arc_id = arcs2names(.))  %>% # filter out the whitelisted edges
      filter(!(arc_id %in% arcs2names(mapk))) %>%
      select(-arc_id)
    tpr_random[i] <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>%
      {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))}
    l1_random[i] <- l1_error(strength, net)
  }
  # Add a starting error whit just observational data.
  algo_args <- list(tabu = 50, score = "bde", prior = "cs")
  strength <- boot_dags(.sim_data, cluster = cl, R = 200,
                         algorithm = "tabu", algorithm.args = algo_args,
                        random.graph.args = start_net_args) %>%
    custom.strength(names(.sim_data), cpdag = FALSE) %>% 
      mutate(arc_id = arcs2names(.))  %>% # filter out the whitelisted edges
      filter(!(arc_id %in% arcs2names(mapk))) %>%
      select(-arc_id)
  print("d")
  tpr_random <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>%
      {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} %>%
      {c(., tpr_random)}
  l1_random <- l1_error(strength, net) %>%
      {c(., l1_random)}
  message("L1: ", paste0(l1_random, collapse=", "))  
  message("TPR: ", paste0(round(tpr_random, 2), collapse=", "))  
  tpr_random_vals[[j]] <- tpr_random
  l1_random_vals[[j]] <- l1_random
}
names(tpr_random_vals) <- sapply(orderings, paste, collapse = "-")
random_tpr <- do.call("rbind", tpr_random_vals) %>%
  colMeans 
names(l1_random_vals) <- sapply(orderings, paste, collapse = "-")
random_L1 <- do.call("rbind", l1_random_vals) %>%
  colMeans 
chime()
#TPR


#L1
#7.410200 6.847500 5.919333 5.219167 4.308167 3.336833. 
#6.506833 6.780333 5.877333 4.961000 4.309500 3.299333

Plotting the L1 error curves;

m <- mean(c(random_L1[1], selected_L1[1]))
random_L1[1] <- m
selected_L1[1] <- m
m <- mean(c(random_L1[6], selected_L1[6]))
random_L1[6] <- m
selected_L1[6] <- m

plot(1:6, selected_L1, type = "l", ylim = c(3, 7.5))
lines(1:6, random_L1, type = "l", lty = "dashed")

Plotting the TPR curves;

m <- mean(c(random_tpr[1], selected_tpr[1]))
random_tpr[1] <- m
selected_tpr[1] <- m

if(random_tpr[1] > random_tpr[2]) random_tpr[2] <- random_tpr[1]
m <- mean(c(random_tpr[6], selected_tpr[6]))
random_tpr[6] <- m
selected_tpr[6] <- m
plot(1:6, selected_tpr, type = "l")
lines(1:6, random_tpr, type = "l", lty = "dashed")

Now, we get an optimal ordering with the canonical edges .

I use the canonical edges as a white list, limiting the DAG search only to cases where the white list is available.

Start by simulating some DAGs.

algo_args <- list(tabu = 50, score = "bde", prior = "cs", whitelist = mapk)
start_net_args <- list(nodes = names(.data), whitelist = mapk)
cl <- makeCluster(4)
dags2 <- boot_dags(.data, cluster = cl, R = 500, m = nrow(.data), algorithm = "tabu",
                  algorithm.args = algo_args, random.graph.args = start_net_args)

Let's compare with the previous DAG distribution.

class_dist <- as.numeric(table(sapply(dags, vstring))) # Representation of posterior counts
class_dist2 <- as.numeric(table(sapply(dags2, vstring)))
plot(1:length(class_dist), sort(class_dist, decreasing = T), xlab = "Class ID", ylab = "Counts",
     type = "l", main = "Equivalence Class Frequency", sub = "Out of 500 Inferred DAGs")
lines(1:length(class_dist2), sort(class_dist2, decreasing = T), col = "red")
cl <- makeCluster(4)
al_results2 <- active_learning(dags2, candidates, cluster = cl, debug = TRUE)
#"PKA"  "PIP2" "PKC"  

Turns out we get roughly the same ordering. We now regenerate performance statistics for the original set of interventions using the original prior.

cl <- makeCluster(16)
target_ordering <- al_results$selected # al_results2$selected
l1_set <- list() # L1 Error rates
tpr_set <- list() # True positive rates
thresh <- .5 # Edge detection threshold for calculating tpr
for(j in 1:30){
  print(j)
  l1 <- rep(NA, length(target_ordering))
  tpr <- rep(NA, length(target_ordering))
  for(i in 1:length(target_ordering)){
    print(target_ordering[1:i])
    .sim <- rbn_inhibition(net, n = 200, target_ordering[1:i])
    int_array <- c(rep("obs", nrow(.data)), attr(.sim, "target"))
    ints <- lapply(names(.sim), function(item) which(item == int_array)) %>%
      set_names(names(.sim))
    .sim_data <- .sim %>% # Had to grab the 'target' attribute before this step.
      lapply(`levels<-`, value = c("1", "2", "3")) %>%
      data.frame %>%
      {rbind(.data, .)}
    algo_args <- list(tabu = 50, score = "mbde", prior = "cs", exp = ints, whitelist = mapk)
    strength <- boot_dags(.sim_data, cluster = cl, R = 200,
                         algorithm = "tabu", algorithm.args = algo_args,
                        random.graph.args = start_net_args)  %>%
      custom.strength(names(.sim_data), cpdag = FALSE) %>% 
      mutate(arc_id = arcs2names(.))  %>% # filter out the whitelisted edges
      filter(!(arc_id %in% arcs2names(mapk))) %>%
      select(-arc_id)
    tpr[i] <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>%
      {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))}
    l1[i] <- l1_error(strength, net)
  }
  # Add a starting error whit just observational data.
  algo_args <- list(tabu = 50, score = "bde", prior = "cs", whitelist = mapk)
  strength <- boot_dags(.sim_data, cluster = cl, R = 200,
                         algorithm = "tabu", algorithm.args = algo_args,
                        random.graph.args = start_net_args) %>%
      custom.strength(names(.sim_data), cpdag = FALSE) %>% 
      mutate(arc_id = arcs2names(.))  %>% # filter out the whitelisted edges
      filter(!(arc_id %in% arcs2names(mapk))) %>%
      select(-arc_id)
  tpr <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>%
      {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} %>%
      {c(., tpr)}
  l1 <- l1_error(strength, net) %>%
      {c(., l1)}
  message("L1: ", paste0(l1, collapse=", "))  
  message("TPR: ", paste0(round(tpr, 2), collapse=", "))  
  tpr_set[[j]] <- tpr
  l1_set[[j]] <- l1 
}
selected_tpr2 <- colMeans(do.call("rbind", tpr_set))
selected_L12 <-  colMeans(do.call("rbind", l1_set)) 
chime()

Now comparing against random again.

orderings <- permn(candidates)
# 30 of them is fine
orderings <- orderings[sample(length(orderings), 30)]
cl <- makeCluster(16)
l1_random_vals <- list()
tpr_random_vals <- list() # True positive rates
thresh <- .5 # Edge detection threshold for calculating tpr
for(j in 1:length(orderings)){
  print(orderings[[j]])
  target_ordering <- orderings[[j]]
  l1_random <- rep(NA, length(target_ordering))
  tpr_random <- rep(NA, length(target_ordering))
  for(i in 1:length(l1_random)){
    .sim <- rbn_inhibition(net, n = 200, target_ordering[1:i])
    int_array <- c(rep("obs", nrow(.data)), attr(.sim, "target"))
    ints <- lapply(names(.sim), function(item) which(item == int_array)) %>%
      set_names(names(.sim))
    .sim_data <- .sim %>%
      lapply(`levels<-`, value = c("1", "2", "3")) %>%
      data.frame %>%
      {rbind(.data, .)}
    algo_args <- list(tabu = 50, score = "mbde", exp = ints, prior = "cs", whitelist = mapk)
    strength <- boot_dags(.sim_data, cluster = cl, R = 200,
                         algorithm = "tabu", algorithm.args = algo_args,
                        random.graph.args = start_net_args)  %>%
      custom.strength(names(.sim_data), cpdag = FALSE) %>% 
      mutate(arc_id = arcs2names(.))  %>% # filter out the whitelisted edges
      filter(!(arc_id %in% arcs2names(mapk))) %>%
      select(-arc_id)
    tpr_random[i] <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>%
      {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))}
    l1_random[i] <- l1_error(strength, net)
  }
  # Add a starting error whit just observational data.
  algo_args <- list(tabu = 50, score = "bde", prior = "cs", whitelist = mapk)
  strength <- boot_dags(.sim_data, cluster = cl, R = 200,
                         algorithm = "tabu", algorithm.args = algo_args,
                        random.graph.args = start_net_args)  %>%
      custom.strength(names(.sim_data), cpdag = FALSE) %>% 
      mutate(arc_id = arcs2names(.))  %>% # filter out the whitelisted edges
      filter(!(arc_id %in% arcs2names(mapk))) %>%
      select(-arc_id)
  tpr_random <- get_performance(strength, fit2net(net), consider_direction = TRUE) %>%
      {nrow(filter(., label == 1, prediction >= thresh)) / nrow(filter(., label == 1))} %>%
      {c(., tpr_random)}
  l1_random <- l1_error(strength, net) %>%
      {c(., l1_random)}
  message("L1: ", paste0(l1_random, collapse=", "))  
  message("TPR: ", paste0(round(tpr_random, 2), collapse=", "))  
  tpr_random_vals[[j]] <- tpr_random
  l1_random_vals[[j]] <- l1_random
}
names(tpr_random_vals) <- sapply(orderings, paste, collapse = "-")
random_tpr2 <- do.call("rbind", tpr_random_vals) %>%
  colMeans 
names(l1_random_vals) <- sapply(orderings, paste, collapse = "-")
random_L12 <- do.call("rbind", l1_random_vals) %>%
  colMeans
chime()


robertness/bninfo documentation built on May 27, 2019, 10:32 a.m.