R/dimensionality.R

# Functions specific to the dimensionality vignette.

#' Simulating the effect of sample size on inference with constraint based algorithms.
#' Given a parameterized Bayesian network, this simulation simulates data of a given
#' sample size, learns undirected edges from the simulated data using a constraint-based
#' algorithm, and collects the number of tests used in the inference as well as
#' the number of true and false positives.  Several iterations are conducted and
#' the median number of tests, fp count and tp count is returned.
#' @param net a parameterized Bayesian network of class \emph{bn.fit}
#' @param a numeric vector of sample size values
#' @param a constraint-based algorithm \emph{gs, iamb, fast.iamb, inter.iamb}
#' @param the target nominal type I error rate
#' @param m the number of iteration
#' @export
sample_size_sim <- function(net, N, algo, alpha = .05, m= 100, verbose = FALSE){
  if(!(deparse(substitute(algo)) %in% c("gs", "fast.iamb",
                                        "iamb", "inter.iamb"))) stop("Must be a constraint based algorithm")
  test_num <- rep(NA, length(N))
  fp_num <- rep(NA, length(N))
  tp_num <- rep(NA, length(N))
  for(j in 1:length(N)){
    if(verbose == TRUE) message("testing for sample size = ", N[j])
    tests <- rep(NA, m)
    fps <- rep(NA, m)
    tps <- rep(NA, m)
    for(i in 1:m){
      inferred <- rbn(net, n = N[j]) %>%
        algo(undirected = TRUE, alpha = alpha)
      perf <- count_positives(inferred, moral(fit2net(net)))
      fps[i] <- perf$fp
      tps[i] <- perf$tp
      tests[i] <- inferred$learning$ntests
    }
    fp_num[j] <- median(fps)
    tp_num[j] <- median(tps)
    test_num[j] <- median(tests)
  }
  list(median_test_num = test_num, median_fp_count = fp_num, median_tp_count = tp_num)
}




#' The simplest case of for testing conditional independence is one of three
#' variables -- two to compare, and one variable upon which to condition the comparison.
#' These function generates a Gaussian Bayesian network structure composed of three variable
#' subnetworks of the form B <- C -> A.
#' @param Number of triplets.
#' #' @param reg_line an element of the bn.fit.gnet class.  This is treated here as a list element, since the class uses S3.
#' The element is a regression line CPD, either of length 1 or 2.
#' @param rho the desired value of the correlation.
#' @param net object of \emph{bn.fit}, \emph{bn.fit.gnet}
#' @keywords internal
build_triplet_network <- function(m){
  arc_mat <- matrix(rep(0, 2 * 2 * m), ncol = 2)
  for(i in 1:m){
    r <- i * 2
    parent <- paste0("C", i)
    child1 <- paste0("A", i)
    child2 <- paste0("B", i)
    arc_mat[r - 1, ] <- c(parent, child1)
    arc_mat[r, ] <- c(parent, child2)
  }
  net <- empty.graph(c(paste0("A", 1:m),
                       paste0("B", 1:m),
                       paste0("C", 1:m)))
  arcs(net) <- arc_mat
  net
}
#' @rdname build_triplet_network
modify_triplet_cpd <- function(reg_line, rho){
  if(length(coef(reg_line)) == 1){
    reg_line$coefficients <- 0
    reg_line$sd <- 1
  } else{
    reg_line$coefficients <- c(0, rho)
    reg_line$sd <- sqrt(1 - rho^2)
  }
  reg_line
}
#' @rdname build_triplet_network
modify_triplet_cpds <- function(net, rho){
  # I use lapply to do the modifications.
  net <- lapply(net, modify_triplet_cpd, rho = rho)
  class(net) <- c("bn.fit", "bn.fit.gnet")
  net
}

#' Gaussian Network Comprised of 3-Node Subnetworks
#'
#' The simplest case of for testing conditional independence is one of three
#' variables -- two to compare, and one variable upon which to condition the comparison.
#' These function generates a Gaussian Bayesian network structure composed of three variable
#' subnetworks of the form B <- C -> A.  Intercepts are fixed at 0 and marginal variances at 1.
#' @param Number of subnetworks.
#' @param rho the desired value of the correlation for A and B wsith C.
#' @return an object of \emph{bn.fit}, \emph{bn.fit.gnet}
#' @export
triplet_network <- function(m, rho){
  build_triplet_network(m) %>%
    sim_gbn %>%
    modify_triplet_cpds(rho)
}

#' Add Single Nodes to a Gaussian Bayesian Network
#'
#' Adds single nodes to a Gaussian Bayesian Network.  Used here to
#' increase dimension without increasing the number of edges.
#' @param gbn an object of the \emph{bn.fit} and \emph{bn.fig.gnet} classes.  The function will add single nodes to this object.
#' @param k number of nodes to add
#' @return an object of \emph{bn.fit}, \emph{bn.fit.gnet}
#' @keywords simulation
#' @export
add_singletons <- function(gbn, k){
  new_gbn <- empty.graph(c(paste0("D", 1:k))) %>%
    sim_gbn %>%
    modify_triplet_cpds(1) %>%
    c(gbn)
  class(new_gbn) <- c("bn.fit", "bn.fit.gnet")
  new_gbn
}

#' Simulate Max Absolute Correlation
#'
#' This is specific to the dimensionality vignette.  Finds the maximum absolute value of
#' correlation between conditionally independent proteins.
#'
#' @param net a bn.fit.gnet object. This won't work with a graph that simulates non-numeric data.
#' @param n number of observations
#' @keywords simulation
#' @export
sim_cond_ind_cor <- function(net, n){
  independence_matrix <- net %>%
    fit2net %>% # convert fitted bn to a network structure
    moral %>% # convert to an undirected network
    amat %>% # get the adj matrix, where 0's indicate conditional independence
    {abs(. - 1)} # make the 0's into 1's and 1's into 0's
  rbn(net, n) %>% # Sim data
    cor %>% # Get the correlation matrix
    {. * independence_matrix} %>% # keep only independent elements
    {. * lower.tri(., diag = FALSE)} %>% # take half the matrix
    as.numeric %>% # convert to a numeric
    abs %>% # we want to look at the absolute correlation
    unique %>% # get rid of the many 1's and other duplicates
    max # return the highest correlation
}
#' Simulate Median Correlation
#'
#' This is specific to the dimensionality vignette.  Finds the median
#' correlation between nodes that share edges.
#'
#' @param net a bn.fit.gnet object. This won't work with a graph that simulates non-numeric data.
#' @param n number of observations
#' @keywords simulation
#' @export
sim_median_cor <- function(net, n){
  markov_elements <- net %>%
    fit2net %>%
    moral %>%
    amat %>%
    {which(. == 1, arr.ind = T)}
  rbn(net, n) %>% # Sim data
    cor %>%
    {apply(markov_elements, 1, function(idx) .[idx[1], idx[2]] )} %>%
    unique %>% # not a neccessary step but less sorting required
    median # return the median correlation
}

#' Performance of Causal Inference
#'
#' This is specific to the dimensionality vignette.  Simulates data from
#' a network, applies a learning algorithm, and counts the true and false postives of network
#' inference.
#' @param net a bn.fit.gnet object. This won't work with a graph that simulates non-numeric data.
#' @param n number of observations
#' @keywords simulation
#' @export
test_markov_dependence_learning <- function(net, n, algo = gs){
  successful <- FALSE
  while(!successful){
    inferred <- tryCatch(algo(rbn(net, n), undirected = TRUE),
                         error = function(e) "failed")
    if(!identical(inferred, "failed")) successful <- TRUE
  }
  count_positives(inferred, moral(fit2net(net)))
}
robertness/bninfo documentation built on May 27, 2019, 10:32 a.m.