R/tools.R

#' Scramble Labels in a Bayesian Network
#'
#' Scrambles the labels in random networks.  Intended to address any node ordering bias present in
#' random network generation algorithms
#' @param g a Bayesian network structure of class bn.struct
#' @export
scramble <- function(g){
  am <- amat(g)
  scramble <- sample(nodes(g))
  colnames(am) <- scramble
  rownames(am) <- scramble
  h <- empty.graph(scramble)
  amat(h) <- am
  h
}

#' Confirm a list of edges forms a DAG.
test_dag <- function(arc_list, debug = debug){
  if(debug) message("testing DAG constraint")
  isdag <- arc_list %>%
    as.data.frame %>%
    unique %>%
    graph.data.frame %>%
    is.dag
  if(!isdag && debug) message("failed DAG test")
  isdag
}

#' Convert bn.fit to bn
#' @export
fit2net <- function(netfit){
  net <- empty.graph(nodes(netfit))
  arcs(net) <- arcs(netfit)
  net
}

#' Covert Dataframe to a Model Averaging Object
#'
#'  Converts a data frame to an object of bn.strength, the class
#'  produced by model averaging in \emph{bn.learn}. The columns of the
#'  typical bn.strength object are "from", "to", "strength", and "direction".
#'  However this function does not check for presence of exclusivity of those items, it
#'  assumes you know what you are doing.
#'
#' @param a data frame
#' @param threshold desired value for threshold attribute
#' @param mode desired value for mode attribute
#' @export
rebuild_strength <- function(df, threshold, mode){
  if(!("data.frame" %in% class(df))) stop("df must be a data frame.")
  attr(df, "threshold") <- threshold
  attr(df, "mode") <- mode
  class(df) <- c("data.frame", "bn.strength")
  df
}

#' Convert an edge matrix to array of edge names
#'
#' Takes a matrix or data frame of edges, with from nodes in the first column, to nodes in the second.
#' Returns an array of edge names in the form of 'A->B' or 'A--B' for directed and undirected
#' edges respectfully.
#'
#' This is useful when you need a way of referencing edges, and you can easily infer the name by
#' which nodes are in the edge.  For example, when combine two data frames with different annotations
#' on the same set of edges, you can create a new 'edge_name' variable in both data frames, and merge the
#' two objects, using that variable in 'by' argument for the 'merge' function.
#' @export
#' @seealso \code{\link{name_edge_df}}, \code{\link{name_edges}}
arcs2names <- function(edges, directed_edges = TRUE, return_unique = FALSE){
  if(directed_edges){
    return(apply(edges[, 1:2, drop = F], 1, function(edge) paste(edge, collapse = "->")))
  }
  apply(edges[, 1:2, drop = F], 1, function(edge) paste(sort(edge), collapse = "--"))
}

#' Return array of edge names
#'
#' Given a network structure of class \emph{bn} returns an array of all the edge names.
#' Works with both undirected, directed, and partially directed graphs.
#' @param net object of class \emph{bn}
#' @return a vector of edge names of the form "A->B" or "A--B" depending on whether or not the edge
#' is directed or not.
#' @seealso \code{\link{name_edge_df}}, \code{\link{arcs2names}}
#' @export
name_edges <- function(net){
  directed_df <- data.frame(directed.arcs(net), stringsAsFactors = F) %>%
    mutate(edge_names = arcs2names(., directed_edges = TRUE),
           directed = TRUE)
  undirected_df <- data.frame(undirected.arcs(net), stringsAsFactors = F) %>%
    mutate(edge_names = arcs2names(., directed_edges = FALSE),
           directed = FALSE)
  unique(c(directed_df$edge_names, undirected_df$edge_names))
}

#' Convert arc matrix to a data frame with named edges
#'
#' Takes a network, and returns a dataframe containing its edges.  The first to
#' columns are from and to nodes, the third column is the name of the edge, i.e. a
#' string in the form "A->B" or "A--B", depending on whether or not the edge is
#' directed.  The first two columns correspond to the output of \emph{arcs},
#' \emph{undirected.arcs}, and \emph{directed.arcs}.  In the case of an undirected arc,
#' these functions will produce two rows in their output matrix, eg A--B has two rows
#' "A, B", and "B, A".  This function treats undirected arcs the same way.
#' @param net object of class \emph{bn}
#' @return a dataframe corresponding to a edge list, where the third column is an edge
#' attribute corresponding to a character name of the edge.
#' @export
#' @seealso \code{\link{name_edges}}, \code{\link{arcs2names}}
name_edge_df <- function(net){
  rbind(
    { directed.arcs(net) %>%
      data.frame(stringsAsFactors=FALSE) %>%
      mutate(edge_name = arcs2names(., directed_edges = TRUE))
    },
    { undirected.arcs(net) %>%
      data.frame(stringsAsFactors=FALSE) %>%
      mutate(edge_name = arcs2names(., directed_edges = FALSE))
    }
  )
}

#' DAG Simulation with Preferential Attachment
#'
#' Simulates a DAG using preferential attachment, and accepts a set of whitelisted
#' edges.
#' @param node_names The names of the nodes in the network.
#' @param whitelist A white list of edges, as a matrix of two columns.
#' The preferential attachment algorithm uses these edges as starting edges.
#' @export
ba_whitelist <- function(nodes, whitelist, ...){
  start <- igraph::graph_from_edgelist(whitelist, directed = TRUE)
  g <- igraph::sample_pa(length(nodes), start.graph = start, ...)
  igraph::V(g)[1:igraph::vcount(start)]$name <- igraph::V(start)$name
  igraph::V(g)[is.na(igraph::V(g)$name)]$name <- setdiff(nodes,igraph::V(start)$name)
  net <- bninfo::construct_bn(igraph::as_edgelist(g))
  net$learning$algo <- "preferential attachment"
  net
}

#' Generate random graphs with a white list of edges
#'
#' The bnlearn package has a random graph generator in the 'random.graph' function.
#' These random graphs are used as starting graphs in structure inference.  However,
#' if there is a set of white listed edges, some of those edges may clash with random
#' edges in the random graph.  For example, the whitelisted edges and random edges might
#' introduce cycles.  This function is a wrapper for bn.learn's random.graph function
#' that allows for a whitelist.  It uses the same arguments as random.graph, except for the
#' whitelist argument, which is the same as the whitelist argument used in bnlearn's structure
#' inference methods.  Ideally, the random.graph function would take a whitelist
#' argument and add random edges on a starter graph that included those whitelisted edges.
#' This presents a less elegant work-around, where random edges are added to nodes that are
#' not in the whitelist.  If all the nodes are in the whitelist, then simply a graph based on the
#' whitelist is return.
#' @param nodes a vector of character strings, the labels of the nodes.
#' @param a matrix with two columns (optionally labeled "from" and "to"), containing a set of arcs to be included in the graph.
#' @param num an integer, the number of graphs to be generated.
#' @param ... additional arguments passed to either the preferential attachment algorithm (if there is a whitelist) or the random.graph algorithm.
#' @export
random_graph <- function(nodes, whitelist = NULL, num = 1, ..., debug = FALSE){
  extra_args <- list(...)
  if(is.null(whitelist)){
    random.graph.args <- c(list(nodes = nodes, num = num), extra_args)
    start_nets <- do.call("random.graph", random.graph.args)
    if(num == 1) start_nets <- list(start_nets)
  }else{
    start_nets <- lapply(1:num, function(i) {
      ba_whitelist_args <- c(list(nodes = nodes, whitelist = whitelist), extra_args)
      do.call("ba_whitelist", ba_whitelist_args)
    })
  }
  start_nets
}

#' Apply Structure Inference from a Starting Net
#'
#' Applies a structure inference algorithm given a starting net.
#' @param net A starting net
#' @param algorithm character string for the structure inference algorithm to be applied.
#' @param algorithm.args named list of the arguments to be passed to the algorithm.
#' @param m a positive integer, the size of each bootstrap replicate.
#' @param int_array provided only if using the "mbde" score for causal inference, provide a character array where each element corresponds to a row in the dataframe, and whose value is the name of the feature that was subject to intervention in that row.
#' @export
infer_from_start_net <- function(net, algorithm, algorithm.args, m = NULL, int_array = NULL){
  require(bnlearn)
  if(is.null(m)) m <- nrow(algorithm.args$x)
  algorithm.args$start <- net
  sample_dex <- sample(nrow(algorithm.args$x), size = m, replace = TRUE)
  if(algorithm.args$score == "mbde"){
    sim_int <- int_array[sample_dex]
    exp <- lapply(names(algorithm.args$x), function(item){
      which(sim_int == item)
    })
    names(exp) <- names(algorithm.args$x)
    algorithm.args$exp <- exp
  }
  algorithm.args$x <- algorithm.args$x[sample_dex, ]
  do.call(algorithm, algorithm.args)
}

#' Infer Multiple Structures with Different Starting Graphs
#'
#' Infers multiple structures from different starting graphs.  In each iteration
#' the data is resampled.  If a whitelist is included in the inference algorithm
#' arguments, random graphs are generated from the whitelist using preferential
#' attachment.  Currently not implemented for blacklists.
#' @param data the data set targeted for inference.
#' @param cluster an optional cluster object from package parallel.
#' @param R a positive integer, the number of bootstrap replicates.
#' @param m a positive integer, the size of each bootstrap replicate.
#' @param algorithm a character string, the learning algorithm to be applied to the bootstrap replicates. Possible values are gs, iamb, fast.iamb, inter.iamb, mmpc, hc, tabu, mmhc and rsmax2. See bnlearn-package and the documentation of each algorithm for details.
#' @param algorithm.args a list of extra arguments to be passed to the learning algorithm.
#' @param random.graph.args arguments to be passed in random graph generation. If a whitelist
#' is provided in algorithm.args, the arguments are used in preferential attachment. Otherwise
#' they are passed to bnlearn's random.graph generation.
#' @export
boot_dags <- function(x, cluster = NULL, R = 200, m = nrow(data), algorithm,
                          algorithm.args = list(), random.graph.args = list()){
  algorithm.args$x <- x
  random.graph.args$nodes <- names(x)
  random.graph.args$num <- R
  if("whitelist" %in% names(algorithm.args)){
    random.graph.args$whitelist <- algorithm.args$whitelist
  }
  nets <- do.call("random_graph", random.graph.args)
  int_array <- NULL
  if("exp" %in% names(algorithm.args)){
    INT <- algorithm.args$exp
    int_array <- rep(NA, nrow(x))
    for(l in 1:length(INT)){
      int_array[INT[[l]]] <- names(INT)[l]
    }
    int_array[is.na(int_array)] <- "obs"
  }
  if(is.null(cluster)){
    result <- lapply(nets, infer_from_start_net, algorithm = algorithm, algorithm.args = algorithm.args, m = m, int_array = int_array)
  }else{
    result <- parLapply(cl = cluster, nets, infer_from_start_net, algorithm = algorithm, algorithm.args = algorithm.args, m = m, int_array = int_array)
  }
  result
}

#' Construct Bayesian network structure directly from arc matrix
#'
#' @param arc_mat a matrix of arcs (edge list), from nodes in the first column, to nodes in the
#' second column.
#' @return an object of bn.fit
#' @export
construct_bn <- function(arc_mat){
  node_names <- unique(c(arc_mat[, 1], arc_mat[, 2]))
  net <- bnlearn::empty.graph(node_names)
  bnlearn::`arcs<-`(net, value = arc_mat)
}



#' Break cycles to produce a DAG
#' Given a set of arcs of a directed graph, if the arcs contain a cycle,
#' break the cycle by reversing low probability arcs.
#' @param arc_list a list of arcs/edges from a directed graph structure
#' @param white_list a list of arcs/edges that are not to be broken.
#' @param probabilities associated with the arcs.  The algorithm will prioritize
#' low probability edges for reversal.
#' @export
break_cycles <- function(arc_list, probabilities = NULL){
  if(any(probabilities == 0)) stop("Don't include 0 probability edges.")
  if(is.null(probabilities)) probabilities <- rep(.5, nrow(arc_list))
  g <- arc_list[, c(1, 2)] %>%
    as.data.frame %>%
    mutate(prob = probabilities) %>%
    graph.data.frame(directed = TRUE)
  if(is_dag(g)) return(arc_list)
  E(g)$weight <- E(g)$prob ^ -1
  g_init <- g
  reversed_edges <- NULL
  untouchable_edges <- E(g)[prob == 1]
  repeat{
    path <- detect_cycle(g)
    if(length(path) == 0) {
      break
    }
    candidates_for_reversal <- E(g)[setdiff(path, c(reversed_edges, untouchable_edges))]
    if(length(candidates_for_reversal) == 0) stop("There are no edges eligible for reversal. Try adjusting probabilities.")
    target_edge <- candidates_for_reversal[which(candidates_for_reversal$weight ==
                                max(candidates_for_reversal$weight))][1]
    g <- reverse_edge(g, target_edge)
    E(g)$weight <- E(g_init)$weight
    reversed_edges <- c(reversed_edges, target_edge)
  }
  get.edgelist(g) %>%
    set_colnames(c("from", "to"))
}

#' Convert a DAG to an array of strings representing v-structures
#' Concatenates all the v-structures in a DAG into an array of strings.  Useful for counting equivalence classes.
#' For a v-structure of X->Z<-Y, returns an element Z~X,Y, where the order of X and Y is determined
#' by a sort.
#' @export
v_array <- function(net){
  mat <- vstructs(net, moral = FALSE)
  apply(mat, 1, function(vs){
    Ps <-  sort(c(vs[1], vs[3]))
    paste0(vs[2], "~", Ps[1],",", Ps[2])
  })
}
#' @rdname v_array
#' @export
v_string <- function(net){
  v_array(net) %>%
    sort %>%
    paste(collapse = ";")
}
robertness/bninfo documentation built on May 27, 2019, 10:32 a.m.