R/bnsl.R

Defines functions phgs ppc bnsl

Documented in bnsl phgs ppc

#' Bayesian network structure learning
#'
#' Wrapper function for executing general structure learning algorithms, much like \code{\link[bnlearn]{rsmax2}} from the \code{\link[bnlearn]{bnlearn}} package, with the addition of the partitioned PC (pPC) algorithm, the \eqn{p}-value adjacency thresholding (PATH) algorithm, and the hybrid greedy initialization (HGI) algorithm.
#'
#' @param x a data frame containing the variables in the model. Currently, the implementations of pPC, PATH, and HGI only support discrete data.
#' @param restrict an argument as in \code{\link[bnlearn]{rsmax2}}, with the following additional options. 
#' \itemize{
#' \item \code{restrict = "ppc"} for the pPC algorithm (\code{\link{ppc}}).
#' \item \code{restrict = "true"} with \code{true_bn} supplied to perfectly restrict to the true skeleton.
#' \item \code{restrict = "cig"} with \code{true_bn} supplied to perfectly restrict to the conditional independence graph (CIG).
#' \item \code{restrict = ""} for no restriction method (i.e., for score-based methods).
#' }
#' @param maximize an argument as in \code{\link[bnlearn]{rsmax2}}, with the additional option of \code{maximize = ""} for no maximization method (i.e., for constraint-based methods).
#' @param restrict.args an argument as in \code{\link[bnlearn]{rsmax2}}, with the addition of the following arguments only applicable to \code{restrict = "ppc"} (\code{\link{ppc}}). 
#' \itemize{
#' \item \code{max_groups = 20}: a numeric value indicating maximum number of groups to partition into. The PC(-stable) algorithm may be recovered with \code{max_groups = 1}, allowing for the \code{sort_pval} argument.
#' \item \code{sort_pval = TRUE}: a logical value indicating whether or not to sort the order of the consideration of conditioning sets by the current p-values. 
#' \item \code{max_wthn_sx}: a numeric value indicating the maximum size of considered conditioning sets when estimating edges within clusters. 
#' \item \code{max_btwn_sx}: a numeric value indicating the maximum size of considered conditioning sets when estimating edges between clusters.
#' \item \code{max_btwn_nbr}: a numeric value indicating the maximum neighborhood size when estimating edges between clusters.
#' \item \code{maxp}: a numeric value indicating the maximum number of parents for a node, as in \code{\link[bnlearn]{hc}} and \code{\link[bnlearn]{tabu}}. Useful when \code{hgi = TRUE}. Should be less than or equal to \code{maximize.args$maxp}.
#' }
#' Additionally, \code{\link{ppc}} only uses \code{test = "mi"} for the clustering step, though subsequent stages of the algorithm can employ different tests.
#' @param maximize.args an argument as in \code{\link[bnlearn]{rsmax2}}, with the \code{maxp} argument likewise used constraint-based edge orientation, if applicable. Additionally, the \code{score} argument determines the score used in PATH (\code{score = "pred-loglik"} not supported), and HGI exclusively uses \code{score = "bic"}.
#' @param undirected a logical value indicating, for constraint-based algorithms, whether the skeleton should be output without learning edge orientations. Not applicable for hybrid methods, score-based methods, or when \code{path} or \code{hgi} are activated.
#' @param path a numeric value indicating the number of solution(s) to be generated by the PATH algorithm. By default, PATH remains deactivated with \code{path = 1}. Only applicable for \code{restrict = "ppc"}.
#' @param min_alpha a numeric value between \code{0} and \code{restrict.args$alpha} that indicates the minimum threshold value for the PATH algorithm.
#' @param hgi a logical value activating the HGI algorithm for greedy edge orientation. Not applicable if \code{restrict = ""}.
#' @param true_bn a \code{bn} object with the true underlying structure of \code{x} to evaluate d-separation tests instead of conditional independence tests. Only applicable for \code{restrict = "ppc"} or \code{restrict = "true"}.
#' @param whitelist,blacklist,debug arguments as in \code{\link[bnlearn]{rsmax2}}.
#' @details None.
#' @return A Bayesian network as an object of class \code{bn}.
#' @author Jireh Huang (\email{jirehhuang@@ucla.edu})
#' @references \href{https://doi.org/10.1007/s10994-022-06145-4}{Huang, J., & Zhou, Q. (2022). Partitioned hybrid learning of Bayesian network structures. \emph{Machine Learning}. https://doi.org/10.1007/s10994-022-06145-4}
#' @seealso \code{\link{ppc}}, \code{\link{phgs}}
#' @examples
#' ## Read Bayesian network object 
#' true_bn <- bnrepository("child")
#' 
#' ## Generate data and relevel for simplicity
#' set.seed(1)
#' x <- bnlearn::rbn(true_bn, n = 1e4)
#' x <- as.data.frame(sapply(x, function(x) as.factor(as.integer(x) - 1L)),
#'                    stringsAsFactors = TRUE)
#' 
#' ## pPC with PATH
#' bn1 <- bnsl(x = x, restrict = "ppc", maximize = "",
#'             restrict.args = list(alpha = 1e-3, max.sx = 3, sort_pval = TRUE),
#'             maximize.args = list(maxp = 8), path = 10, min_alpha = 1e-5,
#'             hgi = FALSE, debug = TRUE)
#' 
#' ## pHGS (pPC with PATH, HGI, and greedy search)
#' bn2 <- bnsl(x = x, restrict = "ppc", maximize = "tabu",
#'             restrict.args = list(alpha = 1e-3, max.sx = 3, sort_pval = TRUE),
#'             maximize.args = list(maxp = 8, tabu = 10, max.tabu = 10),
#'             path = 10, min_alpha = 1e-5, hgi = TRUE, debug = TRUE)
#' 
#' ## MMHC with HGI
#' bn3 <- bnsl(x = x, restrict = "mmpc", maximize = "hc",
#'             restrict.args = list(alpha = 1e-3, max.sx = 3, sort_pval = TRUE),
#'             maximize.args = list(maxp = 8), hgi = TRUE, debug = TRUE)
#' @export

bnsl <- function(x, restrict = "ppc", maximize = "tabu", 
                 restrict.args = list(), maximize.args = list(), 
                 undirected = FALSE, path = 1, min_alpha = 1e-5, 
                 hgi = FALSE, true_bn = NULL, whitelist = NULL, 
                 blacklist = NULL, debug = FALSE) 
{
  times <- tests <- 
    c(restrict = 0, orient = 0, maximize = 0)
  
  nodes <- names(x)
  if (!(restrict <- ifelse(is.null(restrict), "", restrict)) %in% c("ppc", "true", 
                                                                    "cig", "")){
    bnlearn:::check.learning.algorithm(restrict, class = c("constraint"))
  }
  if (!(maximize <- ifelse(is.null(maximize), "", maximize)) %in% c(""
  )){
    bnlearn:::check.learning.algorithm(maximize, class = "score")
  }
  bnlearn:::check.logical(hgi)
  debug_cli(!is.numeric(path) || (path %% 1 != 0) || is.infinite(path), 
            cli::cli_abort, "path must be a positive numeric integer")
  if (restrict == "" && maximize == "")
    debug_cli(TRUE, cli::cli_abort, 
              "no method given for both restrict and maximize")
  else if ((restrict == "mmpc") && (maximize == "hc"))
    method <- "mmhc"
  else if ((restrict == "hpc") && (maximize == "hc")) 
    method <- "h2pc"
  else 
    method <- sprintf("%s%s", restrict, maximize)
  method <- sprintf("%s%s", method, ifelse(hgi, "-hgi", ""))
  method <- ifelse(method %in% c("ppchc-hgi",
                                 "ppctabu-hgi"), "phgs", method)
  algo <- gsub("-.*", "", method)
  if (restrict == ""){
    if (path > 1){
      debug_cli(TRUE, cli::cli_alert_warning,
                "deactivating path because no restrict method supplied")
      path <- 1
    }
    if (hgi){
      debug_cli(TRUE, cli::cli_alert_warning,
                "deactivating hgi because no restrict method supplied")
      hgi <- FALSE
    }
  }
  else if (maximize == ""){
    if (restrict %in% c("ppc", "true", "cig")){
      algo <- "pc.stable"
    }
  }
  else if (!method %in% bnlearn:::hybrid.algorithms){
    algo <- "rsmax2"
  }
  if (path > 1 && (restrict != "ppc" ||
                   !is.null(true_bn))){
    debug_cli(TRUE, cli::cli_alert_warning,
              c("path can only be used for finite-sample executions of ppc; path deactivated"))
    path <- 1
  }
  min_alpha <- bnlearn:::check.alpha(min_alpha)
  bnlearn:::check.logical(undirected)
  undirected <- (undirected || restrict == "") &&
    !(path > 1) && (!hgi)  # not undirected if path or hgi
  if (!is.null(true_bn)){
    if (any(class(true_bn) == "bn.fit")){
      arcs <- bnlearn::arcs(true_bn)
      true_bn <- bnlearn::empty.graph(nodes = bnlearn::nodes(true_bn))
      bnlearn::arcs(true_bn) <- arcs
    }
    bnlearn:::check.bn(true_bn)
  }
  
  debug_cli(debug, cli::cli_alert_info,
            c("structure learning with restrict = {restrict}, ",
              "path = {path}, hgi = {hgi}, maximize = {maximize}"))
  
  rst <- NULL  
  if (restrict %in% c(bnlearn:::constraint.based.algorithms, "ppc", "true", "cig")) {
    
    start_time <- Sys.time()
    
    critical.arguments <- c("x", "method", "whitelist", 
                            "blacklist", "true_bn", "debug")
    named.arguments <- names(formals(skeleton))
    named.arguments <- setdiff(named.arguments, critical.arguments)
    other.arguments <- setdiff(names(restrict.args), named.arguments)
    bnlearn:::check.unused.args(other.arguments, character(0))
    restrict.args[critical.arguments] <- list(x, method = restrict, whitelist = whitelist, 
                                              blacklist = blacklist, true_bn = true_bn, debug = debug)
    rst <- do.call("skeleton", restrict.args)
    
    end_time <- Sys.time()
    times[1] <- as.numeric(end_time - start_time, unit = "secs")
    tests[1] <- sum(attr(rst, "learning")$ntests)
  }
  if (undirected){  
    
    ## no path or hgi
    debug_cli(all(c(restrict, maximize) == ""), cli::cli_abort, 
              "valid constraint-based algorithm must be supplied if undirected graph desired")
    arcs <- bnlearn:::nbr2arcs(rst)
    learning <- c(attr(rst, "learning"), 
                  list(algo = method, undirected = undirected,
                       illegal = bnlearn:::check.arcs.against.assumptions(NULL, x, 
                                                                          attr(rst, "learning")$test)))
    res <- list(learning = learning, 
                nodes = bnlearn:::cache.structure(names(x), arcs = arcs), arcs = arcs)
  }
  else if (restrict != ""){  ## TODO: check if necessary
    
    start_time <- Sys.time()
    
    ## orient, with or without path and with or without hgi
    restrict.args[c("whitelist", "blacklist")] <- attr(rst, "learning")[c("whitelist", 
                                                                          "blacklist")]
    args <- c(restrict.args,
              list(local.structure = rst, score = maximize.args$score,
                   path = path, min_alpha = min_alpha, hgi = hgi, 
                   complete = attr(rst, "learning")$complete))
    if (is.null(args$maxp))
      args$maxp <- maximize.args$maxp
    else if (!is.null(maximize.args$maxp))
      args$maxp <- min(args$maxp, maximize.args$maxp)
    res <- do.call("orient", args[names(args) %in% names(formals(orient))])
    # res$learning["blacklist"] = list(blacklist)  # TODO: check if necessary
    
    end_time <- Sys.time()
    times[2] <- as.numeric(end_time - start_time, unit = "secs")
    tests[2] <- res$learning$tests
  }
  if (maximize == ""){
    res <- structure(res, class = "bn")
  } 
  else{
    
    start_time <- Sys.time()
    
    if (restrict != ""){
      
      ## restricted greedy search
      start <- bnlearn::empty.graph(nodes = nodes)
      bnlearn::arcs(start) <- res$arcs
      bnlearn::arcs(start) <- bnlearn::directed.arcs(start)
      constraints <- bnlearn:::arcs.to.be.added(amat = res$arcs, nodes = nodes, 
                                                whitelist = res$learning$blacklist)
    } 
    else{
      
      constraints <- start <- NULL
    }
    critical.arguments <- c("x", "start", "heuristic", 
                            "whitelist", "blacklist", "debug")
    named.arguments <- names(formals(bnlearn:::greedy.search))
    named.arguments <- setdiff(named.arguments, critical.arguments)
    bnlearn:::check.unused.args(intersect(critical.arguments, names(maximize.args)), 
                                character(0))
    if (any(bnlearn:::which.undirected(whitelist, nodes = nodes))) 
      whitelist <- bnlearn:::cpdag.arc.extension(whitelist, nodes = nodes)
    maximize.args[critical.arguments] <- list(x, start = start, heuristic = maximize, 
                                              whitelist = whitelist, blacklist = constraints, 
                                              debug = debug >= 3)
    rst <- res[c("learning", "nodes", "arcs")]
    res <- do.call(bnlearn:::greedy.search, maximize.args)
    
    ## rst args
    res$learning$args[c("alpha", "min_alpha",
                        "best_alpha")] <- rst$learning$args[c("alpha", "min_alpha",
                                                              "best_alpha")]
    ## rst
    res$learning[c("rstest", "max.sx", "group", "path", 
                   "hgi", "whitelist", "blacklist", "arcs1")] <- 
      rst$learning[c("test", "max.sx", "group", "path", 
                     "hgi", "whitelist", "blacklist", "arcs1")]
    
    end_time <- Sys.time()
    times[3] <- as.numeric(end_time - start_time, unit = "secs")
    tests[3] <- res$learning$ntests - 
      (hgi * length(nodes))  # reference scores already computed in hgi
    
    debug_cli(debug, cli::cli_alert_success,
              c("completed greedy search with {maximize} ",
                "in {prettyunits::pretty_sec(times[3])} with {tests[3]} calls"))
  }
  ## other
  res$learning$maxscore <- res$learning$test
  res$learning[c("ntests", "algo", "method",
                 "undirected", "restrict", "maximize")] <- list(sum(tests), algo, method, 
                                                                undirected, restrict, maximize)
  res$learning[c("times", "tests")] <- list(times, tests)
  
  debug_cli(debug, cli::cli_alert_success,
            c("completed structure learning ",
              "in {prettyunits::pretty_sec(sum(times))} with {sum(tests)} calls"))
  
  return(invisible(res))
}



#' The partitioned PC (pPC) algorithm
#'
#' Function for executing the partitioned PC (pPC) algorithm, including the option to use the \eqn{p}-value adjacency thresholding (PATH) algorithm. 
#'
#' @param x,undirected,alpha,max.sx,maxp,path,min_alpha,hgi,max_wthn_sx,max_btwn_sx,max_btwn_nbr,sort_pval,max_groups,true_bn,cluster,whitelist,blacklist,debug see \code{\link{bnsl}} for a description of arguments.
#' @return A Bayesian network as an object of class \code{bn}.
#' @author Jireh Huang (\email{jirehhuang@@ucla.edu})
#' #' @references \href{https://doi.org/10.1007/s10994-022-06145-4}{Huang, J., & Zhou, Q. (2022). Partitioned hybrid learning of Bayesian network structures. \emph{Machine Learning}. https://doi.org/10.1007/s10994-022-06145-4}
#' @seealso \code{\link{bnsl}}, \code{\link{phgs}}
#' @examples
#' ## Read Bayesian network object 
#' true_bn <- bnrepository("child")
#' 
#' ## Generate data and relevel for simplicity
#' set.seed(1)
#' x <- bnlearn::rbn(true_bn, n = 1e4)
#' x <- as.data.frame(sapply(x, function(x) as.factor(as.integer(x) - 1L)),
#'                    stringsAsFactors = TRUE)
#' 
#' ## pPC with PATH
#' bn1 <- ppc(x = x, alpha = 1e-2, max.sx = 3, maxp = 8, path = 10,
#'            min_alpha = 1e-5, sort_pval = TRUE, max_groups = 20, 
#'            debug = TRUE)
#' 
#' ## PC(-stable) with PATH
#' bn2 <- ppc(x = x, alpha = 1e-2, max.sx = 3, maxp = 8, path = 10,
#'            min_alpha = 1e-5, sort_pval = TRUE, max_groups = 1, 
#'            debug = TRUE)
#' 
#' ## pPC with PATH and HGI (no greedy search)
#' bn3 <- ppc(x = x, alpha = 1e-2, max.sx = 3, maxp = 8, path = 10,
#'            min_alpha = 1e-5, hgi = TRUE, sort_pval = TRUE, 
#'            max_groups = 20, debug = TRUE)
#' @export

ppc <- function(x, undirected = FALSE, alpha = NULL, max.sx = NULL, 
                maxp = NULL, path = 1, min_alpha = 1e-5, hgi = FALSE,
                max_wthn_sx = max.sx, max_btwn_sx = max.sx, max_btwn_nbr = ncol(x)-2,
                sort_pval = TRUE, max_groups = 20, true_bn = NULL,
                cluster = NULL, whitelist = NULL, blacklist = NULL, debug = FALSE){
  
  restrict.args <- list(cluster = cluster, test = "mi", alpha = alpha, 
                        max.sx = max.sx, max_wthn_sx = max_wthn_sx, 
                        max_btwn_sx = max_btwn_sx, max_btwn_nbr = max_btwn_nbr, 
                        sort_pval = sort_pval, max_groups = max_groups)
  
  maximize.args <- list(maxp = maxp)
  
  bnsl(x = x, restrict = "ppc", maximize = "",
       restrict.args = restrict.args, maximize.args = maximize.args,
       undirected = undirected, path = path, min_alpha = min_alpha, 
       hgi = hgi, true_bn = true_bn, whitelist = whitelist,
       blacklist = blacklist, debug = debug)
}



#' The partitioned hybrid greedy search (pHGS) algorithm
#'
#' Function for executing the partitioned hybrid greedy search (pHGS) algorithm, a combination of the partitioned PC (pPC) algorithm, the \eqn{p}-value adjacency thresholding (PATH) algorithm, and the hybrid greedy initialization (HGI) algorithm.
#'
#' @param x,maximize,maximize.args,alpha,max.sx,maxp,path,min_alpha,max_wthn_sx,max_btwn_sx,max_btwn_nbr,sort_pval,max_groups,true_bn,cluster,whitelist,blacklist,debug see \code{\link{bnsl}} for a description of arguments.
#' @details See \code{\link{bnsl}}.
#' @return A Bayesian network as an object of class \code{bn}.
#' @author Jireh Huang (\email{jirehhuang@@ucla.edu})
#' #' @references \href{https://doi.org/10.1007/s10994-022-06145-4}{Huang, J., & Zhou, Q. (2022). Partitioned hybrid learning of Bayesian network structures. \emph{Machine Learning}. https://doi.org/10.1007/s10994-022-06145-4}
#' @seealso \code{\link{bnsl}}, \code{\link{ppc}}
#' @examples
#' ## Read Bayesian network object 
#' true_bn <- bnrepository("child")
#' 
#' ## Generate data and relevel for simplicity
#' set.seed(1)
#' x <- bnlearn::rbn(true_bn, n = 1e4)
#' x <- as.data.frame(sapply(x, function(x) as.factor(as.integer(x) - 1L)),
#'                    stringsAsFactors = TRUE)
#' 
#' ## pHGS (pPC with PATH and HGI) with hill-climbing
#' bn1 <- phgs(x = x, maximize = "hc", 
#'             maximize.args = list(maxp = 8, restart = 10, perturb = 10),
#'             alpha = 1e-2, max.sx = 3, maxp = 8, path = 10,
#'             min_alpha = 1e-5, sort_pval = TRUE, max_groups = 20, 
#'             debug = TRUE)
#' 
#' ## pHGS (pPC with PATH and HGI) with tabu search
#' bn2 <- phgs(x = x, maximize = "tabu", 
#'             maximize.args = list(maxp = 8, tabu = 10, max.tabu = 10),
#'             alpha = 1e-2, max.sx = 3, maxp = 8, path = 10,
#'             min_alpha = 1e-5, sort_pval = TRUE, max_groups = 20, 
#'             debug = TRUE)
#' @export

phgs <- function(x, maximize = "tabu", maximize.args = NULL, alpha = NULL, 
                 max.sx = NULL, maxp = NULL, path = 1, min_alpha = 1e-5,
                 max_wthn_sx = max.sx, max_btwn_sx = max.sx, max_btwn_nbr = ncol(x)-2,
                 sort_pval = TRUE, max_groups = 20, true_bn = NULL,
                 cluster = NULL, whitelist = NULL, blacklist = NULL, debug = FALSE){
  
  restrict.args <- list(cluster = cluster, test = "mi", alpha = alpha, 
                        max.sx = max.sx, max_wthn_sx = max_wthn_sx,
                        max_btwn_sx = max_btwn_sx, max_btwn_nbr = max_btwn_nbr, 
                        sort_pval = sort_pval, max_groups = max_groups)
  
  if (!is.null(maxp) && is.null(maximize.args$maxp))
    maximize.args$maxp <- maxp
  
  bnsl(x = x, restrict = "ppc", maximize = maximize,
       restrict.args = restrict.args, maximize.args = maximize.args,
       undirected = FALSE, path = path, min_alpha = min_alpha, 
       hgi = TRUE, true_bn = true_bn, whitelist = whitelist,
       blacklist = blacklist, debug = debug)
}
jirehhuang/phsl documentation built on May 23, 2022, 4:19 a.m.