R/orient.R

Defines functions arcs2dag get_rs_delta build_alpha_path score_arcs_path orient

# Orient edges
# 
# Extended version of bnlearn:::learn.arc.directions() that includes
# p-value adjacency thresholding (PATH) and hybrid greedy initialization
# (HGI). 

orient <- function(x, cluster = NULL, local.structure, whitelist, blacklist, 
                   test, score, alpha, complete, B = NULL, max.sx = ncol(x)-2, 
                   maxp = ncol(x)-1, path = 1, min_alpha = 1e-5, hgi = FALSE, 
                   true_bn = NULL, debug = FALSE)
{
  times <- tests <- 
    c(detect = bnlearn::test.counter(), apply = 0, rules = 0, select = 0)
  
  test <- bnlearn:::check.test(test, x)
  score <- bnlearn:::check.score(score, x)
  alpha <- bnlearn:::check.alpha(alpha)
  B <- bnlearn:::check.B(B, test)
  max.sx <- bnlearn:::check.largest.sx.set(max.sx, 
                                           x)
  maxp <- bnlearn:::check.maxp(maxp,
                               x)
  nodes = names(x)
  arcs = bnlearn:::mb2arcs(local.structure, nodes)
  to.drop = !apply(arcs, 1, function(x) {
    bnlearn:::is.blacklisted(blacklist, x)
  })
  arcs = arcs[to.drop, , 
              drop = FALSE]
  
  is_discrete <- class(x[[1]]) %in% c("factor", "integer")
  if (path <= 1){
    path <- 1
    min_alpha <- alpha
    alpha_path <- alpha
  } else{
    debug_cli(min_alpha >= alpha, cli::cli_abort, 
              "min_alpha must be less than alpha")
    min_alpha <- bnlearn:::check.alpha(min_alpha)
  }
  best_alpha <- alpha
  
  
  ## detect v-structures
  start_time_d <- Sys.time()
  if (!is.null(true_bn)){  
    ## if true_bn is supplied, local.structure is the true skeleton,
    ## so vstruct.detect_ will recover all and only true v-structures
    ## thus, for speed purposes, simply extract v-structures from true_bn
    vs <- cbind(data.frame(max_a = 0), 
                as.data.frame(vstructs_(x = true_bn), 
                              stringsAsFactors = FALSE))
    names(vs) <- c("max_a", "y", "x", "z")
  } else{
    vs <- do.call("rbind", vstruct.detect_(nodes = nodes, arcs = arcs, mb = local.structure, 
                                           data = x, alpha = alpha, B = B, test = test, 
                                           blacklist = blacklist, max.sx = max.sx, 
                                           complete = complete, true_bn = true_bn, 
                                           debug = debug))
    if (is.null(vs))
      vs <- matrix(nrow = 0, ncol = 4)
  }
  rownames(vs) = NULL
  end_time_d <- Sys.time()
  times[1] <- ifelse(is.null(true_bn), 
                     as.numeric(end_time_d - start_time_d, unit = 'secs'), 0)
  tests[1] <- bnlearn::test.counter() - tests[1]
  
  debug_cli(debug, cli::cli_alert,
            c("detected {nrow(vs)} v-structures ",
              "in {prettyunits::pretty_sec(times[1])} with {tests[1]} calls"))
  
  
  ## initialize
  lambda <- 0.5  # TODO: lambda parameter
  attr(arcs, "alpha") <- alpha
  
  pdag <- list(learning = list(whitelist = whitelist, blacklist = blacklist, test = test, 
                               args = list(alpha = alpha, min_alpha = alpha, best_alpha = alpha),
                               ntests = 0, max.sx = max.sx, 
                               group = attr(local.structure, "learning")$group,
                               path = 1, hgi = hgi, valid = TRUE),
               nodes = NULL, arcs = arcs, 
               arcs_path = list(arcs))
  if (!is.null(B)) 
    pdag$learning$args$B = B
  names(pdag$arcs_path) <- alpha
  
  ## if there are any estimated arcs
  if (nrow(arcs)){
    
    key <- build_key(nodes = nodes)
    
    if (path > 1){
      
      ## build alpha_path
      arc_indices <- apply(arcs, 1, function(arc) find_arc_index(arc_ind = key[arc], 
                                                                 nnodes = length(nodes)))
      p_vec <- unname(sapply(arc_indices, function(x) attr(local.structure, 
                                                           "dsep.set")[[x]]$p.value))
      path <- min(path, sum((unique_p_vec <- unique(p_vec)) <= alpha) + 1)
      alpha_path <- build_alpha_path(p_vec = p_vec, alpha = alpha, 
                                     min_alpha = min_alpha, n_alpha = path)
      min_alpha <- max(min_alpha, min(alpha_path))
      path <- length(alpha_path)
      
      ## list of v-structure indicators for each alpha
      bool_vs_path <- lapply(alpha_path, function(a){
        bool_arcs <- (p_vec <= a)
        if (!is.null(vs) && nrow(vs) && any(bool_arcs)){
          bool_vs <- apply(vs, 1, function(v){
            i <- find_arc_index(arc_ind = sort(key[v[c("x", "y")]]), 
                                nnodes = length(nodes))
            j <- find_arc_index(arc_ind = sort(key[v[c("x", "z")]]), 
                                nnodes = length(nodes))
            if (!setequal(v[c("x", "y")],
                          attr(local.structure, "dsep.set")[[i]]$arc) ||
                !setequal(v[c("x", "z")],
                          attr(local.structure, "dsep.set")[[j]]$arc)){
              cat("Problem\n")
            }
            max(attr(local.structure, "dsep.set")[[i]]$p.value,
                attr(local.structure, "dsep.set")[[j]]$p.value) <= a
          })
          bool_vs <- vs[, "max_a"] > a & bool_vs
        } else{
          bool_vs <- logical(0)
        }
        attr(bool_vs, "alpha") <- a
        return(bool_vs)
      })
    } else {
      p_vec <- numeric(nrow(arcs))
      bool_vs_path <- list(sapply(seq_len(nrow(vs)), function(x) TRUE))
      attr(bool_vs_path[[1]], "alpha") <- alpha
    }
    ## cache initial reference scores and score deltas for hgs
    if (hgi){
      rs_delta <- get_rs_delta(arcs = arcs, vs = vs, nodes = nodes, data = x, debug = FALSE)
    } else{
      rs_delta <- NULL
    }
    ## determine arcs_path
    pdag$arcs_path <- lapply(seq_len(length(bool_vs_path)), function(i){
      
      bool_vs <- bool_vs_path[[i]]
      
      ## TODO: check if all necessary
      nundir <- 0
      arcs <- arcs[p_vec <= attr(bool_vs, "alpha"), , drop=FALSE]
      vs_temp <- vs[as.logical(bool_vs), , drop=FALSE]
      rs <- sapply(nodes, function(x) 0)
      
      ## apply v-structures, if any
      if (nrow(vs_temp)){
        
        ## TODO: check if still having memory reference issues
        if (hgi){
          deep_copy_NumericVector(original = rs_delta$rs, copy = rs)
          deep_copy_NumericVector(original = rs_delta$delta[bool_vs], 
                                  copy = vs_temp$max_a)
        }
        arcs <- vstruct.apply_(arcs = arcs, vs = vs_temp, nodes = nodes, 
                               data = if (hgi) x else NULL, rs = rs,
                               lambda = lambda, maxp = maxp, debug = debug)
        times[2] <- attr(arcs, "time")
        tests[2] <- attr(arcs, "nscores")
        
        ## TODO: check if still having memory reference issues
        if (!is.null(attr(arcs, "rs")))
          deep_copy_NumericVector(original = attr(arcs, "rs"), copy = rs)
      }
      ## apply orientation rules (R1-4 or extend)
      if (nrow(arcs)){
        
        start_time <- Sys.time()
        if (!hgi){
          nscores <- 0
          pdag = list(learning = list(), nodes = structure(rep(0, length(nodes)), 
                                                           names = nodes), arcs = arcs)
          amat <- bnlearn:::arcs2amat(arcs = pdag$arcs, nodes = nodes)
          amat <- apply_cpdag_rules(pdag = amat, nodes = nodes, 
                                    remove_invalid = TRUE, debug = debug >= 3)
          arcs <- bnlearn:::amat2arcs(a = amat, nodes = nodes)
          
        } else{
          
          ## initialize
          pdag <- bnlearn:::arcs2amat(arcs, nodes)
          dag <- pdag - pdag * t(pdag)
          mode(dag) <- "integer"
          nscores <- numeric(1)
          
          ## greedy extension
          a <- pdag2dag_greedy(a = pdag,
                               d = dag,
                               reference = rs,
                               nodes = nodes,
                               data = x,
                               nscores = nscores,
                               is_discrete = is_discrete,
                               k = lambda * log(nrow(x)), 
                               maxp = maxp,
                               verbose = FALSE)
          
          ## transfer to arcs, including deleted arcs as undirected edges
          ## TODO: check if undirecting cycles necessary
          # dag <- undirect_cycles(dag)
          dag <- dag - dag * t(dag)
          pdag <- bnlearn:::arcs2amat(arcs, nodes)
          pdag[dag | t(dag)] <- 0
          mode(pdag) <- "integer"
          arcs <- bnlearn:::amat2arcs(pdag + dag, nodes)
          attr(arcs, "rs") <- rs
          # attr(arcs, "nundir") <- (sum(a * t(a)) / 2)
        }
        end_time <- Sys.time()
        times[3] <- as.numeric(end_time - start_time, unit = 'secs')
        tests[3] <- nscores
        
        debug_cli(debug, cli::cli_alert,
                  c("applied rules for estimate {i} ",
                    "in {prettyunits::pretty_sec(times[3])} with {tests[3]} calls"), 
                  .envir = environment())
      }
      
      attr(arcs, "alpha") <- attr(bool_vs, "alpha")
      attr(arcs, "times") <- times
      attr(arcs, "tests") <- tests
      
      return(arcs)
    })
    names(pdag$arcs_path) <- alpha_path
    for (i in pdag$arcs_path){
      times[2:3] <- times[2:3] + attr(i, "times")[2:3]
      tests[2:3] <- tests[2:3] + attr(i, "tests")[2:3]
    }
    if (length(pdag$arcs_path) > 1){
      
      start_time_s <- Sys.time()
      
      if (!hgi){
        
        pdag$dag_path <- score_arcs_path(arcs_path = pdag$arcs_path, nodes = nodes, x = x, 
                                         score = score, lambda = lambda, debug = debug)
        tests[4] <- attr(pdag$dag_path, "nscores")
        which_best <- attr(pdag$dag_path, "which_best")
        pdag$learning$valid <- attr(pdag$dag_path, "success")
        pdag$arcs <- pdag$arcs_path[[which_best]]
        
      } else{
        
        which_best <- which.max(sapply(pdag$arcs_path, 
                                       function(arcs) sum(attr(arcs, "rs"))))
        ## keep all undirected edges from the original alpha
        ## but keep only directed edges in the best hgi estimate
        a0 <- bnlearn:::arcs2amat(pdag$arcs_path[[1]], nodes)
        a1 <- bnlearn:::arcs2amat(pdag$arcs_path[[which_best]], nodes)
        a1 <- (a0 | t(a0)) - (a1 | t(a1)) + a1
        pdag$arcs <- bnlearn:::amat2arcs(a1,
                                         nodes)
        attr(pdag$arcs, "alpha") <- attr(pdag$arcs_path[[which_best]], "alpha")
      }
      end_time_s <- Sys.time()
      times[4] <- as.numeric(end_time_s - start_time_s, unit = "secs")
    } else{
      
      pdag$arcs <- pdag$arcs_path[[1]]
      pdag$learning$valid <- attr(arcs2dag(arcs = pdag$arcs, 
                                           nodes = nodes), "success")
    }
    pdag$learning[c("tests", "path")] <- list(sum(tests),
                                              length(alpha_path))
    pdag$learning$arcs1 <- pdag$arcs_path[[1]]
    pdag$learning$args[c("min_alpha", "best_alpha")] <- list(min(alpha_path),
                                                             attr(pdag$arcs, "alpha"))
    pdag$nodes <- bnlearn:::cache.structure(nodes, 
                                            arcs = pdag$arcs)
  }
  debug_cli(debug, cli::cli_alert_success,
            c("completed edge orientation for {length(pdag$arcs_path)} ",
              "estimates with hgi = {hgi} ",
              "in {prettyunits::pretty_sec(sum(times))} with {sum(tests)} calls"))
  
  return(pdag)
}



# Score solution path generated by PATH and selects highest-scoring estimate.

score_arcs_path <- function(arcs_path, nodes, x, score, 
                            lambda = 0.5, debug = FALSE){
  
  start_time <- Sys.time()
  nscores <- bnlearn::test.counter()
  ns0 <- sapply(nodes, function(x) 0)
  
  if (length(arcs_path) > 1){
    
    bn_i <- bn_im1 <- bnlearn::empty.graph(nodes = nodes)
    # extra.args <- list(k = lambda * log(nrow(x)))
    dag_path <- list()
    
    for (i in seq_len(length(arcs_path))){
      
      arcs <- arcs_path[[i]]
      if (is.null(attr(arcs, "dag"))) 
        attr(arcs, "dag") <- arcs2dag(arcs = arcs, nodes = nodes, 
                                      direct_all = TRUE)
      dag_path[[i]] <- list(dag = attr(arcs, "dag"),
                            delta = 0, ns = ns0,
                            alpha = attr(arcs, 'alpha'))
      dag_path[[i]]$eL <- sparsebnUtils::edgeList(dag_path[[i]]$dag)
      
      if (i > 1){
        differences <- Matrix::colSums(!(dag_path[[i-1]]$dag == 
                                           dag_path[[i]]$dag))
        different <- names(differences[differences > 0])
        dag_path[[i]]$ns <- dag_path[[i-1]]$ns
        if (length(different)){
          
          bnlearn::amat(bn_i) <- as.matrix(dag_path[[i]]$dag)
          bnlearn::amat(bn_im1) <- as.matrix(dag_path[[i-1]]$dag)
          
          for (node in different){
            if (dag_path[[i-1]]$ns[node] >= 0){
              
              ## score previous node if not yet scored
              extra.args <- bnlearn:::check.score.args(score = score, network = bn_im1, data = x, 
                                                       extra.args = list(), learning = TRUE)
              dag_path[[i-1]]$ns[node] <- bnlearn:::per.node.score(bn_im1, x, score, node,
                                                                   extra.args, debug = debug >= 3)
            }
            ## score current node
            increment.test.counter_(1)  # one per score difference
            extra.args <- bnlearn:::check.score.args(score = score, network = bn_i, data = x, 
                                                     extra.args = list(), learning = TRUE)
            dag_path[[i]]$ns[node] <- bnlearn:::per.node.score(bn_i, x, score, node,
                                                               extra.args, debug = debug >= 3)
          }
        }
        ## compute score difference between estimates
        dag_path[[i]]$delta <- sum(dag_path[[i]]$ns[different] - 
                                     dag_path[[i-1]]$ns[different])
        
        debug_cli(debug >= 2, cli::cli_alert,
                  c("score delta of {format(dag_path[[i]]$delta, digits = 3, nsmall = 3)} ",
                    "between estimates {i-1} and {i}"))
      }
    }
  } else{
    
    dag_path <- list(list(dag = arcs2dag(arcs = arcs_path[[1]], nodes = nodes),
                          delta = 0, ns = ns0,
                          alpha = attr(arcs_path[[1]], 'alpha')))
  }
  successes <- sapply(dag_path, function(x) attr(x$dag, "success"))
  if (success <- any(sapply(successes, function(x) is.logical(x) && x))){
    which_best <- which(successes)[which.max(cumsum(sapply(dag_path, 
                                                           `[[`, 'delta'))[successes])]
  } else{
    which_best <- which.max(cumsum(sapply(dag_path, `[[`, 'delta')))
  }
  debug_cli(debug >= 2, cli::cli_alert,
            "{sum(successes)} out of {length(successes)} estimates are admissible")
  
  names(dag_path) <- sapply(arcs_path, function(x) attr(x, 'alpha'))
  attr(dag_path, 'which_best') <- which_best
  attr(dag_path, 'success') <- success
  attr(dag_path, 'nscores') <- bnlearn::test.counter() - nscores
  end_time <- Sys.time()
  
  debug_cli(debug >= 2, cli::cli_alert_success,
            c("chose alpha = {signif(dag_path[[which_best]]$alpha, digits = 3)} ",
              "out of {length(dag_path)} significance levels ",
              "in {prettyunits::pretty_sec(as.numeric(end_time - start_time, units = 'secs'))} ",
              "with {attr(dag_path, 'nscores')} calls"))
  
  return(dag_path)
}



# Generate evenly spaced out sequence of alpha values for PATH.

build_alpha_path <- function(p_vec, alpha, 
                             min_alpha = 1e-5, n_alpha = 10){
  
  if (min_alpha >= alpha || 
      abs(alpha - min_alpha) < alpha^2){
    if (n_alpha == 1)
      return(alpha)
    min_alpha <- alpha * 1e-1
  }
  
  # n_alpha <- n_alpha - (n_alpha > 1)
  alpha_path <- unique(sort(p_vec[p_vec >= min_alpha & p_vec <= 1], 
                            decreasing = TRUE))[-1]  # first element corresponds to alpha
  alpha_path <- union(alpha, alpha_path)
  
  if (length(alpha_path) <= 1)
    return(alpha)
  
  ind <- unique(floor(seq(1, length(alpha_path), length = n_alpha)))
  return(alpha_path[ind])
}



# Compute reference scores (rs) and score differences (delta). 

get_rs_delta <- function(arcs, vs, nodes, data, lambda = 0.5, debug = FALSE){
  
  ## currently, only discrete implementation of hgi
  is_discrete <- TRUE
  rs <- sapply(nodes, R_loglik_dnode, parents = character(0),
               data = data, k = lambda * log(nrow(data)), debug = debug >= 3, USE.NAMES = TRUE)
  
  ## TODO: eventually generalizable
  # network <- bnlearn::empty.graph(nodes = nodes)
  # score <- bnlearn:::check.score(NULL, data = data)
  # extra.args <- bnlearn:::check.score.args(score = score, network = network, 
  #                                          data = x, extra.args = list(), learning = TRUE)
  # rs <- bnlearn:::per.node.score(network = network, data = data, score = score, 
  #                                targets = nodes, extra.args = extra.args, debug = debug)
  
  if (is.null(vs) || nrow(vs) == 0)
    return(list(rs = rs, delta = numeric(0), 
                nscores = integer(1)))
  
  vs_cpp <- vs2vs_cpp(vs, nodes)
  undirMat <- bnlearn:::arcs2amat(arcs = arcs, 
                                  nodes = nodes)
  out <- vstruct_apply_hgi(undirMat = undirMat,
                           reference = rs,
                           nodes = nodes,
                           vs = vs_cpp,
                           data = data,
                           delta = numeric(0),
                           just_delta = TRUE,
                           is_discrete = is_discrete,
                           k = lambda * log(nrow(data)),
                           debug = debug >= 3)
  return(list(rs = rs, 
              delta = c(out$delta), nscores = out$nscores))
}



# Extend a PDAG in the form of arcs to a DAG in the form
# of an adjacency matrix.

arcs2dag <- function(arcs, nodes, direct_all = FALSE){
  
  ## TODO: add maxp argument
  
  a <- bnlearn:::arcs2amat(arcs, nodes)
  ## TODO: check if necessary
  # if (undirect)
  #   a <- undirect_cycles(a)
  ## if any undirected edges
  if (any(a == 1 & a == t(a))){
    extend <- pdag2dag_cpp(g = a, nodes = nodes,
                           direct_all = direct_all)
    dag <- Matrix::Matrix(extend$graph, sparse = TRUE)
    attr(dag, "success") <- extend$success
    attr(dag, "undirected") <- extend$undirected
    attr(dag, "nundir") <- (sum(a | t(a)) - sum(extend$graph | t(extend$graph))) / 2
  } else{
    dag <- Matrix::Matrix(a, sparse = TRUE)
    attr(dag, "success") <- TRUE
    attr(dag, "undirected") <- 0
    attr(dag, "nundir") <- 0
  }
  rownames(dag) <- colnames(dag) <- nodes
  return(dag)
}
jirehhuang/phsl documentation built on May 23, 2022, 4:19 a.m.