R/sanitization-scores.R
In bnlearn: Bayesian Network Structure Learning, Parameter Learning and Inference

# check score labels.
check.score = function(score, data, allowed = available.scores) {

  metadata = attr(data, "metadata")

  if (!is.null(score)) {

    # check the score label.
    check.label(score, choices = allowed, labels = score.labels,
      argname = "score", see = "bnlearn-package")
    # check if it's the right score for the data (discrete, continuous, mixed).
    if ((metadata$type %!in% discrete.data.types) &&
         (score %in% available.discrete.scores))
      stop("score '", score, "' may only be used with discrete data.")
    if ((metadata$type != "continuous") &&
         (score %in% available.continuous.scores))
      stop("score '", score, "' may only be used with continuous data.")
    if ((metadata$type != "mixed-cg") &&
         (score %in% available.mixedcg.scores))
      stop("score '", score, "' may only be used with a mixture of continuous and discrete data.")
    # if the data are incomplete, check whether the score can be used at all.
    if (any(!metadata$complete.nodes) &&
         (score %!in% scores.for.incomplete.data))
      stop("score '", score, "' cannot be used with incomplete data.")

    return(score)

  }#THEN
  else {

    # warn about ordinal data modelled as unordered categorical ones.
    if (metadata$type %in% c("ordered", "mixed-do"))
      warning("no score is available for ordinal data, disregarding the ordering of the levels.")

    if (all(metadata$complete.nodes)) {

      if (metadata$type %in% discrete.data.types)
        return("bic")
      else if (metadata$type == "continuous")
        return("bic-g")
      else if (metadata$type == "mixed-cg")
        return("bic-cg")

    }#THEN
    else {

      if (metadata$type %in% discrete.data.types)
        return("pnal")
      else if (metadata$type == "continuous")
        return("pnal-g")
      else if (metadata$type == "mixed-cg")
        return("pnal-cg")

    }#ELSE

  }#ELSE

}#CHECK.SCORE

# check whether a score is score equivalent.
is.score.equivalent = function(score, nodes, extra) {

  # log-likelihood for discrete and Gaussian data is always score equivalent.
  if (score %in% c("loglik", "loglik-g", "pred-loglik", "pred-loglik-g"))
    return(TRUE)
  # same with AIC and BIC.
  if (score %in% c("aic", "aic-g", "bic", "bic-g", "ebic", "ebic-g"))
    return(TRUE)
  # the quotient NML is score equivalent, but the factorized NML is not.
  if (score %in% c("qnml"))
    return(TRUE)
  # BDe and BGe can be score equivalent depending on the graph prior.
  if ((score %in% c("bde", "bge")) &&
      (extra$prior %in% c("uniform", "marginal", "vsp")))
    return(TRUE)
  # all the node-average likelihood scores are score equivalent.
  if (score %in% c("nal", "nal-g", "pnal", "pnal-g"))
    return(TRUE)

  # a conservative default (BDla, BDs, BDj, all *-cg scores).
  return(FALSE)

}#IS.SCORE.EQUIVALENT

# check whether a score is decomposable.
is.score.decomposable = function(score, extra) {

  # Castelo & Siebes prior is not decomposable.
  if ((score %in% c("bde", "bds", "bdj", "mbde", "bdla", "bge")) &&
      (extra$prior %in% c("cs", "marginal")))
    return(FALSE)

  # a sensible default.
  return(TRUE)

}#IS.SCORE.DECOMPOSABLE

# sanitize the extra arguments passed to the network scores.
check.score.args = function(score, network, data, extra.args, learning = FALSE) {

  # check the imaginary sample size.
  if (has.argument(score, "iss", score.extra.args))
    extra.args[["iss"]] =
      check.iss(iss = extra.args[["iss"]], network = network)

  # check the graph prior distribution.
  if (has.argument(score, "prior", score.extra.args))
    extra.args[["prior"]] =
      check.graph.prior(prior = extra.args[["prior"]], network = network)

  # check the sparsity parameter of the graph prior distribution.
  if (has.argument(score, "beta", score.extra.args))
    extra.args[["beta"]] =
      check.graph.hyperparameters(beta = extra.args[["beta"]],
        prior = extra.args[["prior"]], network = network, data = data,
        learning = learning)

  # check the list of the experimental observations in the data set.
  if (has.argument(score, "exp", score.extra.args))
    extra.args[["exp"]] =
      check.experimental(exp = extra.args[["exp"]], network = network,
        data = data)

  # check the likelihood penalty.
  if (has.argument(score, "k", score.extra.args))
    extra.args[["k"]] =
      check.penalty(k = extra.args[["k"]], network = network, data = data,
        score = score)

  # check the additional penalty in the extended BIC.
  if (has.argument(score, "gamma", score.extra.args))
    extra.args[["gamma"]] =
      check.extended.penalty(gamma = extra.args[["gamma"]], network = network)

  # check the number of scores to average.
  if (has.argument(score, "l", score.extra.args))
    extra.args[["l"]] = check.l(l = extra.args[["l"]], network = network)

  # check the normal-Wishart prior arguments.
  if (has.argument(score, "nu", score.extra.args))
    extra.args$nu =
      check.nu(nu = extra.args[["nu"]], network = network, data = data)

  if (has.argument(score, "iss.mu", score.extra.args))
    extra.args[["iss.mu"]] =
      check.iss.mu(iss.mu = extra.args[["iss.mu"]], network = network)

  if (has.argument(score, "iss.w", score.extra.args))
    extra.args[["iss.w"]] =
      check.iss.w(iss.w = extra.args[["iss.w"]], network = network, data = data)

  # check the test data for predictive scores.
  if (has.argument(score, "newdata", score.extra.args))
    extra.args[["newdata"]] =
      check.newdata(newdata = extra.args[["newdata"]], network = network,
        data = data)

  # check the R function implementing a custom score.
  if (has.argument(score, "fun", score.extra.args))
    extra.args[["fun"]] =
      check.custom.score.function(fun = extra.args[["fun"]], network = network)
  if (has.argument(score, "args", score.extra.args))
    extra.args[["args"]] =
      check.custom.score.arguments(args = extra.args[["args"]], network = network)

  # warn about and remove unused arguments.
  extra.args = check.unused.args(extra.args, score.extra.args[[score]])

  return(extra.args)

}#CHECK.SCORE.ARGS

# check the imaginary sample size of the Dirichlet prior.
check.iss = function(iss, network) {

  if (!is.null(iss)) {

    # validate the imaginary sample size.
    if (!is.positive(iss))
      stop("the imaginary sample size must be a positive number.")
    if (iss < 1)
      warning("very small imaginary sample size, the results may be affected by numeric problems.")

  }#THEN
  else {

    # check whether there is an imaginary sample size stored in the bn object;
    # otherwise use a the de facto standard value of 1.
    if (!is.null(network$learning$args$iss))
      iss = network$learning$args$iss
    else
      iss = 1

  }#ELSE

  # coerce iss to integer.
  return(as.numeric(iss))

}#CHECK.ISS

# check the prior mean in the BGe score.
check.nu = function(nu, network, data) {

  if (!is.null(nu)) {

    # check type and length.
    if (!is.real.vector(nu))
      stop("the prior mean vector must contain real numbers.")
    if (length(nu) != ncol(data))
      stop("the length of the prior mean vector does not match the number of variables in the data.")

    # check the names.
    if (is.null(names(nu)))
      names(nu) = names(data)
    else if (!setequal(names(nu), names(data)))
      stop("the prior mean vector and the data contain variables with different names.")

  }#THEN
  else {

    if (!is.null(network$learning$args$nu))
      nu = network$learning$args$nu
    else
      nu = structure(colMeans(data), names = names(data))

  }#ELSE

  return(nu)

}#CHECK.NU

# check the imaginary sample size for the normal prior over the mean.
check.iss.mu = function(iss.mu, network) {

  if (!is.null(iss.mu)) {

    # validate the imaginary sample size.
    if (!is.positive(iss.mu))
      stop("the imaginary sample size must be a positive number.")
    if (iss.mu < 1)
      warning("very small imaginary sample size, the results may be affected by numeric problems.")

  }#THEN
  else {

    # check whether there is an imaginary sample size stored in the bn object;
    # otherwise use a the de facto standard value of 1.
    if (!is.null(network$learning$args$iss.mu))
      iss.mu = network$learning$args$iss.mu
    else
      iss.mu = 1

  }#ELSE

  # coerce iss to integer.
  return(as.numeric(iss.mu))

}#CHECK.ISS.MU

# check the imaginary sample size for the Wishart prior over the covariance.
check.iss.w = function(iss.w, network, data) {

  if (!is.null(iss.w)) {

    if (!is.positive.integer(iss.w))
      stop("the imaginary sample size for the Wishart prior component must be a positive integer number.")

  }#THEN
  else {

    if (!is.null(network$learning$args$iss.w))
      iss.w = network$learning$args$iss.w
    else
      iss.w = ncol(data) + 2

  }#ELSE

  return(iss.w)

}#CHECK.ISS.W

# check the experimental data list.
check.experimental = function(exp, network, data) {

  if (!is.null(exp)) {

    if (!is.list(exp))
      stop("experimental data must be specified via a list of indexes.")
    if (any(names(exp) %!in% names(data)) || (length(names(exp)) == 0))
      stop("unkown variables specified in the experimental data list.")
    for (var in names(exp)) {

      if (!is.positive.vector(exp[[var]]))
        stop("indexes of experimental data must be positive integer numbers.")
      if (any(duplicated(exp[[var]])))
        stop("duplicated indexes for experimental data.")
      if (any(exp[[var]] > length(data[, var])))
        stop("out of bounds indexes for experimental data.")

      # just kill empty elements.
      if (length(exp[[var]]) == 0)
        exp[[var]] = NULL
      # also, convert everything to integers to make things simpler at the
      # C level.
      exp[[var]] = as.integer(exp[[var]])

    }#FOR

  }#THEN
  else {

    # check whether there is a list stored in the bn object; if no experimental
    # data is specified, return an empty list (which is the same as using the
    # plain BDe score).
    if (!is.null(network$learning$args$exp))
      exp = network$learning$args$exp
    else
      exp = structure(vector(ncol(data), mode = "list"), names = names(data))

  }#ELSE

  return(exp)

}#CHECK.EXPERIMENTAL

# check the penalty used in AIC and BIC.
check.penalty = function(k, network, data, score) {

  if (!is.null(k)) {

    # validate the penalty weight.
    if (!is.positive(k))
      stop("the penalty weight must be a positive numeric value.")

    # warn if using a non-standard penalty.
    if (grepl("^aic", score) && (k != 1))
      warning("using AIC with a non-standard penalty k = ", k, ".")
    if (grepl("^[e]*bic", score) && (k != log(nrow(data)) / 2))
      warning("using BIC with a non-standard penalty k = ", k, ".")

  }#THEN
  else {

    # check whether there is a penalization coefficient stored in the bn object,
    # use the default for the score function otherwise.
    if (!is.null(network$learning$args$k) && (score == network$learning$test))
      k = network$learning$args$k
    else {

      if (grepl("^aic", score))
        k = 1
      else if (grepl("^[e]*bic", score))
        k = log(nrow(data)) / 2
      else if (grepl("^[p]*nal", score))
        k = 1 / length(network$nodes) * nrow(data) ^ -0.25

    }#ELSE

  }#ELSE

  return(as.numeric(k))

}#CHECK.PENALTY

# check the additional penalty used in the extended BIC.
check.extended.penalty = function(gamma, network) {

  if (!is.null(gamma)) {

    if (!is.probability(gamma))
      stop("'gamma' must be a number between 0 and 1.")

  }#THEN
  else {

    if (!is.null(network$learning$args$gamma))
      gamma = network$learning$args$gamma
    else
      gamma = 0.5

  }#ELSE

  return(as.numeric(gamma))

}#CHECK.EXTENDED.PENALTY

# sanitize prior distributions over the graph space.
check.graph.prior = function(prior, network) {

  if (is.null(prior)) {

    # check whether there is a graph prior stored in the bn object, use the
    # uniform one otherwise.
    if (!is.null(network$learning$args$prior))
      prior = network$learning$args$prior
    else
      prior = "uniform"

  }#THEN
  else {

    # check whether prior is a string, and whether the label matches a known prior.
    check.label(prior, choices = prior.distributions, labels = prior.labels,
      argname = "prior distribution", see = score)

  }#ELSE

  return(prior)

}#CHECK.GRAPH.PRIOR

# check the sparsity parameter of the prior distribution over the graph space.
check.graph.hyperparameters = function(beta, prior, network, data,
    learning = FALSE) {

  default.beta =
    list("uniform" = NULL, "vsp" = 1/(ncol(data) - 1),
      "marginal" = structure(0.5, nodes = names(network$nodes)),
      "cs" = cs.completed.prior(data.frame(character(0), character(0),
             numeric(0)), names(data)))

  if (is.null(beta)) {

    # check whether there is a graph prior stored in the bn object, use the
    # uniform one otherwise.
    if (!is.null(network$learning$args$prior))
      beta = network$learning$args$beta
    else
      beta = default.beta[[prior]]

  }#THEN
  else {

    if (prior %in% c("uniform")) {

      warning("unused argument beta.")
      beta = NULL

    }#THEN
    else if (prior == "marginal") {

      if (!is.probability(beta) || (beta >= 1))
       stop("beta must be a probability smaller than 1.")

      beta = structure(noattr(beta, ok = character(0)), nodes = names(network$nodes))

    }#THEN
    else if (prior == "vsp") {

      if (!is.probability(beta) || (beta >= 1))
       stop("beta must be a probability smaller than 1.")

    }#THEN
    else if (prior == "cs") {

      # arcs' prior probabilities should be provided in a data frame.
      if (!is.data.frame(beta) || (ncol(beta) != 3) ||
          !identical(colnames(beta), c("from", "to", "prob")))
        stop("beta must be a data frame with three colums: 'from', 'to' and 'prob'.")
      # the probs cloumns must contain only probabilities.
      if (!is.probability.vector(beta$prob, zero = TRUE))
        stop("arcs prior must contain only probabilities.")
      # check that the first two columns contain only valid arcs.
      check.arcs(beta[, c("from", "to")], nodes = names(data))

      # complete the user-specified prior.
      beta = cs.completed.prior(beta, names(data), learning)

    }#THEN

  }#ELSE

  return(beta)

}#CHECK.GRAPH.SPARSITY

# check the number of scores to average.
check.l = function(l, network) {

  if (is.null(l)) {

    if (!is.null(network$learning$args$l))
      l = network$learning$args$l
    else
      l = 5

  }#THEN
  else {

    if (!is.positive.integer(l))
      stop("l must be a positive integer, the number of scores to average.")

  }#ELSE

  return(as.numeric(l))

}#CHECK.L

# check the test data for predictive scores.
check.newdata = function(newdata, network, data, required = TRUE,
    allow.missing = FALSE) {

  if (is.null(newdata)) {

    if (!is.null(network$learning$args$newdata))
      return(network$learning$args$newdata)
    else if (required)
      stop("predictive scores require a test set passed as 'newdata'.")
    else
      return(NULL)

  }#THEN

  # newdata must be a data frame.
  if (!is.data.frame(newdata))
    stop("newdata must be in a data frame.")

  # check whether data and newdata have the same columns.
  names.data = names(data)
  names.newdata = names(newdata)

  if (length(names.data) < length(names.newdata)) {

    if (all(names.newdata %in% names(data))) {

       warning("extra variables in newdata will be ignored.")
       newdata = .data.frame.column(newdata, names.data, drop = FALSE)

    }#THEN
    else {

      stop("some variables in the data are missing in newdata.")

    }#ELSE

  }#THEN
  else if (length(names.data) > length(names.newdata)) {

    stop("some variables in the data are missing in newdata.")

  }#THEN

  # reorder the columns of newdata to match data.
  newdata = .data.frame.column(newdata, names.data, drop = FALSE)

  # check whether data and newdata have the same data types.
  types.data = lapply(data, class)
  types.newdata = lapply(newdata, class)

  for (i in seq_along(types.data))
    if (!isTRUE(all.equal(types.data[[i]], types.newdata[[i]])))
      stop("variable ", names.data[i], " has a different class in newdata.")

  # for discrete variables, also check that the levels match.
  levels.data = lapply(data, levels)
  levels.newdata = lapply(newdata, levels)

  for (i in seq_along(types.data))
    if (any(types.data[i] %in% "factor"))
      if (!isTRUE(all.equal(levels.data[[i]], levels.newdata[[i]]))) {

        if (setequal(levels.data[[i]], levels.newdata[[i]]))
          levels(newdata[[i]]) = levels(data[[i]])
        else
          stop("variable ", names.data[i], " has a different levels in newdata.")

      }#THEN

  # make sure to return a data frame with column names.
  newdata = .data.frame(newdata)
  names(newdata) = names.data

  # sanitize the reworked data frame and add the metadata.
  newdata = check.data(newdata, label = "newdata", allow.levels = TRUE,
              allow.missing = allow.missing)

  return(newdata)

}#CHECK.NEWDATA

# check the R function implementing a custom score.
check.custom.score.function = function(fun, network) {

  # there is no possible default value.
  if (is.null(fun)) {

    if (!is.null(network$learning$args$fun))
      fun = network$learning$args$fun
    else
      stop("missing the custom score function.")

  }#THEN
  else {

    # check the argument list.
    fun.arguments = names(formals(fun))
    if (!identical(fun.arguments, c("node", "parents", "data", "args")))
      stop("the custom score function must have signature function(node, parents, data, args).")

  }#ELSE

  return(fun)

}#CHECK.CUSTOM.SCORE.FUNCTION

# check the additional argument list passed to a custom test.
check.custom.score.arguments = function(args, network) {

  # default to an empty argument list.
  if (is.null(args)) {

    if (!is.null(network$learning$args$args))
      args = network$learning$args$args
    else
      args = list()

  }#THEN
  else {

    if (!is.list(args))
      stop("the arguments for the custom score must be passed as a list.")

  }#ELSE

  return(args)

}#CHECK.CUSTOM.SCORE.ARGUMENTS
Any scripts or data that you put into this service are public.
bnlearn documentation built on April 3, 2025, 8:19 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
bnlearn
Bayesian Network Structure Learning, Parameter Learning and Inference

R/sanitization-scores.R
In bnlearn: Bayesian Network Structure Learning, Parameter Learning and Inference

Try the bnlearn package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bnlearn Bayesian Network Structure Learning, Parameter Learning and Inference

R/sanitization-scores.R In bnlearn: Bayesian Network Structure Learning, Parameter Learning and Inference

Try the bnlearn package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bnlearn
Bayesian Network Structure Learning, Parameter Learning and Inference

R/sanitization-scores.R
In bnlearn: Bayesian Network Structure Learning, Parameter Learning and Inference