Nothing
#
# ccdrAlgorithm-main.R
# ccdrAlgorithm
#
# Created by Bryon Aragam (local) on 1/22/16.
# Copyright (c) 2014-2017 Bryon Aragam. All rights reserved.
#
#
# PACKAGE CCDRALGORITHM: Main CCDr methods
#
# CONTENTS:
# ccdr.run
# ccdr_call
# ccdr_gridR
# ccdr_singleR
#
###--- These two lines are necessary to import the auto-generated Rcpp methods in RcppExports.R---###
#' @useDynLib ccdrAlgorithm, .registration = TRUE
#' @importFrom Rcpp sourceCpp
NULL
#' Main CCDr Algorithm
#'
#' Estimate a Bayesian network (directed acyclic graph) from observational data using the
#' CCDr algorithm as described in \href{https://jmlr.org/papers/v16/aragam15a.html}{Aragam and Zhou (2015)}.
#'
#' Instead of producing a single estimate, this algorithm computes a solution path of estimates based
#' on the values supplied to \code{lambdas} or \code{lambdas.length}. The CCDr algorithm approximates
#' the solution to a nonconvex optimization problem using coordinate descent. Instead of AIC or BIC,
#' CCDr uses continuous regularization based on concave penalties such as the minimax concave penalty
#' (MCP).
#'
#' This implementation includes two options for the penalty: (1) MCP, and (2) L1 (or Lasso). This option
#' is controlled by the \code{gamma} argument.
#'
#' @param data Data as \code{\link[sparsebnUtils]{sparsebnData}} object. Must be numeric and contain no missing values.
#' @param lambdas Numeric vector containing a grid of lambda values (i.e. regularization
#' parameters) to use in the solution path. If missing, a default grid of values will be
#' used based on a decreasing log-scale (see also \link{generate.lambdas}).
#' @param lambdas.length Integer number of values to include in the solution path. If \code{lambdas}
#' has also been specified, this value will be ignored. Note also that the final
#' solution path may contain fewer estimates (see
#' \code{alpha}).
#' @param whitelist A two-column matrix of edges that are guaranteed to be in each
#' estimate (a "white list"). Each row in this matrix corresponds
#' to an edge that is to be whitelisted. These edges can be
#' specified by node name (as a \code{character} matrix), or by
#' index (as a \code{numeric} matrix).
#' @param blacklist A two-column matrix of edges that are guaranteed to be absent
#' from each estimate (a "black list"). See argument
#' "\code{whitelist}" above for more details.
#' @param gamma Value of concavity parameter. If \code{gamma > 0}, then the MCP will be used
#' with \code{gamma} as the concavity parameter. If \code{gamma < 0}, then the L1 penalty
#' will be used and this value is otherwise ignored.
#' @param error.tol Error tolerance for the algorithm, used to test for convergence.
#' @param max.iters Maximum number of iterations for each internal sweep.
#' @param alpha Threshold parameter used to terminate the algorithm whenever the number of edges in the
#' current DAG estimate is \code{> alpha * ncol(data)}.
#' @param betas Initial guess for the algorithm. Represents the weighted adjacency matrix
#' of a DAG where the algorithm will begin searching for an optimal structure.
#' @param sigmas Numeric vector of known values of conditional variances for each node in the network. If this is
#' set by the user, these parameters will not be computed and the input will
#' be used as the "true" values of the variances in the algorithm. Note that setting
#' this to be all ones (i.e. \code{sigmas[j] = 1} for all \code{j}) is
#' equivalent to using the least-squares loss.
#' @param verbose \code{TRUE / FALSE} whether or not to print out progress and summary reports.
#'
#' @return A \code{\link[sparsebnUtils]{sparsebnPath}} object.
#'
#' @examples
#'
#' ### Generate some random data
#' dat <- matrix(rnorm(1000), nrow = 20)
#' dat <- sparsebnUtils::sparsebnData(dat, type = "continuous")
#'
#' # Run with default settings
#' ccdr.run(data = dat, lambdas.length = 20)
#'
#' ### Optional: Adjust settings
#' pp <- ncol(dat$data)
#'
#' # Initialize algorithm with a random initial value
#' init.betas <- matrix(0, nrow = pp, ncol = pp)
#' init.betas[1,2] <- init.betas[1,3] <- init.betas[4,2] <- 1
#'
#' # Run with adjusted settings
#' ccdr.run(data = dat, betas = init.betas, lambdas.length = 20, alpha = 10, verbose = TRUE)
#'
#' @export
ccdr.run <- function(data,
lambdas = NULL,
lambdas.length = NULL,
whitelist = NULL,
blacklist = NULL,
gamma = 2.0,
error.tol = 1e-4,
max.iters = NULL,
alpha = 10,
betas,
sigmas = NULL,
verbose = FALSE
){
### Check data format
if(!sparsebnUtils::is.sparsebnData(data)) stop(sparsebnUtils::input_not_sparsebnData(data))
### Extract the data and ivn
### CCDr now works on both observational data and interventional data, and a mixture of both
data_matrix <- data$data
ivn_list <- data$ivn
### If ivn_list contains character names, convert to indices
if("character" %in% sparsebnUtils::list_classes(ivn_list)){
ivn_list <- lapply(ivn_list, function(x){
idx <- match(x, names(data_matrix))
if(length(idx) == 0) NULL # return NULL if no match (=> observational)
else idx
})
}
### Call the CCDr algorithm
ccdr_call(data = data_matrix,
ivn = ivn_list,
betas = betas,
sigmas = sigmas,
lambdas = lambdas,
lambdas.length = lambdas.length,
whitelist = whitelist,
blacklist = blacklist,
gamma = gamma,
error.tol = error.tol,
rlam = NULL,
max.iters = max.iters,
alpha = alpha,
verbose = verbose)
} # END CCDR.RUN
### Maximum number of nodes allowed
MAX_CCS_ARRAY_SIZE <- function() 10000
# ccdr_call
#
# Handles most of the bookkeeping for CCDr. Sets default values and prepares arguments for
# passing to ccdr_gridR and ccdr_singleR. Some type-checking as well, although most of
# this is handled internally by ccdr_gridR and ccdr_singleR.
#
ccdr_call <- function(data,
ivn = NULL,
betas,
sigmas,
lambdas,
lambdas.length,
whitelist = NULL,
blacklist = NULL,
gamma,
error.tol,
rlam,
max.iters,
alpha,
verbose = FALSE
){
node_names <- names(data)
# ### Allow users to input a data.frame, but kindly warn them about doing this
# if(is.data.frame(data)){
# warning(sparsebnUtils::alg_input_data_frame())
# data <- sparsebnUtils::sparsebnData(data)
# }
#
# ### Check data format
# if(!sparsebnUtils::is.sparsebnData(data)) stop(sparsebnUtils::input_not_sparsebnData(data))
#
# ### Extract the data (CCDr only works on observational data, so ignore the intervention part)
# data_matrix <- data$data
### Check data format
if(!sparsebnUtils::check_if_data_matrix(data)) stop("'data' argument must be a data.frame or matrix!")
# Could use check_if_complete_data here, but we avoid this in order to save a (small) amount of computation
# and give a more informative error message
num_missing_values <- sparsebnUtils::count_nas(data)
if(num_missing_values > 0) stop(sprintf("%d missing values detected!", num_missing_values))
### Get the dimensions of the data matrix
nn <- as.integer(nrow(data))
pp <- as.integer(ncol(data))
if(pp > MAX_CCS_ARRAY_SIZE()){
stop(max_nodes_warning(pp))
}
if(is.null(ivn)) ivn <- vector("list", nn) # to pass testthat for observational data cases
### Check ivn
if(!check_if_ivn_list(ivn)) stop("ivn must be a list of NULLs or vectors!")
if(!check_ivn_size(ivn, data)) stop(sprintf("Length of ivn is %d, expected to match the number of rows in data: %d.", length(ivn), nn))
check_ivn_label(ivn, data)
### if(!check_ivn_label(ivn, data)) stop("Intervention labels are incorrect.")
### use a vector nj to count how many times each node is under intervention
### refer to nj as "intervention times vector"
nj <- rep(0, pp)
for(j in 1:pp) { ## include 0 here or not?
nj[j] <- sum(!sapply(lapply(ivn, is.element, j), any)) ## optimize for sorted column?
}
### Set default for sigmas (negative values => ignore initial value and update as usual)
if(is.null(sigmas)){
sigmas <- rep(-1., pp)
}
### Use default values for lambda if not specified
if(is.null(lambdas)){
if(is.null(lambdas.length)){
stop("Both lambdas and lambdas.length unspecified: Must specify a value for at least one of these arguments!")
} else{
### Check lambdas.length if specified
if(!is.numeric(lambdas.length)) stop("lambdas.length must be numeric!")
if(lambdas.length <= 0) stop("lambdas.length must be positive!")
}
if(missing(rlam)){
### Even though ccdr_call should never be called on its own, this behaviour is left for testing backwards-compatibility
stop("rlam must be specified if lambdas is not explicitly specified.")
} else if(is.null(rlam)){
### rlam = NULL is used as a sentinel value to indicate a default value should be used
rlam <- 1e-2
} else{
### Check rlam if specified
if(!is.numeric(rlam)) stop("rlam must be numeric!")
if(rlam < 0) stop("rlam must be >= 0!")
}
# If no grid of lambdas is passed, then use the standard log-scale that starts at
# max.lam = sqrt(nn) and descends to min.lam = rlam * max.lam
lambdas <- sparsebnUtils::generate.lambdas(lambda.max = sqrt(nn),
lambdas.ratio = rlam,
lambdas.length = as.integer(lambdas.length),
scale = "log")
}
### Check lambdas
if(!is.numeric(lambdas)) stop("lambdas must be a numeric vector!")
if(any(lambdas < 0)) stop("lambdas must contain only nonnegative values!")
# if(length(lambdas) != lambdas.length){
# warning("Length of lambdas vector does not match lambdas.length. The specified lambdas vector will be used and lambdas.length will be overwritten.")
# }
### By default, set the initial guess for betas to be all zeroes
if(missing(betas)){
betas <- matrix(0, nrow = pp, ncol = pp)
# betas <- SparseBlockMatrixR(betas) # 2015-03-26: Deprecated and replaced with .init_sbm below
betas <- .init_sbm(betas, rep(0, pp))
# If the initial matrix is the zero matrix, indexing does not matter so we don't need to use reIndexC here
# Still need to set start = 0, though.
betas$start <- 0
} # Type-checking for betas happens in ccdr_singleR
# This parameter can be set by the user, but in order to prevent the algorithm from taking too long to run
# it is a good idea to keep the threshold used by default which is O(sqrt(pp))
if(is.null(max.iters)){
max.iters <- sparsebnUtils::default_max_iters(pp)
}
### White/black lists
# Be careful about handling various NULL cases
if(!is.null(whitelist)) whitelist <- bwlist_check(whitelist, node_names)
if(!is.null(blacklist)) blacklist <- bwlist_check(blacklist, node_names)
if(!is.null(whitelist) && !is.null(blacklist)){
if(length(intersect(whitelist, blacklist)) > 0){
badinput <- vapply(intersect(whitelist, blacklist), function(x) sprintf("\t[%s]\n", paste(x, collapse = ",")), FUN.VALUE = "vector")
badinput <- paste(badinput, collapse = "")
msg <- sprintf("Duplicate entries found in blacklist and whitelist: \n%s", badinput)
stop(msg)
}
}
weights <- bwlist_to_weights(blacklist, whitelist, nnode = pp)
### Pre-process correlation data
t1.cor <- proc.time()[3]
# cors <- cor(data)
# cors <- cors[upper.tri(cors, diag = TRUE)]
corlist <- sparsebnUtils::cor_vector_ivn(data, ivn)
cors <- corlist$cors
indexj <- corlist$indexj
t2.cor <- proc.time()[3]
fit <- ccdr_gridR(cors,
as.integer(pp),
as.integer(nn),
as.integer(nj),
as.integer(indexj),
betas,
as.numeric(sigmas),
as.numeric(lambdas),
as.integer(weights),
as.numeric(gamma),
as.numeric(error.tol),
as.integer(max.iters),
as.numeric(alpha),
verbose)
#
# Output DAGs as edge lists (i.e. edgeList objects).
# This is NOT the same as sbm$rows since some of these rows may correspond to edges with zero coefficients.
# See docs for SparseBlockMatrixR class for details.
#
for(k in seq_along(fit)){
### Coerce sbm output to edgeList
names(fit[[k]])[1] <- "edges" # rename 'sbm' slot to 'edges': After the next line, this slot will no longer be an SBM object
fit[[k]]$edges <- sparsebnUtils::as.edgeList(fit[[k]]$edges) # Before coercion, li$edges is actually an SBM object
names(fit[[k]]$edges) <- names(data)
### Add node names to output
fit[[k]] <- append(fit[[k]], list(node_names), after = 1) # insert node names into second slot
names(fit[[k]])[2] <- "nodes"
}
fit <- lapply(fit, sparsebnUtils::sparsebnFit) # convert everything to sparsebnFit objects
sparsebnUtils::sparsebnPath(fit) # wrap as sparsebnPath object
} # END CCDR_CALL
# ccdr_gridR
#
# Main subroutine for running the CCDr algorithm on a grid of lambda values.
ccdr_gridR <- function(cors,
pp, nn,
nj = NULL,
indexj = NULL,
betas,
sigmas,
lambdas,
weights,
gamma,
eps,
maxIters,
alpha,
verbose
){
### Check alpha
if(!is.numeric(alpha)) stop("alpha must be numeric!")
if(alpha < 0) stop("alpha must be >= 0!")
### nlam is now set automatically
nlam <- length(lambdas)
### Check indexj
if(is.null(indexj)) indexj <- rep(0L, pp + 1)
### Check nj
if(is.null(nj)) nj <- as.integer(rep(nn, pp))
ccdr.out <- list()
for(i in 1:nlam){
if(verbose) message("Working on lambda = ", round(lambdas[i], 5), " [", i, "/", nlam, "]")
t1.ccdr <- proc.time()[3]
ccdr.out[[i]] <- ccdr_singleR(cors,
pp, nn,
nj,
indexj,
betas,
sigmas,
lambdas[i],
weights,
gamma = gamma,
eps = eps,
maxIters = maxIters,
alpha = alpha,
verbose = verbose
)
t2.ccdr <- proc.time()[3]
betas <- ccdr.out[[i]]$sbm
betas <- sparsebnUtils::reIndexC(betas) # use C-friendly indexing
if(verbose){
test.nedge <- sum(as.matrix(betas) != 0)
message(" Estimated number of edges: ", ccdr.out[[i]]$nedge)
# message(" Estimated total variance: ", sum(1 / (betas$sigmas)^2))
}
# 7-16-14: Added code below to check edge threshold via alpha parameter
if(ccdr.out[[i]]$nedge > alpha * pp){
if(verbose) message("Edge threshold met, terminating algorithm with ", ccdr.out[[i-1]]$nedge, " edges.")
ccdr.out <- ccdr.out[1:(i-1)] # only return up to i - 1 since the last (ith) model did not finish
break
}
}
ccdr.out
} # END CCDR_GRIDR
# ccdr_singleR
#
# Internal subroutine for handling calls to singleCCDr: This is the only place where C++ is directly
# called. Type-checking is strongly enforced here.
ccdr_singleR <- function(cors,
pp, nn,
nj = NULL,
indexj = NULL,
betas,
sigmas,
lambda,
weights,
gamma,
eps,
maxIters,
alpha, # 2-9-15: No longer necessary in ccdr_singleR, but needed since the C++ call asks for it
verbose = FALSE
){
### Check dimension parameters
if(!is.integer(pp) || !is.integer(nn)) stop("Both pp and nn must be integers!")
if(pp <= 0 || nn <= 0) stop("Both pp and nn must be positive!")
### These variables, if NULL, need to be initialized before checking anything
if(is.null(indexj)) indexj <- rep(0L, pp + 1) # initialize indexj
if(is.null(nj)) nj <- as.integer(rep(nn, pp)) # initialize nj
### Check indexj
if(!is.vector(indexj)) stop("Index vector for cors is not a vector.")
if(length(indexj) > pp + 1) stop(sprintf("Index vector for cors is too long, expected to be no greater than %d, the number of columns of data.", pp))
if(!is.integer(indexj)) stop("Index vector for cors has non-integer component(s).")
if(any(is.na(indexj) | is.null(indexj))) stop("Index vector cannot have missing or NULL values.")
if(any(indexj < 0 | indexj > pp + 1)) stop(sprintf("Index vector for cors has out-of-range component(s), expected to be between 0 and %d.", pp))
### Check nj
if(!is.vector(nj)) stop("Intervention times vector is not a vector.")
if(length(nj) != pp) stop(sprintf("Length of intervention times vector is %d, expected to match the number of columns of data = %d", length(nj), pp))
if(!is.integer(nj)) stop("Intervention times vector has non-integer component(s).")
if(any(is.na(nj) | is.null(nj))) stop("Intervention times vector cannot have missing or NULL values.")
if(any(nj < 0 | nj > nn)) stop(sprintf("Intervention times vector has out-of-range component(s), expected to be between 0 and %d.", nn))
### Check cors
### This check must come after the checks for indexj, nj since these values are used to check cors
if(!is.numeric(cors)) stop("cors must be a numeric vector!")
if(length(cors) != length(unique(indexj))*pp*(pp+1)/2) stop(paste0("cors has incorrect length: Expected length = ", length(unique(indexj))*pp*(pp+1)/2, " input length = ", length(cors)))
### add a weight a_j to penalty on beta_{ij}
### since now with intervention data, beta_{ij} only appears n_j times out of total nn samples
aj <- nj / nn
### Check betas
if(sparsebnUtils::check_if_matrix(betas)){ # if the input is a matrix, convert to SBM object
betas <- .init_sbm(betas, rep(0, pp)) # if betas is non-numeric, SparseBlockMatrixR constructor should throw error
betas <- reIndexC(betas) # use C-friendly indexing
} else if(!is.SparseBlockMatrixR(betas)){ # otherwise check that it is an object of class SparseBlockMatrixR
stop("Incompatible data passed for betas parameter: Should be either matrix or list in SparseBlockMatrixR format.")
}
### Check sigmas
if(!is.numeric(sigmas)) stop("sigmas must be numeric!")
if(length(sigmas) != pp) stop(sprintf("sigmas must have length = %d!", pp))
if(any(sigmas < 0)){
# -1 is a sentinel value for updating sigmas via the CD updates
if(any(sigmas != -1.)){
stop("sigmas must be > 0!")
}
}
### Check lambda
if(!is.numeric(lambda)) stop("lambda must be numeric!")
if(lambda < 0) stop("lambda must be >= 0!")
### Check weights
if(length(weights) != pp*pp) stop(sprintf("weights must have length p^2 = %d!", pp*pp))
if(!is.numeric(weights)) stop("weights must be numeric!")
if(any(weights < -1) || any(weights > 1)) stop("weights out of bounds!")
### Check gamma
if(!is.numeric(gamma)) stop("gamma must be numeric!")
if(gamma < 0 && gamma != -1) stop("gamma must be >= 0 (MCP) or = -1 (Lasso)!")
### Check eps
if(!is.numeric(eps)) stop("eps must be numeric!")
if(eps <= 0){
if(eps < 0) stop("eps must be >= 0!")
if(eps == 0) warning("eps is set to zero: This may cause the algorithm to fail to converge, and maxIters will be used to terminate the algorithm.")
}
### Check maxIters
if(!is.integer(maxIters)) stop("maxIters must be an integer!")
if(maxIters <= 0) stop("maxIters must be > 0!")
### alpha check is in ccdr_gridR
# if(verbose) cat("Opening C++ connection...")
t1.ccdr <- proc.time()[3]
ccdr.out <- singleCCDr(cors,
betas,
sigmas,
nj,
indexj,
aj,
lambda,
weights,
c(gamma, eps, maxIters, alpha),
verbose = verbose)
t2.ccdr <- proc.time()[3]
# if(verbose) cat("C++ connection closed. Total time in C++: ", t2.ccdr-t1.ccdr, "\n")
#
# Convert output back to SBM format
#
ccdr.out <- list(sbm = SparseBlockMatrixR(list(rows = ccdr.out$rows, vals = ccdr.out$vals, blocks = ccdr.out$blocks, sigmas = ccdr.out$sigmas, start = 0)),
lambda = ccdr.out$lambda,
nedge = ccdr.out$length,
pp = pp,
nn = nn,
time = t2.ccdr - t1.ccdr)
ccdr.out$sbm <- sparsebnUtils::reIndexR(ccdr.out$sbm)
# sparsebnFit(ccdr.out)
ccdr.out
} # END CCDR_SINGLER
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.