#
# ccdr-main-R.R
# ccdr
#
# Created by Bryon Aragam (local) on 2/4/15.
# Copyright (c) 2014-2015 Bryon Aragam (local). All rights reserved.
#
#
# PACKAGE CCDR: Main CCDr methods
#
# CONTENTS:
# ccdr.run
# ccdr_call
# ccdr_gridR
# ccdr_singleR
#
###--- These two lines are necessary to import the auto-generated Rcpp methods in RcppExports.R---###
#' @useDynLib ccdr
#' @importFrom Rcpp sourceCpp
NULL
#' Main CCDr Algorithm
#'
#' Estimate a Bayesian network (directed acyclic graph) from observational data using the
#' CCDr algorithm as described in \href{http://arxiv.org/abs/1401.0852}{Aragam and Zhou (2015), JMLR}.
#'
#' Instead of producing a single estimate, this algorithm computes a solution path of estimates based
#' on the values supplied to \code{lambdas} or \code{lambdas.length}. The CCDr algorithm approximates
#' the solution to a nonconvex optimization problem using coordinate descent. Instead of AIC or BIC,
#' CCDr uses continuous regularization based on concave penalties such as the minimax concave penalty
#' (MCP).
#'
#' This implementation includes two options for the penalty: (1) MCP, and (2) L1 (or Lasso). This option
#' is controlled by the \code{gamma} argument.
#'
#' @param data Data matrix. Must be numeric and contain no missing values.
#' @param betas Initial guess for the algorithm. Represents the weighted adjacency matrix
#' of a DAG where the algorithm will begin searching for an optimal structure.
#' @param lambdas (optional) Numeric vector containing a grid of lambda values (i.e. regularization
#' parameters) to use in the solution path. If missing, a default grid of values will be
#' used based on a decreasing log-scale (see also \link{generate.lambdas}).
#' @param lambdas.length Integer number of values to include in the solution path. If \code{lambdas}
#' has also been specified, this value will be ignored. Note also that the final
#' solution path may contain fewer estimates (see
#' \code{alpha}).
#' @param gamma Value of concavity parameter. If \code{gamma > 0}, then the MCP will be used
#' with \code{gamma} as the concavity parameter. If \code{gamma < 0}, then the L1 penalty
#' will be used and this value is otherwise ignored.
#' @param error.tol Error tolerance for the algorithm, used to test for convergence.
#' @param max.iters Maximum number of iterations for each internal sweep.
#' @param alpha Threshold parameter used to terminate the algorithm whenever the number of edges in the
#' current estimation is \code{> alpha * ncol(data)}.
#' @param verbose \code{TRUE / FALSE} whether or not to print out progress and summary reports.
#'
#' @return A \code{\link{ccdrPath-class}} object.
#'
#' @examples
#'
#' \dontrun{
#'
#' ### Generate some random data
#' dat <- matrix(rnorm(1000), nrow = 20)
#'
#' ### Run with default settings
#' ccdr.run(data = dat)
#'
#' ### Optional: Adjust settings
#' pp <- ncol(dat)
#' init.betas <- matrix(0, nrow = pp, ncol = pp) # initialize algorithm with a random initial value
#' init.betas[1,2] <- init.betas[1,3] <- init.betas[4,2] <- 1 #
#' ccdr.run(data = dat, betas = init.betas, lambdas.length = 10, alpha = 10, verbose = TRUE)
#' }
#'
#' @export
ccdr.run <- function(data,
betas,
lambdas,
lambdas.length = NULL,
gamma = 2.0,
error.tol = 1e-4,
max.iters = NULL,
alpha = 10,
verbose = FALSE
){
### This is just a wrapper for the internal implementation given by ccdr_call
ccdr_call(data = data,
betas = betas,
lambdas = lambdas,
lambdas.length = lambdas.length,
gamma = gamma,
error.tol = error.tol,
rlam = NULL,
max.iters = max.iters,
alpha = alpha,
verbose = verbose)
} # END CCDR.RUN
# ccdr_call
#
# Handles most of the bookkeeping for CCDr. Sets default values and prepares arguments for
# passing to ccdr_gridR and ccdr_singleR. Some type-checking as well, although most of
# this is handled internally by ccdr_gridR and ccdr_singleR.
#
ccdr_call <- function(data,
betas,
lambdas,
lambdas.length,
gamma,
error.tol,
rlam,
max.iters,
alpha,
verbose = FALSE
){
### Check data
if(!check_if_data_matrix(data)) stop("Data must be either a data.frame or a numeric matrix!")
if(count_nas(data) > 0) stop(paste0(count_nas(data), " missing values detected!"))
### Get the dimensions of the data matrix
nn <- as.integer(nrow(data))
pp <- as.integer(ncol(data))
### Use default values for lambda if not specified
if(missing(lambdas)){
if(is.null(lambdas.length)){
stop("Both lambdas and lambdas.length unspecified: Must specify a value for at least one of these arguments!")
} else{
### Check lambdas.length if specified
if(!is.numeric(lambdas.length)) stop("lambdas.length must be numeric!")
if(lambdas.length <= 0) stop("lambdas.length must be positive!")
}
if(missing(rlam)){
### Even though ccdr_call should never be called on its own, this behaviour is left for testing backwards-compatibility
stop("rlam must be specified if lambdas is not explicitly specified.")
} else if(is.null(rlam)){
### rlam = NULL is used as a sentinel value to indicate a default value should be used
rlam <- 1e-2
} else{
### Check rlam if specified
if(!is.numeric(rlam)) stop("rlam must be numeric!")
if(rlam < 0) stop("rlam must be >= 0!")
}
# If no grid of lambdas is passed, then use the standard log-scale that starts at
# max.lam = sqrt(nn) and descends to min.lam = rlam * max.lam
lambdas <- generate.lambdas(lambda.max = sqrt(nn),
lambdas.ratio = rlam,
lambdas.length = as.integer(lambdas.length),
scale = "log")
}
### Check lambdas
if(!is.numeric(lambdas)) stop("lambdas must be a numeric vector!")
if(any(lambdas < 0)) stop("lambdas must contain only nonnegative values!")
# if(length(lambdas) != lambdas.length){
# warning("Length of lambdas vector does not match lambdas.length. The specified lambdas vector will be used and lambdas.length will be overwritten.")
# }
### By default, set the initial guess for betas to be all zeroes
if(missing(betas)){
betas <- matrix(0, nrow = pp, ncol = pp)
# betas <- SparseBlockMatrixR(betas) # 2015-03-26: Deprecated and replaced with .init_sbm below
betas <- .init_sbm(betas, rep(0, pp))
# If the initial matrix is the zero matrix, indexing does not matter so we don't need to use reIndexC here
# Still need to set start = 0, though.
betas$start <- 0
} # Type-checking for betas happens in ccdr_singleR
# This parameter can be set by the user, but in order to prevent the algorithm from taking too long to run
# it is a good idea to keep the threshold used by default which is O(sqrt(pp))
if(is.null(max.iters)){
max.iters <- 2 * max(10, sqrt(pp))
}
t1.cor <- proc.time()[3]
# cors <- cor(data)
# cors <- cors[upper.tri(cors, diag = TRUE)]
cors <- cor_vector(data)
t2.cor <- proc.time()[3]
fit <- ccdr_gridR(cors,
as.integer(pp),
as.integer(nn),
betas,
as.numeric(lambdas),
as.numeric(gamma),
as.numeric(error.tol),
as.integer(max.iters),
as.numeric(alpha),
verbose)
fit <- lapply(fit, ccdrFit.list) # convert everything to ccdrFit objects
ccdrPath.list(fit) # wrap as ccdrPath object
} # END CCDR_CALL
# ccdr_gridR
#
# Main subroutine for running the CCDr algorithm on a grid of lambda values.
ccdr_gridR <- function(cors,
pp, nn,
betas,
lambdas,
gamma,
eps,
maxIters,
alpha,
verbose
){
### Check alpha
if(!is.numeric(alpha)) stop("alpha must be numeric!")
if(alpha < 0) stop("alpha must be >= 0!")
### nlam is now set automatically
nlam <- length(lambdas)
ccdr.out <- list()
for(i in 1:nlam){
if(verbose) message("Working on lambda = ", round(lambdas[i], 5), " [", i, "/", nlam, "]")
t1.ccdr <- proc.time()[3]
ccdr.out[[i]] <- ccdr_singleR(cors,
pp, nn,
betas,
lambdas[i],
gamma = gamma,
eps = eps,
maxIters = maxIters,
alpha = alpha,
verbose = verbose
)
t2.ccdr <- proc.time()[3]
betas <- ccdr.out[[i]]$sbm
betas <- reIndexC(betas) # use C-friendly indexing
if(verbose){
test.nedge <- sum(as.matrix(betas) != 0)
message(" Estimated number of edges: ", ccdr.out[[i]]$nedge, " / test = ", test.nedge)
# message(" Estimated total variance: ", sum(1 / (betas$sigmas)^2))
}
# 7-16-14: Added code below to check edge threshold via alpha parameter
if(ccdr.out[[i]]$nedge > alpha * pp){
if(verbose) message("Edge threshold met, terminating algorithm with ", ccdr.out[[i-1]]$nedge, " edges.")
break
}
}
ccdr.out[1:(i-1)] # only return up to i - 1 since the last (ith) model would not have finished running anyway
} # END CCDR_GRIDR
# ccdr_singleR
#
# Internal subroutine for handling calls to singleCCDr: This is the only place where C++ is directly
# called. Type-checking is strongly enforced here.
ccdr_singleR <- function(cors,
pp, nn,
betas,
lambda,
gamma,
eps,
maxIters,
alpha, # 2-9-15: No longer necessary in ccdr_singleR, but needed since the C++ call asks for it
verbose = FALSE
){
### Check cors
if(!is.numeric(cors)) stop("cors must be a numeric vector!")
if(length(cors) != pp*(pp+1)/2) stop(paste0("cors has incorrect length: Expected length = ", pp*(pp+1)/2, " input length = ", length(cors)))
### Check dimension parameters
if(!is.integer(pp) || !is.integer(nn)) stop("Both pp and nn must be integers!")
if(pp <= 0 || nn <= 0) stop("Both pp and nn must be positive!")
### Check betas
if(check_if_matrix(betas)){ # if the input is a matrix, convert to SBM object
betas <- SparseBlockMatrixR(betas) # if betas is non-numeric, SparseBlockMatrixR constructor should throw error
betas <- reIndexC(betas) # use C-friendly indexing
} else if(!is.SparseBlockMatrixR(betas)){ # otherwise check that it is an object of class SparseBlockMatrixR
stop("Incompatible data passed for betas parameter: Should be either matrix or list in SparseBlockMatrixR format.")
}
### Check lambda
if(!is.numeric(lambda)) stop("lambda must be numeric!")
if(lambda < 0) stop("lambda must be >= 0!")
### Check gamma
if(!is.numeric(gamma)) stop("gamma must be numeric!")
if(gamma < 0 && gamma != -1) stop("gamma must be >= 0 (MCP) or = -1 (Lasso)!")
### Check eps
if(!is.numeric(eps)) stop("eps must be numeric!")
if(eps <= 0){
if(eps < 0) stop("eps must be >= 0!")
if(eps == 0) warning("eps is set to zero: This may cause the algorithm to fail to converge, and maxIters will be used to terminate the algorithm.")
}
### Check maxIters
if(!is.integer(maxIters)) stop("maxIters must be an integer!")
if(maxIters <= 0) stop("maxIters must be > 0!")
### alpha check is in ccdr_gridR
# if(verbose) cat("Opening C++ connection...")
t1.ccdr <- proc.time()[3]
ccdr.out <- singleCCDr(cors,
betas,
nn,
lambda,
c(gamma, eps, maxIters, alpha),
verbose = verbose)
t2.ccdr <- proc.time()[3]
# if(verbose) cat("C++ connection closed. Total time in C++: ", t2.ccdr-t1.ccdr, "\n")
#
# Convert output back to SBM format
#
ccdr.out <- list(sbm = SparseBlockMatrixR(list(rows = ccdr.out$rows, vals = ccdr.out$vals, blocks = ccdr.out$blocks, sigmas = ccdr.out$sigmas, start = 0)),
lambda = ccdr.out$lambda,
nedge = ccdr.out$length,
pp = pp,
nn = nn,
time = t2.ccdr - t1.ccdr)
ccdr.out$sbm <- reIndexR(ccdr.out$sbm)
# ccdrFit(ccdr.out)
ccdr.out
} # END CCDR_SINGLER
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.