# R/gamBiCopSelect.R In gamCopula: Generalized Additive Models for Bivariate Conditional Dependence Structures and Vine Copulas

#### Documented in gamBiCopSelect

#' Selection and Maximum penalized likelihood estimation of a Generalized
#' Additive model (gam) for the copula parameter or Kendall's tau.
#'
#' This function selects an appropriate bivariate copula family for given
#' bivariate copula data using one of a range of methods. The corresponding
#' parameter estimates are obtained by maximum penalized  likelihood estimation,
#' where each Newton-Raphson iteration is reformulated as a generalized ridge
#' regression solved using the \code{\link[mgcv:mgcv-package]{mgcv}} package.
#'
#' @param udata A matrix or data frame containing the model responses, (u1,u2) in
#' [0,1]x[0,1]
#' @param lin.covs A matrix or data frame containing the parametric (i.e.,
#' linear) covariates.
#' @param smooth.covs A matrix or data frame containing the non-parametric
#' (i.e., smooth) covariates.
#' @param familyset (Similar to \code{\link{BiCopSelect}} from the
#' Vector of bivariate copula families to select from.
#' If \code{familyset = NA} (default), selection
#' among all possible families is performed.   Coding of bivariate copula
#' families:
#' \code{1} Gaussian,
#' \code{2} Student t,
#' \code{5} Frank,
#' \code{301} Double Clayton type I (standard and rotated 90 degrees),
#' \code{302} Double Clayton type II (standard and rotated 270 degrees),
#' \code{303} Double Clayton type III (survival and rotated 90 degrees),
#' \code{304} Double Clayton type IV (survival and rotated 270 degrees),
#' \code{401} Double Gumbel type I (standard and rotated 90 degrees),
#' \code{402} Double Gumbel type II (standard and rotated 270 degrees),
#' \code{403} Double Gumbel type III (survival and rotated 90 degrees),
#' \code{404} Double Gumbel type IV (survival and rotated 270 degrees).
#' @param rotations If \code{TRUE}, all rotations of the families in familyset
#' are included.
#' @param familycrit Character indicating the criterion for bivariate copula
#' selection. Possible choices: \code{familycrit = 'AIC'} (default) or
#' \code{'BIC'}, as in \code{\link{BiCopSelect}} from the
#' @param level Numerical; significance level of the test for removing individual
#' predictors (default: \code{level = 0.05}).
#' @param edf Numerical; if the estimated EDF for individual predictors is
#' smaller than \code{edf} but the predictor is still significant, then
#' it is set as linear (default: \code{edf = 1.5}).
#' @param tau \code{FALSE} for a calibration function specified for
#' the Copula parameter or \code{TRUE} (default) for a calibration function
#' specified for Kendall's tau.
#' @param method \code{'FS'} for Fisher-scoring (default) and
#' \code{'NR'} for Newton-Raphson.
#' @param tol.rel Relative tolerance for \code{'FS'}/\code{'NR'} algorithm.
#' @param n.iters Maximal number of iterations for
#' \code{'FS'}/\code{'NR'} algorithm.
#' @param parallel \code{TRUE} for a parallel estimation across copula families.
#' @param verbose \code{TRUE} prints informations during the estimation.
#' @param select.once if \code{TRUE} the GAM structure is only selected once,
#'   for the family that appears first in \code{familyset}.
#' @return \code{gamBiCopFit} returns a list consisting of
#' \item{method}{\code{'FS'} for Fisher-scoring and
#' \code{'NR'} for Newton-Raphson.}
#' \item{tol.rel}{relative tolerance for \code{'FS'}/\code{'NR'} algorithm.}
#' \item{n.iters}{maximal number of iterations for
#' \code{'FS'}/\code{'NR'} algorithm.}
#' \item{trace}{the estimation procedure's trace.}
#' \item{conv}{\code{0} if the algorithm converged and \code{1} otherwise.}
#' @examples
#' require(copula)
#' set.seed(0)
#'
#' ## Simulation parameters (sample size, correlation between covariates,
#' ## Student copula with 4 degrees of freedom)
#' n <- 5e2
#' rho <- 0.9
#' fam <- 2
#' par2 <- 4
#'
#' ## A calibration surface depending on four variables
#' eta0 <- 1
#' calib.surf <- list(calib.lin <- function(t, Ti = 0, Tf = 1, b = 2) {
#'     return(-2+4*t)},
#'   calib.quad <- function(t, Ti = 0, Tf = 1, b = 8) {
#'     Tm <- (Tf - Ti)/2
#'     a <- -(b/3) * (Tf^2 - 3 * Tf * Tm + 3 * Tm^2)
#'   return(a + b * (t - Tm)^2)},
#'   calib.sin <- function(t, Ti = 0, Tf = 1, b = 1, f = 1) {
#'     a <- b * (1 - 2 * Tf * pi/(f * Tf * pi +
#'                                  cos(2 * f * pi * (Tf - Ti))
#'                                - cos(2 * f * pi * Ti)))
#'     return((a + b)/2 + (b - a) * sin(2 * f * pi * (t - Ti))/2)},
#'   calib.exp <- function(t, Ti = 0, Tf = 1, b = 2, s = Tf/8) {
#'     Tm <- (Tf - Ti)/2
#'     a <- (b * s * sqrt(2 * pi)/Tf) * (pnorm(0, Tm, s) - pnorm(Tf, Tm, s))
#'     return(a + b * exp(-(t - Tm)^2/(2 * s^2)))})
#'
#' ## 6-dimensional matrix X of covariates
#' covariates.distr <- mvdc(normalCopula(rho, dim = 6),
#'                                  c("unif"), list(list(min = 0, max = 1)),
#'                                  marginsIdentical = TRUE)
#' X <- rMvdc(n, covariates.distr)
#' colnames(X) <- paste("x",1:6,sep="")
#'
#' ## U in [0,1]x[0,1] depending on the four first columns of X
#' U <- condBiCopSim(fam, function(x1,x2,x3,x4) {eta0+sum(mapply(function(f,x)
#'   f(x), calib.surf, c(x1,x2,x3,x4)))}, X[,1:4], par2 = 4, return.par = TRUE)
#'
#' \dontrun{
#' ## Selection using AIC (about 30sec on single core)
#' ## Use parallel = TRUE to speed-up....
#' system.time(best <- gamBiCopSelect(U$data, smooth.covs = X)) #' print(best$res)
#' EDF(best$res) ## The first function is linear #' ## Plot only the smooth component #' par(mfrow=c(2,2)) #' plot(best$res)}
#' @export
gamBiCopSelect <- function(udata, lin.covs = NULL, smooth.covs = NULL,
familyset = NA, rotations = TRUE,
familycrit = "AIC", level = 5e-2, edf = 1.5, tau = TRUE,
method = "FS", tol.rel = 1e-3, n.iters = 10,
parallel = FALSE, verbose = FALSE,
select.once = TRUE, ...) {

tmp <- valid.gamBiCopSelect(udata, lin.covs, smooth.covs, rotations, familyset,
familycrit, level, edf, tau, method, tol.rel,
n.iters, parallel, verbose, select.once)
if (tmp != TRUE)
stop(tmp)

if (length(familyset) == 1 && is.na(familyset)) {
familyset <- get.familyset()
}
if (rotations) {
familyset <- withRotations(familyset)
}
familyset <- unique(familyset)

parallel <- as.logical(parallel)
if (parallel) {
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl, cores = detectCores() - 1)
on.exit(stopCluster(cl))
}

x <- NULL
if (select.once) {
res <- vector("list", length(familyset))
# select GAM structure for first family in familyset
res[[1]] <- tryCatch(gamBiCopVarSel(udata, lin.covs, smooth.covs,
familyset[1], tau, method, tol.rel,
n.iters, level, edf, verbose,...),
error = function(e) e)
# fit with same structure for all other families
if (length(familyset) > 1) {
# combine data and covariates into one matrix
tmpdata <- udata
if (!is.null(lin.covs))
tmpdata <- cbind(tmpdata, lin.covs)
if (!is.null(smooth.covs))
tmpdata <- cbind(tmpdata, smooth.covs)

if (!parallel) {
res[-1] <- foreach(x=familyset[-1]) %do%
tryCatch(gamBiCopFit(tmpdata,
res[[1]]$res@model$formula,
x, tau, method, tol.rel, n.iters,
verbose, ...),
error = function(e) e)
} else {
res[-1] <- foreach(x=familyset[-1]) %dopar%
tryCatch(gamBiCopFit(tmpdata,
res[[1]]$res@model$formula,
x, tau, method, tol.rel, n.iters,
verbose, ...),
error = function(e) e)
}
}
} else {
# select GAM structure independently for all families
if (!parallel) {
res <- foreach(x=familyset) %do%
tryCatch(gamBiCopVarSel(udata, lin.covs, smooth.covs,
x, tau, method, tol.rel, n.iters, #max.knots,
level, edf, verbose, ...),
error = function(e) e)
} else {
res <- foreach(x=familyset) %dopar%
tryCatch(gamBiCopVarSel(udata, lin.covs, smooth.covs,
x, tau, method, tol.rel, n.iters,
level, edf, verbose, ...),
error = function(e) e)
}
}

res <- res[sapply(res,length) == 6]
if (length(res) == 0 ) { #|| sum(sapply(res, function(x) x$conv) == 0) == 0) { return(paste("No convergence of the estimation for any copula family.", "Try modifying the parameters (FS/NR, tol.rel, n.iters)...")) } if (familycrit == "AIC") { return(res[[which.min(sapply(res, function(x) AIC(x$res)))]])
} else {
return(res[[which.min(sapply(res, function(x) BIC(x$res)))]]) } } gamBiCopVarSel <- function(udata, lin.covs, smooth.covs, family, tau = TRUE, method = "FS", tol.rel = 1e-3, n.iters = 10, #max.knots, level = 5e-2, edf = 1.5, verbose = FALSE, ...) { udata <- as.data.frame(udata) colnames(udata) <- c("u1", "u2") data <- udata if (!is.null(lin.covs)) { if (is.null(colnames(lin.covs))) { lin.covs <- as.data.frame(lin.covs) colnames(lin.covs) <- paste0("l", 1:ncol(lin.covs)) } data <- cbind(data, lin.covs) } if (!is.null(smooth.covs)) { if (is.null(colnames(smooth.covs))) { smooth.covs <- as.data.frame(smooth.covs) colnames(smooth.covs) <- paste0("s", 1:ncol(smooth.covs)) } data <- cbind(data, smooth.covs) } if (verbose == TRUE) { cat(paste("Model selection for family", family, "\n")) } myGamBiCopEst <- function(formula) gamBiCopFit(data, formula, family, tau, method, tol.rel, n.iters) n <- nrow(data) d <- ncol(data) - 2 ## Create a list with formulas of smooth terms corresponding to the covariates lin.nms <- colnames(lin.covs) nn <- smooth.nms <- colnames(smooth.covs) if (!is.null(ncol(smooth.covs))) { # if (length(max.knots) == 1) { # basis <- rep(max.knots, ncol(smooth.covs)) # } else { # stopifnot(length(max.knots) == ncol(smooth.covs)) # basis <- max.knots # } basis <- rep(10, ncol(smooth.covs)) formula.expr <- mapply(get.smooth, smooth.nms, basis) } else { basis <- NULL formula.expr <- NULL } if (!is.null(ncol(lin.covs))) { formula.lin <- lin.nms #get.linear(lin.nms, lin.nms) } else { formula.lin <- NULL } ## Update the list by removing unsignificant predictors if (verbose == TRUE) { cat("Remove unsignificant covariates.......\n") t1 <- Sys.time() } sel.smooth <- smooth2lin <- FALSE sel.lin <- NULL while((!all(c(sel.lin,sel.smooth)) && length(basis) > 0) || any(smooth2lin)) { smooth2lin <- FALSE sel.lin <- NULL formula.tmp <- get.formula(c(formula.lin,formula.expr)) # if (verbose == TRUE) { # cat("Model formula:\n") # print(formula.tmp) # } tmp <- myGamBiCopEst(formula.tmp) ## Remove unsignificant parametric terms if (length(summary(tmp$res@model)$p.coeff) > 1) { sel.lin <- summary(tmp$res@model)$p.pv[-1] < level formula.lin <- formula.lin[sel.lin] } if (summary(tmp$res@model)$m > 0) { ## Remove unsignificant smooth terms sel.smooth <- summary(tmp$res@model)$s.pv < level nn <- nn[sel.smooth] basis <- rep(10,length(nn)) formula.expr <- mapply(get.smooth,nn,basis) ## Check whether some smooth need to be set to linear ## (and adjust the model if required) smooth2lin <- sel.smooth & summary(tmp$res@model)$edf < edf if (any(smooth2lin)) { sel.lin <- which(sel.smooth) sel.lin <- summary(tmp$res@model)$edf[sel.lin] < edf sel.smooth <- !sel.lin formula.lin <- c(formula.lin,nn[sel.lin]) formula.expr <- formula.expr[sel.smooth] basis <- basis[sel.smooth] nn <- nn[sel.smooth] } } } ## Check if we can output a constant or linear model directly ## (or re-estimate the model if not) if (length(formula.expr) == 0) { if ((is.null(formula.lin) || length(formula.lin) == 0)) { tmp <- myGamBiCopEst(~1) } else { formula.tmp <- get.formula(formula.lin) tmp <- myGamBiCopEst(formula.tmp) } return(tmp) } else { formula.tmp <- get.formula(c(formula.lin,formula.expr)) tmp <- myGamBiCopEst(formula.tmp) } ## Increasing the basis size appropriately sel <- summary(tmp$res@model)$edf > (basis-1)*0.8 if (verbose == TRUE && any(sel)) { t2 <- Sys.time() cat(paste0("Time elapsed: ", round(as.numeric(t2-t1), 2), " seconds.\n")) t1 <- Sys.time() cat(paste("Select the basis sizes .......\n")) } while (any(sel) && all(basis < n/30)) { ## Increase basis sizes basis[sel] <- 2*basis[sel] ## Update the smooth terms, formula and fit the new model if (any(sel)) { formula.expr[sel] <- mapply(get.smooth,nn[sel],basis[sel]) formula.tmp <- get.formula(c(formula.lin,formula.expr)) # if (verbose == TRUE) { # cat("Updated model formula:\n") # print(formula.tmp) # } tmp <- myGamBiCopEst(formula.tmp) sel[sel] <- summary(tmp$res@model)$edf[sel] > (basis[sel]-1)*0.8 } ## Check that the basis size is smaller than 1/2 the size of ## the corresponding covariate's support if (any(sel)) { if (sum(sel) == 1) { sel[sel] <- 2*basis[sel] < length(unique(data[,nn[sel]])) } else { sel[sel] <- 2*basis[sel] < apply(data[,nn[sel]], 2, function(x) length(unique(x))) } } } if (verbose == TRUE) { t2 <- Sys.time() cat(paste0("Time elapsed: ", round(as.numeric(t2-t1), 2), " seconds.\n")) } return(tmp) } get.smooth <- function(x,k,bs="cr") { paste("s(",x,", k=", k,", bs='", bs, "')",sep = "") } get.formula <- function(expr,y=FALSE) { if (!y) { as.formula(paste("~",paste(expr,collapse = " + "))) } else { as.formula(paste("y ~",paste(expr,collapse = " + "))) } } get.linear <- function(x,nn){ names(unlist(sapply(nn,function(z)grep(z,x)))) } valid.gamBiCopSelect <- function(udata, lin.covs, smooth.covs, rotations, familyset, familycrit, level, edf, tau, method, tol.rel, n.iters, parallel, verbose, select.once) { data <- tryCatch(as.data.frame(udata), error = function(e) e$message)
if (is.character(udata)) {
return(udata)
}

tmp <- valid.gamBiCopFit(udata, n.iters, tau, tol.rel, method, verbose, 1)
if (tmp != TRUE) {
return(tmp)
}
m <- dim(udata)[2]
n <- dim(udata)[1]
if (m != 2) {
return("udata should have only two columns.")
}

if (!is.null(lin.covs) && !(is.data.frame(lin.covs) || is.matrix(lin.covs))) {
return("lin.covs has to be either a data frame or a matrix.")
} else {
if(!is.null(lin.covs) && dim(lin.covs)[1] != n) {
return("lin.covs must have the same number of rows as udata.")
}
}
if (!is.null(smooth.covs) && !(is.data.frame(smooth.covs) ||
is.matrix(smooth.covs))) {
return("smooth.covs has to be either a data frame or a matrix.")
} else {
if(!is.null(smooth.covs) && dim(smooth.covs)[1] != n) {
return("smooth.covs must have the same number of rows as udata.")
}
}

if (!valid.familyset(familyset)) {
return(return(msg.familyset(var2char(familyset))))
}

if (!valid.logical(rotations)) {
return(msg.logical(var2char(rotations)))
}

if(is.null(familycrit) || length(familycrit) != 1 ||
(familycrit != "AIC" && familycrit != "BIC")) {
return("Selection criterion not implemented.")
}

if (!valid.logical(parallel)) {
return(msg.logical(var2char(parallel)))
}

if (!valid.unif(level)) {
return(msg.unif(var2char(level)))
}

if (!valid.real(edf)) {
return(msg.real(var2char(edf)))
}

if (!is.logical(select.once)) {
return("'select.once' must be logical")
}

return(TRUE)
}


## Try the gamCopula package in your browser

Any scripts or data that you put into this service are public.

gamCopula documentation built on Aug. 18, 2017, 1:01 a.m.