R/gamBiCopSelect.R

Defines functions gamBiCopSelect gamBiCopVarSel get.smooth get.formula get.linear valid.gamBiCopSelect

Documented in gamBiCopSelect

#' Selection and Maximum penalized likelihood estimation of a Generalized
#' Additive model (gam) for the copula parameter or Kendall's tau.
#'
#' This function selects an appropriate bivariate copula family for given
#' bivariate copula data using one of a range of methods. The corresponding
#' parameter estimates are obtained by maximum penalized  likelihood estimation,
#' where each Newton-Raphson iteration is reformulated as a generalized ridge
#' regression solved using the \code{\link[mgcv:mgcv-package]{mgcv}} package.
#'
#' @param udata A matrix or data frame containing the model responses, (u1,u2) in
#' [0,1]x[0,1]
#' @param lin.covs A matrix or data frame containing the parametric (i.e.,
#' linear) covariates.
#' @param smooth.covs A matrix or data frame containing the non-parametric
#' (i.e., smooth) covariates.
#' @param familyset (Similar to \code{\link{BiCopSelect}} from the
#' \code{\link[VineCopula:VineCopula-package]{VineCopula}} package)
#' Vector of bivariate copula families to select from.
#' If \code{familyset = NA} (default), selection
#' among all possible families is performed.   Coding of bivariate copula
#' families:
#' \code{1} Gaussian,
#' \code{2} Student t,
#' \code{5} Frank,
#' \code{301} Double Clayton type I (standard and rotated 90 degrees),
#' \code{302} Double Clayton type II (standard and rotated 270 degrees),
#' \code{303} Double Clayton type III (survival and rotated 90 degrees),
#' \code{304} Double Clayton type IV (survival and rotated 270 degrees),
#' \code{401} Double Gumbel type I (standard and rotated 90 degrees),
#' \code{402} Double Gumbel type II (standard and rotated 270 degrees),
#' \code{403} Double Gumbel type III (survival and rotated 90 degrees),
#' \code{404} Double Gumbel type IV (survival and rotated 270 degrees).
#' @param rotations If \code{TRUE}, all rotations of the families in familyset
#' are included.
#' @param familycrit Character indicating the criterion for bivariate copula
#' selection. Possible choices: \code{familycrit = 'AIC'} (default) or
#' \code{'BIC'}, as in \code{\link{BiCopSelect}} from the
#' \code{\link[VineCopula:VineCopula-package]{VineCopula}} package.
#' @param level Numerical; significance level of the test for removing individual
#' predictors (default: \code{level = 0.05}).
#' @param edf Numerical; if the estimated EDF for individual predictors is
#' smaller than \code{edf} but the predictor is still significant, then
#' it is set as linear (default: \code{edf = 1.5}).
#' @param tau \code{FALSE} for a calibration function specified for
#' the Copula parameter or \code{TRUE} (default) for a calibration function
#' specified for Kendall's tau.
#' @param method \code{'FS'} for Fisher-scoring (default) and
#' \code{'NR'} for Newton-Raphson.
#' @param tol.rel Relative tolerance for \code{'FS'}/\code{'NR'} algorithm.
#' @param n.iters Maximal number of iterations for
#' \code{'FS'}/\code{'NR'} algorithm.
#' @param parallel \code{TRUE} for a parallel estimation across copula families.
#' @param verbose \code{TRUE} prints informations during the estimation.
#' @param select.once if \code{TRUE} the GAM structure is only selected once,
#'   for the family that appears first in \code{familyset}.
#' @param ... Additional parameters to be passed to \code{\link{gam}}
#' @return \code{gamBiCopFit} returns a list consisting of
#' \item{res}{S4 \code{\link{gamBiCop-class}} object.}
#' \item{method}{\code{'FS'} for Fisher-scoring and
#' \code{'NR'} for Newton-Raphson.}
#' \item{tol.rel}{relative tolerance for \code{'FS'}/\code{'NR'} algorithm.}
#' \item{n.iters}{maximal number of iterations for
#' \code{'FS'}/\code{'NR'} algorithm.}
#' \item{trace}{the estimation procedure's trace.}
#' \item{conv}{\code{0} if the algorithm converged and \code{1} otherwise.}
#' @seealso \code{\link{gamBiCop}} and \code{\link{gamBiCopFit}}.
#' @examples
#' require(copula)
#' set.seed(0)
#'
#' ## Simulation parameters (sample size, correlation between covariates,
#' ## Student copula with 4 degrees of freedom)
#' n <- 5e2
#' rho <- 0.9
#' fam <- 2
#' par2 <- 4
#'
#' ## A calibration surface depending on four variables
#' eta0 <- 1
#' calib.surf <- list(
#'   calib.lin <- function(t, Ti = 0, Tf = 1, b = 2) {
#'     return(-2 + 4 * t)
#'   },
#'   calib.quad <- function(t, Ti = 0, Tf = 1, b = 8) {
#'     Tm <- (Tf - Ti) / 2
#'     a <- -(b / 3) * (Tf^2 - 3 * Tf * Tm + 3 * Tm^2)
#'     return(a + b * (t - Tm)^2)
#'   },
#'   calib.sin <- function(t, Ti = 0, Tf = 1, b = 1, f = 1) {
#'     a <- b * (1 - 2 * Tf * pi / (f * Tf * pi +
#'       cos(2 * f * pi * (Tf - Ti))
#'       - cos(2 * f * pi * Ti)))
#'     return((a + b) / 2 + (b - a) * sin(2 * f * pi * (t - Ti)) / 2)
#'   },
#'   calib.exp <- function(t, Ti = 0, Tf = 1, b = 2, s = Tf / 8) {
#'     Tm <- (Tf - Ti) / 2
#'     a <- (b * s * sqrt(2 * pi) / Tf) * (pnorm(0, Tm, s) - pnorm(Tf, Tm, s))
#'     return(a + b * exp(-(t - Tm)^2 / (2 * s^2)))
#'   }
#' )
#'
#' ## 6-dimensional matrix X of covariates
#' covariates.distr <- mvdc(normalCopula(rho, dim = 6),
#'   c("unif"), list(list(min = 0, max = 1)),
#'   marginsIdentical = TRUE
#' )
#' X <- rMvdc(n, covariates.distr)
#' colnames(X) <- paste("x", 1:6, sep = "")
#'
#' ## U in [0,1]x[0,1] depending on the four first columns of X
#' U <- condBiCopSim(fam, function(x1, x2, x3, x4) {
#'   eta0 + sum(mapply(function(f, x)
#'     f(x), calib.surf, c(x1, x2, x3, x4)))
#' }, X[, 1:4], par2 = 4, return.par = TRUE)
#' \dontrun{
#' ## Selection using AIC (about 30sec on single core)
#' ## Use parallel = TRUE to speed-up....
#' system.time(best <- gamBiCopSelect(U$data, smooth.covs = X))
#' print(best$res)
#' EDF(best$res) ## The first function is linear
#' ## Plot only the smooth component
#' par(mfrow = c(2, 2))
#' plot(best$res)
#' }
#' @export
gamBiCopSelect <- function(udata, lin.covs = NULL, smooth.covs = NULL,
                           familyset = NA, rotations = TRUE,
                           familycrit = "AIC", level = 5e-2, edf = 1.5, tau = TRUE,
                           method = "FS", tol.rel = 1e-3, n.iters = 10,
                           parallel = FALSE, verbose = FALSE,
                           select.once = TRUE, ...) {
  tmp <- valid.gamBiCopSelect(
    udata, lin.covs, smooth.covs, rotations, familyset,
    familycrit, level, edf, tau, method, tol.rel,
    n.iters, parallel, verbose, select.once
  )
  if (tmp != TRUE) {
    stop(tmp)
  }

  if (length(familyset) == 1 && is.na(familyset)) {
    familyset <- get.familyset()
  }
  if (rotations) {
    familyset <- withRotations(familyset)
  }
  familyset <- unique(familyset)

  parallel <- as.logical(parallel)
  if (parallel) {
    cl <- makeCluster(detectCores() - 1)
    registerDoParallel(cl, cores = detectCores() - 1)
    on.exit(stopCluster(cl))
  }

  x <- NULL
  if (select.once) {
    res <- vector("list", length(familyset))
    # select GAM structure for first family in familyset
    res[[1]] <- tryCatch(gamBiCopVarSel(
      udata, lin.covs, smooth.covs,
      familyset[1], tau, method, tol.rel,
      n.iters, level, edf, verbose, ...
    ),
    error = function(e) e
    )
    # fit with same structure for all other families
    if (length(familyset) > 1) {
      # combine data and covariates into one matrix
      tmpdata <- udata
      if (!is.null(lin.covs)) {
        tmpdata <- cbind(tmpdata, lin.covs)
      }
      if (!is.null(smooth.covs)) {
        tmpdata <- cbind(tmpdata, smooth.covs)
      }

      if (!parallel) {
        res[-1] <- foreach(x = familyset[-1]) %do%
          tryCatch(gamBiCopFit(
            tmpdata,
            res[[1]]$res@model$formula,
            x, tau, method, tol.rel, n.iters,
            verbose, ...
          ),
          error = function(e) e
          )
      } else {
        res[-1] <- foreach(x = familyset[-1]) %dopar%
          tryCatch(gamBiCopFit(
            tmpdata,
            res[[1]]$res@model$formula,
            x, tau, method, tol.rel, n.iters,
            verbose, ...
          ),
          error = function(e) e
          )
      }
    }
  } else {
    # select GAM structure independently for all families
    if (!parallel) {
      res <- foreach(x = familyset) %do%
        tryCatch(gamBiCopVarSel(
          udata, lin.covs, smooth.covs,
          x, tau, method, tol.rel, n.iters, # max.knots,
          level, edf, verbose, ...
        ),
        error = function(e) e
        )
    } else {
      res <- foreach(x = familyset) %dopar%
        tryCatch(gamBiCopVarSel(
          udata, lin.covs, smooth.covs,
          x, tau, method, tol.rel, n.iters,
          level, edf, verbose, ...
        ),
        error = function(e) e
        )
    }
  }

  res <- res[sapply(res, length) == 6]
  if (length(res) == 0) { #|| sum(sapply(res, function(x) x$conv) == 0) == 0) {
    return(paste(
      "No convergence of the estimation for any copula family.",
      "Try modifying the parameters (FS/NR, tol.rel, n.iters)..."
    ))
  }

  if (familycrit == "AIC") {
    return(res[[which.min(sapply(res, function(x) AIC(x$res)))]])
  } else {
    return(res[[which.min(sapply(res, function(x) BIC(x$res)))]])
  }
}

gamBiCopVarSel <- function(udata, lin.covs, smooth.covs,
                           family, tau = TRUE, method = "FS",
                           tol.rel = 1e-3, n.iters = 10, # max.knots,
                           level = 5e-2, edf = 1.5,
                           verbose = FALSE, ...) {
  udata <- as.data.frame(udata)
  colnames(udata) <- c("u1", "u2")
  data <- udata

  if (!is.null(lin.covs)) {
    if (is.null(colnames(lin.covs))) {
      lin.covs <- as.data.frame(lin.covs)
      colnames(lin.covs) <- paste0("l", 1:ncol(lin.covs))
    }
    data <- cbind(data, lin.covs)
  }
  if (!is.null(smooth.covs)) {
    if (is.null(colnames(smooth.covs))) {
      smooth.covs <- as.data.frame(smooth.covs)
      colnames(smooth.covs) <- paste0("s", 1:ncol(smooth.covs))
    }
    data <- cbind(data, smooth.covs)
  }

  if (verbose == TRUE) {
    cat(paste("Model selection for family", family, "\n"))
  }

  myGamBiCopEst <- function(formula) gamBiCopFit(
      data, formula, family, tau,
      method, tol.rel, n.iters
    )

  n <- nrow(data)
  d <- ncol(data) - 2
  ## Create a list with formulas of smooth terms corresponding to the covariates
  lin.nms <- colnames(lin.covs)

  nn <- smooth.nms <- colnames(smooth.covs)

  if (!is.null(ncol(smooth.covs))) {
    # if (length(max.knots) == 1) {
    #   basis <- rep(max.knots, ncol(smooth.covs))
    # } else {
    #   stopifnot(length(max.knots) == ncol(smooth.covs))
    #   basis <- max.knots
    # }
    basis <- rep(10, ncol(smooth.covs))
    formula.expr <- mapply(get.smooth, smooth.nms, basis)
  } else {
    basis <- NULL
    formula.expr <- NULL
  }
  if (!is.null(ncol(lin.covs))) {
    formula.lin <- lin.nms # get.linear(lin.nms, lin.nms)
  } else {
    formula.lin <- NULL
  }

  ## Update the list by removing unsignificant predictors
  if (verbose == TRUE) {
    cat("Remove unsignificant covariates.......\n")
    t1 <- Sys.time()
  }

  sel.smooth <- smooth2lin <- FALSE
  sel.lin <- NULL

  while ((!all(c(sel.lin, sel.smooth)) && length(basis) > 0) || any(smooth2lin)) {
    smooth2lin <- FALSE
    sel.lin <- NULL
    formula.tmp <- get.formula(c(formula.lin, formula.expr))
    # if (verbose == TRUE) {
    #   cat("Model formula:\n")
    #   print(formula.tmp)
    # }

    tmp <- myGamBiCopEst(formula.tmp)

    ## Remove unsignificant parametric terms
    if (length(summary(tmp$res@model)$p.coeff) > 1) {
      sel.lin <- summary(tmp$res@model)$p.pv[-1] < level
      formula.lin <- formula.lin[sel.lin]
    }

    if (summary(tmp$res@model)$m > 0) {
      ## Remove unsignificant smooth terms
      sel.smooth <- summary(tmp$res@model)$s.pv < level
      nn <- nn[sel.smooth]
      basis <- rep(10, length(nn))
      formula.expr <- mapply(get.smooth, nn, basis)

      ## Check whether some smooth need to be set to linear
      ## (and adjust the model if required)
      smooth2lin <- sel.smooth & summary(tmp$res@model)$edf < edf
      if (any(smooth2lin)) {
        sel.lin <- which(sel.smooth)
        sel.lin <- summary(tmp$res@model)$edf[sel.lin] < edf
        sel.smooth <- !sel.lin
        formula.lin <- c(formula.lin, nn[sel.lin])
        formula.expr <- formula.expr[sel.smooth]
        basis <- basis[sel.smooth]
        nn <- nn[sel.smooth]
      }
    }
  }

  ## Check if we can output a constant or linear model directly
  ## (or re-estimate the model if not)
  if (length(formula.expr) == 0) {
    if ((is.null(formula.lin) ||
      length(formula.lin) == 0)) {
      tmp <- myGamBiCopEst(~1)
    } else {
      formula.tmp <- get.formula(formula.lin)
      tmp <- myGamBiCopEst(formula.tmp)
    }
    return(tmp)
  } else {
    formula.tmp <- get.formula(c(formula.lin, formula.expr))
    tmp <- myGamBiCopEst(formula.tmp)
  }

  ## Increasing the basis size appropriately
  sel <- summary(tmp$res@model)$edf > (basis - 1) * 0.8
  if (verbose == TRUE && any(sel)) {
    t2 <- Sys.time()
    cat(paste0("Time elapsed: ", round(as.numeric(t2 - t1), 2), " seconds.\n"))
    t1 <- Sys.time()
    cat(paste("Select the basis sizes .......\n"))
  }

  while (any(sel) && all(basis < n / 30)) {

    ## Increase basis sizes
    basis[sel] <- 2 * basis[sel]

    ## Update the smooth terms, formula and fit the new model
    if (any(sel)) {
      formula.expr[sel] <- mapply(get.smooth, nn[sel], basis[sel])
      formula.tmp <- get.formula(c(formula.lin, formula.expr))
      # if (verbose == TRUE) {
      #   cat("Updated model formula:\n")
      #   print(formula.tmp)
      # }
      tmp <- myGamBiCopEst(formula.tmp)
      sel[sel] <- summary(tmp$res@model)$edf[sel] > (basis[sel] - 1) * 0.8
    }

    ## Check that the basis size is smaller than 1/2 the size of
    ## the corresponding covariate's support
    if (any(sel)) {
      if (sum(sel) == 1) {
        sel[sel] <- 2 * basis[sel] < length(unique(data[, nn[sel]]))
      } else {
        sel[sel] <- 2 * basis[sel] < apply(data[, nn[sel]], 2, function(x)
          length(unique(x)))
      }
    }
  }

  if (verbose == TRUE) {
    t2 <- Sys.time()
    cat(paste0("Time elapsed: ", round(as.numeric(t2 - t1), 2), " seconds.\n"))
  }
  return(tmp)
}

get.smooth <- function(x, k, bs = "cr") {
  paste("s(", x, ", k=", k, ", bs='", bs, "')", sep = "")
}
get.formula <- function(expr, y = FALSE) {
  if (!y) {
    as.formula(paste("~", paste(expr, collapse = " + ")))
  } else {
    as.formula(paste("y ~", paste(expr, collapse = " + ")))
  }
}

get.linear <- function(x, nn) {
  names(unlist(sapply(nn, function(z) grep(z, x))))
}

valid.gamBiCopSelect <- function(udata, lin.covs, smooth.covs, rotations,
                                 familyset, familycrit, level, edf,
                                 tau, method, tol.rel, n.iters, parallel,
                                 verbose, select.once) {
  data <- tryCatch(as.data.frame(udata), error = function(e) e$message)
  if (is.character(udata)) {
    return(udata)
  }

  tmp <- valid.gamBiCopFit(udata, n.iters, tau, tol.rel, method, verbose, 1)
  if (tmp != TRUE) {
    return(tmp)
  }
  m <- dim(udata)[2]
  n <- dim(udata)[1]
  if (m != 2) {
    return("udata should have only two columns.")
  }

  if (!is.null(lin.covs) && !(is.data.frame(lin.covs) || is.matrix(lin.covs))) {
    return("lin.covs has to be either a data frame or a matrix.")
  } else {
    if (!is.null(lin.covs) && dim(lin.covs)[1] != n) {
      return("lin.covs must have the same number of rows as udata.")
    }
  }
  if (!is.null(smooth.covs) && !(is.data.frame(smooth.covs) ||
    is.matrix(smooth.covs))) {
    return("smooth.covs has to be either a data frame or a matrix.")
  } else {
    if (!is.null(smooth.covs) && dim(smooth.covs)[1] != n) {
      return("smooth.covs must have the same number of rows as udata.")
    }
  }

  if (!valid.familyset(familyset)) {
    return(return(msg.familyset(var2char(familyset))))
  }

  if (!valid.logical(rotations)) {
    return(msg.logical(var2char(rotations)))
  }

  if (is.null(familycrit) || length(familycrit) != 1 ||
    (familycrit != "AIC" && familycrit != "BIC")) {
    return("Selection criterion not implemented.")
  }

  if (!valid.logical(parallel)) {
    return(msg.logical(var2char(parallel)))
  }

  if (!valid.unif(level)) {
    return(msg.unif(var2char(level)))
  }

  if (!valid.real(edf)) {
    return(msg.real(var2char(edf)))
  }

  if (!is.logical(select.once)) {
    return("'select.once' must be logical")
  }

  return(TRUE)
}

Try the gamCopula package in your browser

Any scripts or data that you put into this service are public.

gamCopula documentation built on Feb. 6, 2020, 5:12 p.m.