empiricial_copulas.R
In copula: Multivariate Dependence with Copulas

## ----prelim, echo=FALSE-------------------------------------------------------
## lower resolution - less size  (default dpi = 72):
knitr::opts_chunk$set(dpi = 48)

## ----message=FALSE------------------------------------------------------------
library(copula)

## ----lsum---------------------------------------------------------------------
##' @title Compute log(x_1 + .. + x_n) for given log(x_1),..,log(x_n)
##' @param lx n-vector containing log(x_1),..,log(x_n)
##' @author Marius Hofert
lsum <- function(lx) {
    l.off <- max(lx)
    l.off + log(sum(exp(lx - l.off)))
}

## ----emp_cop------------------------------------------------------------------
##' @title Evaluate Empirical Copulas
##' @param u evaluation points (in [0,1]^d)
##' @param U points (in [0,1]^d) based on which the empirical copula is built
##' @param smoothing character string indicating the smoothing method
##' @param offset shift when computing the outer sum (over i)
##' @param log logical indicating whether the log-density is to be returned
##'        (only applies to smoothing = "dbeta")
##' @param ... additional arguments passed to the underlying rank()
##' @return empirical copula values
##' @author Marius Hofert
##' @note See Hofert et al. (2018, "Elements of copula modeling with R")
empirical_copula <- function(u, U, smoothing = c("none", "pbeta", "dbeta", "checkerboard"),
                             offset = 0, log = FALSE, ...)
{
    stopifnot(0 <= u, u <= 1, 0 <= U, U <= 1)
    if(!is.matrix(u)) u <- rbind(u)
    if(!is.matrix(U)) U <- rbind(U)
    m <- nrow(u) # number of evaluation points
    n <- nrow(U) # number of points based on which the empirical copula is computed
    d <- ncol(U) # dimension
    stopifnot(ncol(u) == d)
    R <- apply(U, 2, rank, ...) # (n, d)-matrix of ranks
    switch(match.arg(smoothing),
    "none" = {
        R. <- t(R) / (n + 1) # (d, n)-matrix
        vapply(seq_len(m), function(k) # iterate over rows k of u
            sum(colSums(R. <= u[k,]) == d) / (n + offset), NA_real_)
    },
    "pbeta" = {
        ## Note: pbeta(q, shape1, shape2) is vectorized in the following sense:
        ##       1) pbeta(c(0.8, 0.6), shape1 = 1, shape2 = 1) => 2-vector (as expected)
        ##       2) pbeta(0.8, shape1 = 1:4, shape2 = 1:4) => 4-vector (as expected)
        ##       3) pbeta(c(0.8, 0.6), shape1 = 1:2, shape2 = 1:2) => 2-vector (as expected)
        ##       4) pbeta(c(0.8, 0.6), shape1 = 1:4, shape2 = 1:4) => This is
        ##          equal to the recycled pbeta(c(0.8, 0.6, 0.8, 0.6), shape1 = 1:4, shape2 = 1:4)
        vapply(seq_len(m), function(k) { # iterate over rows k of u
                sum( # sum() over i
                    vapply(seq_len(n), function(i)
                        prod( pbeta(u[k,], shape1 = R[i,], shape2 = n + 1 - R[i,]) ), # prod() over j
                        NA_real_)) / (n + offset)
        }, NA_real_)
    },
    "dbeta" = {
        if(log) {
            vapply(seq_len(m), function(k) { # iterate over rows k of u
                lsum( # lsum() over i
                    vapply(seq_len(n), function(i) {
                        ## k and i are fixed now
                        lx.k.i <- sum( dbeta(u[k,], shape1 = R[i,], shape2 = n + 1 - R[i,], log = TRUE) ) # log(prod()) = sum(log()) over j for fixed k and i
                    },
                    NA_real_)) - log(n + offset)
            }, NA_real_)
        } else { # as for 'pbeta', just with dbeta()
            vapply(seq_len(m), function(k) { # iterate over rows k of u
                sum( # sum() over i
                    vapply(seq_len(n), function(i)
                        prod( dbeta(u[k,], shape1 = R[i,], shape2 = n + 1 - R[i,]) ), # prod() over j
                        NA_real_)) / (n + offset)
            }, NA_real_)
        }
    },
    "checkerboard" = {
        R. <- t(R) # (d, n)-matrix
        vapply(seq_len(m), function(k) # iterate over rows k of u
            sum(apply(pmin(pmax(n * u[k,] - R. + 1, 0), 1), 2, prod)) / (n + offset),
            NA_real_) # pmin(...) = (d, n)-matrix
    },
    stop("Wrong 'smoothing'"))
}

## ----gener--------------------------------------------------------------------
## Generate copula data based on which to build empirical copula
n <- 1000 # sample size
d <- 2 # dimension
set.seed(271)
U <- rCopula(n, copula = gumbelCopula(iTau(gumbelCopula(), tau = 0.5), dim = d))

## ----eval-pts-----------------------------------------------------------------
## Evaluation points
n.grid <- 26
sq <- seq(0, 1, length.out = n.grid)
u <- as.matrix(expand.grid("u[1]" = sq, "u[2]" = sq, KEEP.OUT.ATTRS = FALSE))
## ... for the density of the empirical beta copula
delta <- 0.01
sq. <- seq(delta, 1-delta, length.out = n.grid)
u. <- as.matrix(expand.grid("u[1]" = sq., "u[2]" = sq., KEEP.OUT.ATTRS = FALSE))

## ----eval-cops----------------------------------------------------------------
## Evaluate empirical copulas
emp.cop.none   <- empirical_copula(u,  U = U)
emp.cop.pbeta  <- empirical_copula(u,  U = U, smoothing = "pbeta")
emp.cop.dbeta  <- empirical_copula(u., U = U, smoothing = "dbeta")
lemp.cop.dbeta <- empirical_copula(u., U = U, smoothing = "dbeta", log = TRUE)
stopifnot(all.equal(lemp.cop.dbeta, log(emp.cop.dbeta))) # sanity check
emp.cop.chck   <- empirical_copula(u,  U = U, smoothing = "checkerboard")

## ----gen-Bern-----------------------------------------------------------------
## Generate some data with Bernoulli margins
tau <- 0.5
gc <- normalCopula(iTau(normalCopula(), tau = tau))
n <- 1e4  ## <-- use this  for scientific reason
n <- 1000 # <<< prefer to decrease *.html and final package size
set.seed(271)
U <- rCopula(n, copula = gc)
p <- c(1/2, 3/4) # marginal probabilities p_1, p_2 of being 1
stopifnot(0 < p, p < 1)
X <- sapply(1:2, function(j) qbinom(U[,j], size = 1, prob = p[j]))
mean(rowSums(X) == 0) # p = P(X_1 = 0, X_2 = 0) = C(1-p_1, 1-p_2)

## ----fn-F---------------------------------------------------------------------
## Define generalized distributional transform (for one margin)
F <- function(X, Lambda, p)
{
    is0 <- X == 0
    res <- numeric(length(X))
    res[is0]  <- Lambda[is0] * (1-p)
    res[!is0] <- (1-p) + Lambda[!is0] * p
    res
}

## ----U.Pi.M-------------------------------------------------------------------
## Compute U's for two different Lambda
Lambda.Pi <- matrix(runif(n * 2), ncol = 2) # independence case for Lambda
Lambda.M <- cbind(Lambda.Pi[,1], Lambda.Pi[,1]) # comonotone case for Lambda
U.Pi <- sapply(1:2, function(j) F(X[,j], Lambda = Lambda.Pi[,j], p = p[j]))
U.M  <- sapply(1:2, function(j) F(X[,j], Lambda = Lambda.M [,j], p = p[j]))

## ----plot-margins, fig.align = "center", fig.width = 6, fig.height = 6, fig.show = "hide"----
## Check margins
opar <- par(pty = "s")
plot(U.Pi[,1], pch = ".", ylab = expression(U[1]~"under independent"~Lambda*"'s"))
plot(U.Pi[,2], pch = ".", ylab = expression(U[2]~"under independent"~Lambda*"'s"))
plot(U.M [,1], pch = ".", ylab = expression(U[1]~"under equal"~Lambda*"'s"))
plot(U.M [,2], pch = ".", ylab = expression(U[2]~"under equal"~Lambda*"'s"))
par(opar)

## ----check-mass---------------------------------------------------------------
## Check the mass distributions
check <- function(U, p)
    c(mean(U[,1] <= 1-p[1] & U[,2] <= 1-p[2]), # P(U_1 <= 1-p_1, U_2 <= 1-p_2)
      mean(U[,1] <= 1-p[1] & U[,2] >  1-p[2]), # P(U_1 <= 1-p_1, U_2 >  1-p_2)
      mean(U[,1] >  1-p[1] & U[,2] <= 1-p[2]), # P(U_1 >  1-p_1, U_2 <= 1-p_2)
      mean(U[,1] >  1-p[1] & U[,2] >  1-p[2])) # P(U_1 >  1-p_1, U_2 >  1-p_2)
probs.Pi <- check(U.Pi, p = p)
probs.M  <- check(U.M,  p = p)
stopifnot(all.equal(sum(probs.Pi), 1), all.equal(probs.Pi, probs.M))

## ----pl_mass-fn---------------------------------------------------------------
## Plot function for the mass distributions
plot_mass <- function(U, probs, p, text)
{
    plot(U, pch = ".", xlab = expression(U[1]), ylab = expression(U[2]))
    abline(v = 1-p[1], h = 1-p[2], lty = 2, lwd = 2, col = "royalblue3")
    mtext(text, side = 4, adj = 0, line = 0.5) # label
    text((1-p[1])/2, (1-p[2])/2, labels = sprintf("%1.4f", probs[1]), font = 2, col = "royalblue3")
    text((1-p[1])/2, 1-p[2] + p[2]/2, labels = sprintf("%1.4f", probs[2]), font = 2, col = "royalblue3")
    text(1-p[1] + p[1]/2, (1-p[2])/2, labels = sprintf("%1.4f", probs[3]), font = 2, col = "royalblue3")
    text(1-p[1] + p[1]/2, 1-p[2] + p[2]/2, labels = sprintf("%1.4f", probs[4]), font = 2, col = "royalblue3")
}

## ----plot-mass, fig.align = "center", fig.width = 6, fig.height = 6, fig.show = "hold"----
## Plot the mass distributions with probabilities of falling in the four regions
opar <- par(pty = "s")
plot_mass(U.Pi, probs = probs.Pi, p = p, text = expression("Independent"~Lambda*"'s"))
plot_mass(U.M,  probs = probs.M,  p = p, text = expression("Comonotone"~Lambda*"'s"))
par(opar)

## ----wireframes, fig.align = "center", fig.width = 6, fig.height = 6, fig.show = "hold"----
## Define and plot empirical copulas
ec.Pi <- empCopula(U.Pi)
ec.M  <- empCopula(U.M)
wireframe2(ec.Pi, FUN = pCopula, n.grid = 81, draw.4.pCoplines = FALSE)
wireframe2(ec.M,  FUN = pCopula, n.grid = 81, draw.4.pCoplines = FALSE)