Nothing
#' Calculate the Wasserstein Distance
#'
#' The Wasserstein distance is calculated based on the Euclidean distance
#' between two copula PDFs on a grid, or between a copula PDF and
#' pseudo-observations.
#'
#' @param copula \R object of class '\code{\linkS4class{cyl_copula}}'.
#' or '\code{\linkS4class{Copula}}' (package '\pkg{copula}', only 2-dimensional).
#' @param copula2 \R object of class '\code{\linkS4class{cyl_copula}}'.
#' or '\code{\linkS4class{Copula}}' (package '\pkg{copula}', only 2-dimensional).
#' @param theta (alternatively) \link[base]{numeric} \link[base]{vector} of angles
#' (measurements of a circular variable) or "circular" component of pseudo-observations.
#' @param x (alternatively) \link[base]{numeric} \link[base]{vector} of step lengths
#' (measurements of a linear variable) or "linear" component of pseudo-observations.
#' @param n_grid \link[base]{integer} number of grid cells at which the PDF of the copula(s) is calculated
#' Default is 2500
#' @param p \link[base]{integer} power (1 or 2) to which the Euclidean distance
#' between points is taken in order to compute transportation costs.
#'
#' @details Note that when comparing 2 copula PDFs (i.e. \code{theta = NULL} and \code{x = NULL}),
#' the calculated Wasserstein distance will depend on the number of grid cells
#' (\code{n_grid}) used to approximate the PDFs. The distance will converge to a certain
#' value with a higher number of grid cells, but the computational time will also increase.
#' The default of 2500 seems to be a good (empirically determined) compromise.
#' The same is true when calculating the Wasserstein distance between a copula
#' PDF and pseudo-observations. There, it is also important to only compare distances
#' that use the same number of observations.
#'
#' The code is based on the functions \code{transport::\link[transport]{wasserstein}()}
#' and \code{transport::\link[transport]{semidiscrete}()}.
#'
#' @return
#' \link[base]{numeric}, the pth Wasserstein distance
#'
#' @examples
#' set.seed(1234)
#' copula1 <- cyl_quadsec(0.1)
#' copula2 <- cyl_rect_combine(copula::frankCopula(2))
#' wasserstein(copula=copula1,copula2 = copula2,p=2,n_grid=20)
#' wasserstein(copula=copula1,copula2 = copula1,p=2,n_grid=20)
#' wasserstein(copula=copula1,copula2 = copula::frankCopula(2),p=2,n_grid=20)
#'
#' sample <- rjoint(10,
#' copula1,
#' marginal_1 = list(name = "vonmises", coef = list(0, 1)),
#' marginal_2 = list(name = "weibull", coef = list(3,4))
#' )
#'
#' wasserstein(copula=copula1, theta=sample[,1], x=sample[,2], n_grid=20)
#' @export
#'
wasserstein <-
function(copula,
copula2 = NULL,
theta = NULL,
x = NULL,
n_grid = 2500,
p = 2) {
#validate input
tryCatch({
check_arg_all(list(
check_argument_type(copula,
type = "Copula"),
check_argument_type(copula,
type = "cyl_copula")
)
, 2)
check_arg_all(list(
check_argument_type(copula2,
type = "Copula"),
check_argument_type(copula2,
type = "cyl_copula"),
check_argument_type(copula2,
type = "NULL")
)
,
3)
check_arg_all(list(
check_argument_type(theta,
type = "numeric"),
check_argument_type(theta,
type = "NULL")
)
, 2)
check_arg_all(list(
check_argument_type(x,
type = "numeric"),
check_argument_type(x,
type = "NULL")
)
, 2)
check_arg_all(check_argument_type(
n_grid,
type = "numeric",
integer = TRUE,
length = 1,
lower = 0
)
,
1)
check_arg_all(check_argument_type(
p,
type = "numeric",
length = 1,
integer = TRUE,
values = c(1, 2)
)
,
1)
},
error = function(e) {
error_sound()
rlang::abort(conditionMessage(e))
})
if ((is.null(theta) || is.null(x)) && is.null(copula2)) {
stop("provide either data (theta and x) or a second copula to compare copula to")
}
if ((!is.null(theta) || !is.null(x)) && !is.null(copula2)) {
stop("provide either data (theta and x) or a second copula to compare copula to")
}
if (is.null(copula2)) {
if (is.null(theta) || is.null(x)) {
stop("if copula2 is NULL, step lengths and turn angles must be provided")
}
}
p <- as.integer(p)
n_grid <- as.integer(sqrt(n_grid))
grid <- expand.grid(seq(1 / (2 * n_grid), 1 - 1 / (2 * n_grid), 1 / n_grid),
seq(1 / (2 * n_grid), 1 - 1 / (2 * n_grid), 1 / n_grid))
dens_cop1 <- matrix(dcylcop(as.matrix(grid), copula), ncol = n_grid)
dens_cop1 <- dens_cop1 / sum(dens_cop1)
dens_pgrid_cop1 <-
transport::pgrid(dens_cop1, boundary = c(0, 1, 0, 1))
if (is.null(copula2)) {
data <- cbind(theta, x) %>% na.omit()
n_sample <- nrow(data)
pseudo_obs <- copula::pobs(data, ties.method = "average")
sample_wpp <-
transport::wpp(matrix(c(pseudo_obs[, 2], 1 - pseudo_obs[, 1]), ncol = 2),
mass = rep(dens_pgrid_cop1$totcontmass /
n_sample, n_sample))
out_cop <-
transport::semidiscrete(a = dens_pgrid_cop1, b = sample_wpp, p = p)
distance <- out_cop$wasserstein_dist
} else{
dens_cop2 <- matrix(dcylcop(as.matrix(grid), copula2), ncol = n_grid)
dens_cop2 <- dens_cop2 / sum(dens_cop2)
dens_pgrid_cop2 <-
transport::pgrid(dens_cop2, boundary = c(0, 1, 0, 1))
if (p == 2) {
distance <-
transport::wasserstein(
a = dens_pgrid_cop1,
b = dens_pgrid_cop2,
p = p,
prob = TRUE,
method = "networkflow"
)
} else{
distance <-
transport::wasserstein(
a = dens_pgrid_cop1,
b = dens_pgrid_cop2,
p = p,
prob = TRUE
)
}
}
return(distance)
}
#' Cramér-von-Mises criterion
#'
#' Calculate the Cramér-von-Mises criterion
#' with a p-value (via parametric bootstrapping) to assess the goodness of fit
#' of a parametric copula compared to the empirical copula of the data.
#'
#' @param copula \R object of class '\code{\linkS4class{cyl_copula}}' or
#' '\code{\linkS4class{Copula}}' (package '\pkg{copula}'.
#' @param theta \link[base]{numeric} \link[base]{vector} of angles
#' (measurements of a circular variable) or "circular" component of pseudo-observations.
#' @param x \link[base]{numeric} \link[base]{vector} of step lengths
#' (measurements of a linear variable) or "linear" component of pseudo-observations.
#' @param n_bootstrap \link[base]{integer} number of bootstrap replicates. If
#' \code{n_bootstrap} is smaller than 1, no p-value is calculated.
#' @param parameters \link[base]{vector} of \link[base]{character} strings
#' holding the names of the parameters to be optimized when using the bootstrap
#' procedure.
#' These can be any parameters in \code{copula@@parameters}. Default is to
#' optimize the first 2 parameters. \code{parameters} has no effect if \code{copula}
#' is of class '\code{\linkS4class{Copula}}' (package '\pkg{copula}'
#' @param optim.method \link[base]{character} string, optimizer used in
#' \code{\link[stats]{optim}()}, can be
#' \code{"Nelder-Mead"}, \code{"BFGS"}, \code{"CG"}, \code{"L-BFGS-B"},
#' \code{"SANN"}, or \code{"Brent"}.
#' @param optim.control \link[base]{list} of additional controls passed to
#' \code{\link[stats]{optim}()}.
#'
#' @details The Cramér-von Misses criterion is calculated as the sum of the squared
#' differences between the empirical copula and the parametric copula, \code{copula},
#' evaluated at the pseudo-observations obtained from \code{theta} and \code{x}.
#' If the bootstrap procedure is used, a random sample is drawn from \code{copula}
#' and converted to pseudo-observations. A new (set of) copula parameter(s) is then
#' fit to those pseudo-observations using maximum likelihood (function
#' \code{cylcop::\link[cylcop]{fit_cylcop_ml}()}).
#'
#' @references \insertRef{Genest2008}{cylcop}
#'
#' @return
#' A list of length 2 containing the Cramér-von Mises criterion and the p-value.
#' @export
#'
#' @examples
#' set.seed(1234)
#' sample <- rcylcop(100,cyl_cubsec(0.1, 0.1))
#'
#' opt_cop <- fit_cylcop_ml(copula = cyl_quadsec(),
#' theta = sample[,1],
#' x = sample[,2],
#' parameters = "a",
#' start = 0
#' )$copula
#' cramer_vonmises(opt_cop,
#' theta = sample[,1],
#' x = sample[,2],
#' n_bootstrap=5)
#'
cramer_vonmises <- function(copula,
theta,
x,
n_bootstrap = 1000,
parameters = NULL,
optim.method = "L-BFGS-B",
optim.control = list(maxit = 100)) {
#validate input
tryCatch({
check_arg_all(list(
check_argument_type(copula,
type = "Copula"),
check_argument_type(copula,
type = "cyl_copula")
)
, 2)
check_arg_all(check_argument_type(theta,
type = "numeric")
, 1)
check_arg_all(check_argument_type(x,
type = "numeric")
, 1)
check_arg_all(check_argument_type(
n_bootstrap,
type = "numeric",
integer = TRUE,
length = 1
)
,
1)
check_arg_all(list(
check_argument_type(parameters,
type = "character"),
check_argument_type(parameters,
type = "NULL")
)
, 2)
},
error = function(e) {
error_sound()
rlang::abort(conditionMessage(e))
})
data <- cbind(theta, x) %>% na.omit()
#calculate empirical
points <- data
pseudo_obs <- pobs(points, ties.method = "average")
emp_cop <-
C.n(
pseudo_obs,
X = pseudo_obs,
smoothing = "none",
ties.method = "average"
)
CvM <- sum((emp_cop - pcylcop(pseudo_obs, copula)) ^ 2)
p_val <- NULL
if (n_bootstrap > 1) {
if (any(is(copula) %in% "cyl_copula")) {
if (is.null(parameters)) {
if (length(copula@param.names) == 1) {
parameters <- copula@param.names
} else{
parameters <- copula@param.names[1:2]
}
}
start_val <-
copula@parameters[match(parameters, copula@param.names)]
par_num <-
tryCatch(
param_num_checked(copula, param_val = start_val, param_name = parameters),
error = function(e) {
rlang::abort(conditionMessage(e))
}
)
} else{
start_val <- copula@parameters
}
#start timer
ptm <- proc.time()
time <- 0
CvM_vec <- rep(0, n_bootstrap)
for (i in seq_len(n_bootstrap)) {
curr_opt_cop <- 1
while (is.numeric(curr_opt_cop)) {
points <- rcylcop(nrow(data), copula)
if (any(is(copula) %in% "cyl_copula")) {
curr_opt_cop <-
tryCatch(
suppressMessages(
cylcop::fit_cylcop_ml(
copula = copula,
theta = points[, 1],
x = points[, 2],
parameters = parameters,
start = start_val,
optim.method = optim.method,
optim.control = optim.control
)
),
warning = function(w) {
curr_opt_cop + 1
}
)
} else{
curr_opt_cop <-
tryCatch(
suppressMessages(
copula::fitCopula(
copula = copula,
data = points,
start = start_val,
optim.method = optim.method,
optim.control = optim.control
)
),
warning = function(w) {
curr_opt_cop + 1
}
)
}
if (is.numeric(curr_opt_cop) && curr_opt_cop > 10)
stop("failed to converge 10 times in a row")
}
curr_opt_cop <- curr_opt_cop$copula
pseudo_obs <- pobs(points, ties.method = "average")
emp_cop <- C.n(
points,
X = pseudo_obs,
smoothing = "none",
ties.method = "average"
)
CvM_vec[i] <- sum((emp_cop - pcylcop(points, curr_opt_cop)) ^ 2)
if (cylcop.env$silent == FALSE) {
#get time for 10 steps, if n_bootstrap is at least that large, to set up the progress bar
if (i == 10) {
time <- (proc.time() - ptm)[3] %>% as.double()
time <- (time / 10 * n_bootstrap)
if (time > 20) {
message(
"Estimated time to generate bootstrap sample: ",
floor(time / 60),
" minutes, ",
round(time - 60 * floor(time / 60)),
" seconds"
)
pb <- utils::txtProgressBar(min = 1, max = n_bootstrap)
}
}
if (time > 20) {
utils::setTxtProgressBar(pb, i)
}
}
}
if (cylcop.env$silent == FALSE) {
#close progress bar
if (time > 20) {
close(pb)
done_sound()
}
}
p_val <- (1 / (n_bootstrap + 1)) * length(which(CvM_vec > CvM))
}
out <- list(CvM = CvM, p_val = p_val)
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.