rm(list=ls()) ### To clear namespace library(knitr) # opts_chunk$set(fig.width=12, fig.height=8, fig.path='Figs/', # echo=TRUE, warning=FALSE, message=FALSE) opts_chunk$set(fig.width= 7, fig.height= 5, echo=TRUE, warning=FALSE, message=FALSE, eval=FALSE)
A well studied nonparametric estimator of probability density functions based on the i.i.d. observations $X_1, \ldots, X_n$, is the kernel density estimator. A natural generalization to the case where random variables have been contaminated with random additive noise, say $Y_1, \ldots, Y_n$, with $Y_i = X_i + \epsilon_i$, $i = 1, \ldots n$ is the so kernel deconvolution density estimator (kdde). The error distribution is usually assumed to be known.
There are two popular R
packages that provide functions to do this, decon
and deamer
(REFERENCES). The former package performs kdde with known error distribution allowing the suer to pick the bandwidth selection method between several methods summarized in Delaigle and Gijbels (2004)(REFERENCES). The latter package covers the univariate case and it allows known error case, repeated observations and pure error case, the bandwidth selection method follows Comte (2011???)
The package kerdec
provides functions to handle univariate and bivariate kdde with unknown error distribution, by using the empirical characteristic function, or known errors, which can be Laplace or Gaussian. Also, functions are provided to estimate the parameters based on a sample of errors or panel data.
A kernel deconvolution density estimator is a nonparametric approximation to the density of a random vector $\boldsymbol{X}$ based on samples that have been contaminated with additive noise, say, samples from $$\boldsymbol{Y} = \boldsymbol{X} + \boldsymbol{\epsilon},$$ where $\boldsymbol{\epsilon}$ has mean zero and it is independent of $\boldsymbol{X}$.
Let $\boldsymbol{Y}1, \ldots, \boldsymbol{Y}_n$ be a sample of independent and identically distributed (_iid) random vectors from the latter model. Assume (for the moment) that the error distribution ($\boldsymbol{\epsilon}$) is perfectly known. The kernel deconvolution density estimator (KDDE) is given by the KDDE formula: $$ \hat{f}{\boldsymbol{X}}(\boldsymbol{x}) = \frac{1}{(2\pi)^d} \int e^{-\imath \langle \boldsymbol{x}, \boldsymbol{t} \rangle} \hat{\phi}{\boldsymbol{Y}, n}(\boldsymbol{t}) \frac{K^{Ft}(H^{-1/2}\boldsymbol{t})} {\phi_{\boldsymbol{\epsilon}}(\boldsymbol{t})} d\boldsymbol{t}, $$ where $\langle \cdot, \cdot \rangle$ is the usual inner product in $\mathbb{R}^d$; $$\hat{\phi}{\boldsymbol{Y}, n}(\boldsymbol{t}) = n^{-1} \sum_j \exp \left( \imath \langle \boldsymbol{t}, \boldsymbol{Y}_j \rangle\right)$$ is the empirical characteristic function of the contaminated sample; $K:\mathbb{R}^d \rightarrow \mathbb{R}$ is a symmetric kernel and $$K^{Ft}(\boldsymbol{s}) = \int e^{\imath \langle \boldsymbol{s}, \boldsymbol{x} \rangle}K(\boldsymbol{x}) d\boldsymbol{x}$$ its Fourier transform; $$\phi{\boldsymbol{\epsilon}}(\boldsymbol{t}) = \int e^{\imath \langle \boldsymbol{t}, \boldsymbol{x} \rangle} f_\boldsymbol{\epsilon}(\boldsymbol{x}) d\boldsymbol{x}$$ is the characteristic function of the error, and $H$ is a positive definite bandwidth matrix.
At this point, we have assumed that the error distribution is perfectly known (so it is its characteristic function). This is, however, very rarely known. More realistic sampling scenarios that can allow to approximate $\phi_{\boldsymbol{\epsilon}}(\cdot)$ include (a) having an independent sample of errors besides the contaminated sample and (b) having repeated (contaminated) measurements per subject so differences can be obtained to approximate the characteristic function of the error. Both options are available in kerdec
and they are discussed with detail in "Sampling Scenarios" sections.
As it can be seen in the KDDE formula, a kernel function $K(\cdot)$ must be selected. For theoretical and numerical reasons, it is more convenient to select a kernel function whose Fourier transform, $K^{Ft}(\cdot)$, has compact support. Kernel functions used in traditional (i.e. error free) settings, like Gaussian, uniform and Epanechnikov kernels, do not usually meet this condition. The kerdec
package offers 5 choices for kernel, including the sinc kernel, which is perhaps the most popular kernel in deconvolution. Go to the section "Kernels" to learn the details and usage of these kernels in KDDE.
We will consider two approaches to provide bandwidth. The first of them consists on minimizing the AMISE by plugging in a normal reference estimator to R(f''). The second is my crossed-validation. We will only consider cases with one smoothing parameter.
We present examples of univariate kernel density deconvolution, in particular: - Sampling scenarios - Kernel functions - Bandwidth selection - Regular and nonregular grids
Kernel deconvolution density estimation (KDDE) arises in mainly three sampling scenarios.
We present in the following subsections the basic usage of 'kerdec' for each of these scenarios. We highlight that other features can be specified, such as the way the error distribution is approximated or the bandwidth selection method; look at the corresponding section for that matter.
The simplest case of kernel deconvolution density estimation is when the measurement error is perfectly known and the density estimator is computed based on a contaminated sample.
The example below corresponds to a sample that has been generated from a gamma distribution and contaminated with Laplacian noise.
library(kerdec) # Load package set.seed(666) # Set seed ## Signal parameters shape <- 5 # X distr. shape par. rate <- 5 # X distr. rate par. ## Settings and samples for all the cases n <- 150 # Sample size sd_error <- 0.2 # std. error of error distr. X <- rgamma(n, shape, rate) # Uncontaminated sample Y <- X + rlaplace(n, sd = sd_error) # Contaminated sample ## Estimate density out <- kerdec_dens(smp = Y, error_scale_par = sd_error) ## Show structure of out str(out)
The variable out
is a list that contains the output of the density. Now we put some of the element in a dataframe to plot it afterwards. Observe that kerdec_dens
only requires a vector with the contaminated sample and the standard deviation of the error. By default, the error is assumed to be Laplace; this can be modified by adding the parameter error_dist
to be equal to normal
for Gaussian errors.
library(dplyr) # To handle data frames library(tidyr) # To handle data frames library(ggplot2) # Load generate plot ## Create dataframe dat <- data_frame(x = out$x_eval, estimate = out$f_vals, true = dgamma(out$x_eval, shape, rate)) %>% gather(key = density, value = value, -x) ## Plot densities ggplot(dat, aes(x, value)) + geom_line(aes(color = density, linetype = density)) + # theme_bw() + theme(legend.position = c(0.9, 0.9))
Assume that we want to compute the KDDE based on a contaminated sample $Y_1, \ldots, Y_n$ and the distribution of the measurement error is unknown, but there is an extra sample of errors (independent of the contaminated sample), $\epsilon_1, \ldots, \epsilon_M$, that can used to approximate the error distribution. The size of the contaminated sample does not need to be the same size of the sample of errors.
Three approximations to the error distributions based on the sample of errors are implemented:
Laplace error. Set error_dist = Laplace
(Though, this is the default option). The scale parameter of the Laplace distribution is found via maximum likelihood estimation.
Gaussian error. Set error_dist = normal
. The scale parameter of the normal distribution is found via maximum likelihood estimation.
Nonparametric error. Set error_dist = none
. The empirical characteristic function is used to approximate the characteristic function of the error in the deconvolution formula.
library(kerdec) # Load package set.seed(666) # Set seed ## Signal parameters shape <- 5 # X distr. shape par. rate <- 5 # X distr. rate par. ## Settings and samples for all the cases n <- 150 # Sample size sd_error <- 0.2 # std. error of error distr. X <- rgamma(n, shape, rate) # Uncontaminated sample Y <- X + rlaplace(n, sd = sd_error) # Contaminated sample extra_errors <- rlaplace(2*n, sd = sd_error) # Extra errors ## Estimate density out <- kerdec_dens(smp = Y, error_smp = extra_errors) ## Show structure of out str(out)
We finally plot the estimated values of the density. Observe that the sample size of the extra sample of errors was actually different from the sample size of the contaminated sample.
library(dplyr) # To handle data frames library(tidyr) # To handle data frames library(ggplot2) # Load generate plot ## Create dataframe dat <- data_frame(x = out$x_eval, estimate = out$f_vals, true = dgamma(out$x_eval, shape, rate)) %>% gather(key = density, value = value, -x) ## Plot densities ggplot(dat, aes(x, value)) + geom_line(aes(color = density, linetype = density)) + # theme_bw() + theme(legend.position = c(0.9, 0.9))
Consider the case where one does not know the error distribution nor have an extra sample of errors to approximate it. Instead, one observes more than one contaminated replicates of $n$ individuals, say
| Individual | Replicate 1 | Replicate 2 | Replicate 3 | | :--------: | :---------: | :---------: | :---------: | | 1 | 2.12 | * | 1.98 | | 2 | 0.34 | 0.52 | 0.21 | | 3 | * | 0.98 | 1.12 | | $\vdots$ | $\vdots$ | $\vdots$ | $\vdots$ | | n | 1.34 | 1.56 | 1.61 |
Note that we are allowing some missing observations. The following example consists of 100 individuals, 5 contaminated observations per individual with some of these observations missing.
library(kerdec) # Load package set.seed(666) # Set seed ## Settings and samples for all the cases n <- 100 # Sample size l <- 5 # Number of columns m <- n + 10 # Error sample size shape <- 5 # X distr. shape par. rate <- 5 # X distr. rate par. sd_error <- .2 # std. error of error distr. X <- rgamma(n, shape, rate) # Uncontaminated sample # Panel of errors with # some NAs errors_aux <- rlaplace(n*l, sd = sd_error) nas_aux <- c(NA, 1)[sample(1:2, size = n*l, prob = c(0.1, 0.9), replace = TRUE)] eps_panel <- matrix(errors_aux*nas_aux, n, l) #eps_panel <- matrix(errors_aux, n, l) Y_panel <- sweep(x = eps_panel, MARGIN = 1, STATS = X, FUN = "+") # Contaminated in panel ## Estimate density out <- kerdec_dens(smp = Y_panel) ## Show structure of out str(out)
Finally, we generate the plot associated to the kernel deconvolution from repeated measurements.
library(dplyr) # To handle data frames library(tidyr) # To handle data frames library(ggplot2) # Load generate plot ## Create dataframe dat <- data_frame(x = out$x_eval, estimate = out$f_vals, true = dgamma(out$x_eval, shape, rate)) %>% gather(key = density, value = value, -x) ## Plot densities ggplot(dat, aes(x, value)) + geom_line(aes(color = density, linetype = density)) + # theme_bw() + theme(legend.position = c(0.9, 0.9))
library(kerdec) # Load package set.seed(666) ## Settings and samples for all the cases n <- 130 # Sample size l <- 5 # Number of columns m <- n + 10 # Error sample size shape <- 5 # X distr. shape par. rate <- 5 # X distr. rate par. sd_error <- .2 # std. error of error distr. X <- rgamma(n, shape, rate) # Uncontaminated sample eps_panel <- matrix(rlaplace(n*l, sd = sd_error), n, l) # Panel of errors eps <- rlaplace(m, sd = sd_error) # Pure errors Y <- X + eps_panel[, 1] # Contaminated sample Y_panel <- sweep(x = eps_panel, MARGIN = 1, STATS = X, FUN = "+") # Contaminated in panel plot(function(x) dgamma(x, shape, rate), 0, 4) ## ------------------------------------------------------------------- ## ------------------------------------------------------------------- ## Case 1: Normal error. Sigma known. ## ------------------------------------------------------------------- h <- select_bw(smp = Y, error_scale_par = sd_error, error_dist = "normal", bw_interval = c(0.12, 0.25)) dens <- kerdec_dens(smp = Y, error_scale_par = sd_error, error_dist = "normal", h = h$h, lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 2: Known Laplace error ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y, error_scale_par = sd_error, error_dist = "laplace", lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 3: Normal error approximated with indep. sample of errors ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y, error_smp = eps, error_dist = "normal", lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 4: Laplace error approximated with indep. sample of errors ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y, error_smp = eps, error_dist = "laplace", lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 5: Unknown error approximated with indep. sample of errors ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y, error_smp = eps, lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 6: Panel data. Keep fist column. Normal errors. ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y_panel, error_smp = eps_panel, error_dist = "normal", lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 7: Panel data. Keep fist column. Laplace errors. ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y_panel, error_smp = eps_panel, error_dist = "Laplace", lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 8: Panel data. Keep fist column. Unknown errors. ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y_panel, error_smp = eps_panel, lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 9: Panel data. Average by individual. Normal errors. ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y_panel, error_smp = eps_panel, error_dist = "normal", panel_proc = "take_aver", lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 10: Panel data. Average by invidual. Laplace errors. ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y_panel, error_smp = eps_panel, error_dist = "laplace", panel_proc = "take_aver", lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3) ## ------------------------------------------------------------------- ## Case 11: Panel data. Average by invidual. Unknown errors. ## ------------------------------------------------------------------- dens <- kerdec_dens(smp = Y_panel, error_smp = eps_panel, panel_proc = "take_aver", lower = min(Y) - sd(Y), upper = max(Y) + sd(Y)) with(dens, { true <- dgamma(x_eval, shape, rate) plot(range(x_eval), range(c(f_vals, true)), type = "n", xlab = "x", ylab = "values") lines(x_eval, f_vals, col = 3, lty = 2, lwd = 1.5) lines(x_eval, true, col = 4, lty = 3, lwd = 1.5) }) legend("topright", legend = c("approx", "true"), lwd = 1.5, col = 3:4, lty = 2:3)
Information about the error distribution can be extracted from panel data by taking differences or observations for the same individual, which produces differences of errors. For example, if $Y_{ij_1} = X_i + \epsilon_{ij_1}$ and $Y_{ij_2} = X_i + \epsilon_{ij_2}$ are two observations with measurement error for the same individual, then $Y_{ij_1} - Y_{ij_2} = \epsilon_{ij_1} - \epsilon_{ij_2}$.
If there are only two observations per individual, this process is as simple as taking the difference between these two. However, when there are more than two observations per individual, there is not a unique way to do this. It is possible to take all possible differences, resulting on a sample of non-independent differences; to obtain independent samples, although fewer differences would be available, or a compromise between them.
The function process_differences()
considers three approaches which are described below. For the examples, consider a panel data array of contaminated observations with $n$ individuals and every measurement taken $l$ times, that is,
$$
\begin{matrix}
Y_{11} & \cdots & Y_{1k} \
Y_{21} & \cdots & Y_{2k} \
\vdots & \ddots & \vdots \
Y_{n1} & \cdots & Y_{nk}
\end{matrix}.
$$
Consider the panel data $$ \begin{matrix} Y_{11} & \cdots & Y_{1k} \ Y_{21} & \cdots & Y_{2k} \ \vdots & \ddots & \vdots \ Y_{n1} & \cdots & Y_{nk} \end{matrix}, $$ where $Y_{ij} = X_i + \epsilon_{ij}$. We want to approximate the characteristic function of the error, $\phi_{\epsilon}(\cdot)$ based on these data.
Recall that $Y_{ij_1} - Y_{ij_2} = \epsilon_{ij_1} - \epsilon_{ij_2}$, that is, it is possible of extract differences of errors by taking differences of observations for the same individual. In vignette differences, we describe three possible ways to do this using the function process_differences()
. Let $L$ denote the number of differences (thus, $L = nk(k - 1)/2$, $n(k - 1)$ or $n\lfloor k\rfloor/2$). Let $\delta_1, \ldots, \delta_L$ be such differences.
Assume that the error distribution is symmetric and its characteristic function is never zero. The latter condition is true is $\boldsymbol{\epsilon}$ is Laplace or normal, for instance. Then $\phi_{\boldsymbol{\delta}}(\boldsymbol{t}) = \phi_{\boldsymbol{\epsilon}}^2(\boldsymbol{t})$
$$ \hat{\phi}{\epsilon}(t) = \sqrt{\lvert \hat{\phi}{\boldsymbol{\delta}} (\boldsymbol{t}) \rvert} $$
Therefore, $$ \hat{\phi}{\bar{\epsilon}}(t) = \left[\hat{\phi}{\epsilon}(t/k)\right]^k $$
library(kerdec) set.seed(1810) n <- 150 k <- 4 smp <- matrix(rnorm(n*k), n, k) t <- seq(-10, 10, .05) averaged_smp <- rowMeans(smp); ecf1 <- ecf_mod(t, averaged_smp) # ecf2 <- error_cf_approx(t = t, smp = smp, diff_method = 1) ecf3 <- exp(-t^2/2/k) plot(range(t), range(c(ecf1, ecf3)), type = "n", col = "green", ylab = "value", xlab = "t") lines(t, ecf1, col = "green") # lines(t, ecf2, col = "blue") lines(t, ecf3, col = "red")
Information about the error distribution can be extracted from panel data by taking differences or observations for the same individual, which produces differences of errors. For example, if $Y_{ij_1} = X_i + \epsilon_{ij_1}$ and $Y_{ij_2} = X_i + \epsilon_{ij_2}$ are two observations with measurement error for the same individual, then $Y_{ij_1} - Y_{ij_2} = \epsilon_{ij_1} - \epsilon_{ij_2}$.
If there are only two observations per individual, this process is as simple as taking the difference between these two. However, when there are more than two observations per individual, there is not a unique way to do this. It is possible to take all possible differences, resulting on a sample of non-independent differences; to obtain independent samples, although fewer differences would be available, or a compromise between them.
The function process_differences()
considers three approaches which are described below. For the examples, consider a panel data array of contaminated observations with $n$ individuals and every measurement taken $l$ times, that is,
$$
\begin{matrix}
Y_{11} & \cdots & Y_{1k} \
Y_{21} & \cdots & Y_{2k} \
\vdots & \ddots & \vdots \
Y_{n1} & \cdots & Y_{nk}
\end{matrix}.
$$
We consider all the differences $Y_{ij_1} - Y_{ij_2}$, for individual $i \in {1, \ldots, n}$ and $1 \leq j_2 < j_1 \leq l$. Therefore, it must be $nl(l-1)/2$ differences.
$$ \begin{bmatrix} 0 & 0 \ 1 & 1 \end{bmatrix} $$
library(kerdec) smp <- matrix(c(0, 1, 3, 6, 10, 1, 2, 4, 7, 11), 2, 5, byrow = TRUE) vec1 <- process_differences(smp, method = 1) vec2 <- process_differences(smp, method = 2) vec3 <- process_differences(smp, method = 3) c(vec1) c(vec2) c(vec3)
Panel data structure $Y_{ij}$ for individual $i = 1, \ldots, n$ and occasion $j = 1, \ldots, l$.
The empirical characteristic function (ecf) of the random sample $\boldsymbol{X}1, \ldots, \boldsymbol{X}_n$, where $\boldsymbol{X}_i$'s are random vectors of size $d$ is defined as $$ \frac{1}{n}\sum{j=1}^{n}\exp \left( \imath \langle \boldsymbol{X}_j, \boldsymbol{t}\rangle\right), $$ where $\langle \cdot, \cdot \rangle$ is the usual inner product in $\mathbb{R}^d$. We present here functions do to find empirical characteristic function or its module and imaginary and real parts.
Empirical characteristic functions are crucial for kernel deconvolution formulas and they are also used when the error distribution is not considered known. On the other hand, evaluate them can be time-consuming, thus, we have speeded up the evaluation time by implementing it in C++
and allowing to compute only the part that we might be interested in, that is, the real or imaginary part, the modulus or the empirical characteristic function itself.
All the functions here described calculate empirical characteristic functions of samples from univariate or multivariate random variables.
Having a function that computes the empirical characteristic function (ecf) is practical, but if what the user needs is only the real part of the empirical characteristic function it is more efficient to do exactly that. The same is true for the modulus and the imaginary part. Taking the real part of the ecf takes twice the time it takes to compute directly the real part.
Therefore, functions have been implemented in C++
for doing such operations directly, but only the wrappers in R
are available to users.
Here it is an example of three ways to find the imaginary part of the ecf.
ecf_im_cpp (way1): is the fastest, but it requires arguments to be matrices with the appropriate arguments.
ecf_imag (way2): is almost as fast as the function above, but it allows vectors when it applies (either when the random sample is unidimensional or the characteristic function is evaluated at only one point).
ecf_cpp (way3): takes twice the time, but it also computes the real part.
library(kerdec) ## Generate sample and define two points to evaluate the functions smp <- rpois(50, lambda = 100) t <- c(-1, 1) ## Evaluate the functions and arrange them in a dataframe. dat <- data.frame( way1 = kerdec:::ecf_im_cpp(matrix(t), matrix(smp)), way2 = ecf_imag(t, smp), way3 = Im(ecf(t, smp)) ) ## kable function is used for displaying the table nicely. knitr::kable(dat) ## We may also test the speed: if(require(microbenchmark)){ ctimes <- microbenchmark( kerdec:::ecf_im_cpp(matrix(t), matrix(smp)), ecf_imag(t, smp), Im(ecf(t, smp)) ) ctimes }
This is a brief demonstration of the use of univariate functions for empirical characteristic functions. Observe that they all can be evaluated in vectors.
library(kerdec) ## Parameters of Poisson distribution, sample size and grid size lambda <- 10 n <- 150 # Sample size m <- 200 # Grid size t <- seq(-2, 2, length.out = m) # Evaluation grid smp <- rpois(n, lambda) # Random sample ## Compute empirical characteristic values and characteristic function ## values real <- ecf_real(t, smp) imag <- ecf_imag(t, smp) modu <- ecf_mod(t, smp) true <- exp(lambda*(exp(1i*t) - 1)) ## Make plots # Real plot(t, real, type = "l", col = 3) lines(t, Re(true), col = 4) title("Real part of empirical and true characteristic functions") legend("topleft", legend = c("ecf", "cf"), col = 3:4, lwd = 2) # Imaginary plot(t, imag, type = "l", col = 3) lines(t, Im(true), col = 4) title("Imaginary part of empirical and true characteristic functions") legend("topleft", legend = c("ecf", "cf"), col = 3:4, lwd = 2) # Modulus plot(t, modu, type = "l", col = 3, ylim = c(-0.05, 1)) lines(t, Mod(true), col = 4) title("Modulus of empirical and true characteristic functions") legend("topleft", legend = c("ecf", "cf"), col = 3:4, lwd = 2)
The functions on this package work for ecf of random vectors. Since visualization of ecf of random vectors is not straightforward, we present an example of a random vector of size three and show a plot of how the absolute error (of the ecf evaluated in $10^3$ points) decreases as the sample size increases.
It is worth mentioning that the ecf functions receive a vector of size $m$ or a $m \times d$ matrix as evaluation grid and a $n \times d$ as sample, where $m$ is the number of points where the ecf will be evaluated, $d$ the size of the random vector and $n$ the sample size.
library(kerdec) ## Parameters of bivariate normal distribution mu <- c(-1, 0, 1) sig <- diag(1:3) ## Characteristic function ## s is n x d phi <- function(s) { complex(modulus = exp(- 0.5*rowSums(s*(s %*% sig))), argument = s %*% mu) } ## Random sample of dimension 3. rndm <- function(n) { cbind(rnorm(n, mu[1], sig[1, 1]), rnorm(n, mu[2], sig[2, 2]), rnorm(n, mu[3], sig[3, 3])) } ## Create evaluation grid. grid_1d <- seq(-3, 3, length.out = 10) grid <- as.matrix(expand.grid(t1 = grid_1d, t2 = grid_1d, t3 = grid_1d)) ## Compute absolute error of modules n <- seq(500, 5000, by = 500) abs_error <- sapply(n, function(nn) sum(abs(Mod(phi(grid)) - ecf_mod(t = grid, smp = rndm(nn))))) ## Generate plot plot(n, abs_error, type = "b", col = "magenta")
Kernel whose Fourier transform are preferred in kernel deconvolution methods due to their theoretical and numerical advantages. In fact, only those Fourier transforms are used, rather than the kernel functions themselves. Thus, we have a function that can calculates kernel and product kernels.
The explicit expressions of kernel functions are not required in kernel deconvolution methods, but rather their Fourier transform. This package provides formulas for those Fourier transforms. We show below how they look like and then we show how do kernel functions look like, but we do use the package fourierin
to obtain these values from their Fourier transforms.
The five kernel functions considered have boundedly supported Fourier transforms, specifically on $[-1, 1]$. We abuse on the nomenclature of the kernels, since "triangular", "tricube" and "triweight" do not correspond to the kernel functions with these names, but to the fact that their Fourier transform is proportional to these kernels. In the same manner, "flat-top" refers to a kernel whose Fourier transform is flat around zero.
## Load package library(kerdec) ## Define evaluation grid and kernels. t <- seq(-1.3, 1.7, by = 0.01) ker <- c("sinc", "triangular", "triweight", "tricube", "flat-top") ## Evaluate Ft of kernels. values <- sapply(1:5, function(idx) ft_kernel(t, ker[idx])) ## Generate plots. plot(t, rep(0, length(t)), type = "n", ylim = c(-0.01, 1), ylab = "value", main = "Fourier transform of kernels") for(idx in 1:5){ lines(t, values[, idx], col = idx + 1, lty = idx, lwd = 2) } legend("topright", ker, col = 2:6, lty = 1:5, lwd = 2)
Now we obtain the values of the kernel functions from their Fourier transforms.
## Load package library(kerdec) ## Write the name of all kernels available ker <- c("sinc", "triangular", "triweight", "tricube", "flat-top") names(ker) <- ker ## Find the actual kernel values from their Fourier transform vals <- lapply(ker, function(krnl) fourierin::fourierin(function(t) ft_kernel(t, krnl), -1, 1, -9, 9, -1, -1, resol = 256)) ## Obtain grid and extract the real part of every set of values. x <- vals$sinc$w vals <- lapply(vals, function(out) c(Re(out$values))) ## Generate plot plot(x, vals[["sinc"]], type = "n", ylab = "value", main = "Kernel functions") abline(h = 0, col = "lightgray") for(idx in 1:5){ lines(x, vals[[idx]], col = idx + 1, lty = idx, lwd = 2) } legend("topright", ker, col = 2:6, lty = 1:5, lwd = 2)
Although in the previous section we showed how to use the function that provides the Fourier transform of a particular set of kernel functions, we show a bare minimum example to illustrate how to use these function.
## Load package library(fourierin) library(kerdec) ## Define grid t <- seq(-1.3, 1.3, 0.01) ## Compute values ## We obtain the same result with ker = 2 values <- ft_kernel(t, ker = "triangular") ## Print plot plot(t, values, main = "Triangular kernel", lwd = 2, type = "l", col = "magenta")
Finally we show a minimal example of how to use the function ft_kernel
to find the Fourier transform of a product kernel. Observe that the grid has to be introduced as a matrix with $d$ columns and not as a $d$-dimensional array.
## Load package library(fourierin) ## Define grid t1 <- seq(-1.1, 1.1, length.out = 150) t2 <- t1 t <- expand.grid(t1 = t1, t2 = t1) ## Compute values ## We obtain the same result with ker = 2 values <- ft_kernel(t, ker = "triangular") values <- ft_kernel(t, ker = "flat-top") ## Convert dataframe to matrix values <- matrix(values, length(t1)) ## Print surface plot persp(x = t1, y = t2, z = values, theta = 120, lwd = .2, phi = 30, shade = .8) ## Now a contour plot contour(x = t1, y = t2, z = values, nlevels = 10)
library(kerdec) # Load package set.seed(666) ## Settings and samples for all the cases n <- 130 # Sample size l <- 5 # Number of columns m <- n + 10 # Error sample size shape <- 5 # X distr. shape par. rate <- 5 # X distr. rate par. sd_error1 <- .2 # std. error of error distr. sd_error2 <- .3 # std. error of error distr. ## Generate biv. samples. X1 <- rgamma(n, shape, rate) # Uncontaminated sample X2 <- rgamma(n, shape, rate) # Uncontaminated sample eps_panel1 <- matrix(rlaplace(n*l, sd = sd_error1), n, l) # Panel of errors eps_panel2 <- matrix(rlaplace(n*l, sd = sd_error2), n, l) # Panel of errors eps1 <- rlaplace(m, sd = sd_error1) # Pure errors eps2 <- rlaplace(m, sd = sd_error2) # Pure errors Y1 <- X1 + eps_panel1[, 1] # Contaminated sample Y2 <- X2 + eps_panel2[, 1] # Contaminated sample Y_panel1 <- sweep(x = eps_panel1, MARGIN = 1, STATS = X1, FUN = "+") # Contaminated in panel Y_panel2 <- sweep(x = eps_panel2, MARGIN = 1, STATS = X2, FUN = "+") # Contaminated in panel plot(function(x) dgamma(x, shape, rate), 0, 4) dens <- kerdec_dens2D(smp1 = Y1, smp2 = Y2, kernel = "sinc", error_smp1 = eps1, error_smp2 = eps2, error_scale_par1 = sd_error1, error_scale_par2 = sd_error2, error_dist = "normal", lower = c(min(Y1) - 2*sd(Y1), min(Y2) - 2*sd(Y2)), upper = c(max(Y1) + 2*sd(Y1), max(Y2) + 2*sd(Y2)), h = 0.18) persp(z = Re(dens$vals), theta = 0.5)
d
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.