# Local polynomial regression with the option for explicit boundary adjustment
# Effective Programming Practices for Economists
# Prof. von Gaudecker
# Winter 2021/22, M.Sc. Economics, Bonn University
# Sven Jacobs
################################################################################
#' lpreba: Local polynomial regression with option for explicit boundary adjustment
#'
#' The package implements the local polynomial regression estimator
#' of arbitrary degree. Estimates for the regression function (conditional mean),
#' its first and second derivative can be obtained. Moreover, computation of
#' effective kernels (i.e. effectively assigned weights in the kernel smoothing
#' process) is provided. Different compactly supported kernels (Uniform,
#' Epanechnikov, etc.) are available. For local constant regression
#' (Nadaraya-Watson), explicit boundary adjustment via boundary kernels can be
#' conducted. Different boundary kernels are available. Additional functionality
#' includes bandwidth selection via cross-validation and the computation of
#' asymptotic confidence intervals.
#'
#' @section Functions:
#' \tabular{ll}{
#' \code{LP} \tab Local polynomial estimator. \cr
#' \code{NW_boundary} \tab Boundary-adjusted Nadaraya-Watson estimator. \cr
#' \code{CV_error_fun} \tab Leave-one-out cross-validation (LOOCV) error. \cr
#' \code{bw_CV_fun} \tab CV optimal bandwidth. \cr
#' \code{confidence_intervals_LP} \tab Asymptotic confidence intervals for the local polynomial estimator.
#'}
#'
#' @author Sven Jacobs.
#'
#' @docType package
#' @name lpreba
NULL
#############################
# Auxiliary functions #
#############################
# Creating standard basis vectors
make_basis <- function(length, position) {
replace(numeric(length), position, 1)
}
#################
# Kernels #
#################
# Uniform
#' Uniform kernel
#'
#' @param u Kernel argument.
#'
#' @export
uniform <- function(u) {ifelse(abs(u) <= 1, 0.5, 0)}
# Triangular
#' Triangular kernel
#'
#' @param u Kernel argument.
#'
#' @export
triangular <- function(u) {ifelse(abs(u) <= 1, 1 - abs(u), 0)}
# Epanechnikov
#' Epanechnikov kernel
#'
#' @param u Kernel argument.
#'
#' @export
epanechnikov <- function(u) {ifelse(abs(u) <= 1, 3/4 * (1 - u^2), 0)}
# Biweight
#' Biweight kernel
#'
#' @param u Kernel argument.
#'
#' @export
biweight <- function(u) {ifelse(abs(u) <= 1, 15/16 * (1 - u^2)^2, 0)}
# Triweight
#' Triweight kernel
#'
#' @param u Kernel argument.
#'
#' @export
triweight <- function(u) {ifelse(abs(u) <= 1, 35/32 * (1 - u^2)^3, 0)}
# Tricube
#' Tricube kernel
#'
#' @param u Kernel argument.
#'
#' @export
tricube <- function(u) {ifelse(abs(u) <= 1, 70/81 * (1 - abs(u)^3)^3, 0)}
# Cosine
#' Cosine kernel
#'
#' @param u Kernel argument.
#'
#' @export
cosine <- function(u) {ifelse(abs(u) <= 1, pi/4 * cos(pi/2 * u), 0)}
# List with all kernels
kernel_list <- list(uniform, triangular, epanechnikov, biweight,
triweight, tricube, cosine)
####################################
# Local polynomial estimator #
####################################
#' Local polynomial estimator
#'
#' Local polynomial regression estimator of arbitrary degree.
#'
#' The degree of the local polynomial estimator can be chosen. Two special
#' cases are the Nadaraya-Watson estimator (\code{degree = 0L}) and the
#' local linear estimator (\code{degree = 1L}).
#'
#' The most important compactly supported (on the interval \[-1, 1\]) kernels
#' are available: \code{uniform}, \code{triangular}, \code{epanechnikov},
#' \code{biweight}, \code{triweight}, \code{tricube}, \code{cosine}.
#'
#' The effective kernel at an evaluation point is the set of effectively
#' assigned weights in the smoothing process (see e.g. Hastie and Loader, 1993).
#'
#' To obtain estimates for the first derivative of the regression function,
#' the degree has to be at least \code{1L}.
#' To obtain estimates for the second derivative of the regression function,
#' the degree has to be at least \code{2L}.
#'
#' @param x Evaluation points (vector).
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param kernel Kernel (function). Default is \code{epanechnikov}.
#' @param bw Bandwidth (scalar).
#' @param degree Degree of the locally fitted polynomial (integer). Default is \code{1L}.
#'
#' @return List containing:
#' \item{estimates}{Estimates for the regression function at the evaluation points (vector).}
#' \item{effective_kernels}{Effective kernels at the evaluation points (matrix).}
#' \item{slopes}{Estimates for the first derivative (slope) of the regression function at the evaluation points (vector).}
#' \item{curvatures}{Estimates for the second derivative (curvature) of the regression function at the evaluation points (vector).}
#' @export
#'
#' @references Hastie, T. and C. Loader (1993).
#' ``Local regression: Automatic kernel carpentry''.
#' \emph{Statistical Science} 8 (2), pp. 120--129.
#'
#' @examples
#' m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' n <- 100 # Sample size
#' X <- seq(0, 1, length.out = n) # Data for the regressor
#' m_X <- m_fun(X) # True values of regression function
#' epsilon <- rnorm(n, sd = 0.25) # Error term
#' Y <- m_X + epsilon # Data for the regressand
#' bw <- 0.2 # Bandwidth
#' x <- seq(0, 1, length.out = n/2) # Evaluation points
#'
#' output_LP <- LP(x = x, X = X, Y = Y, kernel = epanechnikov, bw = bw, degree = 1L)
#' estimates_LP <- output_LP$estimates
#' effective_kernels_LP <- output_LP$effective_kernels
#' slopes_LP <- output_LP$slopes
#' curvatures_LP <- output_LP$curvatures # Yields only NAs since degree >= 2L is required
LP <- function(x, X, Y, kernel = epanechnikov, bw, degree = 1L) {
# input: - x: evaluation points (vector)
# - X: data for the regressor (vector)
# - Y: data for the regressand (vector)
# - kernel: kernel (function), with default
# - bw: bandwidth (scalar)
# - degree: degree of the locally fitted polynomial (integer),
# with default
# output: - list containing
# - estimates: estimates for the regression function
# at the evaluation points (vector)
# - effective_kernels: effective kernels
# at the evaluation points (matrix)
# - slopes: estimates for the first derivative (slope) of the
# regression function at the evaluation points (vector)
# - curvatures: estimates for the second derivative (curvature) of the
# regression function at the evaluation points (vector)
# Error messages
if (length(X) != length(Y)) {stop("Length of X and Y has to be equal.")}
if (bw <= 0) {stop("Bandwidth has to be strictly positive.")}
if (TRUE %in% lapply(kernel_list, all.equal, kernel) == FALSE) {
stop("Only supported kernels can be chosen.")
}
if (degree < 0 | typeof(degree) != "integer") {
stop("degree has to be of type integer and nonnegative.")
}
# Warning messages
if ((degree %% 2) == 0) {warning(strwrap("It is strongly recommended to use an
odd order as odd order polynomial fits
outperform even order ones.
In particular, odd order fits are
boundary adaptive.",
prefix = " ", initial = ""))}
# General messages
if (degree > 1) {message(strwrap("It is recommended to use the lowest (odd) order,
since the bandwidth is used to control the
model complexity.",
prefix = " ", initial = ""))}
X_matrix <- matrix(NA, length(X), degree + 1)
estimates <- rep(NA, length(x))
effective_kernels <- matrix(NA, length(x), length(Y))
slopes <- rep(NA, length(x))
curvatures <- rep(NA, length(x))
for (i in 1:length(x)) {
for (j in 1:(degree + 1)) {
X_matrix[, j] <- (X - x[i])^(j - 1)
}
W_matrix <- diag(kernel((X - x[i])/bw))
auxiliary_matrix <- solve(t(X_matrix)%*%W_matrix%*%X_matrix) %*%
t(X_matrix) %*% W_matrix
effective_kernels[i, ] <- t(make_basis(degree + 1, 1)) %*% auxiliary_matrix
estimates[i] <- effective_kernels[i, ] %*% Y
if (degree >= 1) {
slopes[i] <- t(make_basis(degree + 1, 2)) %*% auxiliary_matrix %*% Y
}
if (degree >= 2) {
curvatures[i] <- 2 * t(make_basis(degree + 1, 3)) %*% auxiliary_matrix %*% Y
}
}
list(estimates = estimates, effective_kernels = effective_kernels,
slopes = slopes, curvatures = curvatures)
}
##########################
# Boundary kernels #
##########################
# Note:
#
# The following boundary kernels rely on M?ller (1991).
# However, the left boundary kernels of M?ller are right boundary kernels for us, and vice versa.
# This is because of the different definition of the kernel estimator concerning the argument of the kernel.
# M?ller uses K(x - X), whereas we define K(X - x).
# Left Uniform kernels
#' Left Uniform kernels
#'
#' Left Uniform kernels derived from Müller (1991, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. (1991).
#' ``Smooth optimum kernel estimators near endpoints''.
#' \emph{Biometrika} 78 (3), pp. 521--530.
uniform_left <- function(u, rho) {
ifelse(u >= -rho & u <= 1, 1/(1+rho)*(1 + 3*((1-rho)/(1+rho))^2 - 6*((1-rho)/(1+rho)^2)*u), 0)
}
# Left Epanechnikov kernels
#' Left Epanechnikov kernels
#'
#' Left Epanechnikov kernels derived from Müller (1991, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. (1991).
#' ``Smooth optimum kernel estimators near endpoints''.
#' \emph{Biometrika} 78 (3), pp. 521--530.
epanechnikov_left <- function(u, rho) {
ifelse(u >= -rho & u <= 1, 6*(1-u)*(rho+u)*(1/(1+rho)^3)*(1 + 5*((1-rho)/(1+rho))^2 - 10*((1-rho)/(1+rho)^2)*u), 0)
}
# Left Biweight kernels
#' Left Biweight kernels
#'
#' Left Biweight kernels derived from Müller (1991, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. (1991).
#' ``Smooth optimum kernel estimators near endpoints''.
#' \emph{Biometrika} 78 (3), pp. 521--530.
biweight_left <- function(u, rho) {
ifelse(u >= -rho & u <= 1, 30*(1-u)^2*(rho+u)^2*(1/(1+rho)^5)*(1 + 7*((1-rho)/(1+rho))^2 - 14*((1-rho)/(1+rho)^2)*u), 0)
}
# Left Triweight kernels
#' Left Triweight kernels
#'
#' Left Triweight kernels derived from Müller (1991, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. (1991).
#' ``Smooth optimum kernel estimators near endpoints''.
#' \emph{Biometrika} 78 (3), pp. 521--530.
triweight_left <- function(u, rho) {
ifelse(u >= -rho & u <= 1, 140*(1-u)^3*(rho+u)^3*(1/(1+rho)^7)*(1 + 9*((1-rho)/(1+rho))^2 - 18*((1-rho)/(1+rho)^2)*u), 0)
}
# Note:
#
# The following boundary kernels rely on M?ller and Wang (1994).
# These alternative kernels have greater asymptotic efficiency
# at the cost of discontinuity at their endpoints.
# Left alternative Uniform kernels
#' Left alternative Uniform kernels
#'
#' Left alternative Uniform kernels derived from Müller and Wang (1994, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. and J.-L. Wang (1994).
#' ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#' \emph{Biometrics} 50 (1), pp. 61--76.
uniform_alt_left <- function(u, rho) {
ifelse(u >= -rho & u <= 1, 2/(1+rho)^3*(-3*(1-rho)*u + 2*(1-rho+rho^2)), 0)
}
# Left alternative Epanechnikov kernels
#' Left alternative Epanechnikov kernels
#'
#' Left alternative Epanechnikov kernels derived from Müller and Wang (1994, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. and J.-L. Wang (1994).
#' ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#' \emph{Biometrics} 50 (1), pp. 61--76.
epanechnikov_alt_left <- function(u, rho) {
ifelse(u >= -rho & u <= 1, 12/(1+rho)^4*(1-u)*(-(1-2*rho)*u + 0.5*(3*rho^2-2*rho+1)), 0)
}
# Left alternative Biweight kernels
#' Left alternative Biweight kernels
#'
#' Left alternative Biweight kernels derived from Müller and Wang (1994, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. and J.-L. Wang (1994).
#' ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#' \emph{Biometrics} 50 (1), pp. 61--76.
biweight_alt_left <- function(u, rho) {
ifelse(u >= -rho & u <= 1, 15/(1+rho)^5*(1-u)^2*(rho+u)*(-2*u*(5*(1-rho)/(1+rho)-1) + 3*rho-1 + 5*(1-rho)^2/(1+rho)), 0)
}
# Left alternative Triweight kernels
#' Left alternative Triweight kernels
#'
#' Left alternative Triweight kernels derived from Müller and Wang (1994, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. and J.-L. Wang (1994).
#' ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#' \emph{Biometrics} 50 (1), pp. 61--76.
triweight_alt_left <- function(u, rho) {
ifelse(u >= -rho & u <= 1, 70/(1+rho)^7*(1-u)^3*(rho+u)^2*(-2*u*(7*(1-rho)/(1+rho)-1) + 3*rho-1 + 7*(1-rho)^2/(1+rho)), 0)
}
# List with all boundary kernels
boundary_kernel_list <- list(uniform_left, epanechnikov_left, biweight_left, triweight_left,
uniform_alt_left, epanechnikov_alt_left, biweight_alt_left, triweight_alt_left)
#####################################################
# Boundary-adjusted Nadaraya-Watson estimator #
#####################################################
#' Boundary-adjusted Nadaraya-Watson estimator
#'
#' Explicit boundary adjustment via boundary kernels for the
#' Nadaraya-Watson estimator (local constant regression).
#'
#' When applying the Nadaraya-Watson estimator with a compactly supported
#' (on the interval \[-1, 1\]) kernel to data of a regressor that is
#' compactly supported, boundary effects arise over the region
#' \[\code{boundary_left}, \code{boundary_left} + \code{bw}) \eqn{\cup}
#' (\code{boundary_right} - \code{bw}, \code{boundary_right}\].
#' That is, the bias is of larger order compared to the interior.
#' The original order can be preserved by using special so-called boundary
#' kernels when applying Nadaraya-Watson at the boundaries
#' (e.g. Gasser and Müller, 1979).
#'
#' ``Smooth optimum boundary kernels'' (Müller, 1991, Table 1) are available:
#' \code{uniform_left}, \code{epanechnikov_left}, \code{biweight_left},
#' \code{triweight_left}.
#' Moreover, an alternative version of these kernels with greater asymptotic
#' efficiency at the cost of discontinuity at their endpoints
#' (Müller and Wang, 1994, Table 1) is available:
#' \code{uniform_alt_left}, \code{epanechnikov_alt_left}, \code{biweight_alt_left},
#' \code{triweight_alt_left}.
#'
#' The effective kernel at an evaluation point is the set of effectively
#' assigned weights in the smoothing process (see Hastie and Loader, 1993).
#'
#' @param x Evaluation points (vector).
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param bw Bandwidth (scalar).
#' @param kernel_interior Kernel used in the interior (function). Default is \code{epanechnikov}.
#' @param kernel_left Left boundary kernels (function). Default is \code{epanechnikov_left}.
#' @param boundary_left Lower boundary of the support of X (scalar).
#' @param boundary_right Upper boundary of the support of X (scalar).
#'
#' @return List containing:
#' \item{estimates}{Estimates for the regression function at the evaluation points (vector).}
#' \item{effective_kernels}{Effective kernels at the evaluation points (matrix).}
#' @export
#'
#' @references Gasser, T. and H.-G. Müller (1979).
#' ``Kernel estimation of regression functions''.
#' In: \emph{Smoothing Techniques for Curve Estimation}.
#' Ed. by T. Gasser and M. Rosenblatt.
#' Berlin: Springer, pp. 23--68.
#'
#' Müller, H.-G. (1991).
#' ``Smooth optimum kernel estimators near endpoints''.
#' \emph{Biometrika} 78 (3), pp. 521--530.
#'
#' Müller, H.-G. and J.-L. Wang (1994).
#' ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#' \emph{Biometrics} 50 (1), pp. 61--76.
#'
#' Hastie, T. and C. Loader (1993).
#' ``Local regression: Automatic kernel carpentry''.
#' \emph{Statistical Science} 8 (2), pp. 120--129.
#'
#' @examples
#' m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' n <- 100 # Sample size
#' X <- seq(0, 1, length.out = n) # Data for the regressor
#' m_X <- m_fun(X) # True values of regression function
#' epsilon <- rnorm(n, sd = 0.25) # Error term
#' Y <- m_X + epsilon # Data for the regressand
#' bw <- 0.2 # Bandwidth
#' x <- seq(0, 1, length.out = n/2) # Evaluation points
#'
#' output_NW_boundary <- NW_boundary(x = x, X = X, Y = Y, bw = bw,
#' kernel_interior = epanechnikov, kernel_left = epanechnikov_left,
#' boundary_left = 0, boundary_right = 1)
#' estimates_NW_boundary <- output_NW_boundary$estimates
#' effective_kernels_NW_boundary <- output_NW_boundary$effective_kernels
NW_boundary <- function(x, X, Y, bw,
kernel_interior = epanechnikov, kernel_left = epanechnikov_left,
boundary_left, boundary_right) {
# input: - x: evaluation points (vector)
# - X: data for the regressor (vector)
# - Y: data for the regressand (vector)
# - bw: bandwidth (scalar)
# - kernel_interior: kernel used in the interior (function), with default
# - kernel_left: left boundary kernels (function), with default
# - boundary_left: lower boundary of the support of X (scalar)
# - boundary_right: upper boundary of the support of X (scalar)
# output: - list containing
# - estimates: estimates for the regression function
# at the evaluation points (vector)
# - effective_kernels: effective kernels
# at the evaluation points (matrix)
# Error messages
if (length(X) != length(Y)) {stop("Length of X and Y has to be equal.")}
if (bw <= 0) {stop("Bandwidth has to be strictly positive.")}
if (TRUE %in% lapply(kernel_list, all.equal, kernel_interior) == FALSE) {
stop("Only supported kernels can be chosen.")
}
if (TRUE %in% lapply(boundary_kernel_list, all.equal, kernel_left) == FALSE) {
stop("Only supported boundary kernels can be chosen.")
}
if (boundary_left > min(X)) {
stop("The lower boundary cannot be larger than the smallest observed value of the regressor X.")
}
if (boundary_right < max(X)) {
stop("The upper boundary cannot be smaller than the largest observed value of the regressor X.")
}
absolute_weights <- matrix(NA, length(x), length(Y))
for (i in 1:length(x)) {
if (x[i] >= boundary_left & x[i] < boundary_left + bw) {
absolute_weights[i, ] <- kernel_left(u = (X - x[i])/bw, rho = (x[i] - boundary_left)/bw)
} else if (x[i] >= boundary_left + bw & x[i] <= boundary_right - bw) {
absolute_weights[i, ] <- kernel_interior((X - x[i])/bw)
} else if (x[i] > boundary_right - bw & x[i] <= boundary_right) {
absolute_weights[i, ] <- kernel_left(u = -(X - x[i])/bw, rho = (boundary_right - x[i])/bw)
}
}
effective_kernels <- absolute_weights/rowSums(absolute_weights)
estimates <- as.vector(effective_kernels %*% Y)
list(estimates = estimates, effective_kernels = effective_kernels)
}
######################################################
# Leave-one-out cross-validation (LOOCV) error #
######################################################
#' Leave-one-out cross-validation (LOOCV) error
#'
#' Leave-one-out cross-validation (LOOCV) error for a given bandwidth, either
#' for the local polynomial estimator or the boundary-adjusted
#' Nadaraya-Watson estimator.
#'
#' The LOOCV error function is the mean squared prediction errors.
#'
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param kernel Kernel (function). Default is \code{epanechnikov}.
#' @param bw Bandwidth (scalar).
#' @param degree Degree of the locally fitted polynomial (integer). Default is \code{1L}.
#' @param kernel_left Left boundary kernels (function). Default is \code{epanechnikov_left}.
#' @param boundary_left Lower boundary of the support of X (scalar). Default is \code{NA}.
#' @param boundary_right Upper boundary of the support of X (scalar). Default is \code{NA}.
#' @param boundary_adjustment Explicit boundary adjustment (boolean). Default is \code{FALSE}.
#'
#' @return LOOCV error for bandwidth \code{bw} (scalar).
#' @export
#'
#' @examples
#' #m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' #n <- 100 # Sample size
#' #X <- seq(0, 1, length.out = n) # Data for the regressor
#' #m_X <- m_fun(X) # True values of regression function
#' #epsilon <- rnorm(n, sd = 0.25) # Error term
#' #Y <- m_X + epsilon # Data for the regressand
#' #bw <- 0.2 # Bandwidth
#'
#' # Local polynomial estimator
#' #CV_error_fun(X = X, Y = Y, kernel = epanechnikov, bw = bw, degree = 1L,
#' # kernel_left = epanechnikov_left,
#' # boundary_left = NA, boundary_right = NA,
#' # boundary_adjustment = FALSE)
#'
#' # Boundary-adjusted Nadaraya-Watson estimator
#' #CV_error_fun(X = X, Y = Y, kernel = epanechnikov, bw = bw, degree = 0L,
#' # kernel_left = epanechnikov_left,
#' # boundary_left = 0, boundary_right = 1,
#' # boundary_adjustment = TRUE)
CV_error_fun <- function(X, Y, kernel = epanechnikov, bw, degree = 1L,
kernel_left = epanechnikov_left,
boundary_left = NA, boundary_right = NA,
boundary_adjustment = FALSE) {
# input: - X: data for the regressor (vector)
# - Y: data for the regressand (vector)
# - kernel: kernel (function), with default
# - bw: bandwidth (scalar)
# - degree: degree of the locally fitted polynomial (integer), with default
# - kernel_left: left boundary kernels (function), with default
# - boundary_left: lower boundary of the support of X (scalar), with default
# - boundary_right: upper boundary of the support of X (scalar), with default
# - boundary_adjustment: explicit boundary adjustment (boolean), with default
# output: - CV_error: LOOCV error for bandwidth bw (scalar)
# Error messages
if (degree != 0L & boundary_adjustment == TRUE) {
stop("Explicit boundary adjustment can only be used for the Nadaraya-Watson estimator (degree = 0).")
}
if (boundary_adjustment == FALSE) {
LP_output <- LP(x = X, X = X, Y = Y, kernel = kernel, bw = bw, degree = degree)
estimates <- LP_output$estimates
effective_kernels <- LP_output$effective_kernels
} else if (boundary_adjustment == TRUE) {
NW_boundary_output <- NW_boundary(x = X, X = X, Y = Y, bw = bw,
kernel_interior = kernel, kernel_left = kernel_left,
boundary_left = boundary_left, boundary_right = boundary_right)
estimates <- NW_boundary_output$estimates
effective_kernels <- NW_boundary_output$effective_kernels
}
CV_error <- 1/length(Y) * sum(((Y - estimates)/(1 - diag(effective_kernels)))^2)
return(CV_error)
}
##############################
# CV optimal bandwidth #
##############################
#' CV optimal bandwidth
#'
#' Selection of the bandwidth that yields the smallest leave-one-out
#' cross-validation (LOOCV) error over a supplied grid.
#'
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param kernel Kernel (function). Default is \code{epanechnikov}.
#' @param bw_grid Bandwidth grid to find global minimum of CV error (vector).
#' @param degree Degree of the locally fitted polynomial (integer). Default is \code{1L}.
#' @param kernel_left Left boundary kernels (function). Default is \code{epanechnikov_left}.
#' @param boundary_left Lower boundary of the support of X (scalar). Default is \code{NA}.
#' @param boundary_right Upper boundary of the support of X (scalar). Default is \code{NA}
#' @param boundary_adjustment Explicit boundary adjustment (boolean). Default is \code{FALSE}.
#' @param plot Plot CV error over bandwidth grid (boolean). Default is \code{TRUE}.
#'
#' @return CV optimal bandwidth (scalar) and plot of CV errors if plot == TRUE.
#' @export
#'
#' @importFrom graphics abline legend rug
#'
#' @examples
#' m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' n <- 100 # Sample size
#' X <- seq(0, 1, length.out = n) # Data for the regressor
#' m_X <- m_fun(X) # True values of regression function
#' epsilon <- rnorm(n, sd = 0.25) # Error term
#' Y <- m_X + epsilon # Data for the regressand
#' bw_grid <- seq(0.05, 0.2, length.out = 200)
#'
#' # Local polynomial estimator
#' bw_CV_fun(X = X, Y = Y, kernel = epanechnikov, bw_grid = bw_grid, degree = 1L,
#' kernel_left = epanechnikov_left,
#' boundary_left = NA, boundary_right = NA,
#' boundary_adjustment = FALSE,
#' plot = TRUE)
#'
#' # Boundary-adjusted Nadaraya-Watson estimator
#' bw_CV_fun(X = X, Y = Y, kernel = epanechnikov, bw_grid = bw_grid, degree = 0L,
#' kernel_left = epanechnikov_left,
#' boundary_left = 0, boundary_right = 1,
#' boundary_adjustment = TRUE,
#' plot = TRUE)
#'
#' @seealso \code{\link{CV_error_fun}}
bw_CV_fun <- function(X, Y, kernel = epanechnikov, bw_grid, degree = 1L,
kernel_left = epanechnikov_left,
boundary_left = NA, boundary_right = NA,
boundary_adjustment = FALSE,
plot = TRUE) {
# input: - X: data for the regressor (vector)
# - Y: data for the regressand (vector)
# - kernel: kernel (function), with default
# - bw_grid: bandwidth grid to find global minimum of CV error (vector)
# - degree: degree of the locally fitted polynomial (integer), with default
# - kernel_left: left boundary kernels (function), with default
# - boundary_left: lower boundary of the support of X (scalar), with default
# - boundary_right: upper boundary of the support of X (scalar), with default
# - boundary_adjustment: explicit boundary adjustment (boolean), with default
# - plot: plot CV error over bandwidth grid (boolean), with default
# output: - bw_CV: CV optimal bandwidth (scalar)
# - plot of CV errors if plot == TRUE (plot)
CV_errors <- sapply(bw_grid, function(bw) {
CV_error_fun(X = X, Y = Y, kernel = kernel, bw, degree = degree,
kernel_left = kernel_left,
boundary_left = boundary_left, boundary_right = boundary_right,
boundary_adjustment = boundary_adjustment)
})
bw_CV <- bw_grid[which.min(CV_errors)]
if (bw_CV == bw_grid[1] | bw_CV == bw_grid[length(bw_grid)]) {
warning("Selected bandwidth equals the smallest or largest grid value.
You may wish to expand the grid after consulting the plot of the CV errors.")
}
if (plot == TRUE) {
plot(bw_grid, CV_errors,
xlab = "bw", ylab = "CV error", cex.lab = 1.25, cex.axis = 1.25, cex = 0.75)
rug(bw_grid, ticksize = 0.015)
abline(v = bw_CV, col = "red")
legend("top", legend = bquote(paste("bw"["CV"]*" = ", .(round(bw_CV, 3)))), bty = "n", cex = 1.25)
}
return(bw_CV)
}
############################################################################
# Asymptotic confidence intervals for the local polynomial estimator #
############################################################################
#' Asymptotic confidence intervals for the local polynomial estimator
#'
#' @param x Evaluation points (vector).
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param kernel Kernel (function). Default is \code{epanechnikov}.
#' @param bw Bandwidth (scalar).
#' @param degree Degree of the locally fitted polynomial (integer). Default is \code{1L}.
#' @param alpha Significance level (scalar). Default is \code{0.05}.
#'
#' @return List containing:
#' \item{confidence_intervals_lower}{Lower points for the (1 - alpha) asymptotic confidence intervals (vector).}
#' \item{confidence_intervals_upper}{Upper points for the (1 - alpha) asymptotic confidence intervals (vector).}
#' @export
#'
#' @importFrom stats qnorm
#'
#' @examples
#' m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' n <- 100 # Sample size
#' X <- seq(0, 1, length.out = n) # Data for the regressor
#' m_X <- m_fun(X) # True values of regression function
#' epsilon <- rnorm(n, sd = 0.25) # Error term
#' Y <- m_X + epsilon # Data for the regressand
#' bw <- 0.2 # Bandwidth
#' x <- seq(0, 1, length.out = n/2) # Evaluation points
#'
#' output_confidence_intervals_LP <- confidence_intervals_LP(x = x, X = X, Y = Y, bw = bw,
#' alpha = 0.05)
#' confidence_intervals_lower <- output_confidence_intervals_LP$confidence_intervals_lower
#' confidence_intervals_upper <- output_confidence_intervals_LP$confidence_intervals_upper
confidence_intervals_LP <- function(x, X, Y, kernel = epanechnikov, bw, degree = 1L, alpha = 0.05) {
# input: - x: evaluation points (vector)
# - X: data for the regressor (vector)
# - Y: data for the regressand (vector)
# - kernel: kernel (function), with default
# - bw: bandwidth (scalar)
# - degree: degree of the locally fitted polynomial (integer), with default
# - alpha: significance level (scalar), with default
# output: - list containing
# - confidence_intervals_lower:
# lower points for the (1 - alpha) asymptotic confidence intervals (vector)
# - confidence_intervals_upper:
# upper points for the (1 - alpha) asymptotic confidence intervals (vector)
# Error messages
if (alpha <= 0 | alpha >= 1) {
stop("The significance level has to be between zero and one.")
}
LP_output <- LP(x = x, X = X, Y = Y, kernel = kernel, bw = bw, degree = degree)
estimates <- LP_output$estimates
effective_kernels <- LP_output$effective_kernels
squared_prediction_errors <- ((Y - estimates)/(1 - diag(effective_kernels)))^2
error_matrix <- diag(squared_prediction_errors)
X_matrix <- matrix(NA, length(X), degree + 1)
estimates_variance <- rep(NA, length(x))
for (i in 1:length(x)) {
for (j in 1:(degree + 1)) {
X_matrix[, j] <- (X - x[i])^(j - 1)
}
W_matrix <- diag(kernel((X - x[i])/bw))
auxiliary_matrix <- solve(t(X_matrix)%*%W_matrix%*%X_matrix)
estimate_covariance_matrix <- auxiliary_matrix %*%
t(X_matrix)%*%W_matrix%*%error_matrix%*%W_matrix%*%X_matrix %*%
auxiliary_matrix
estimates_variance[i] <- estimate_covariance_matrix[1, 1]
}
confidence_intervals_LP_lower <- estimates - qnorm(p = 1 - alpha/2) * sqrt(estimates_variance)
confidence_intervals_LP_upper <- estimates + qnorm(p = 1 - alpha/2) * sqrt(estimates_variance)
list(confidence_intervals_lower = confidence_intervals_LP_lower,
confidence_intervals_upper = confidence_intervals_LP_upper)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.