R/lpreba.R

Defines functions bw_CV_fun CV_error_fun NW_boundary triweight_alt_left biweight_alt_left epanechnikov_alt_left uniform_alt_left triweight_left biweight_left epanechnikov_left uniform_left LP cosine tricube triweight biweight epanechnikov triangular uniform make_basis

Documented in biweight biweight_alt_left biweight_left bw_CV_fun cosine CV_error_fun epanechnikov epanechnikov_alt_left epanechnikov_left LP NW_boundary triangular tricube triweight triweight_alt_left triweight_left uniform uniform_alt_left uniform_left

# Local polynomial regression with the option for explicit boundary adjustment

# Effective Programming Practices for Economists
# Prof. von Gaudecker
# Winter 2021/22, M.Sc. Economics, Bonn University
# Sven Jacobs

################################################################################

#' lpreba: Local polynomial regression with option for explicit boundary adjustment
#'
#' The package implements the local polynomial regression estimator
#' of arbitrary degree. Estimates for the regression function (conditional mean),
#' its first and second derivative can be obtained. Moreover, computation of
#' effective kernels (i.e. effectively assigned weights in the kernel smoothing
#' process) is provided. Different compactly supported kernels (Uniform,
#' Epanechnikov, etc.) are available. For local constant regression
#' (Nadaraya-Watson), explicit boundary adjustment via boundary kernels can be
#' conducted. Different boundary kernels are available. Additional functionality
#' includes bandwidth selection via cross-validation and the computation of
#' asymptotic confidence intervals.
#'
#' @section Functions:
#' \tabular{ll}{
#'     \code{LP} \tab Local polynomial estimator. \cr
#'     \code{NW_boundary} \tab Boundary-adjusted Nadaraya-Watson estimator. \cr
#'     \code{CV_error_fun} \tab Leave-one-out cross-validation (LOOCV) error. \cr
#'     \code{bw_CV_fun} \tab CV optimal bandwidth. \cr
#'     \code{confidence_intervals_LP} \tab Asymptotic confidence intervals for the local polynomial estimator.
#'}
#'
#' @author Sven Jacobs.
#'
#' @docType package
#' @name lpreba
NULL

#############################
#    Auxiliary functions    #
#############################

# Creating standard basis vectors
make_basis <- function(length, position) {

    replace(numeric(length), position, 1)

}

#################
#    Kernels    #
#################

# Uniform
#' Uniform kernel
#'
#' @param u Kernel argument.
#'
#' @export
uniform <- function(u) {ifelse(abs(u) <= 1, 0.5, 0)}

# Triangular
#' Triangular kernel
#'
#' @param u Kernel argument.
#'
#' @export
triangular <- function(u) {ifelse(abs(u) <= 1, 1 - abs(u), 0)}

# Epanechnikov
#' Epanechnikov kernel
#'
#' @param u Kernel argument.
#'
#' @export
epanechnikov <- function(u) {ifelse(abs(u) <= 1, 3/4 * (1 - u^2), 0)}

# Biweight
#' Biweight kernel
#'
#' @param u Kernel argument.
#'
#' @export
biweight <- function(u) {ifelse(abs(u) <= 1, 15/16 * (1 - u^2)^2, 0)}

# Triweight
#' Triweight kernel
#'
#' @param u Kernel argument.
#'
#' @export
triweight <- function(u) {ifelse(abs(u) <= 1, 35/32 * (1 - u^2)^3, 0)}

# Tricube
#' Tricube kernel
#'
#' @param u Kernel argument.
#'
#' @export
tricube <- function(u) {ifelse(abs(u) <= 1, 70/81 * (1 - abs(u)^3)^3, 0)}

# Cosine
#' Cosine kernel
#'
#' @param u Kernel argument.
#'
#' @export
cosine <- function(u) {ifelse(abs(u) <= 1, pi/4 * cos(pi/2 * u), 0)}

# List with all kernels
kernel_list <- list(uniform, triangular, epanechnikov, biweight,
                    triweight, tricube, cosine)

####################################
#    Local polynomial estimator    #
####################################

#' Local polynomial estimator
#'
#' Local polynomial regression estimator of arbitrary degree.
#'
#' The degree of the local polynomial estimator can be chosen. Two special
#' cases are the Nadaraya-Watson estimator (\code{degree = 0L}) and the
#' local linear estimator (\code{degree = 1L}).
#'
#' The most important compactly supported (on the interval \[-1, 1\]) kernels
#' are available: \code{uniform}, \code{triangular}, \code{epanechnikov},
#' \code{biweight}, \code{triweight}, \code{tricube}, \code{cosine}.
#'
#' The effective kernel at an evaluation point is the set of effectively
#' assigned weights in the smoothing process (see e.g. Hastie and Loader, 1993).
#'
#' To obtain estimates for the first derivative of the regression function,
#' the degree has to be at least \code{1L}.
#' To obtain estimates for the second derivative of the regression function,
#' the degree has to be at least \code{2L}.
#'
#' @param x Evaluation points (vector).
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param kernel Kernel (function). Default is \code{epanechnikov}.
#' @param bw Bandwidth (scalar).
#' @param degree Degree of the locally fitted polynomial (integer). Default is \code{1L}.
#'
#' @return List containing:
#'     \item{estimates}{Estimates for the regression function at the evaluation points (vector).}
#'     \item{effective_kernels}{Effective kernels at the evaluation points (matrix).}
#'     \item{slopes}{Estimates for the first derivative (slope) of the regression function at the evaluation points (vector).}
#'     \item{curvatures}{Estimates for the second derivative (curvature) of the regression function at the evaluation points (vector).}
#' @export
#'
#' @references Hastie, T. and C. Loader (1993).
#'     ``Local regression: Automatic kernel carpentry''.
#'     \emph{Statistical Science} 8 (2), pp. 120--129.
#'
#' @examples
#' m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' n <- 100 # Sample size
#' X <- seq(0, 1, length.out = n) # Data for the regressor
#' m_X <- m_fun(X) # True values of regression function
#' epsilon <- rnorm(n, sd = 0.25) # Error term
#' Y <- m_X + epsilon # Data for the regressand
#' bw <- 0.2 # Bandwidth
#' x <- seq(0, 1, length.out = n/2) # Evaluation points
#'
#' output_LP <- LP(x = x, X = X, Y = Y, kernel = epanechnikov, bw = bw, degree = 1L)
#' estimates_LP <- output_LP$estimates
#' effective_kernels_LP <- output_LP$effective_kernels
#' slopes_LP <- output_LP$slopes
#' curvatures_LP <- output_LP$curvatures # Yields only NAs since degree >= 2L is required
LP <- function(x, X, Y, kernel = epanechnikov, bw, degree = 1L) {

    # input: - x: evaluation points (vector)
    #        - X: data for the regressor (vector)
    #        - Y: data for the regressand (vector)
    #        - kernel: kernel (function), with default
    #        - bw: bandwidth (scalar)
    #        - degree: degree of the locally fitted polynomial (integer),
    #                  with default

    # output: - list containing
    #               - estimates: estimates for the regression function
    #                            at the evaluation points (vector)
    #               - effective_kernels: effective kernels
    #                                    at the evaluation points (matrix)
    #               - slopes: estimates for the first derivative (slope) of the
    #                         regression function at the evaluation points (vector)
    #               - curvatures: estimates for the second derivative (curvature) of the
    #                             regression function at the evaluation points (vector)

    # Error messages
    if (length(X) != length(Y)) {stop("Length of X and Y has to be equal.")}
    if (bw <= 0) {stop("Bandwidth has to be strictly positive.")}
    if (TRUE %in% lapply(kernel_list, all.equal, kernel) == FALSE) {
        stop("Only supported kernels can be chosen.")
    }
    if (degree < 0 | typeof(degree) != "integer") {
        stop("degree has to be of type integer and nonnegative.")
    }

    # Warning messages
    if ((degree %% 2) == 0) {warning(strwrap("It is strongly recommended to use an
                                             odd order as odd order polynomial fits
                                             outperform even order ones.
                                             In particular, odd order fits are
                                             boundary adaptive.",
                                             prefix = " ", initial = ""))}

    # General messages
    if (degree > 1) {message(strwrap("It is recommended to use the lowest (odd) order,
                                     since the bandwidth is used to control the
                                     model complexity.",
                                     prefix = " ", initial = ""))}

    X_matrix <- matrix(NA, length(X), degree + 1)

    estimates <- rep(NA, length(x))
    effective_kernels <- matrix(NA, length(x), length(Y))
    slopes <- rep(NA, length(x))
    curvatures <- rep(NA, length(x))

    for (i in 1:length(x)) {

        for (j in 1:(degree + 1)) {

            X_matrix[, j] <- (X - x[i])^(j - 1)

        }

        W_matrix <- diag(kernel((X - x[i])/bw))

        auxiliary_matrix <- solve(t(X_matrix)%*%W_matrix%*%X_matrix) %*%
            t(X_matrix) %*% W_matrix

        effective_kernels[i, ] <- t(make_basis(degree + 1, 1)) %*% auxiliary_matrix
        estimates[i] <- effective_kernels[i, ] %*% Y

        if (degree >= 1) {
            slopes[i] <- t(make_basis(degree + 1, 2)) %*% auxiliary_matrix %*% Y
        }
        if (degree >= 2) {
            curvatures[i] <- 2 * t(make_basis(degree + 1, 3)) %*% auxiliary_matrix %*% Y
        }

    }

    list(estimates = estimates, effective_kernels = effective_kernels,
         slopes = slopes, curvatures = curvatures)

}

##########################
#    Boundary kernels    #
##########################

# Note:
#
# The following boundary kernels rely on M?ller (1991).
# However, the left boundary kernels of M?ller are right boundary kernels for us, and vice versa.
# This is because of the different definition of the kernel estimator concerning the argument of the kernel.
# M?ller uses K(x - X), whereas we define K(X - x).

# Left Uniform kernels
#' Left Uniform kernels
#'
#' Left Uniform kernels derived from Müller (1991, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. (1991).
#'     ``Smooth optimum kernel estimators near endpoints''.
#'     \emph{Biometrika} 78 (3), pp. 521--530.
uniform_left <- function(u, rho) {

    ifelse(u >= -rho & u <= 1, 1/(1+rho)*(1 + 3*((1-rho)/(1+rho))^2 - 6*((1-rho)/(1+rho)^2)*u), 0)

}

# Left Epanechnikov kernels
#' Left Epanechnikov kernels
#'
#' Left Epanechnikov kernels derived from Müller (1991, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. (1991).
#'     ``Smooth optimum kernel estimators near endpoints''.
#'     \emph{Biometrika} 78 (3), pp. 521--530.
epanechnikov_left <- function(u, rho) {

    ifelse(u >= -rho & u <= 1, 6*(1-u)*(rho+u)*(1/(1+rho)^3)*(1 + 5*((1-rho)/(1+rho))^2 - 10*((1-rho)/(1+rho)^2)*u), 0)

}

# Left Biweight kernels
#' Left Biweight kernels
#'
#' Left Biweight kernels derived from Müller (1991, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. (1991).
#'     ``Smooth optimum kernel estimators near endpoints''.
#'     \emph{Biometrika} 78 (3), pp. 521--530.
biweight_left <- function(u, rho) {

    ifelse(u >= -rho & u <= 1, 30*(1-u)^2*(rho+u)^2*(1/(1+rho)^5)*(1 + 7*((1-rho)/(1+rho))^2 - 14*((1-rho)/(1+rho)^2)*u), 0)

}

# Left Triweight kernels
#' Left Triweight kernels
#'
#' Left Triweight kernels derived from Müller (1991, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. (1991).
#'     ``Smooth optimum kernel estimators near endpoints''.
#'     \emph{Biometrika} 78 (3), pp. 521--530.
triweight_left <- function(u, rho) {

    ifelse(u >= -rho & u <= 1, 140*(1-u)^3*(rho+u)^3*(1/(1+rho)^7)*(1 + 9*((1-rho)/(1+rho))^2 - 18*((1-rho)/(1+rho)^2)*u), 0)

}

# Note:
#
# The following boundary kernels rely on M?ller and Wang (1994).
# These alternative kernels have greater asymptotic efficiency
# at the cost of discontinuity at their endpoints.

# Left alternative Uniform kernels
#' Left alternative Uniform kernels
#'
#' Left alternative Uniform kernels derived from Müller and Wang (1994, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. and J.-L. Wang (1994).
#'     ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#'     \emph{Biometrics} 50 (1), pp. 61--76.
uniform_alt_left <- function(u, rho) {

    ifelse(u >= -rho & u <= 1, 2/(1+rho)^3*(-3*(1-rho)*u + 2*(1-rho+rho^2)), 0)

}

# Left alternative Epanechnikov kernels
#' Left alternative Epanechnikov kernels
#'
#' Left alternative Epanechnikov kernels derived from Müller and Wang (1994, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. and J.-L. Wang (1994).
#'     ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#'     \emph{Biometrics} 50 (1), pp. 61--76.
epanechnikov_alt_left <- function(u, rho) {

    ifelse(u >= -rho & u <= 1, 12/(1+rho)^4*(1-u)*(-(1-2*rho)*u + 0.5*(3*rho^2-2*rho+1)), 0)

}

# Left alternative Biweight kernels
#' Left alternative Biweight kernels
#'
#' Left alternative Biweight kernels derived from Müller and Wang (1994, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. and J.-L. Wang (1994).
#'     ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#'     \emph{Biometrics} 50 (1), pp. 61--76.
biweight_alt_left <- function(u, rho) {

    ifelse(u >= -rho & u <= 1, 15/(1+rho)^5*(1-u)^2*(rho+u)*(-2*u*(5*(1-rho)/(1+rho)-1) + 3*rho-1 + 5*(1-rho)^2/(1+rho)), 0)

}

# Left alternative Triweight kernels
#' Left alternative Triweight kernels
#'
#' Left alternative Triweight kernels derived from Müller and Wang (1994, Table 1).
#'
#' @param u Kernel argument.
#' @param rho Determines the strength of the boundary adjustment (scalar).
#'
#' @export
#'
#' @references Müller, H.-G. and J.-L. Wang (1994).
#'     ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#'     \emph{Biometrics} 50 (1), pp. 61--76.
triweight_alt_left <- function(u, rho) {

    ifelse(u >= -rho & u <= 1, 70/(1+rho)^7*(1-u)^3*(rho+u)^2*(-2*u*(7*(1-rho)/(1+rho)-1) + 3*rho-1 + 7*(1-rho)^2/(1+rho)), 0)

}

# List with all boundary kernels
boundary_kernel_list <- list(uniform_left, epanechnikov_left, biweight_left, triweight_left,
                             uniform_alt_left, epanechnikov_alt_left, biweight_alt_left, triweight_alt_left)

#####################################################
#    Boundary-adjusted Nadaraya-Watson estimator    #
#####################################################

#' Boundary-adjusted Nadaraya-Watson estimator
#'
#' Explicit boundary adjustment via boundary kernels for the
#' Nadaraya-Watson estimator (local constant regression).
#'
#' When applying the Nadaraya-Watson estimator with a compactly supported
#' (on the interval \[-1, 1\]) kernel to data of a regressor that is
#' compactly supported, boundary effects arise over the region
#' \[\code{boundary_left}, \code{boundary_left} + \code{bw}) \eqn{\cup}
#' (\code{boundary_right} - \code{bw}, \code{boundary_right}\].
#' That is, the bias is of larger order compared to the interior.
#' The original order can be preserved by using special so-called boundary
#' kernels when applying Nadaraya-Watson at the boundaries
#' (e.g. Gasser and Müller, 1979).
#'
#' ``Smooth optimum boundary kernels'' (Müller, 1991, Table 1) are available:
#' \code{uniform_left}, \code{epanechnikov_left}, \code{biweight_left},
#' \code{triweight_left}.
#' Moreover, an alternative version of these kernels with greater asymptotic
#' efficiency at the cost of discontinuity at their endpoints
#' (Müller and Wang, 1994, Table 1) is available:
#' \code{uniform_alt_left}, \code{epanechnikov_alt_left}, \code{biweight_alt_left},
#' \code{triweight_alt_left}.
#'
#' The effective kernel at an evaluation point is the set of effectively
#' assigned weights in the smoothing process (see Hastie and Loader, 1993).
#'
#' @param x Evaluation points (vector).
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param bw Bandwidth (scalar).
#' @param kernel_interior Kernel used in the interior (function). Default is \code{epanechnikov}.
#' @param kernel_left Left boundary kernels (function). Default is \code{epanechnikov_left}.
#' @param boundary_left Lower boundary of the support of X (scalar).
#' @param boundary_right Upper boundary of the support of X (scalar).
#'
#' @return List containing:
#'     \item{estimates}{Estimates for the regression function at the evaluation points (vector).}
#'     \item{effective_kernels}{Effective kernels at the evaluation points (matrix).}
#' @export
#'
#' @references Gasser, T. and H.-G. Müller (1979).
#'     ``Kernel estimation of regression functions''.
#'     In: \emph{Smoothing Techniques for Curve Estimation}.
#'     Ed. by T. Gasser and M. Rosenblatt.
#'     Berlin: Springer, pp. 23--68.
#'
#'     Müller, H.-G. (1991).
#'     ``Smooth optimum kernel estimators near endpoints''.
#'     \emph{Biometrika} 78 (3), pp. 521--530.
#'
#'     Müller, H.-G. and J.-L. Wang (1994).
#'     ``Hazard rate estimation under random censoring with varying kernels and bandwidths''.
#'     \emph{Biometrics} 50 (1), pp. 61--76.
#'
#'     Hastie, T. and C. Loader (1993).
#'     ``Local regression: Automatic kernel carpentry''.
#'     \emph{Statistical Science} 8 (2), pp. 120--129.
#'
#' @examples
#' m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' n <- 100 # Sample size
#' X <- seq(0, 1, length.out = n) # Data for the regressor
#' m_X <- m_fun(X) # True values of regression function
#' epsilon <- rnorm(n, sd = 0.25) # Error term
#' Y <- m_X + epsilon # Data for the regressand
#' bw <- 0.2 # Bandwidth
#' x <- seq(0, 1, length.out = n/2) # Evaluation points
#'
#' output_NW_boundary <- NW_boundary(x = x, X = X, Y = Y, bw = bw,
#'                                   kernel_interior = epanechnikov, kernel_left = epanechnikov_left,
#'                                   boundary_left = 0, boundary_right = 1)
#' estimates_NW_boundary <- output_NW_boundary$estimates
#' effective_kernels_NW_boundary <- output_NW_boundary$effective_kernels
NW_boundary <- function(x, X, Y, bw,
                        kernel_interior = epanechnikov, kernel_left = epanechnikov_left,
                        boundary_left, boundary_right) {

    # input: - x: evaluation points (vector)
    #        - X: data for the regressor (vector)
    #        - Y: data for the regressand (vector)
    #        - bw: bandwidth (scalar)
    #        - kernel_interior: kernel used in the interior (function), with default
    #        - kernel_left: left boundary kernels (function), with default
    #        - boundary_left: lower boundary of the support of X (scalar)
    #        - boundary_right: upper boundary of the support of X (scalar)

    # output: - list containing
    #               - estimates: estimates for the regression function
    #                            at the evaluation points (vector)
    #               - effective_kernels: effective kernels
    #                                    at the evaluation points (matrix)

    # Error messages
    if (length(X) != length(Y)) {stop("Length of X and Y has to be equal.")}
    if (bw <= 0) {stop("Bandwidth has to be strictly positive.")}
    if (TRUE %in% lapply(kernel_list, all.equal, kernel_interior) == FALSE) {
        stop("Only supported kernels can be chosen.")
    }
    if (TRUE %in% lapply(boundary_kernel_list, all.equal, kernel_left) == FALSE) {
        stop("Only supported boundary kernels can be chosen.")
    }
    if (boundary_left > min(X)) {
        stop("The lower boundary cannot be larger than the smallest observed value of the regressor X.")
    }
    if (boundary_right < max(X)) {
        stop("The upper boundary cannot be smaller than the largest observed value of the regressor X.")
    }

    absolute_weights <- matrix(NA, length(x), length(Y))

    for (i in 1:length(x)) {

        if (x[i] >= boundary_left & x[i] < boundary_left + bw) {

            absolute_weights[i, ] <- kernel_left(u = (X - x[i])/bw, rho = (x[i] - boundary_left)/bw)

        } else if (x[i] >= boundary_left + bw & x[i] <= boundary_right - bw) {

            absolute_weights[i, ] <- kernel_interior((X - x[i])/bw)

        } else if (x[i] > boundary_right - bw & x[i] <= boundary_right) {

            absolute_weights[i, ] <- kernel_left(u = -(X - x[i])/bw, rho = (boundary_right - x[i])/bw)

        }

    }

    effective_kernels <- absolute_weights/rowSums(absolute_weights)

    estimates <- as.vector(effective_kernels %*% Y)

    list(estimates = estimates, effective_kernels = effective_kernels)

}

######################################################
#    Leave-one-out cross-validation (LOOCV) error    #
######################################################

#' Leave-one-out cross-validation (LOOCV) error
#'
#' Leave-one-out cross-validation (LOOCV) error for a given bandwidth, either
#' for the local polynomial estimator or the boundary-adjusted
#' Nadaraya-Watson estimator.
#'
#' The LOOCV error function is the mean squared prediction errors.
#'
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param kernel Kernel (function). Default is \code{epanechnikov}.
#' @param bw Bandwidth (scalar).
#' @param degree Degree of the locally fitted polynomial (integer). Default is \code{1L}.
#' @param kernel_left Left boundary kernels (function). Default is \code{epanechnikov_left}.
#' @param boundary_left Lower boundary of the support of X (scalar). Default is \code{NA}.
#' @param boundary_right Upper boundary of the support of X (scalar). Default is \code{NA}.
#' @param boundary_adjustment Explicit boundary adjustment (boolean). Default is \code{FALSE}.
#'
#' @return LOOCV error for bandwidth \code{bw} (scalar).
#' @export
#'
#' @examples
#' #m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' #n <- 100 # Sample size
#' #X <- seq(0, 1, length.out = n) # Data for the regressor
#' #m_X <- m_fun(X) # True values of regression function
#' #epsilon <- rnorm(n, sd = 0.25) # Error term
#' #Y <- m_X + epsilon # Data for the regressand
#' #bw <- 0.2 # Bandwidth
#'
#' # Local polynomial estimator
#' #CV_error_fun(X = X, Y = Y, kernel = epanechnikov, bw = bw, degree = 1L,
#' #             kernel_left = epanechnikov_left,
#'  #            boundary_left = NA, boundary_right = NA,
#' #             boundary_adjustment = FALSE)
#'
#' # Boundary-adjusted Nadaraya-Watson estimator
#' #CV_error_fun(X = X, Y = Y, kernel = epanechnikov, bw = bw, degree = 0L,
#'  #            kernel_left = epanechnikov_left,
#'  #            boundary_left = 0, boundary_right = 1,
#'   #           boundary_adjustment = TRUE)
CV_error_fun <- function(X, Y, kernel = epanechnikov, bw, degree = 1L,
                         kernel_left = epanechnikov_left,
                         boundary_left = NA, boundary_right = NA,
                         boundary_adjustment = FALSE) {

    # input: - X: data for the regressor (vector)
    #        - Y: data for the regressand (vector)
    #        - kernel: kernel (function), with default
    #        - bw: bandwidth (scalar)
    #        - degree: degree of the locally fitted polynomial (integer), with default
    #        - kernel_left: left boundary kernels (function), with default
    #        - boundary_left: lower boundary of the support of X (scalar), with default
    #        - boundary_right: upper boundary of the support of X (scalar), with default
    #        - boundary_adjustment: explicit boundary adjustment (boolean), with default

    # output: - CV_error: LOOCV error for bandwidth bw (scalar)

    # Error messages
    if (degree != 0L & boundary_adjustment == TRUE) {
        stop("Explicit boundary adjustment can only be used for the Nadaraya-Watson estimator (degree = 0).")
    }

    if (boundary_adjustment == FALSE) {

        LP_output <- LP(x = X, X = X, Y = Y, kernel = kernel, bw = bw, degree = degree)
        estimates <- LP_output$estimates
        effective_kernels <- LP_output$effective_kernels

    } else if (boundary_adjustment == TRUE) {

        NW_boundary_output <- NW_boundary(x = X, X = X, Y = Y, bw = bw,
                                          kernel_interior = kernel, kernel_left = kernel_left,
                                          boundary_left = boundary_left, boundary_right = boundary_right)
        estimates <- NW_boundary_output$estimates
        effective_kernels <- NW_boundary_output$effective_kernels

    }

    CV_error <- 1/length(Y) * sum(((Y - estimates)/(1 - diag(effective_kernels)))^2)
    return(CV_error)

}

##############################
#    CV optimal bandwidth    #
##############################

#' CV optimal bandwidth
#'
#' Selection of the bandwidth that yields the smallest leave-one-out
#' cross-validation (LOOCV) error over a supplied grid.
#'
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param kernel Kernel (function). Default is \code{epanechnikov}.
#' @param bw_grid Bandwidth grid to find global minimum of CV error (vector).
#' @param degree Degree of the locally fitted polynomial (integer). Default is \code{1L}.
#' @param kernel_left Left boundary kernels (function). Default is \code{epanechnikov_left}.
#' @param boundary_left Lower boundary of the support of X (scalar). Default is \code{NA}.
#' @param boundary_right Upper boundary of the support of X (scalar). Default is \code{NA}
#' @param boundary_adjustment Explicit boundary adjustment (boolean). Default is \code{FALSE}.
#' @param plot Plot CV error over bandwidth grid (boolean). Default is \code{TRUE}.
#'
#' @return CV optimal bandwidth (scalar) and plot of CV errors if plot == TRUE.
#' @export
#'
#' @importFrom graphics abline legend rug
#'
#' @examples
#' m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' n <- 100 # Sample size
#' X <- seq(0, 1, length.out = n) # Data for the regressor
#' m_X <- m_fun(X) # True values of regression function
#' epsilon <- rnorm(n, sd = 0.25) # Error term
#' Y <- m_X + epsilon # Data for the regressand
#' bw_grid <- seq(0.05, 0.2, length.out = 200)
#'
#' # Local polynomial estimator
#' bw_CV_fun(X = X, Y = Y, kernel = epanechnikov, bw_grid = bw_grid, degree = 1L,
#'           kernel_left = epanechnikov_left,
#'           boundary_left = NA, boundary_right = NA,
#'           boundary_adjustment = FALSE,
#'           plot = TRUE)
#'
#' # Boundary-adjusted Nadaraya-Watson estimator
#' bw_CV_fun(X = X, Y = Y, kernel = epanechnikov, bw_grid = bw_grid, degree = 0L,
#'           kernel_left = epanechnikov_left,
#'           boundary_left = 0, boundary_right = 1,
#'           boundary_adjustment = TRUE,
#'           plot = TRUE)
#'
#' @seealso \code{\link{CV_error_fun}}
bw_CV_fun <- function(X, Y, kernel = epanechnikov, bw_grid, degree = 1L,
                      kernel_left = epanechnikov_left,
                      boundary_left = NA, boundary_right = NA,
                      boundary_adjustment = FALSE,
                      plot = TRUE) {

    # input: - X: data for the regressor (vector)
    #        - Y: data for the regressand (vector)
    #        - kernel: kernel (function), with default
    #        - bw_grid: bandwidth grid to find global minimum of CV error (vector)
    #        - degree: degree of the locally fitted polynomial (integer), with default
    #        - kernel_left: left boundary kernels (function), with default
    #        - boundary_left: lower boundary of the support of X (scalar), with default
    #        - boundary_right: upper boundary of the support of X (scalar), with default
    #        - boundary_adjustment: explicit boundary adjustment (boolean), with default
    #        - plot: plot CV error over bandwidth grid (boolean), with default

    # output: - bw_CV: CV optimal bandwidth (scalar)
    #         - plot of CV errors if plot == TRUE (plot)

    CV_errors <- sapply(bw_grid, function(bw) {

        CV_error_fun(X = X, Y = Y, kernel = kernel, bw, degree = degree,
                     kernel_left = kernel_left,
                     boundary_left = boundary_left, boundary_right = boundary_right,
                     boundary_adjustment = boundary_adjustment)

    })

    bw_CV <- bw_grid[which.min(CV_errors)]

    if (bw_CV == bw_grid[1] | bw_CV == bw_grid[length(bw_grid)]) {
        warning("Selected bandwidth equals the smallest or largest grid value.
                You may wish to expand the grid after consulting the plot of the CV errors.")
    }

    if (plot == TRUE) {

        plot(bw_grid, CV_errors,
             xlab = "bw", ylab = "CV error", cex.lab = 1.25, cex.axis = 1.25, cex = 0.75)
        rug(bw_grid, ticksize = 0.015)
        abline(v = bw_CV, col = "red")
        legend("top", legend = bquote(paste("bw"["CV"]*" = ", .(round(bw_CV, 3)))), bty = "n", cex = 1.25)

    }

    return(bw_CV)

}

############################################################################
#    Asymptotic confidence intervals for the local polynomial estimator    #
############################################################################

#' Asymptotic confidence intervals for the local polynomial estimator
#'
#' @param x Evaluation points (vector).
#' @param X Data for the regressor (vector).
#' @param Y Data for the regressand (vector).
#' @param kernel Kernel (function). Default is \code{epanechnikov}.
#' @param bw Bandwidth (scalar).
#' @param degree Degree of the locally fitted polynomial (integer). Default is \code{1L}.
#' @param alpha Significance level (scalar). Default is \code{0.05}.
#'
#' @return List containing:
#'     \item{confidence_intervals_lower}{Lower points for the (1 - alpha) asymptotic confidence intervals (vector).}
#'     \item{confidence_intervals_upper}{Upper points for the (1 - alpha) asymptotic confidence intervals (vector).}
#' @export
#'
#' @importFrom stats qnorm
#'
#' @examples
#' m_fun <- function(x) {sin(2*pi*x)} # True regression function
#' n <- 100 # Sample size
#' X <- seq(0, 1, length.out = n) # Data for the regressor
#' m_X <- m_fun(X) # True values of regression function
#' epsilon <- rnorm(n, sd = 0.25) # Error term
#' Y <- m_X + epsilon # Data for the regressand
#' bw <- 0.2 # Bandwidth
#' x <- seq(0, 1, length.out = n/2) # Evaluation points
#'
#' output_confidence_intervals_LP <- confidence_intervals_LP(x = x, X = X, Y = Y, bw = bw,
#'                                                           alpha = 0.05)
#' confidence_intervals_lower <- output_confidence_intervals_LP$confidence_intervals_lower
#' confidence_intervals_upper <- output_confidence_intervals_LP$confidence_intervals_upper
confidence_intervals_LP <- function(x, X, Y, kernel = epanechnikov, bw, degree = 1L, alpha = 0.05) {

    # input: - x: evaluation points (vector)
    #        - X: data for the regressor (vector)
    #        - Y: data for the regressand (vector)
    #        - kernel: kernel (function), with default
    #        - bw: bandwidth (scalar)
    #        - degree: degree of the locally fitted polynomial (integer), with default
    #        - alpha: significance level (scalar), with default

    # output: - list containing
    #               - confidence_intervals_lower:
    #                 lower points for the (1 - alpha) asymptotic confidence intervals (vector)
    #               - confidence_intervals_upper:
    #                 upper points for the (1 - alpha) asymptotic confidence intervals (vector)

    # Error messages
    if (alpha <= 0 | alpha >= 1) {
        stop("The significance level has to be between zero and one.")
    }

    LP_output <- LP(x = x, X = X, Y = Y, kernel = kernel, bw = bw, degree = degree)
    estimates <- LP_output$estimates
    effective_kernels <- LP_output$effective_kernels

    squared_prediction_errors <- ((Y - estimates)/(1 - diag(effective_kernels)))^2
    error_matrix <- diag(squared_prediction_errors)

    X_matrix <- matrix(NA, length(X), degree + 1)
    estimates_variance <- rep(NA, length(x))

    for (i in 1:length(x)) {

        for (j in 1:(degree + 1)) {

            X_matrix[, j] <- (X - x[i])^(j - 1)

        }

        W_matrix <- diag(kernel((X - x[i])/bw))

        auxiliary_matrix <- solve(t(X_matrix)%*%W_matrix%*%X_matrix)
        estimate_covariance_matrix <- auxiliary_matrix %*%
            t(X_matrix)%*%W_matrix%*%error_matrix%*%W_matrix%*%X_matrix %*%
            auxiliary_matrix
        estimates_variance[i] <- estimate_covariance_matrix[1, 1]

    }

    confidence_intervals_LP_lower <- estimates - qnorm(p = 1 - alpha/2) * sqrt(estimates_variance)
    confidence_intervals_LP_upper <- estimates + qnorm(p = 1 - alpha/2) * sqrt(estimates_variance)

    list(confidence_intervals_lower = confidence_intervals_LP_lower,
         confidence_intervals_upper = confidence_intervals_LP_upper)

}
svjaco/lpreba documentation built on March 4, 2022, 12:42 a.m.