R/lpdensity.R
In lpdensity: Local Polynomial Density Estimation and Inference

Documented in lpdensity

################################################################################
#' @title  Local Polynomial Density Estimation and Inference
#'
#' @description \code{\link{lpdensity}} implements the local polynomial regression based density (and derivatives)
#'   estimator proposed in Cattaneo, Jansson and Ma (2020).  Robust bias-corrected inference methods,
#'   both pointwise (confidence intervals) and uniform (confidence bands), are also implemented
#'   following the results in Cattaneo, Jansson and Ma (2020, 2023).
#'   See Cattaneo, Jansson and Ma (2022) for more implementation details and illustrations.
#'
#'   Companion command: \code{\link{lpbwdensity}} for bandwidth selection.
#'
#'   Related \code{Stata} and \code{R} packages useful for nonparametric estimation and inference are
#'   available at \url{https://nppackages.github.io/}.
#'
#' @param data Numeric vector or one dimensional matrix/data frame, the raw data.
#' @param grid Numeric, specifies the grid of evaluation points. When set to default, grid points
#'   will be chosen as 0.05-0.95 percentiles of the data, with a step size of 0.05.
#' @param bw Numeric, specifies the bandwidth
#'   used for estimation. Can be (1) a positive scalar (common
#'   bandwidth for all grid points); or (2) a positive numeric vector specifying bandwidths for
#'   each grid point (should be the same length as \code{grid}).
#' @param p Nonnegative integer, specifies the order of the local polynomial used to construct point
#'   estimates. (Default is \code{2}.)
#' @param q Nonnegative integer, specifies the order of the local polynomial used to construct
#'   confidence intervals/bands (a.k.a. the bias correction order). Default is \code{p+1}. When set to be
#'   the same as \code{p}, no bias correction will be performed. Otherwise it should be
#'   strictly larger than \code{p}.
#' @param v Nonnegative integer, specifies the derivative of the distribution function to be estimated. \code{0} for
#'   the distribution function, \code{1} (default) for the density funtion, etc.
#' @param kernel String, specifies the kernel function, should be one of \code{"triangular"}, \code{"uniform"}, and
#'   \code{"epanechnikov"}.
#' @param scale Numeric, specifies how
#'   estimates are scaled. For example, setting this parameter to 0.5 will scale down both the
#'   point estimates and standard errors by half. Default is \code{1}. This parameter is useful if only
#'   part of the sample is employed for estimation, and should not be confused with \code{Cweights}
#'   or \code{Pweights}.
#' @param massPoints \code{TRUE} (default) or \code{FALSE}, specifies whether point estimates and standard errors
#'   should be adjusted if there are mass points in the data.
#' @param bwselect String, specifies the method for data-driven bandwidth selection. This option will be
#'   ignored if \code{bw} is provided. Options are (1) \code{"mse-dpi"} (default, mean squared error-optimal
#'   bandwidth selected for each grid point); (2) \code{"imse-dpi"} (integrated MSE-optimal bandwidth,
#'   common for all grid points); (3) \code{"mse-rot"} (rule-of-thumb bandwidth with Gaussian
#'   reference model); and (4) \code{"imse-rot"} (integrated rule-of-thumb bandwidth with Gaussian
#'   reference model).
#' @param stdVar \code{TRUE} (default) or \code{FALSE}, specifies whether the data should be standardized for
#'   bandwidth selection.
#' @param regularize \code{TRUE} (default) or \code{FALSE}, specifies whether the bandwidth should be
#'   regularized. When set to \code{TRUE}, the bandwidth is chosen such that the local region includes
#'   at least \code{nLocalMin} observations and at least \code{nUniqueMin} unique observations.
#' @param nLocalMin Nonnegative integer, specifies the minimum number of observations in each local neighborhood. This option
#'   will be ignored if \code{regularize=FALSE}. Default is \code{20+p+1}.
#' @param nUniqueMin Nonnegative integer, specifies the minimum number of unique observations in each local neighborhood. This option
#'   will be ignored if \code{regularize=FALSE}. Default is \code{20+p+1}.
#' @param Cweights, Numeric, specifies the weights used
#'   for counterfactual distribution construction. Should have the same length as the data.
#' @param Pweights Numeric, specifies the weights used
#'   in sampling. Should have the same length as the data.
#'
#' @return
#' \item{Estimate}{A matrix containing (1) \code{grid} (grid points), (2) \code{bw} (bandwidths),
#'   (3) \code{nh} (number of observations in each local neighborhood),
#'   (4) \code{nhu} (number of unique observations in each local neighborhood),
#'   (5) \code{f_p} (point estimates with p-th order local polynomial),
#'   (6) \code{f_q} (point estimates with q-th order local polynomial, only if option \code{q} is nonzero),
#'   (7) \code{se_p} (standard error corresponding to \code{f_p}), and (8) \code{se_q} (standard error
#'   corresponding to \code{f_q}).}
#' \item{CovMat_p}{The variance-covariance matrix corresponding to \code{f_p}.}
#' \item{CovMat_q}{The variance-covariance matrix corresponding to \code{f_q}.}
#' \item{opt}{A list containing options passed to the function.}
#'
#' @details Bias correction is only used for the construction of confidence intervals/bands, but not for point
#'   estimation. The point estimates, denoted by \code{f_p}, are constructed using local polynomial estimates
#'   of order \code{p}, while the centering of the confidence intervals/bands, denoted by \code{f_q}, are constructed
#'   using local polynomial estimates of order \code{q}. The confidence intervals/bands take the form:
#'   \code{[f_q - cv * SE(f_q) , f_q + cv * SE(f_q)]}, where \code{cv} denotes the appropriate critical value and \code{SE(f_q)}
#'   denotes an standard error estimate for the centering of the confidence interval/band. As a result,
#'   the confidence intervals/bands may not be centered at the point estimates because they have been bias-corrected.
#'   Setting \code{q} and \code{p} to be equal results on centered at the point estimate confidence intervals/bands,
#'   but requires undersmoothing for valid inference (i.e., (I)MSE-optimal bandwdith for the density point estimator
#'   cannot be used). Hence the bandwidth would need to be specified manually when \code{q=p}, and the
#'   point estimates will not be (I)MSE optimal. See Cattaneo, Jansson and Ma (2020, 2023) for details, and also
#'   Calonico, Cattaneo, and Farrell (2018, 2022) for robust bias correction methods.
#'
#'   Sometimes the density point estimates may lie outside of the confidence intervals/bands, which can happen
#'   if the underlying distribution exhibits high curvature at some evaluation point(s). One possible solution
#'   in this case is to increase the polynomial order \code{p} or to employ a smaller bandwidth.
#'
#' @references
#'   Calonico, S., M. D. Cattaneo, and M. H. Farrell. 2018. On the Effect of Bias Estimation on Coverage Accuracy in Nonparametric Inference. \emph{Journal of the American Statistical Association}, 113(522): 767-779. \doi{10.1080/01621459.2017.1285776}
#'
#'   Calonico, S., M. D. Cattaneo, and M. H. Farrell. 2022. Coverage Error Optimal Confidence Intervals for Local Polynomial Regression. \emph{Bernoulli}, 28(4): 2998-3022. \doi{10.3150/21-BEJ1445}
#'
#'   Cattaneo, M. D., M. Jansson, and X. Ma. 2020. Simple Local Polynomial Density Estimators. \emph{Journal of the American Statistical Association}, 115(531): 1449-1455. \doi{10.1080/01621459.2019.1635480}
#'
#'   Cattaneo, M. D., M. Jansson, and X. Ma. 2022. lpdensity: Local Polynomial Density Estimation and Inference. \emph{Journal of Statistical Software}, 101(2): 1–25. \doi{10.18637/jss.v101.i02}
#'
#'   Cattaneo, M. D., M. Jansson, and X. Ma. 2023. Local Regression Distribution Estimators. \emph{Journal of Econometrics}, 240(2): 105074. \doi{10.1016/j.jeconom.2021.01.006}
#'
#' @author
#' Matias D. Cattaneo, Princeton University. \email{cattaneo@princeton.edu}.
#'
#' Michael Jansson, University of California Berkeley. \email{mjansson@econ.berkeley.edu}.
#'
#' Xinwei Ma (maintainer), University of California San Diego. \email{x1ma@ucsd.edu}.
#'
#' @seealso Supported methods: \code{\link{coef.lpdensity}}, \code{\link{confint.lpdensity}}, \code{\link{plot.lpdensity}}, \code{\link{print.lpdensity}}, \code{\link{summary.lpdensity}}, \code{\link{vcov.lpdensity}}.
#'
#' @examples
#' # Generate a random sample
#' set.seed(42); X <- rnorm(2000)
#'
#' # Estimate density and report results
#' est1 <- lpdensity(data = X, bwselect = "imse-dpi")
#' summary(est1)
#'
#' # Report results for a subset of grid points
#' summary(est1, grid=est1$Estimate[4:10, "grid"])
#' summary(est1, gridIndex=4:10)
#'
#' # Report the 99% uniform confidence band
#' set.seed(42) # fix the seed for simulating critical values
#' summary(est1, alpha=0.01, CIuniform=TRUE)
#'
#' # Plot the estimates and confidence intervals
#' plot(est1, legendTitle="My Plot", legendGroups=c("X"))
#'
#' # Plot the estimates and the 99% uniform confidence band
#' set.seed(42) # fix the seed for simulating critical values
#' plot(est1, alpha=0.01, CIuniform=TRUE, legendTitle="My Plot", legendGroups=c("X"))
#'
#' # Adding a histogram to the background
#' plot(est1, legendTitle="My Plot", legendGroups=c("X"),
#'   hist=TRUE, histData=X, histBreaks=seq(-1.5, 1.5, 0.25))
#'
#' @export
lpdensity <- function(data, grid=NULL, bw=NULL, p=NULL, q=NULL, v=NULL,
                      kernel=c("triangular", "uniform", "epanechnikov"),
                      scale=NULL, massPoints=TRUE,
                      bwselect=c("mse-dpi", "imse-dpi", "mse-rot", "imse-rot"),
                      stdVar=TRUE,
                      regularize=TRUE, nLocalMin=NULL, nUniqueMin=NULL,
                      Cweights=NULL, Pweights=NULL) {
  ################################################################################
  # Input Error Handling
  ################################################################################
  # data
  data <- as.vector(data)
  if (any(is.na(data))) {
    warning(paste(sum(is.na(data)), " missing ", switch((sum(is.na(data))>1)+1, "observation is", "observations are"), " ignored.\n", sep=""))
    data <- data[!is.na(data)]
  }

  n <- length(data)
  if (!is.numeric(data) | length(data)==0) {
    stop("Data should be numeric, and cannot be empty.\n")
  }

  # grid
  if (length(grid) == 0) {
    flag_no_grid <- TRUE
    grid <- quantile(data, seq(from=0.05, to=0.95, by=0.05))
    ng <- length(grid)
  } else {
    flag_no_grid <- FALSE
    grid <- as.vector(grid)
    ng <- length(grid)
    if(!is.numeric(grid)) {
      stop("Grid points should be numeric.\n")
    }
  }

  # bw
  if (length(bw) == 0) {
    flag_no_bw <- TRUE
    if (length(bwselect) == 0) {
      bwselect <- "mse-dpi"
    } else {
      bwselect <- tolower(bwselect[1])
      # for
    }
    ### BACKWARD COMPATIBILITY FOR VERSION 0.1
    if (bwselect == "mse" ) { bwselect <- "mse-dpi" }
    if (bwselect == "imse") { bwselect <- "imse-dpi" }
    if (bwselect == "rot" ) { bwselect <- "mse-rot" }
    if (bwselect == "irot") { bwselect <- "imse-rot" }
    ### END BACKWARD COMPATIBILITY FOR VERSION 0.1

    if (!bwselect%in%c("mse-dpi", "imse-dpi", "mse-rot", "imse-rot")) stop("Incorrect bandwidth selection method specified.\n")
  } else if (length(bw) == 1) {
    if (!is.numeric(bw) | bw <= 0) {
      stop("Bandwidth incorrectly specified.\n")
    } else {
      flag_no_bw <- FALSE
      bw <- rep(bw, ng)
      bwselect <- "user provided"
    }
  } else {
    bw <- as.vector(bw)
    if (!is.numeric(bw)) {
      stop("Bandwidth incorrectly specified.\n")
    } else if (length(bw) != ng) {
      stop("Bandwidth has to be the same length as grid.\n")
    } else {
      bwselect <- "user provided"
      flag_no_bw <- FALSE
    }
  }

  # p
  if (length(p) == 0) {
    flag_no_p <- TRUE
    p <- 2
  } else if ((length(p) != 1) | !(p[1]%in%0:20)) {
    stop("Polynomial order p incorrectly specified.\n")
  } else {
    flag_no_p <- FALSE
  }

  # q
  if (length(q) == 0) {
    flag_no_q <- TRUE
    q <- p + 1
  } else if ((length(q) != 1) | !(q[1]%in%c(0:20)) | (q[1]<p)) {
    stop("Polynomial order (for bias correction) q incorrectly specified.\n")
  } else {
    flag_no_q <- FALSE
  }

  # v
  if (length(v) == 0) {
    flag_no_v <- TRUE
    v <- min(1, p)
  } else if ((length(v) != 1) | !(v[1]%in%c(0:20)) | (v[1]>p)) {
    stop("Derivative order v incorrectly specified.\n")
  } else {
    flag_no_v <- FALSE
  }

  # kernel
  if (length(kernel) == 0) {
    flag_no_kernel <- TRUE
    kernel <- "triangular"
  } else {
    kernel <- tolower(kernel)
    kernel <- kernel[1]
    if (!kernel%in%c("triangular", "uniform", "epanechnikov")) {
      stop("Kernel function incorrectly specified.\n")
    } else {
      flag_no_kernel <- FALSE
    }
  }

  # Cweights
  if (length(Cweights) == 0) {
    flag_no_Cweights <- TRUE
    Cweights <- rep(1, n)
  } else if (!is.numeric(Cweights)) {
    stop("Counterfactual weights incorrectly specified.\n")
  } else if (length(Cweights) != n) {
    stop("Counterfactual weights should have the same length as sample.\n")
  } else {
    flag_no_Cweights <- FALSE
  }

  # Pweights
  if (length(Pweights) == 0) {
    flag_no_Pweights <- TRUE
    Pweights <- rep(1, n)
  } else if (!is.numeric(Pweights)) {
    stop("Probability weights incorrectly specified.\n")
  } else if (length(Pweights) != n) {
    stop("Probability weights should have the same length as sample.\n")
  } else if (any(Pweights < 0)) {
    stop("Probability weights should be nonnegative.\n")
  } else{
    flag_no_Pweights <- FALSE
  }

  # scale
  if (length(scale) == 0) {
    flag_no_scale <- TRUE
    scale <- 1
  } else if (length(scale) != 1 | !is.numeric(scale) | scale <= 0) {
    stop("Scale incorrectly specified.\n")
  } else {
    flag_no_scale <- FALSE
  }

  # regularize
  if (length(regularize) == 0) {
    regularize <- TRUE
  } else if (length(regularize) > 1 | !regularize[1]%in%c(TRUE, FALSE)) {
    stop("Regularization parameter incorrectly specified.\n")
  }

  # massPoints
  if (length(massPoints) == 0) {
    massPoints <- TRUE
  } else if (length(massPoints) > 1 | !massPoints[1]%in%c(TRUE, FALSE)) {
    stop("Option massPoints incorrectly specified.\n")
  }

  # stdVar
  if (length(stdVar) == 0) {
    stdVar <- TRUE
  } else if (length(stdVar) > 1 | !stdVar[1]%in%c(TRUE, FALSE)) {
    stop("Option stdVar incorrectly specified.\n")
  }

  # nLocalMin
  if (length(nLocalMin) == 0) { nLocalMin <- 20 + p + 1 }
  if (!is.numeric(nLocalMin) | is.na(nLocalMin)) {
    stop("Option nLocalMin incorrectly specified.\n")
  } else if (ceiling(nLocalMin) < 0) {
    stop("Option nLocalMin incorrectly specified.\n")
  } else {
    nLocalMin <- ceiling(nLocalMin)
  }

  # nUniqueMin
  if (length(nUniqueMin) == 0) { nUniqueMin <- 20 + p + 1 }
  if (!is.numeric(nUniqueMin) | is.na(nUniqueMin)) {
    stop("Option nUniqueMin incorrectly specified.\n")
  } else if (ceiling(nUniqueMin) < 0) {
    stop("Option nUniqueMin incorrectly specified.\n")
  } else {
    nUniqueMin <- ceiling(nUniqueMin)
  }

  ################################################################################
  # Sample trimming
  ################################################################################
  trim_index <- (Pweights == 0)
  if (all(trim_index)) {
    stop("All weights are zero.\n")
  } else {
    data <- data[!trim_index]
    Cweights <- Cweights[!trim_index]
    Pweights <- Pweights[!trim_index]
  }
  if (abs(sum(Cweights * Pweights)) <= .Machine$double.eps * 10) {
    stop("Composited weights (Cweights * Pweights) are numerically zero.\n")
  }

  ################################################################################
  # Bandwidth selection
  ################################################################################

  if (flag_no_bw & bwselect[1]=="mse-rot") {
    bw <- bw_ROT( data=data, grid=grid, p=p, v=v, kernel=kernel, Cweights=Cweights, Pweights=Pweights, massPoints=massPoints, stdVar=stdVar, regularize=regularize, nLocalMin=nLocalMin, nUniqueMin=nUniqueMin)
  }
  if (flag_no_bw & bwselect[1]=="imse-rot") {
    bw <- bw_IROT(data=data, grid=grid, p=p, v=v, kernel=kernel, Cweights=Cweights, Pweights=Pweights, massPoints=massPoints, stdVar=stdVar, regularize=regularize, nLocalMin=nLocalMin, nUniqueMin=nUniqueMin)
    bw <- rep(bw, length(grid))
  }
  if (flag_no_bw & bwselect[1]=="mse-dpi") {
    bw <- bw_MSE( data=data, grid=grid, p=p, v=v, kernel=kernel, Cweights=Cweights, Pweights=Pweights, massPoints=massPoints, stdVar=stdVar, regularize=regularize, nLocalMin=nLocalMin, nUniqueMin=nUniqueMin)
  }
  if (flag_no_bw & bwselect[1]=="imse-dpi") {
    bw <- bw_IMSE(data=data, grid=grid, p=p, v=v, kernel=kernel, Cweights=Cweights, Pweights=Pweights, massPoints=massPoints, stdVar=stdVar, regularize=regularize, nLocalMin=nLocalMin, nUniqueMin=nUniqueMin)
    bw <- rep(bw, length(grid))
  }

  ################################################################################
  # Point Estimation
  ################################################################################
  Temp_Result <- lpdensity_fn(data=data, grid=grid, bw=bw, p=p, q=q, v=v, kernel=kernel,
                       Cweights=Cweights, Pweights=Pweights, massPoints=massPoints, showSE=TRUE)

  Temp_Result$Estimate[, c("f_p", "f_q", "se_p", "se_q")] <- Temp_Result$Estimate[, c("f_p", "f_q", "se_p", "se_q")] * scale
  rownames(Temp_Result$Estimate) <- 1:ng

  Temp_Result$CovMat_p <- Temp_Result$CovMat_p * scale^2
  Temp_Result$CovMat_q <- Temp_Result$CovMat_q * scale^2
  rownames(Temp_Result$CovMat_p) <- colnames(Temp_Result$CovMat_p) <- 1:ng
  rownames(Temp_Result$CovMat_q) <- colnames(Temp_Result$CovMat_q) <- 1:ng

  Result <- list(Estimate=Temp_Result$Estimate,
                 CovMat_p=Temp_Result$CovMat_p,
                 CovMat_q=Temp_Result$CovMat_q,
                 opt=list(
                   p=p, q=q, v=v, kernel=kernel, scale=scale, massPoints=massPoints, n=n, ng=ng,
                   bwselect=bwselect, stdVar=stdVar,
                   regularize=regularize, nLocalMin=nLocalMin, nUniqueMin=nUniqueMin,
                   data_min=min(data), data_max=max(data),
                   grid_min=min(grid), grid_max=max(grid)
                 ))
  class(Result) <- c("lpdensity")

  ################################################################################
  # Return
  ################################################################################
  return(Result)
}