| crs | R Documentation | 
crs computes a regression spline estimate of a
one (1) dimensional dependent variable on an r-dimensional
vector of continuous and categorical
(factor/ordered) predictors (Ma and
Racine (2013), Ma, Racine and Yang (2015)).
crs(...)
## Default S3 method:
crs(xz,
    y,
    degree = NULL,
    segments = NULL,
    include = NULL,
    kernel = TRUE,
    lambda = NULL,
    complexity = c("degree-knots","degree","knots"),
    knots = c("quantiles","uniform","auto"),
    basis = c("auto","additive","tensor","glp"),
    deriv = 0,
    data.return = FALSE,
    prune = FALSE,
    model.return = FALSE,
    tau = NULL,
    weights = NULL,
    ...)
## S3 method for class 'formula'
crs(formula,
    data = list(),
    degree = NULL,
    segments = NULL,
    include = NULL,
    degree.max = 10, 
    segments.max = 10, 
    degree.min = 0, 
    segments.min = 1, 
    cv.df.min = 1,
    cv = c("nomad","exhaustive","none"),
    cv.threshold = 1000,
    cv.func = c("cv.ls","cv.gcv","cv.aic"),
    kernel = TRUE,
    lambda = NULL,
    lambda.discrete = FALSE,
    lambda.discrete.num = 100,
    complexity = c("degree-knots","degree","knots"),
    knots = c("quantiles","uniform","auto"),
    basis = c("auto","additive","tensor","glp"),
    deriv = 0,
    data.return = FALSE,
    prune = FALSE,
    model.return = FALSE,
    restarts = 0,
    random.seed = 42,
    max.bb.eval = 10000,
    initial.mesh.size.real = "r1.0e-01",
    initial.mesh.size.integer = "1",
    min.mesh.size.real = paste(sqrt(.Machine$double.eps)),
    min.mesh.size.integer = 1, 
    min.poll.size.real = 1,
    min.poll.size.integer = 1, 
    opts=list(),
    nmulti = 5,
    tau = NULL,
    weights = NULL,
    singular.ok = FALSE,
    ...)
xz | 
  numeric (  | 
y | 
 a numeric vector of responses.  | 
degree | 
  integer/vector specifying the polynomial degree of the
B-spline basis for each dimension of the continuous   | 
segments | 
  integer/vector specifying the number of segments of the
B-spline basis for each dimension of the continuous   | 
include | 
  integer/vector specifying whether each of the
nominal/ordinal (  | 
lambda | 
  a vector of bandwidths for each dimension of the
categorical   | 
lambda.discrete | 
  if   | 
lambda.discrete.num | 
 a positive integer indicating the number of
discrete values that lambda can assume - this parameter will only be
used when   | 
formula | 
 a symbolic description of the model to be fit  | 
data | 
 an optional data frame containing the variables in the model  | 
cv | 
  a character string (default   | 
cv.threshold | 
  an integer (default   | 
cv.func | 
 a character string (default   | 
kernel | 
  a logical value (default   | 
degree.max | 
  the maximum degree of the B-spline basis for
each of the continuous predictors (default   | 
segments.max | 
  the maximum segments of the B-spline basis for
each of the continuous predictors (default   | 
degree.min | 
  the minimum degree of the B-spline basis for
each of the continuous predictors (default   | 
segments.min | 
  the minimum segments of the B-spline basis for
each of the continuous predictors (default   | 
cv.df.min | 
  the minimum degrees of freedom to allow when
conducting NOMAD-based cross-validation (default
  | 
complexity | 
 a character string (default
 For the continuous predictors the regression spline model employs
either the additive or tensor product B-spline basis matrix for a
multivariate polynomial spline via the B-spline routines in the GNU
Scientific Library (https://www.gnu.org/software/gsl/) and the
  | 
knots | 
  a character string (default   | 
basis | 
  a character string (default   | 
deriv | 
  an integer   | 
data.return | 
  a logical value indicating whether to return
  | 
prune | 
  a logical value (default   | 
model.return | 
  a logical value indicating whether to return the
list of   | 
restarts | 
 integer specifying the number of times to restart the process of finding extrema of the cross-validation function (for the bandwidths only) from different (random) initial points  | 
random.seed | 
  when it is not missing and not equal to 0, the
initial points will be generated using this seed when using
  | 
max.bb.eval | 
 argument passed to the NOMAD solver (see   | 
initial.mesh.size.real | 
 argument passed to the NOMAD solver (see   | 
initial.mesh.size.integer | 
 argument passed to the NOMAD solver (see   | 
min.mesh.size.real | 
 argument passed to the NOMAD solver (see   | 
min.mesh.size.integer | 
 arguments passed to the NOMAD solver (see   | 
min.poll.size.real | 
 arguments passed to the NOMAD solver (see   | 
min.poll.size.integer | 
 arguments passed to the NOMAD solver (see   | 
opts | 
  list of optional arguments to be passed to
  | 
nmulti | 
 integer number of times to restart the process of finding extrema of
the cross-validation function from different (random) initial
points (default   | 
tau | 
 if non-null a number in (0,1) denoting the quantile for which a quantile
regression spline is to be estimated rather than estimating the
conditional mean (default   | 
weights | 
 an optional vector of weights to be used in the fitting process. Should be ‘NULL’ or a numeric vector. If non-NULL, weighted least squares is used with weights ‘weights’ (that is, minimizing ‘sum(w*e^2)’); otherwise ordinary least squares is used.  | 
singular.ok | 
 a logical value (default   | 
... | 
 optional arguments  | 
Typical usages are (see below for a list of options and also the examples at the end of this help file)
    ## Estimate the model and let the basis type be determined by
    ## cross-validation (i.e. cross-validation will determine whether to
    ## use the additive, generalized, or tensor product basis)
    
    model <- crs(y~x1+x2)
    
    ## Estimate the model for a specified degree/segment/bandwidth
    ## combination and do not run cross-validation (will use the
    ## additive basis by default)
    
    model <- crs(y~x1+factor(x2),cv="none",degree=3,segments=1,lambda=.1)
    
    ## Plot the mean and (asymptotic) error bounds
    plot(model,mean=TRUE,ci=TRUE)
    ## Plot the first partial derivative and (asymptotic) error bounds
    plot(model,deriv=1,ci=TRUE)    
    
crs computes a regression spline estimate of a one (1)
dimensional dependent variable on an r-dimensional vector of
continuous and categorical
(factor/ordered) predictors.
The regression spline model employs the tensor product B-spline basis
matrix for a multivariate polynomial spline via the B-spline routines
in the GNU Scientific Library (https://www.gnu.org/software/gsl/)
and the tensor.prod.model.matrix function.
When basis="additive" the model becomes additive in nature
(i.e. no interaction/tensor terms thus semiparametric not fully
nonparametric).
When basis="tensor" the model uses the multivariate tensor
product basis.
When kernel=FALSE the model uses indicator basis functions for
the nominal/ordinal (factor/ordered)
predictors rather than kernel weighting.
When kernel=TRUE the product kernel function for the discrete
predictors is of the ‘Li-Racine’ type (see Li and Racine (2007)
for details).
When cv="nomad", numerical search is undertaken using Nonsmooth
Optimization by Mesh Adaptive Direct Search (Abramson, Audet, Couture,
Dennis, Jr., and Le Digabel (2011)).
When kernel=TRUE and cv="exhaustive", numerical search
is undertaken using optim and the box-constrained
L-BFGS-B method (see optim for details). The user
may restart the algorithm as many times as desired via the
restarts argument (default restarts=0). The approach
ascends from degree=0 (or segments=0) through
degree.max and for each value of degree (or
segments) searches for the optimal bandwidths. After the most
complex model has been searched then the optimal
degree/segments/lambda combination is
selected. If any element of the optimal degree (or
segments) vector coincides with degree.max (or segments.max) a warning
is produced and the user ought to restart their search with a larger
value of degree.max (or segments.max).
Note that the default plot method for a crs object
provides some diagnostic measures, in particular, a) residuals versus
fitted values (useful for checking the assumption that
E(u|x)=0), b) a normal quantile-quantile plot which allows
residuals to be assessed for normality (qqnorm), c) a
scale-location plot that is useful for checking the assumption that
the errors are iid and, in particular, that the variance is
homogeneous, and d) ‘Cook's distance’ which computes the
single-case influence function. See below for other arguments for the
plot function for a crs object.
Note that setting prune=TRUE produces a final ‘pruning’
of the model via a stepwise cross-validation criterion achieved by
modifying stepAIC and replacing extractAIC
with extractCV throughout the function. This option may be
enabled to remove potentially superfluous bases thereby improving the
finite-sample efficiency of the resulting model.  Note that if the
cross-validation score for the pruned model is no better than that for
the original model then the original model is returned with a warning
to this effect. Note also that this option can only be used when
kernel=FALSE.
crs returns a crs object.  The generic functions
fitted and residuals extract (or
generate) estimated values and residuals. Furthermore, the functions
summary, predict, and plot
(options mean=FALSE, deriv=i where i is an
integer, ci=FALSE, persp.rgl=FALSE,
plot.behavior=c("plot","plot-data","data"),
xtrim=0.0,xq=0.5) support objects of this type. The
returned object has the following components:
fitted.values | 
 estimates of the regression function (conditional mean) at the sample points or evaluation points  | 
lwr, upr | 
  lower/upper bound for a 95% confidence interval for
the   | 
residuals | 
 residuals computed at the sample points or evaluation points  | 
degree | 
  integer/vector specifying the degree of the B-spline
basis for each dimension of the continuous   | 
segments | 
  integer/vector specifying the number of segments of
the B-spline basis for each dimension of the continuous   | 
include | 
  integer/vector specifying whether each of the
nominal/ordinal (  | 
kernel | 
  a logical value indicating whether kernel smoothing was
used (  | 
lambda | 
  vector of bandwidths used if   | 
call | 
 a symbolic description of the model  | 
r.squared | 
 coefficient of determination (Doksum and Samarov (1995))  | 
model.lm | 
  an object of ‘  | 
deriv.mat | 
  a matrix of derivatives (or differences in levels
for the categorical   | 
deriv.mat.lwr | 
  a matrix of 95% coverage lower bounds for
  | 
deriv.mat.upr | 
  a matrix of 95% coverage upper bounds for
  | 
hatvalues | 
  the   | 
P.hat | 
 the kernel probability estimates corresponding to the categorical predictors in the estimated model  | 
Note that when kernel=FALSE summary supports the
option sigtest=TRUE that conducts an F-test for significance
for each predictor.
Jeffrey S. Racine racinej@mcmaster.ca
Abramson, M.A. and C. Audet and G. Couture and J.E. Dennis Jr. and and S. Le Digabel (2011), “The NOMAD project”. Software available at https://www.gerad.ca/nomad.
Craven, P. and G. Wahba (1979), “Smoothing Noisy Data With Spline Functions,” Numerische Mathematik, 13, 377-403.
Doksum, K. and A. Samarov (1995), “Nonparametric Estimation of Global Functionals and a Measure of the Explanatory Power of Covariates in Regression,” The Annals of Statistics, 23 1443-1473.
Hurvich, C.M. and J.S. Simonoff and C.L. Tsai (1998), “Smoothing Parameter Selection in Nonparametric Regression Using an Improved Akaike Information Criterion,” Journal of the Royal Statistical Society B, 60, 271-293.
Le Digabel, S. (2011), “Algorithm 909: NOMAD: Nonlinear Optimization With The MADS Algorithm”. ACM Transactions on Mathematical Software, 37(4):44:1-44:15.
Li, Q. and J.S. Racine (2007), Nonparametric Econometrics: Theory and Practice, Princeton University Press.
Ma, S. and J.S. Racine and L. Yang (2015), “Spline Regression in the Presence of Categorical Predictors,” Journal of Applied Econometrics, Volume 30, 705-717.
Ma, S. and J.S. Racine (2013), “Additive Regression Splines with Irrelevant Categorical and Continuous Regressors,” Statistica Sinica, Volume 23, 515-541.
Racine, J.S. (2011), “Cross-Validated Quantile Regression Splines,” manuscript.
smooth.spline, loess, npreg
set.seed(42)
## Example - simulated data
n <- 1000
num.eval <- 50
x1 <- runif(n)
x2 <- runif(n)
z <- rbinom(n,1,.5)
dgp <- cos(2*pi*x1)+sin(2*pi*x2)+z
z <- factor(z)
y <- dgp + rnorm(n,sd=.5)
## Estimate a model with specified degree, segments, and bandwidth
model <- crs(y~x1+x2+z,degree=c(5,5),
                       segments=c(1,1),
                       lambda=0.1,
                       cv="none",
                       kernel=TRUE)
summary(model)
## Perspective plot
x1.seq <- seq(min(x1),max(x1),length=num.eval)
x2.seq <- seq(min(x2),max(x2),length=num.eval)
x.grid <- expand.grid(x1.seq,x2.seq)
newdata <- data.frame(x1=x.grid[,1],x2=x.grid[,2],
                      z=factor(rep(0,num.eval**2),levels=c(0,1)))
z0 <- matrix(predict(model,newdata=newdata),num.eval,num.eval)
newdata <- data.frame(x1=x.grid[,1],x2=x.grid[,2],
                      z=factor(rep(1,num.eval),levels=c(0,1)))
z1 <- matrix(predict(model,newdata=newdata),num.eval,num.eval)
zlim=c(min(z0,z1),max(z0,z1))
persp(x=x1.seq,y=x2.seq,z=z0,
      xlab="x1",ylab="x2",zlab="y",zlim=zlim,
      ticktype="detailed",      
      border="red",
      theta=45,phi=45)
par(new=TRUE)
persp(x=x1.seq,y=x2.seq,z=z1,
      xlab="x1",ylab="x2",zlab="y",zlim=zlim,
      theta=45,phi=45,
      ticktype="detailed",
      border="blue")
## Partial regression surface plot
plot(model,mean=TRUE,ci=TRUE)
## Not run: 
## A plot example where we extract the partial surfaces, confidence
## intervals etc. automatically generated by plot(mean=TRUE,...) but do
## not plot, rather save for separate use.
pdat <- plot(model,mean=TRUE,ci=TRUE,plot.behavior="data")
## Column 1 is the (evaluation) predictor ([,1]), 2-4 ([,-1]) the mean,
## lwr, and upr (note the returned value is a 'list' hence pdat[[1]] is
## data for the first predictor, pdat[[2]] the second etc). Note that
## matplot() can plot this nicely.
matplot(pdat[[1]][,1],pdat[[1]][,-1],
        xlab=names(pdat[[1]][1]),ylab=names(pdat[[1]][2]),
        lty=c(1,2,2),col=c(1,2,2),type="l")
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.