# R/np.singleindex.R In np: Nonparametric Kernel Smoothing Methods for Mixed Data Types

#### Documented in npindexnpindex.formula

# We implement Ichimura's single index model and Klein and Spady's
# single index model using npksum() and the nlm() minimization
# routine in R. These semiparametric models are used to reduce
# dimensionality to a one-dimensional nonparametric estimator, though
# at the potential cost of misspecification.

# Note also that we will use the so-called scale normalization, i.e.,
# that beta_1=1 (no need to estimate) which reduces search by 1
# parameter (this is obviously restricted search subject to beta_1=1).

# Define the index function model... it is a simple local constant
# estimator of y on a linear index X\beta where beta_1 is presumed to
# be 1 by restriction though, at this stage, the user may feed in any
# value they so desire.

npindex <-
function(bws, ...){
args <- list(...)

if (!missing(bws)){
if (is.recursive(bws)){
if (!is.null(bws$formula) && is.null(args$txdat))
UseMethod("npindex",bws$formula) else if (!is.null(bws$call) && is.null(args$txdat)) UseMethod("npindex",bws$call)
else if (!is.call(bws))
UseMethod("npindex",bws)
else
UseMethod("npindex",NULL)
} else {
UseMethod("npindex", NULL)
}
} else {
UseMethod("npindex", NULL)
}
}

npindex.formula <-
function(bws, data = NULL, newdata = NULL, y.eval = FALSE, ...){

tt <- terms(bws)
m <- match(c("formula", "data", "subset", "na.action"),
names(bws$call), nomatch = 0) tmf <- bws$call[c(1,m)]
tmf[[1]] <- as.name("model.frame")
tmf[["formula"]] <- tt
umf <- tmf <- eval(tmf, envir = environment(tt))

tydat <- model.response(tmf)
txdat <- tmf[, attr(attr(tmf, "terms"),"term.labels"), drop = FALSE]

if ((has.eval <- !is.null(newdata))) {
if (!y.eval){
tt <- delete.response(tt)

orig.class <- sapply(eval(attr(tt, "variables"), newdata, environment(tt)),class)

## delete.response clobbers predvars, which is used for timeseries objects
## so we need to reconstruct it

if(all(orig.class == "ts")){
args <- (as.list(attr(tt, "variables"))[-1])
attr(tt, "predvars") <- as.call(c(quote(as.data.frame),as.call(c(quote(ts.intersect), args))))
}else if(any(orig.class == "ts")){
arguments <- (as.list(attr(tt, "variables"))[-1])
arguments.normal <- arguments[which(orig.class != "ts")]
arguments.timeseries <- arguments[which(orig.class == "ts")]

ix <- sort(c(which(orig.class == "ts"),which(orig.class != "ts")),index.return = TRUE)$ix attr(tt, "predvars") <- bquote(.(as.call(c(quote(cbind),as.call(c(quote(as.data.frame),as.call(c(quote(ts.intersect), arguments.timeseries)))),arguments.normal,check.rows = TRUE)))[,.(ix)]) }else{ attr(tt, "predvars") <- attr(tt, "variables") } } umf <- emf <- model.frame(tt, data = newdata) if (y.eval) eydat <- model.response(emf) exdat <- emf[, attr(attr(emf, "terms"),"term.labels"), drop = FALSE] } ev <- eval(parse(text=paste("npindex(txdat = txdat, tydat = tydat,", ifelse(has.eval,paste("exdat = exdat,",ifelse(y.eval,"eydat = eydat,","")),""), "bws = bws, ...)"))) ev$omit <- attr(umf,"na.action")
ev$rows.omit <- as.vector(ev$omit)
ev$nobs.omit <- length(ev$rows.omit)

ev$mean <- napredict(ev$omit, ev$mean) ev$merr <- napredict(ev$omit, ev$merr)

if(ev$gradients){ ev$grad <- napredict(ev$omit, ev$grad)
ev$gerr <- napredict(ev$omit, ev$gerr) } if(ev$residuals){
ev$resid <- naresid(ev$omit, ev$resid) } return(ev) } npindex.call <- function(bws, ...) { npindex(txdat = eval(bws$"xdat"]], environment(bws$call)), tydat = eval(bws$"ydat"]], environment(bws$call)), bws = bws, ...) } npindex.default <- function(bws, txdat, tydat, ...){ sc <- sys.call() sc.names <- names(sc) ## here we check to see if the function was called with tdat = ## if it was, we need to catch that and map it to dat = ## otherwise the call is passed unadulterated to npudensbw bws.named <- any(sc.names == "bws") txdat.named <- any(sc.names == "txdat") tydat.named <- any(sc.names == "tydat") no.bws <- missing(bws) no.txdat <- missing(txdat) no.tydat <- missing(tydat) ## if bws was passed in explicitly, do not compute bandwidths if(txdat.named) txdat <- toFrame(txdat) sc.bw <- sc sc.bw[[1]] <- quote(npindexbw) if(bws.named){ sc.bw$bandwidth.compute <- FALSE
}

ostxy <- c('txdat','tydat')
nstxy <- c('xdat','ydat')

m.txy <- match(ostxy, names(sc.bw), nomatch = 0)

if(any(m.txy > 0)) {
names(sc.bw)[m.txy] <- nstxy[m.txy > 0]
}

tbw <- eval.parent(sc.bw)

## convention: drop 'bws' and up to two unnamed arguments (including bws)
if(no.bws){
tx.str <- ",txdat = txdat"
ty.str <- ",tydat = tydat"
} else {
tx.str <- ifelse(txdat.named, ",txdat = txdat","")
ty.str <- ifelse(tydat.named, ",tydat = tydat","")
if((!bws.named) && (!txdat.named)){
ty.str <- ifelse(tydat.named, ",tydat = tydat",
ifelse(no.tydat,"",",tydat"))
}
}

eval(parse(text=paste("npindex(bws = tbw", tx.str, ty.str, ",...)")))
}

npindex.sibandwidth <-
function(bws,
txdat = stop("training data 'txdat' missing"),
tydat = stop("training data 'tydat' missing"),
exdat,
eydat,
residuals = FALSE,
errors = FALSE,
boot.num = 399, ...) {

no.ex = missing(exdat)
no.ey = missing(eydat)

## if no.ex then if !no.ey then ey and tx must match, to get
## oos errors alternatively if no.ey you get is errors if
## !no.ex then if !no.ey then ey and ex must match, to get
## oos errors alternatively if no.ey you get NO errors since we
## don't evaluate on the training data

txdat = toFrame(txdat)

if (!(is.vector(tydat) | is.factor(tydat)))
stop("'tydat' must be a vector or a factor")

tydat =
if (is.factor(tydat))
as.numeric(levels(tydat))[as.integer(tydat)]
else
as.double(tydat)

if (!no.ex){
exdat = toFrame(exdat)

if (! txdat %~% exdat )
stop("'txdat' and 'exdat' are not similar data frames!")

if (!no.ey){
if (dim(exdat)[1] != length(eydat))
stop("number of evaluation data 'exdat' and dependent data 'eydat' do not match")
}

} else if(!no.ey) {
if (dim(txdat)[1] != length(eydat))
stop("number of training data 'txdat' and dependent data 'eydat' do not match")
}

## catch and destroy NA's
goodrows = 1:dim(txdat)[1]
rows.omit = attr(na.omit(data.frame(txdat,tydat)), "na.action")
goodrows[rows.omit] = 0

if (all(goodrows==0))
stop("Training data has no rows without NAs")

txdat = txdat[goodrows,,drop = FALSE]
tydat = tydat[goodrows]

if (!no.ex){
goodrows = 1:dim(exdat)[1]
rows.omit = eval(parse(text=paste('attr(na.omit(data.frame(exdat',
ifelse(no.ey,"",",eydat"),')), "na.action")')))

goodrows[rows.omit] = 0

exdat = exdat[goodrows,,drop = FALSE]
if (!no.ey)
eydat = eydat[goodrows]

if (all(goodrows==0))
stop("Evaluation data has no rows without NAs")
}

## convert tydat, eydat to numeric, from a factor with levels from the y-data
## used during bandwidth selection.

if (is.factor(tydat)){
tydat <- adjustLevels(data.frame(tydat), bws$ydati)[,1] tydat <- (bws$ydati$all.dlev[[1]])[as.integer(tydat)] } else tydat <- as.double(tydat) if (no.ey) eydat <- double() else { if (is.factor(eydat)){ eydat <- adjustLevels(data.frame(eydat), bws$ydati)[,1]
eydat <- (bws$ydati$all.dlev[[1]])[as.integer(eydat)]
}
else
eydat <- as.double(eydat)
}

## re-assign levels in training and evaluation data to ensure correct
## conversion to numeric type.

txdat <- adjustLevels(txdat, bws$xdati) if (!no.ex) exdat <- adjustLevels(exdat, bws$xdati)

## grab the evaluation data before it is converted to numeric
if(no.ex)
teval <- txdat
else
teval <- exdat

## put the unordered, ordered, and continuous data in their own objects
## data that is not a factor is continuous.

txdat = toMatrix(txdat)

if (!no.ex){
exdat = toMatrix(exdat)
}

## from this point on txdat and exdat have been recast as matrices

## First, create the scalar index (n \times 1 vector)

index <- txdat %*% bws$beta if(no.ex) { index.eval <- index exdat <- txdat eydat <- tydat } else { index.eval <- exdat %*% bws$beta
}

## Next, if no gradients are requested, use (faster) npksum

tww <- npksum(txdat=as.matrix(txdat) %*% as.matrix(bws$beta), tydat=as.matrix(data.frame(tydat,1)), weights=as.matrix(data.frame(tydat,1)), exdat=as.matrix(exdat) %*% as.matrix(bws$beta),
bws=bws$bw, ckertype = bws$ckertype,
ckerorder = bws$ckerorder)$ksum

index.mean <- tww[1,2,]/NZD(tww[2,2,])

if(!no.ex & (no.ey | residuals)){

## want to evaluate on training data for in sample errors even
## if evaluation x's are different from training but no y's
## are specified

tww <- npksum(txdat=as.matrix(txdat) %*% as.matrix(bws$beta), tydat=as.matrix(data.frame(tydat,1)), weights=as.matrix(data.frame(tydat,1)), bws=bws$bw,
ckertype = bws$ckertype, ckerorder = bws$ckerorder)$ksum index.tmean <- tww[1,2,]/NZD(tww[2,2,]) } } else if(gradients==TRUE) { model <- npreg(txdat=index, tydat=tydat, exdat=index.eval, bws=bws$bw,
ckertype = bws$ckertype, ckerorder = bws$ckerorder,
regtype="lc",

index.mean <- model$mean ## index.grad is a matrix, one column for each variable, each ## equal to its coefficient beta_i times the first derivative of ## the local-constant model index.grad <- as.matrix(model$grad)%*%t(as.vector(bws$beta)) if(!no.ex & (no.ey | residuals)){ ## Want to evaluate on training data for in sample errors even ## if evaluation x's are different from training but no y's ## are specified. Also, needed for variance-covariance matrix ## (uses on ly the training data) model <- npreg(txdat=index, tydat=tydat, bws=bws$bw,
ckertype = bws$ckertype, ckerorder = bws$ckerorder,
regtype="lc",

index.tmean <- model$mean index.tgrad <- model$grad

}

}

if (no.ex) {
index.tmean <- index.mean
}

}

## 5/3/2010, jracine, added vcov methods... thanks to Juan Carlos
## Escanciano <jescanci@indiana.edu> for pushing me on this for
## (training X) - need gradients == TRUE in order for this to
## work.

if(bws$method == "ichimura" & gradients == TRUE) { ## First row & column of covariance matrix `Bvcov' are zero due ## to identification condition that beta_1=0. Note the n n^{-1} ## n in V^{-1}\Sigma V^{-1} and the \sqrt{n} in the ## normalization of \hat\beta will cancel. q <- ncol(txdat) Bvcov <- matrix(0,q,q) dimnames(Bvcov) <- list(bws$xnames,bws$xnames) ## Use the weight matrix so we can compute all expectations with ## only one call to npksum (the kernel arguments x\beta do not ## change, only the j for X_{ij} in E(X_{ij}|X_i'\beta) W <- txdat[,-1,drop=FALSE] tyindex <- npksum(txdat = index, tydat = rep(1,length(tydat)), weights = W, bws = bws$bw,
ckertype = bws$ckertype, ckerorder = bws$ckerorder)$ksum tindex <- npksum(txdat = index, bws = bws$bw,
ckertype = bws$ckertype, ckerorder = bws$ckerorder)$ksum ## Need to trap case where k-1=1... ksum will return a 1 D ## array, need a 1 x n matrix if(length(dim(tyindex))==1) tyindex <- matrix(tyindex,nrow=1,ncol=dim(tyindex)) ## xmex = X_i-\hat E(X_i|X_i'\beta), dimension k\times n. xmex <- sapply(1:length(tydat),function(i){W[i,]-tyindex[,i]/tindex[i]}) ## Need to trap case where k-1=1..., sapply will return a ## vector, need a 1 x n matrix if(is.vector(xmex)) { dg.db.xmex <- matrix(index.tgrad[,1]*xmex,nrow=1,ncol=length(xmex)) } else { dg.db.xmex <- index.tgrad[,1]*xmex } uhat <- tydat - index.tmean ## Training y and training mean Vinv <- chol2inv(chol(dg.db.xmex%*%t(dg.db.xmex))) Sigma <- (uhat*dg.db.xmex)%*%t(uhat*dg.db.xmex) Bvcov[-1,-1] <- Vinv %*% Sigma %*% Vinv dimnames(Bvcov) <- list(bws$xnames,bws$xnames) ## Now export this in an S3 method... } else if(bws$method == "kleinspady" & gradients == TRUE) {

## We divide by P(1-P) so test for P=0 or 1...

keep <- which(index.tmean < 1 & index.tmean > 0)

## First row & column of covariance matrix are zero due to
## identification condition that beta_1=0. Note the n^{-1} in
## the E and the \sqrt{n} in the normalization of \hat\beta will
## cancel.

q <- ncol(txdat)
Bvcov <- matrix(0,q,q)
Bvcov[-1,-1] <- chol2inv(chol(t(dg.db[keep,])%*%(dg.db[keep,]/(index.tmean[keep]*
(1-index.tmean[keep])))))

dimnames(Bvcov) <- list(bws$xnames,bws$xnames)

## Now export this in an S3 method...

}

boofun = function(data, indices){
rindex <- txdat[indices,] %*% bws$beta model <- npreg(regtype = 'lc', gradients = TRUE, txdat = rindex, tydat = tydat[indices], exdat = index.eval, bws = bws$bw,
ckertype = bws$ckertype, ckerorder = bws$ckerorder)[c('mean','grad')]

c(model$mean, model$grad, mean(model$grad)) } } else { boofun = function(data, indices){ rindex = txdat[indices,] %*% bws$beta
tww <- npksum(txdat = rindex,
tydat = cbind(tydat[indices],1),
weights = cbind(tydat[indices],1),
exdat = index.eval,
bws = bws$bw, ckertype = bws$ckertype,
ckerorder = bws$ckerorder)$ksum

tww[1,2,]/NZD(tww[2,2,])

}
}

if (errors){

boot.out = suppressWarnings(boot(data.frame(txdat,tydat), boofun, R = boot.num))

index.merr = matrix(data = 0, ncol = 1, nrow = length(index.eval))
index.merr[,] = sqrt(diag(cov(boot.out$,1:length(index.eval)]))) if (gradients) { index.gerr = matrix(data = 0, ncol = ncol(txdat), nrow = length(index.eval)) index.gerr[,] = sqrt(diag(cov(boot.out$,(length(index.eval)+1):(2*length(index.eval))])))

for (i in 1:ncol(txdat))
index.gerr[,i] = abs(bws$beta[i])*index.gerr[,i] index.mgerr = sd(boot.out$,2*length(index.eval)+1])
index.mgerr = abs(bws$beta)*index.mgerr } } ## goodness of fit if(bws$method == "ichimura") {
if (!no.ey) {
RSQ = RSQfunc(eydat,index.mean)
MSE = MSEfunc(eydat,index.mean)
MAE = MAEfunc(eydat,index.mean)
MAPE = MAPEfunc(eydat,index.mean)
CORR = CORRfunc(eydat,index.mean)
SIGN = SIGNfunc(eydat,index.mean)
} else {
RSQ = RSQfunc(tydat,index.tmean)
MSE = MSEfunc(tydat,index.tmean)
MAE = MAEfunc(tydat,index.tmean)
MAPE = MAPEfunc(tydat,index.tmean)
CORR = CORRfunc(tydat,index.tmean)
SIGN = SIGNfunc(tydat,index.tmean)
}
strgof = "xtra=c(RSQ,MSE,MAE,MAPE,CORR,SIGN),"
strres = ifelse(residuals, "resid = tydat - index.tmean,","")
} else if(bws\$method == "kleinspady") {
index.pred =
if (!no.ey) round(index.mean)
else round(index.tmean)

confusion.matrix =
table(if (!no.ey) eydat else tydat,
index.pred, dnn=c("Actual", "Predicted"))

CCR.overall <- sum(diag(confusion.matrix))/sum(confusion.matrix)
CCR.byoutcome <- diag(confusion.matrix)/rowSums(confusion.matrix)

strgof = "confusion.matrix = confusion.matrix, CCR.overall = CCR.overall,
strres = ""
}

eval(parse(text=paste(
"singleindex(bws = bws, index = index.eval, mean = index.mean,",
ifelse(errors,"merr = index.merr,",""),