R/np.singleindex.R
In np: Nonparametric Kernel Smoothing Methods for Mixed Data Types

Documented in npindex npindex.formula

# We implement Ichimura's single index model and Klein and Spady's
# single index model using npksum() and the nlm() minimization
# routine in R. These semiparametric models are used to reduce
# dimensionality to a one-dimensional nonparametric estimator, though
# at the potential cost of misspecification.

# Note also that we will use the so-called scale normalization, i.e.,
# that beta_1=1 (no need to estimate) which reduces search by 1
# parameter (this is obviously restricted search subject to beta_1=1).

# Define the index function model... it is a simple local constant
# estimator of y on a linear index X\beta where beta_1 is presumed to
# be 1 by restriction though, at this stage, the user may feed in any
# value they so desire.

npindex <-
  function(bws, ...){
    args <- list(...)

    if (!missing(bws)){
      if (is.recursive(bws)){
        if (!is.null(bws$formula) && is.null(args$txdat))
          UseMethod("npindex",bws$formula)
        else if (!is.null(bws$call) && is.null(args$txdat))
          UseMethod("npindex",bws$call)
        else if (!is.call(bws))
          UseMethod("npindex",bws)
        else
          UseMethod("npindex",NULL)
      } else {
        UseMethod("npindex", NULL)
      }
    } else {
      UseMethod("npindex", NULL)
    }
  }

npindex.formula <-
    function(bws, data = NULL, newdata = NULL, y.eval = FALSE, ...){

        tt <- terms(bws)
        m <- match(c("formula", "data", "subset", "na.action"),
                   names(bws$call), nomatch = 0)
        tmf <- bws$call[c(1,m)]
        tmf[[1]] <- as.name("model.frame")
        tmf[["formula"]] <- tt
        umf <- tmf <- eval(tmf, envir = environment(tt))

        tydat <- model.response(tmf)
        txdat <- tmf[, attr(attr(tmf, "terms"),"term.labels"), drop = FALSE]

        if ((has.eval <- !is.null(newdata))) {
          if (!y.eval){
            tt <- delete.response(tt)

            orig.class <- sapply(eval(attr(tt, "variables"), newdata, environment(tt)),class)
            
            ## delete.response clobbers predvars, which is used for timeseries objects
            ## so we need to reconstruct it

            if(all(orig.class == "ts")){
              args <- (as.list(attr(tt, "variables"))[-1])
              attr(tt, "predvars") <- as.call(c(quote(as.data.frame),as.call(c(quote(ts.intersect), args))))
            }else if(any(orig.class == "ts")){
              arguments <- (as.list(attr(tt, "variables"))[-1])
              arguments.normal <- arguments[which(orig.class != "ts")]
              arguments.timeseries <- arguments[which(orig.class == "ts")]

              ix <- sort(c(which(orig.class == "ts"),which(orig.class != "ts")),index.return = TRUE)$ix
              attr(tt, "predvars") <- bquote(.(as.call(c(quote(cbind),as.call(c(quote(as.data.frame),as.call(c(quote(ts.intersect), arguments.timeseries)))),arguments.normal,check.rows = TRUE)))[,.(ix)])
            }else{
              attr(tt, "predvars") <- attr(tt, "variables")
            }
          }
          
          umf <- emf <- model.frame(tt, data = newdata)

          if (y.eval)
            eydat <- model.response(emf)
          
          exdat <- emf[, attr(attr(emf, "terms"),"term.labels"), drop = FALSE]
        }

        ev <-
            eval(parse(text=paste("npindex(txdat = txdat, tydat = tydat,",
                           ifelse(has.eval,paste("exdat = exdat,",ifelse(y.eval,"eydat = eydat,","")),""),
                           "bws = bws, ...)")))

        ev$omit <- attr(umf,"na.action")
        ev$rows.omit <- as.vector(ev$omit)
        ev$nobs.omit <- length(ev$rows.omit)

        ev$mean <- napredict(ev$omit, ev$mean)
        ev$merr <- napredict(ev$omit, ev$merr)

        if(ev$gradients){
            ev$grad <- napredict(ev$omit, ev$grad)
            ev$gerr <- napredict(ev$omit, ev$gerr)
        }

        if(ev$residuals){
            ev$resid <- naresid(ev$omit, ev$resid)
        }    
        return(ev)
    }

npindex.call <-
  function(bws, ...) {
    npindex(txdat = eval(bws$call[["xdat"]], environment(bws$call)),
            tydat = eval(bws$call[["ydat"]], environment(bws$call)),
            bws = bws, ...)
  }

npindex.default <- function(bws, txdat, tydat, ...){
  sc <- sys.call()
  sc.names <- names(sc)

  ## here we check to see if the function was called with tdat =
  ## if it was, we need to catch that and map it to dat =
  ## otherwise the call is passed unadulterated to npudensbw

  bws.named <- any(sc.names == "bws")
  txdat.named <- any(sc.names == "txdat")
  tydat.named <- any(sc.names == "tydat")

  no.bws <- missing(bws)
  no.txdat <- missing(txdat)
  no.tydat <- missing(tydat)

  ## if bws was passed in explicitly, do not compute bandwidths

  if(txdat.named)
    txdat <- toFrame(txdat)

  sc.bw <- sc
  
  sc.bw[[1]] <- quote(npindexbw)

  if(bws.named){
    sc.bw$bandwidth.compute <- FALSE
  }

  ostxy <- c('txdat','tydat')
  nstxy <- c('xdat','ydat')
  
  m.txy <- match(ostxy, names(sc.bw), nomatch = 0)

  if(any(m.txy > 0)) {
    names(sc.bw)[m.txy] <- nstxy[m.txy > 0]
  }
    
  tbw <- eval.parent(sc.bw)

  ## convention: drop 'bws' and up to two unnamed arguments (including bws)
  if(no.bws){
    tx.str <- ",txdat = txdat"
    ty.str <- ",tydat = tydat"
  } else {
    tx.str <- ifelse(txdat.named, ",txdat = txdat","")
    ty.str <- ifelse(tydat.named, ",tydat = tydat","")
    if((!bws.named) && (!txdat.named)){
      ty.str <- ifelse(tydat.named, ",tydat = tydat",
                       ifelse(no.tydat,"",",tydat"))
    }
  }

  eval(parse(text=paste("npindex(bws = tbw", tx.str, ty.str, ",...)")))
}

npindex.sibandwidth <-
  function(bws,
           txdat = stop("training data 'txdat' missing"),
           tydat = stop("training data 'tydat' missing"),
           exdat,
           eydat,
           gradients = FALSE,
           residuals = FALSE,
           errors = FALSE,
           boot.num = 399, ...) {

    no.ex = missing(exdat)
    no.ey = missing(eydat)

    ## if no.ex then if !no.ey then ey and tx must match, to get
    ## oos errors alternatively if no.ey you get is errors if
    ## !no.ex then if !no.ey then ey and ex must match, to get
    ## oos errors alternatively if no.ey you get NO errors since we
    ## don't evaluate on the training data

    txdat = toFrame(txdat)

    if (!(is.vector(tydat) | is.factor(tydat)))
      stop("'tydat' must be a vector or a factor")

    tydat =
      if (is.factor(tydat))
        as.numeric(levels(tydat))[as.integer(tydat)]
      else
        as.double(tydat)

    if (!no.ex){
      exdat = toFrame(exdat)

      if (! txdat %~% exdat )
        stop("'txdat' and 'exdat' are not similar data frames!")

      if (!no.ey){
        if (dim(exdat)[1] != length(eydat))
          stop("number of evaluation data 'exdat' and dependent data 'eydat' do not match")
      }

    } else if(!no.ey) {
      if (dim(txdat)[1] != length(eydat))
        stop("number of training data 'txdat' and dependent data 'eydat' do not match")
    }

    ## catch and destroy NA's
    goodrows = 1:dim(txdat)[1]
    rows.omit = attr(na.omit(data.frame(txdat,tydat)), "na.action")
    goodrows[rows.omit] = 0

    if (all(goodrows==0))
      stop("Training data has no rows without NAs")

    txdat = txdat[goodrows,,drop = FALSE]
    tydat = tydat[goodrows]

    if (!no.ex){
      goodrows = 1:dim(exdat)[1]
      rows.omit = eval(parse(text=paste('attr(na.omit(data.frame(exdat',
                               ifelse(no.ey,"",",eydat"),')), "na.action")')))

      goodrows[rows.omit] = 0

      exdat = exdat[goodrows,,drop = FALSE]
      if (!no.ey)
        eydat = eydat[goodrows]

      if (all(goodrows==0))
        stop("Evaluation data has no rows without NAs")
    }

    ## convert tydat, eydat to numeric, from a factor with levels from the y-data
    ## used during bandwidth selection.

    if (is.factor(tydat)){
      tydat <- adjustLevels(data.frame(tydat), bws$ydati)[,1]
      tydat <- (bws$ydati$all.dlev[[1]])[as.integer(tydat)]
    }
    else
      tydat <- as.double(tydat)

    if (no.ey)
      eydat <- double()
    else {
      if (is.factor(eydat)){
        eydat <- adjustLevels(data.frame(eydat), bws$ydati)[,1]
        eydat <- (bws$ydati$all.dlev[[1]])[as.integer(eydat)]
      }
      else
        eydat <- as.double(eydat)
    }

    ## re-assign levels in training and evaluation data to ensure correct
    ## conversion to numeric type.

    txdat <- adjustLevels(txdat, bws$xdati)

    if (!no.ex)
      exdat <- adjustLevels(exdat, bws$xdati)

    ## grab the evaluation data before it is converted to numeric
    if(no.ex)
      teval <- txdat
    else
      teval <- exdat

    ## put the unordered, ordered, and continuous data in their own objects
    ## data that is not a factor is continuous.

    txdat = toMatrix(txdat)

    if (!no.ex){
      exdat = toMatrix(exdat)
    }

    ## from this point on txdat and exdat have been recast as matrices

    ## First, create the scalar index (n \times 1 vector)

    index <- txdat %*% bws$beta

    if(no.ex) {
      index.eval <- index
      exdat <- txdat
      eydat <- tydat
    } else {
      index.eval <- exdat %*% bws$beta
    }

    ## Next, if no gradients are requested, use (faster) npksum

    if(gradients==FALSE) {

      tww <- npksum(txdat=as.matrix(txdat) %*% as.matrix(bws$beta),
                    tydat=as.matrix(data.frame(tydat,1)),
                    weights=as.matrix(data.frame(tydat,1)),
                    exdat=as.matrix(exdat) %*% as.matrix(bws$beta),
                    bws=bws$bw,
                    ckertype = bws$ckertype,
                    ckerorder = bws$ckerorder)$ksum

      index.mean <- tww[1,2,]/NZD(tww[2,2,])

      if(!no.ex & (no.ey | residuals)){

        ## want to evaluate on training data for in sample errors even
        ## if evaluation x's are different from training but no y's
        ## are specified

        tww <- npksum(txdat=as.matrix(txdat) %*% as.matrix(bws$beta),
                      tydat=as.matrix(data.frame(tydat,1)),
                      weights=as.matrix(data.frame(tydat,1)),
                      bws=bws$bw,
                      ckertype = bws$ckertype,
                      ckerorder = bws$ckerorder)$ksum

        index.tmean <- tww[1,2,]/NZD(tww[2,2,])

      }

    } else if(gradients==TRUE) {

      model <- npreg(txdat=index,
                     tydat=tydat,
                     exdat=index.eval,
                     bws=bws$bw,
                     ckertype = bws$ckertype,
                     ckerorder = bws$ckerorder,
                     regtype="lc",
                     gradients=TRUE)

      index.mean <- model$mean

      ## index.grad is a matrix, one column for each variable, each
      ## equal to its coefficient beta_i times the first derivative of
      ## the local-constant model

      index.grad <- as.matrix(model$grad)%*%t(as.vector(bws$beta))

      if(!no.ex & (no.ey | residuals)){

        ## Want to evaluate on training data for in sample errors even
        ## if evaluation x's are different from training but no y's
        ## are specified. Also, needed for variance-covariance matrix
        ## (uses on ly the training data)

        model <- npreg(txdat=index,
                       tydat=tydat,
                       bws=bws$bw,
                       ckertype = bws$ckertype,
                       ckerorder = bws$ckerorder,
                       regtype="lc",
                       gradients=TRUE)

        index.tmean <- model$mean

        index.tgrad <- model$grad

      }

    }

    if (no.ex) {
      index.tmean <- index.mean
    }

    if (no.ex & gradients) {
      index.tgrad <- index.grad
    }

    ## 5/3/2010, jracine, added vcov methods... thanks to Juan Carlos
    ## Escanciano <jescanci@indiana.edu> for pushing me on this for
    ## the Klein and Spady estimator... use index.tmean, index.tgrad
    ## (training X) - need gradients == TRUE in order for this to
    ## work.

    if(bws$method == "ichimura" & gradients == TRUE) {

      ## First row & column of covariance matrix `Bvcov' are zero due
      ## to identification condition that beta_1=0. Note the n n^{-1}
      ## n in V^{-1}\Sigma V^{-1} and the \sqrt{n} in the
      ## normalization of \hat\beta will cancel.

      q <- ncol(txdat)
      Bvcov <- matrix(0,q,q)
      dimnames(Bvcov) <- list(bws$xnames,bws$xnames)

      ## Use the weight matrix so we can compute all expectations with
      ## only one call to npksum (the kernel arguments x\beta do not
      ## change, only the j for X_{ij} in E(X_{ij}|X_i'\beta)

      W <- txdat[,-1,drop=FALSE]

      tyindex <- npksum(txdat = index,
                        tydat = rep(1,length(tydat)),
                        weights = W,
                        bws = bws$bw,
                        ckertype = bws$ckertype,
                        ckerorder = bws$ckerorder)$ksum

      tindex <- npksum(txdat = index,
                       bws = bws$bw,
                       ckertype = bws$ckertype,
                       ckerorder = bws$ckerorder)$ksum

      ## Need to trap case where k-1=1... ksum will return a 1 D
      ## array, need a 1 x n matrix

      if(length(dim(tyindex))==1) tyindex <- matrix(tyindex,nrow=1,ncol=dim(tyindex))

      ## xmex = X_i-\hat E(X_i|X_i'\beta), dimension k\times n.

      xmex <- sapply(1:length(tydat),function(i){W[i,]-tyindex[,i]/tindex[i]})

      ## Need to trap case where k-1=1..., sapply will return a
      ## vector, need a 1 x n matrix

      if(is.vector(xmex)) {
        dg.db.xmex <- matrix(index.tgrad[,1]*xmex,nrow=1,ncol=length(xmex))
      } else {
        dg.db.xmex <- index.tgrad[,1]*xmex
      }

      uhat <- tydat - index.tmean ## Training y and training mean

      Vinv <- chol2inv(chol(dg.db.xmex%*%t(dg.db.xmex)))

      Sigma <- (uhat*dg.db.xmex)%*%t(uhat*dg.db.xmex)

      Bvcov[-1,-1] <- Vinv %*% Sigma %*% Vinv

      dimnames(Bvcov) <- list(bws$xnames,bws$xnames)

      ## Now export this in an S3 method...

    } else if(bws$method == "kleinspady" & gradients == TRUE) {

      ## We divide by P(1-P) so test for P=0 or 1...

      keep <- which(index.tmean < 1 & index.tmean > 0)
      dg.db <- txdat[,-1,drop=FALSE]*index.tgrad[,1]

      ## First row & column of covariance matrix are zero due to
      ## identification condition that beta_1=0. Note the n^{-1} in
      ## the E and the \sqrt{n} in the normalization of \hat\beta will
      ## cancel.

      q <- ncol(txdat)
      Bvcov <- matrix(0,q,q)
      Bvcov[-1,-1] <- chol2inv(chol(t(dg.db[keep,])%*%(dg.db[keep,]/(index.tmean[keep]*
        (1-index.tmean[keep])))))

      dimnames(Bvcov) <- list(bws$xnames,bws$xnames)

      ## Now export this in an S3 method...

    }

    if (gradients){
      boofun = function(data, indices){
        rindex <- txdat[indices,] %*% bws$beta
        model <- npreg(regtype = 'lc',
                       gradients = TRUE,
                       txdat = rindex,
                       tydat = tydat[indices],
                       exdat = index.eval,
                       bws = bws$bw,
                       ckertype = bws$ckertype,
                       ckerorder = bws$ckerorder)[c('mean','grad')]
        
        c(model$mean, model$grad, mean(model$grad))
      }

    } else {
      boofun = function(data, indices){
        rindex = txdat[indices,] %*% bws$beta
        tww <- npksum(txdat = rindex,
                      tydat = cbind(tydat[indices],1),
                      weights = cbind(tydat[indices],1),
                      exdat = index.eval,
                      bws = bws$bw,
                      ckertype = bws$ckertype,
                      ckerorder = bws$ckerorder)$ksum

        tww[1,2,]/NZD(tww[2,2,])
        
      }
    }

    if (errors){

      boot.out = suppressWarnings(boot(data.frame(txdat,tydat), boofun, R = boot.num))

      index.merr = matrix(data = 0, ncol = 1, nrow = length(index.eval))
      index.merr[,] = sqrt(diag(cov(boot.out$t[,1:length(index.eval)])))

      if (gradients) {
        index.gerr = matrix(data = 0, ncol = ncol(txdat), nrow = length(index.eval))
        index.gerr[,] = sqrt(diag(cov(boot.out$t[,(length(index.eval)+1):(2*length(index.eval))])))

        for (i in 1:ncol(txdat))
          index.gerr[,i] = abs(bws$beta[i])*index.gerr[,i]

        index.mgerr = sd(boot.out$t[,2*length(index.eval)+1])
        index.mgerr = abs(bws$beta)*index.mgerr
      }
    }
    ## goodness of fit

    if(bws$method == "ichimura") {
      if (!no.ey) {
        RSQ = RSQfunc(eydat,index.mean)
        MSE = MSEfunc(eydat,index.mean)
        MAE = MAEfunc(eydat,index.mean)
        MAPE = MAPEfunc(eydat,index.mean)
        CORR = CORRfunc(eydat,index.mean)
        SIGN = SIGNfunc(eydat,index.mean)
      } else {
        RSQ = RSQfunc(tydat,index.tmean)
        MSE = MSEfunc(tydat,index.tmean)
        MAE = MAEfunc(tydat,index.tmean)
        MAPE = MAPEfunc(tydat,index.tmean)
        CORR = CORRfunc(tydat,index.tmean)
        SIGN = SIGNfunc(tydat,index.tmean)
      }
      strgof = "xtra=c(RSQ,MSE,MAE,MAPE,CORR,SIGN),"
      strres = ifelse(residuals, "resid = tydat - index.tmean,","")
    } else if(bws$method == "kleinspady") {
      index.pred =
        if (!no.ey) round(index.mean)
        else round(index.tmean)

      confusion.matrix =
        table(if (!no.ey) eydat else tydat,
              index.pred, dnn=c("Actual", "Predicted"))

      CCR.overall <- sum(diag(confusion.matrix))/sum(confusion.matrix)
      CCR.byoutcome <- diag(confusion.matrix)/rowSums(confusion.matrix)

      fit.mcfadden <- confusion.matrix/sum(confusion.matrix)

      fit.mcfadden <- sum(diag(fit.mcfadden)) -
        (sum(fit.mcfadden^2)-sum(diag(fit.mcfadden)^2))

      strgof = "confusion.matrix = confusion.matrix, CCR.overall = CCR.overall,
           CCR.byoutcome =  CCR.byoutcome, fit.mcfadden = fit.mcfadden,"
      strres = ""
    }

    eval(parse(text=paste(
                 "singleindex(bws = bws, index = index.eval, mean = index.mean,",
                 ifelse(errors,"merr = index.merr,",""),
                 ifelse(gradients,"grad = index.grad, mean.grad = colMeans(index.grad), betavcov = Bvcov,",""),
                 ifelse(errors & gradients,"gerr = index.gerr, mean.gerr = index.mgerr,",""),
                 strres,
                 "ntrain = nrow(txdat),", strgof,
                 "trainiseval = no.ex, residuals = residuals, gradients = gradients)")))

  }