#' Principal coordinate ridge regression
#'
#' Smooth constructor function for principal coordinate ridge regression fitted by \code{\link[mgcv]{gam}}. When the principal coordinates are defined by a relevant distance among functional predictors, this is a form of nonparametric scalar-on-function regression. Reiss et al. (2016) describe the approach and apply it to dynamic time warping distances among functional predictors.
#'
#' @aliases pco smooth.construct.pco.smooth.spec Predict.matrix.pco.smooth poridge
#' @export
#' @import mgcv
#' @importFrom stats cmdscale
#'
#' @param object a smooth specification object, usually generated by a term of the form \code{s(dummy, bs="pco", k, xt)}; see Details.
#' @param data a list containing just the data.
#' @param knots IGNORED!
#'
#' @return An object of class \code{pco.smooth}. The resulting object has an \code{xt} element which contains details of the multidimensional scaling, which may be interesting.
#'
#' @section Details:
#' The constructor is not normally called directly, but is rather used internally by \code{\link{gam}}.
#'
#' In a \code{\link[mgcv]{gam}} term of the above form \code{s(dummy, bs="pco", k, xt)},
#' \itemize{
#' \item \code{dummy} is an arbitrary vector (or name of a column in \code{data}) whose length is the number of observations. This is not actually used, but is required as part of the input to \code{\link[mgcv]{s}}. Note that if multiple \code{pco} terms are used in the model, there must be multiple unique term names (e.g., "\code{dummy1}", "\code{dummy2}", etc).
#' \item \code{k} is the number of principal coordinates (e.g., \code{k=9} will give a 9-dimensional projection of the data).
#' \item \code{xt} is a list supplying the distance information, in one of two ways. (i) A matrix \code{Dmat} of distances can be supplied directly via \code{xt=list(D=Dmat,\dots)}. (ii) Alternatively, one can use \code{xt=list(realdata=\dots, dist_fn=\dots, \dots)} to specify a data matrix \code{realdata} and distance function \code{dist_fn}, whereupon a distance matrix \code{dist_fn(realdata)} is created.
#'}
#' The list \code{xt} also has the following optional elements:
#' \itemize{
#' \item \code{add}: Passed to \code{\link{cmdscale}} when performing multidimensional scaling; for details, see the help for that function. (Default \code{FALSE}.)\cr
#' \item \code{fastcmd}: if \code{TRUE}, multidimensional scaling is performed by \code{\link{cmdscale_lanczos}}, which uses Lanczos iteration to eigendecompose the distance matrix; if \code{FALSE}, MDS is carried out by \code{\link{cmdscale}}. Default is \code{FALSE}, to use \code{cmdscale}.
#' }
#'
#' @author David L Miller, based on code from Lan Huo and Phil Reiss
#'
#' @references
#' Reiss, P. T., Miller, D. L., Wu, P.-S., and Wen-Yu Hua, W.-Y. Penalized nonparametric scalar-on-function regression via principal coordinates. Under revision. Available at \url{https://works.bepress.com/phil_reiss/42/}.
#'
#' @examples
#' # a simulated example
#'
#' require(dtw)
#'
#' ## First generate the data
#' Xnl <- matrix(0, 30, 101)
#' set.seed(813)
#' tt <- sort(sample(1:90, 30))
#' for(i in 1:30){
#' Xnl[i, tt[i]:(tt[i]+4)] <- -1
#' Xnl[i, (tt[i]+5):(tt[i]+9)] <- 1
#' }
#' X.toy <- Xnl + matrix(rnorm(30*101, ,0.05), 30)
#' y.toy <- tt + rnorm(30, 0.05)
#' y.rainbow <- rainbow(30, end=0.9)[(y.toy-min(y.toy))/
#' diff(range(y.toy))*29+1]
#'
#' ## Display the toy data
#' par(mfrow=c(2, 2))
#' matplot((0:100)/100, t(Xnl[c(4, 25), ]), type="l", xlab="t", ylab="",
#' ylim=range(X.toy), main="Noiseless functions")
#' matplot((0:100)/100, t(X.toy[c(4, 25), ]), type="l", xlab="t", ylab="",
#' ylim=range(X.toy), main="Observed functions")
#' matplot((0:100)/100, t(X.toy), type="l", lty=1, col=y.rainbow, xlab="t",
#' ylab="", main="Rainbow plot")
#'
#' ## Obtain DTW distances
#' D.dtw <- dist(X.toy, method="dtw", window.type="sakoechiba", window.size=5)
#'
#' ## Compare PC vs. PCo ridge regression
#'
#' # matrix to store results
#' GCVmat <- matrix(NA, 15, 2)
#' # dummy response variable
#' dummy <- rep(1,30)
#'
#' # loop over possible projection dimensions
#' for (k. in 1:15){
#' # fit PC (m1) and PCo (m2) ridge regression
#' m1 <- gam(y.toy ~ s(dummy, bs="pco", k=k.,
#' xt=list(realdata=X.toy, dist_fn=dist)), method="REML")
#' m2 <- gam(y.toy ~ s(dummy, bs="pco", k=k., xt=list(D=D.dtw)), method="REML")
#' # calculate and store GCV scores
#' GCVmat[k., ] <- length(y.toy) * c(sum(m1$residuals^2)/m1$df.residual^2,
#' sum(m2$residuals^2)/m2$df.residual^2)
#' }
#'
#' ## plot the GCV scores per dimension for each model
#' matplot(GCVmat, lty=1:2, col=1, pch=16:17, type="o", ylab="GCV",
#' xlab="Number of principal components / coordinates",
#' main="GCV score")
#' legend("right", c("PC ridge regression", "DTW-based PCoRR"), lty=1:2, pch=16:17)
#'
#' ## example of making a prediction
#'
#' # fit a model to the toy data
#' m <- gam(y.toy ~ s(dummy, bs="pco", k=2, xt=list(D=D.dtw)), method="REML")
#'
#' # first build the distance matrix
#' # in this case we just subsample the original matrix
#' # see ?pco_predict_preprocess for more information on formatting this data
#' dist_list <- list(dummy = as.matrix(D.dtw)[, c(1:5,10:15)])
#'
#' # preprocess the prediction data
#' pred_data <- pco_predict_preprocess(m, newdata=NULL, dist_list)
#'
#' # make the prediction
#' p <- predict(m, pred_data)
#'
#' # check that these are the same as the corresponding fitted values
#' print(cbind(fitted(m)[ c(1:5,10:15)],p))
smooth.construct.pco.smooth.spec <- function(object, data, knots){
## test what we got given
# all the extra stuff gets put in object$xt
xt <- object$xt
if(all(c("D","realdata","dist_fn") %in% names(xt)) |
all(c("D","dist_fn") %in% names(xt)) |
all(c("D","realdata") %in% names(xt)) |
!any(c("D", "realdata", "dist_fn") %in% names(xt))){
stop("Please supply either a distance matrix or data and distance function!")
}
# distance matrix
if(all(c("realdata","dist_fn") %in% names(xt))){
D <- xt$dist_fn(xt$realdata)
}else{
D <- xt$D
}
# projection dimension
pdim <- object$bs.dim
## do some input checking
# either K or D must be supplied
if(is.null(D)) {
stop("No distance matrix!")
}
# default to use regular cmdscale
if(is.null(xt$fastcmd)){
xt$fastcmd <- FALSE
}
# use the additive constant?
# default to FALSE
if(!is.null(xt$add)){
add <- xt$add
}else{
add <- FALSE
}
# what do cmdscale options mean?!
# k - dimension of MDS projection
# eig - return eigenvalues
# x.ret - return (double centred distance matrix)
# add - add a constant so D is Euclidean (cov() gives -ve
# values, which is not a property of a distance.
if(xt$fastcmd){
# use lanczos for the eigendecomposition
mds.obj <- cmdscale_lanczos(D, k=pdim, eig=TRUE, x.ret=TRUE, add=add)
}else{
mds.obj <- cmdscale(D, k=pdim, eig=TRUE, x.ret=TRUE, add=add)
}
if(sum(mds.obj$eig>0) < pdim){
stop("Only the first ",sum(mds.obj$eig>0)," eigenvalues are positive, this is the maximum projection dimension without setting 'add=TRUE', see ?smooth.construct.pco.smooth.spec for further information.")
}
## four required additions to the return object:
# model matrix
object$X <- mds.obj$points
colnames(object$X) <- paste0("pco_", 1:ncol(object$X))
# penalty matrix
object$S <- list(diag(nrow = pdim))
# penalty rank
object$rank <- array(pdim, dim=c(1,1))
# null space dimension
object$null.space.dim <- 0
# set the label (for plots and summary)
object$label <- object$term
# store dimension
object$dim <- pdim
## extra options
# don't allow tensor products
object$te.ok <- 0
# probably best not to plot this with plot.gam
# (what would the x axis be?)
object$plot.me <- FALSE
# see ?smooth.construct for what these mean!
object$C <- array(dim=c(0, 1))
object$side.constrain <- FALSE
object$no.rescale <- TRUE
# save mds object returned by cmdscale
object$xt$mds.obj <- mds.obj
# give it some class
class(object) <- "pco.smooth"
return(object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.