Nothing
###################################
## Functions for crossvalidation ##
###################################
##Functions in this file:
## createCV EX:ok
## dropObservations EX:ok
## stCheckInternalCV EX:INTERNAL
## predictNaive EX:ok
## computeLTA EX:ok
##' Creates a matrix that specifies cross-validation schemes.
##'
##' The number of observations left out of each group can be rather uneven;
##' the main goal of \code{createCV} is to create CV-groups such that the groups
##' contain roughly the same \emph{number of locations} ignoring the number of
##' observations at each location. If there are large differences in the number
##' of observations at differnt locations one could use the \code{subset} option
##' to create different CV-groupings for different types of locations. If
##' \code{Icv.vector=FALSE}, the groups can then be combined as \cr
##' \code{I.final = I.1 | I.2 | I.3}.
##'
##' The \code{option} input determines which sites to include in the
##' cross-validation. Possible options are \code{"all"}, \code{"fixed"},
##' \code{"comco"}, \code{"snapshot"} and \code{"home"}.
##' \describe{
##' \item{all}{Uses all available sites, possibly subset according to
##' \code{subset}. The sites will be grouped with sites
##' seperated by less than \code{min.dist} being put in the
##' same CV-group.}
##' \item{fixed}{Uses only sites that have \cr
##' \code{STmodel$locations$type \%in\% c("AQS","FIXED")}.
##' Given the subsettting the sites will be grouped as for \code{all}.}
##' \item{home}{Uses only sites that have \cr
##' \code{STmodel$locations$type \%in\% c("HOME")}.
##' Given the subsettting the sites will be grouped as for \code{all}.}
##' \item{comco}{Uses only sites that have \cr
##' \code{STmodel$locations$type \%in\% c("COMCO")}.
##' The sites will be grouped together if they are from the same road
##' gradient. The road gradients are grouped by studying the name of the
##' sites. With "?" denoting one or more letters and "#" denoting one or
##' more digits the names are expected to follow "?-?#?#", for random sites,
##' and "?-?#?#?" for the gradients (with all but the last letter being the
##' same for the entire gradient).}
##' }
##'
##' @title Define Cross-Validation Groups
##'
##' @param STmodel Model object for which to determine cross-validation.
##' @param groups Number of cross-validation groups, zero gives leave-one-out
##' cross-validation.
##' @param min.dist Minimum distance between locations for them to end up in
##' separate groups. Points closer than \code{min.dist} will be forced into the
##' same group. A high value for \code{min.dist} can result in fewer
##' cross-validation groups than specified in \code{groups}.
##' @param random If \code{FALSE} repeated calls to the function will return the
##' same grouping, if \code{TRUE} repeated calls will give different
##' CV-groupings. Ensures that simulation studies are reproducable.
##' @param subset A subset of locations for which to define the cross-validation
##' setup. Only sites listed in \code{subset} are dropped from one of the
##' cross-validation groups; in other words sites \emph{not in} \code{subset}
##' are used for estimation and preidiction of \emph{all} cross-validation
##' groups. This option is \emph{ignored} if \code{option!="all"}.
##' @param option For internal MESA Air usage, see Details below.
##' @param Icv.vector Attempt to return a vector instead of a matrix. If the same
##' observation is in several groups a matrix will still be returned.
##'
##' @return Return a vector, with each element giving the CV-group (as an
##' integer) of each observation; Or a (number or observations) - by -
##' (groups) logical matrix; each column defines a cross-validation set with
##' the \code{TRUE} values marking the observations to be left out.
##'
##' @example Rd_examples/Ex_createCV.R
##'
##' @author Johan Lindstrom
##' @family STmodel functions
##' @family cross-validation functions
##' @export
createCV <- function(STmodel, groups=10, min.dist=.1, random=FALSE,
subset=NA,
option=c("all","fixed","comco","snapshot","home"),
Icv.vector=TRUE){
##check class belonging
stCheckClass(STmodel, "STmodel", name="STmodel")
##options - for internal MESA usage:
## all (just divide over all sites)
## fixed (only AQS and FIXED sites)
## comco (only comco, keeping gradients together, ignores min.dist)
## snapshot (same as comco)
## home (only home)
option <- match.arg( option )
if(option=="all"){
Ind <- 1:dim(STmodel$locations)[1]
if( all(!is.na(subset)) ){
##subset the data - first cast to character
subset <- as.character(subset)
##check that all requested names exist in the colnames
if( any(!(subset %in% STmodel$locations$ID)) )
warning( sprintf("Subset names not found: %s",
paste(subset[!(subset %in% STmodel$locations$ID)],
collapse=", ")) )
Ind <- which(STmodel$locations$ID %in% subset)
}
}else if(option=="fixed"){
Ind <- which(toupper(STmodel$locations$type) %in% c("AQS","FIXED"))
}else if(option=="home"){
Ind <- which(toupper(STmodel$locations$type) %in% c("HOME"))
}else{ ##remaining options are comco/snapshot
Ind <- which(toupper(STmodel$locations$type) %in% c("COMCO"))
}
if(length(Ind)==0)
stop( paste("No sites found for option:",option) )
if(!random){
##set the seed to ensure that repeated calls returns the same grouping
if( exists(".Random.seed") )
old.seed <- .Random.seed ##save current seed so it can be re-set
else
old.seed <- NULL
set.seed(0,kind="Mersenne-Twister")
}
if( option %in% c("all","fixed","home") ){
##extract distance matrix and find stations closer than min.dist
D <- STmodel$D.beta[Ind,Ind]
D[lower.tri(D,diag=TRUE)] <- NA
Ind.coloc <- apply(D, 1, function(x){which(x<min.dist)} )
Ind.G <- list()
if( any(sapply(Ind.coloc,length)>0) ){
##find Ind.coloc's that contain colocated sites
Ind.coloc.I <- unname(which(sapply(Ind.coloc,length)>0))
i <- 1
while( length(Ind.coloc.I)>0 ){
##extract the i:th colocated sites and add it to the grouping vector
Ind.G[[i]] <- unique(c(Ind.coloc.I[1],Ind.coloc[[Ind.coloc.I[1]]]))
##drop the site from Ind.coloc.I
Ind.coloc.I <- Ind.coloc.I[-1]
##see if this leads to additional matches
##find all Ind.coloc elemets that contain things from Ind.G[[i]]
##This is done both looking for matches
Ind.tmp <- which(sapply(Ind.coloc,function(x)
any(x %in% Ind.G[[i]])))
##and adding the Ind.G[[i]] elements since the "diagonal"
##isn't found thtough matches
Ind.tmp <- unique(c(Ind.tmp,Ind.G[[i]]))
##Finnaly let's only consider things not yet used.
Ind.tmp <- Ind.tmp[Ind.tmp %in% Ind.coloc.I]
##Add and repeat until no more new elements match
while( length(Ind.tmp)>0 ){
Ind.G[[i]] <- unique(c(Ind.G[[i]], Ind.tmp,
unlist(Ind.coloc[Ind.tmp])))
Ind.coloc.I <- Ind.coloc.I[!(Ind.coloc.I %in% Ind.tmp)]
Ind.tmp <- which(sapply(Ind.coloc,function(x)
any(x %in% Ind.G[[i]])))
Ind.tmp <- unique(c(Ind.tmp,Ind.G[[i]]))
Ind.tmp <- Ind.tmp[Ind.tmp %in% Ind.coloc.I]
}
i <- i+1
}
for(i in 1:length(Ind.G))
Ind.G[[i]] <- sort(Ind[Ind.G[[i]]])
}
##extract points that are not colocated
Ind.R <- Ind[!(Ind %in% unlist(Ind.G))]
}else{ ##the snapshot sites
##drop starting non-numerical and then numerical characters
##from site ID, if the first character of whats left=="R" then
##this is a random site. -> seprate indecies into random and gradient
##random sites are on the format [A-Z]-[A-Z][0-9]R[0-9]
##so remove the leading characters and see if we find an R.
Ind.R <- Ind[substr(sub("^[A-Z]+-[A-Z]+[0-9]+","",
STmodel$locations$ID[Ind]),1,1)=="R"]
##non random sites are the rest.
Ind <- Ind[!(Ind %in% Ind.R)]
##now lets group the gradient sites
grad.id <- unique(sub("[A-Z]+$","",
as.character(STmodel$locations$ID[Ind])))
Ind.G <- list()
for(i in 1:length(grad.id))
Ind.G[[i]] <- Ind[grep(grad.id[i],STmodel$locations$ID[Ind])]
}##if( option %in% c("all","fixed","home") ){...}else{...}
##we have no created an Ind.G and a Ind.R variable, lets permute and
##create the CV-indicator matrix
if(groups==0){ ##leave one out CV, place each group seperately
Ind.fin <- Ind.G
if( length(Ind.R)!=0 )
for(i in 1:length(Ind.R))
Ind.fin[[i+length(Ind.G)]] <- Ind.R[i]
groups <- length(Ind.fin)
}else{
groups <- min(c(groups,length(Ind.R)+length(Ind.G)))
##randomly permute the sites
Ind.R <- sample(Ind.R,length(Ind.R),replace=FALSE)
if(length(Ind.G)!=0){
Ind.G <- Ind.G[sample(length(Ind.G),length(Ind.G),replace=FALSE)]
}
##collect sites to be in each group, first random sites
Ind.R <- matrix(c(Ind.R,rep(NA,ceiling(length(Ind.R)/groups)*groups -
length(Ind.R))), nrow=groups)
##and then collect the grouped sites
if(length(Ind.G)==0){
Ind <- matrix(NA,groups,1)
}else{
Ind <- 1:length(Ind.G)
Ind <- matrix(c(Ind,rep(NA,ceiling(length(Ind)/groups)*groups -
length(Ind))), nrow=groups)
##flips the matrix so that we have more groups at the bottom, since
##there are more random sites at the top.
Ind <- Ind[seq(dim(Ind)[1],1,-1),,drop=FALSE]
}
##compose a list, each element containing one cv-group
Ind.fin <- list()
for(i in 1:groups){
Ind.fin[[i]] <- c(Ind.R[i,],unlist(Ind.G[Ind[i,]]))
Ind.fin[[i]] <- Ind.fin[[i]][!is.na(Ind.fin[[i]])]
}
}##if(groups==0) else
if(!random){
##re-set the random number generator
if( !is.null(old.seed) )
.Random.seed <- old.seed
}
if( any(sapply(Ind.fin,length)==0) )
stop("Some CV-groups contain NO sites, try reducing the number of requested groups.")
Ind.cv <- matrix(FALSE,dim(STmodel$obs)[1],groups)
for(i in 1:groups)
Ind.cv[,i] <- STmodel$obs$idx %in% Ind.fin[[i]]
##return just a vector?
Ind.cv <- stCheckInternalCV(Ind.cv, Icv.vector)
##create a suitable cross-validation matrix, each colum
##marks the elements to drop for a specific CV.
return(Ind.cv)
}##function createCV
##' Drops observations from \code{STmodel}, removing marked observations
##' along with the corresponding locations and recomputes a number of relevant
##' elements.
##'
##' @title Drop Observations from a STmodel
##'
##' @param STmodel Model object from which to drop observations.
##' @param Ind.cv A logical vector with one element per observation in \cr
##' \code{STmodel$obs}. Observations marked with the \code{TRUE}
##' will be dropped from the data structure. Use \code{\link{createCV}} to
##' create the logical vector.
##'
##' @return Returns the \code{STmodel} without the observations marked by
##' \code{Ind.cv}. Only observed locations are retained.
##'
##' @example Rd_examples/Ex_dropObservations.R
##'
##' @author Johan Lindstrom
##' @family STmodel functions
##' @family cross-validation functions
##' @export
dropObservations <- function(STmodel, Ind.cv){
##check class belonging
stCheckClass(STmodel, "STmodel", name="STmodel")
##check that the size of the indicator is consistent.
N <- dim(STmodel$obs)[1]
if(( is.vector(Ind.cv) && length(Ind.cv)!=N ) ||
( !is.vector(Ind.cv) && any(dim(Ind.cv) != c(N,1)) ) ){
stop("Length of Ind.cv must match dim(STmodel$obs)[1]")
}
##drop observations, temporal trends and spatio-temporal covars.
STmodel$obs <- STmodel$obs[!Ind.cv,,drop=FALSE]
STmodel$F <- STmodel$F[!Ind.cv,,drop=FALSE]
if( length(STmodel$ST)!=0 ){
STmodel$ST <- STmodel$ST[!Ind.cv,,drop=FALSE]
}
##find which locations that remain
IND <- STmodel$locations$ID %in% unique(STmodel$obs$ID)
##drop locations
STmodel$locations <- STmodel$locations[IND,,drop=FALSE]
##and spatio-temporal covariates
if( length(STmodel$ST.all)!=0 ){
STmodel$ST.all <- STmodel$ST.all[,IND,,drop=FALSE]
}
##find missing parts in LUR (repeat due to possible missing locations)
for(i in 1:length(STmodel$LUR)){
IND <- rownames(STmodel$LUR[[i]]) %in% unique(STmodel$obs$ID)
STmodel$LUR[[i]] <- STmodel$LUR[[i]][IND,,drop=FALSE]
}
STmodel$LUR.all <- STmodel$LUR
##and recompute the indecies
STmodel$obs$idx <- match(STmodel$obs$ID, STmodel$locations$ID)
##...and some other things
STmodel <- createSTmodelInternalDistance(STmodel)
##...and the covariance
STmodel <- updateCovf(STmodel)
##return stripped structure
return(STmodel)
}##function dropObservations
stCheckInternalCV <- function(Ind.cv, Icv.vector=TRUE){
if( is.null(Ind.cv) ){
stop("Ind.cv must be defined (is NULL)")
}else if( is.vector(Ind.cv) ){
##ensure integers
Ind.cv <- as.integer(Ind.cv)
uInd.cv <- sort(unique(Ind.cv))
##find missing/empty groups
I.rep <- match(1:max(uInd.cv),uInd.cv)
I.drop <- which(is.na(I.rep))
if( length(I.drop!=0) ){
##renumber CV groups
I.rep[ !is.na(I.rep) ] <- 1:sum(!is.na(I.rep))
I.rep <- c(I.rep,0)
Ind.cv[Ind.cv==0] <- length(I.rep)
Ind.cv <- I.rep[Ind.cv]
}
}else{
I.drop <- which( colSums(Ind.cv)==0 )
##renumber CV groups
if( length(I.drop!=0) ){
Ind.cv <- Ind.cv[, -I.drop, drop=FALSE]
}
if( Icv.vector && max(rowSums(Ind.cv))==1 ){
Ind.cv <- apply(Ind.cv, 1, function(x){ x=which(x); if(length(x)!=1){x=0};
return(x)})
}
}##if( is.vector(Ind.cv) ){...}else{...}
if( length(I.drop!=0) ){
warning( paste("Empty cross validation group(s) dropped:",
paste(I.drop, collapse=", ")) )
}
return(Ind.cv)
}##function stCheckInternalCV
##' Computes naive predictions that are based on a few sites. These predictions
##' can then be used, e.g. in \code{\link{summary.predCVSTmodel}}, to evaluate
##' how much better the spatial-temporal model performs compared to simple
##' (temporal) predictions.
##' \cr
##' The function requires one of \code{location} and \code{type} to be
##' specified, if both are given \code{location} \emph{will be used over}
##' \code{type}. If \code{type} is given locations such that \cr
##' \code{as.character(STmodel$locations$type) \%in\% type} \cr will be
##' used.
##'
##' Given a set of locations the function computes 4 sets of naive prediction
##' for the observations in \code{STmodel}:
##' \describe{
##' \item{smooth.fixed}{The smooth trend in \code{STmodel$trend} is fit to
##' \emph{all} observations at the sites in
##' \code{locations} using a linear regression. The
##' resulting smooth is used as a naive prediction for
##' all locations.}
##' \item{avg.fixed}{The temporal average over sites in \code{locations} is
##' used as a naive prediction.}
##' \item{smooth.closest.fixed}{This fits the smooth trend in
##' \code{STmodel$trend} to each site in
##' \code{locations}; using the smooth at the
##' closest fixed site as a naive prediction.}
##' \item{closest.fixed}{This uses the observations at the closest site in
##' \code{locations} to predict observations at all other
##' sites.}
##' }
##'
##' @title Naive Temporal Predictions
##'
##' @param STmodel \code{STmodel} object for which to compute simple
##' predictions.
##' @param locations Locations on which to base the naive predictions.
##' @param type The type of sites to base the predictions on, uses the
##' (optional) field \code{STmodel$locations$type}.
##' @return A list with items:
##' \item{pred}{A (number of observations) - by - (6) data.frame containing
##' the four naive predictions described under \code{details},
##' along with dates and IDs.}
##' \item{locations}{The locations used for the naive predictions.}
##'
##' @example Rd_examples/Ex_predictNaive.R
##'
##' @author Johan Lindstrom
##' @importFrom stats lm
##' @family STmodel functions
##' @family cross-validation functions
##' @export
predictNaive <- function(STmodel, locations=NULL, type=NULL){
##check class belonging
stCheckClass(STmodel, "STmodel", name="STmodel")
##matrix holding the predictions
pred <- data.frame(smooth.fixed=double( length(STmodel$obs$obs) ),
avg.fixed=double( length(STmodel$obs$obs) ),
smooth.closest.fixed=double( length(STmodel$obs$obs) ),
closest.fixed=double( length(STmodel$obs$obs) ))
pred$ID <- STmodel$obs$ID
pred$date <- STmodel$obs$date
##select sites to base the predictions on
if( !is.null(locations) ){
if( is.character(locations) ){
IND.loc <- locations
}else{
IND.loc <- STmodel$locations$ID[as.double(locations)]
}
}else if( !is.null(type) ){
IND.loc <- STmodel$locations$ID[which(STmodel$locations$type %in% type)]
}else{
stop("Both locations and type are NULL, you must specify one.")
}
##check for specified locations
I <- IND.loc %in% unique(STmodel$obs$ID)
if( any(!I) ){
warning( paste("unobserved locations", paste(IND.loc[!I], collapse=", ")) )
IND.loc <- IND.loc[I]
}
if( length(IND.loc)==0 ){
stop("No sites to use for naive prediction found")
}
##indicator picking out the sites to use for naive prediction.
IND <- (STmodel$obs$ID %in% IND.loc)
##indicator for which is the closest fixed site
IND.closest <- apply(STmodel$D.beta[IND.loc,,drop=FALSE],2,order)
##special case for only one site
if( length(IND.loc)==1 || !is.matrix(IND.closest)){
IND.closest <- matrix(IND.closest, nrow=1)
colnames(IND.closest) <- colnames(STmodel$D.beta)
}
##1) fit temporal smooths to all the fixed sites
pred[,1] <- c(STmodel$F %*% lm(STmodel$obs$obs[IND] ~
STmodel$F[IND,]-1)$coefficients)
##2) use average of the fixed sites to predict
date.tmp <- as.double(STmodel$obs$date)
for(t in unique(date.tmp)){
I <- date.tmp==t
pred[I,2] <- mean(STmodel$obs$obs[IND & I])
}
##3) fit temporal smooth to the closest fixed site to predict the data
##start by fitting the temporal to each of the fixed sites
pred.tmp <- matrix(NA,length(STmodel$obs$obs),length(IND.loc))
for(i in 1:length(IND.loc)){
IND.tmp <- STmodel$obs$ID==IND.loc[i]
pred.tmp[,i] <- STmodel$F %*%
lm(STmodel$obs$obs[IND.tmp] ~
STmodel$F[IND.tmp,]-1)$coefficients
}
for(i in unique(STmodel$obs$ID)){
pred[pred$ID==i,3] <- pred.tmp[STmodel$obs$ID==i,IND.closest[1,i]]
}
##4) use observations from the closest available site
pred.tmp <- matrix(NA,length(STmodel$obs$obs),length(IND.loc))
for(i in 1:length(IND.loc)){
IND.tmp <- STmodel$obs$ID==IND.loc[i]
obs.tmp <- STmodel$obs$obs[IND.tmp]
pred.tmp[,i] <- obs.tmp[match(STmodel$obs$date,
STmodel$obs$date[IND.tmp])]
}
for(i in unique(STmodel$obs$ID)){
pred[pred$ID==i,4] <-
apply(pred.tmp[pred$ID==i, IND.closest[,i], drop=FALSE], 1,
function(x) switch(all(is.na(x))+1,x[min(which(!is.na(x)))],NA))
}
return( list(pred=pred, locations=IND.loc) )
}##predictNaive
##' Computes the long term average of observations and cross-validated
##' predictions for each of the sites in \code{object}. The long term averages
##' are computed using \emph{only} timepoints that have observations, this
##' applies to both the observed and predicted. Also the function allows for a
##' transformation: if requested the transformation is applied \emph{before} the
##' averaging.
##'
##' @title Computes the Long Term Average for Each Sites.
##'
##' @param object A \code{predCVSTmodel} object, the result of
##' \code{\link{predictCV.STmodel}}.
##' @param transform Transform observations (\emph{without} bias correction) and
##' predictions \emph{before}
##' computing averages; e.g. \code{transform=exp} gives the long term averages
##' as \code{mean( exp(obs) )} and \code{mean( exp(pred) )}.
##' @return Returns a (number of locations) - by - 4 matrix with the observed
##' and predicted value (using the three different model parts) for each
##' location.
##'
##' @example Rd_examples/Ex_computeLTA.R
##'
##' @author Johan Lindstrom
##' @family predCVSTmodel functions
##' @family cross-validation functions
##' @export
computeLTA <- function(object, transform=function(x){return(x)}){
##check class belonging
stCheckClass(object, "predCVSTmodel", name="object")
##is data already transformed - LTA is a bad idea
if( !is.null(object$opts$transform) && !missing(transform)){
stop("Data already transformed by predictCV")
}
##check transformation
if( !is.function(transform) ){
stop("'transform' should be a function")
}
##apply transformation
I <- names(object$pred.obs) %in% c("obs","EX.mu","EX.mu.beta","EX","EX.pred")
out <- transform( object$pred.obs[,I,drop=FALSE] )
##split by site
out <- split(out, object$pred.obs$ID)
##drop NA:s
for(i in 1:length(out)){
I <- !apply( is.na(out[[i]]), 1, any)
out[[i]] <- out[[i]][I,,drop=FALSE]
}
##compute averages for each site, and convert to data.frame
out <- as.data.frame( t(sapply(out, colMeans)) )
return(out)
}##function computeLTA
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.