FLAM_canada.R
In FDboost: Boosting Functional Regression Models

## ----pkg-attach, echo = FALSE, warning=FALSE, message=FALSE-------------------
# Load FDboost package
library(FDboost)

## ----load-data, echo = TRUE, warning=FALSE, message=FALSE---------------------
library(fda)
data("CanadianWeather", package = "fda")

### use data on a monthly basis (c.f. Scheipl et. al. online supplement)
dataM <- with(CanadianWeather, 
        list(   temp = t(monthlyTemp),
                l10precip = t(log10(monthlyPrecip)),
                lat = coordinates[,"N.latitude"],
                lon = coordinates[,"W.longitude"],
                region = factor(region),
                place = factor(place)
        ))
# correct Prince George location (wrong at least until fda_2.2.7):
dataM$lon["Pr. George"] <- 122.75 
dataM$lat["Pr. George"] <- 53.9 

# center temperature curves:
dataM$tempRaw <- dataM$temp
dataM$temp <- sweep(dataM$temp, 2, colMeans(dataM$temp)) 

# define function indices 
dataM$month.t <- 1:12
dataM$month.s <- 1:12

## ----plot-data, echo=TRUE, fig.cap='Monthly average temperature and log-precipitation at 35 locations in Canada. Regions are coded by colors and different line types.', fig.height=5, fig.width=10, fig.pos='!htbp'----

par(mfrow=c(1,2))

# plot precipitation
with(dataM, {
  matplot(t(l10precip), type = "l", lty = as.numeric(region), 
          col = as.numeric(region),
          xlab = "", ylab = "",
          ylim = c(-1.2, 1))#, cex.axis = 1, cex.lab = 1)
  legend("bottom", col = 1:4, lty = 1:4, legend = levels(region),
         cex = 1, bty = "n")
})
mtext("time [month]", 1, line = 2)#, cex = 1.5)
mtext("log(precipitation) [mm]", 2, line = 2)#, cex = 1.5)

# plot temperature
with(dataM, {
  matplot(t(tempRaw), type = "l", lty = as.numeric(region), 
          col = as.numeric(region),
          xlab = "", ylab = "")#, 
#          cex.axis = 1, cex.lab = 1)
})
mtext("time [month]", 1, line = 2)#, cex = 1.5)
mtext("temperature [C]", 2, line = 2)#, cex = 1.5)

## ----setup-model, echo = TRUE, message=FALSE, warning=FALSE-------------------
locations <- cbind(dataM$lon, dataM$lat)
### fix location names s.t. they correspond to levels in places
rownames(locations) <- as.character(dataM$place)

library(fields) 
  ### get great circle distances between locations:
  dist <- rdist.earth(locations, miles = FALSE, R = 6371)
  
  ### construct Matern correlation matrices as
  ### marginal penalty for a GRF over the locations:
  ### find ranges for nu = .5, 1 and 10
  ### where the correlation drops to .2 at a distance of 500/1500/3000 km
  ### (about the 10%/40%/70% quantiles of distances here)
  r.5 <- Matern.cor.to.range(500, nu = 0.5, cor.target = .2)
  r1 <- Matern.cor.to.range(1500, nu = 1.0, cor.target = .2)
  r10 <- Matern.cor.to.range(3000, nu = 10.0, cor.target = .2)
  ### compute correlation matrices
  corr_nu.5 <- apply(dist, 1, Matern, nu = .5, range = r.5)
  corr_nu1 <- apply(dist, 1, Matern, nu = 1, range = r1)
  corr_nu10 <- apply(dist, 1, Matern, nu = 10, range = r10)
  ### invert to get precisions
  P_nu.5 <- solve(corr_nu.5)
  P_nu1 <- solve(corr_nu1)
  #P_nu10 <- solve(corr_nu10)
   
if(FALSE){
  curve(Matern(x, nu = .5, range = r.5), 0, 5000, ylab = "Correlation(d)", 
        xlab = "d [km]", lty = 2)
  curve(Matern(x, nu = 1, range = r1), 0, 5000, add = TRUE, lty = 3)
  curve(Matern(x, nu = 10, range = r10), 0, 5000, add = TRUE, lty = 4)
  legend("topright", inset = 0.2, lty = c(2, NA, 3, NA, 4), 
         legend = c(".5", "", "1", "", "10"), 
       title = expression(nu), cex = .8, bty = "n") 
}


## for uncorrelated residuals
#   P_nu.5 <- diag(35)
#   print("Residuals are uncorrelated!")


## ----fit-model, echo = TRUE, message=FALSE------------------------------------
# use bolsc() base-learner with precision matrix as penalty matrix
set.seed(210114)
mod3 <- FDboost(l10precip ~ bols(region, df = 2.5, contrasts.arg = "contr.dummy") 
                + bsignal(temp, month.s, knots = 11, cyclic = TRUE, 
                          df = 2.5, boundary.knots = c(0.5, 12.5), check.ident = FALSE)
                + bolsc(place, df = 2.5, K = P_nu.5, contrasts.arg = "contr.dummy"), 
                timeformula = ~ bbs(month.t, knots = 11, cyclic = TRUE, 
                                 df=3, boundary.knots = c(0.5, 12.5)),
                offset="scalar", offset_control = o_control(k_min = 5), 
                data=dataM)

## ----cv-bootstrap0, eval = TRUE-----------------------------------------------
mod3 <- mod3[47]

## ----cv-bootstrap, eval = FALSE-----------------------------------------------
#  set.seed(2303)
#  folds <- cvMa(ydim = mod3$ydim, type = "bootstrap", B = 25)
#  cvMod3 <- cvrisk(mod3, grid = seq(1, 1000, by=1), folds = folds, mc.cores = 1)
#  mod3 <- mod3[mstop(cvMod3)] # 47
#  # summary(mod3)

## ----plot-bootstrap-model1a, echo = TRUE, fig.cap='Coefficients for model with 49 boosting iterations (determinded by 25 fold bootstrap). The estimated coefficients for the four climatic regions are plotted with color coded regions (left panel).  The coefficient function for the functional effect of temperature is colored in red for positive values and in blue for negative values (right panel).', fig.pos='!htbp', fig.height=5, fig.width=10----

par(mfrow=c(1,2))#, mar = c(7,4,7,1))#, cex = 1.5, cex.main = 0.9)
predRegion <- predict(mod3, which = 1, 
                      newdata = list(region = factor(c("Arctic", "Atlantic",
                                                   "Continental", "Pacific")),
                                   month.t = seq(1, 12, l=20))) + mod3$offset
matplot(seq(1, 12, l = 20), t(predRegion), col = 1:4, 
        type = "l", lwd = 2, lty = 1:4,
        main = "region", ylab = "", xlab = "")
mtext("t, time [month]", 1, line = 2)#, cex = 1.5)

legend("bottom", lty = 1:4, legend = levels(dataM$region), col = 1:4, bty = "n", lwd = 2)

## plot the effect of temperature
# par(mar = c(4,4,7,1))
plot(mod3, which = 2, pers = TRUE, main = "temperature", zlab = "",
     xlab = "s, time [month]", ylab = "t, time [month]")

## ----cv-curves0, echo = TRUE--------------------------------------------------
mod3 <- mod3[750]

## ----cv-curves, eval = FALSE--------------------------------------------------
#  set.seed(143)
#  folds <- cvMa(ydim = mod3$ydim, type = "curves")
#  cvMod3curves <- cvrisk(mod3, grid = seq(1, 1000, by = 1), folds = folds, mc.cores = 1)
#  
#  ## optimal stopping iteration in terms of mean
#  mstop(cvMod3curves)
#  ## optimal stopping iteration in terms of median
#  (mStop <- which.min(apply(cvMod3curves, 2, median)) )
#  mod3 <- mod3[mStop]

## ----plot-bootstrap-model1b, echo = TRUE, fig.cap='Coefficients for model with 750 boosting iterations (determinded by leaving-one-curve-out cross-validation). The estimated coefficients for the four climatic regions are plotted with color coded regions (left panel).  The coefficient function for the functional effect of temperature is colored in red for positive values and in blue for negative values (right panel).', fig.pos='!htbp', fig.height=5, fig.width=10----
par(mfrow=c(1,2))
predRegion <- predict(mod3, which=1, 
                      newdata = list(region = factor(c("Arctic", "Atlantic",
                                                   "Continental", "Pacific")),
                                   month.t=seq(1, 12, l = 20))) + mod3$offset
matplot(seq(1, 12, l=20), t(predRegion), col = 1:4, 
        type = "l", lwd = 2, lty = 1:4,
        main = "region", ylab = "", xlab = "")
mtext("t, time [month]", 1, line = 2)#, cex = 1.5)
#plot(mod, which=1, lwd=2, lty=1, col=c(2,3,4,1))
legend("bottom", lty = 1:4, legend = levels(dataM$region), col = 1:4, bty = "n", lwd = 2)

## plot the effect of temperature
plot(mod3, which = 2, pers = TRUE, main = "temperature", zlab = "",
     xlab = "s, time [month]", ylab = "t, time [month]")

## ----plot-bootstrap-model2_prep, echo = TRUE----------------------------------
ord <- c("Dawson", "Whitehorse", "Yellowknife", "Uranium City", "Churchill",
         "Edmonton",  "Pr. Albert", "The Pas", "Calgary", "Regina", "Winnipeg",
         "Thunder Bay",
         "Pr. George", "Pr. Rupert", "Kamloops", "Vancouver", "Victoria",         
         "Scheffervll", "Bagottville", "Arvida", "St. Johns", "Quebec",
         "Fredericton", "Sydney", "Ottawa", "Montreal", "Sherbrooke", "Halifax",
         "Yarmouth", "Toronto", "London", "Charlottvl",
         "Inuvik",  "Resolute", "Iqaluit" )
ind <- sapply(1:35, function(s){ which(dataM$place == ord[s]) }) 
smoothRes <- predict(mod3, which=3)
if( is.null(dim(smoothRes)) ) smoothRes <- matrix(0, ncol = 12, nrow = 35)
smoothRes <- (smoothRes )[ind, ] 
# smoothRes <- ( predict(mod4, which=3) )[ind, ]
regionOrd <- dataM$region[ind]

fit3 <- (predict(mod3))[ind, ] 
response <- dataM$l10precip[ind, ]

## ----plot-bootstrap-model2, echo = TRUE, fig.cap='The estimated smooth spatially correlated residual curves (lines) and the residuals (circles) are plotted with regions color-coded. The locations of the weather stations are given in the map at the bottom.', warning=FALSE, message=FALSE, fig.height=9.5----
par(mar = c(2.55, 2.05, 2.05, 1.05), oma=c(0, 0, 0, 0))
layout(rbind(matrix(1:36, 6, 6), rep(37, 6), rep(37, 6)))
for(i in 1:35) {
  plot(1:12, smoothRes[i, ], col = as.numeric(regionOrd[i]), type = "l",
       ylim = range(smoothRes, response-fit3), 
       main = paste(ord[i], " (", i, ")", sep = ""), 
       cex = 1.2, cex.axis = .8, ylab = "", xlab = "")
  abline(h = 0, col = 8)
  lines(1:12, smoothRes[i, ], col = as.numeric(regionOrd[i]))
  points(1:12, response[i, ] - fit3[i, ], cex = 0.8)
}
plot(0, 0, col = "white", xaxt = "n", yaxt = "n", bty = "n")

if(require(maps) & require(mapdata)){
  mapcanada <- map(database="world", regions="can", plot=FALSE)
  plot(mapcanada, type = "l", xaxt = "n", yaxt = "n", ylab = "", xlab = "", bty = "n",
     xlim = c(-141, -50), ylim=c(43, 74),
     col = "grey", mar = c(0, 0, 0, 0))
  for(i in 1:35) {
  text(-dataM$lon[ind[i]], dataM$lat[ind[i]], col = as.numeric(regionOrd[i]), 
       labels = as.character(i), cex = 0.8)
 }
}