circforest_fit_models_ibk.R
In circtree: Regression Trees and Forests for Circular Responses

# -------------------------------------------------------------------
# - NAME:   circforest_fit_models_ibk.R
# - AUTHOR: Moritz N. Lang, Lisa Schlosser
# - DATE:   2019-06-03
# -------------------------------------------------------------------
# - PURPOSE:
# -------------------------------------------------------------------
# - L@ST MODIFIED: 2019-08-14 on thinkmoritz
# -------------------------------------------------------------------

## Model version for output files
my_vers <- "v4"

## Set time to utc (just in case)
Sys.setenv('TZ'='UTC')

## Load packages
library("zoo")
library("circtree")
library("verification") # Careful, version 1.35 needed!!

if (! dir.exists("results")) dir.create("results")

# -------------------------------------------------------------------
# Pre-process data
# -------------------------------------------------------------------

## Load data / Remove rows with NAs
tmp <- readRDS("data/circforest_prepared_data_ibk_lag6.rds")
tmp <- tmp[, -grep("obertauern|montana|mariazell|guetsch|altdorf", names(tmp))]

tmp <- na.omit(tmp)

## Subset data to full hours
d <- tmp[as.POSIXlt(index(tmp))$min == 0L, ]; rm(tmp); gc()

## Remove very low values of ff and zero wind direction
d <- d[!d[, "dd.response"] == 0,]
d <- d[!d[, "ff.response"] < 1,]

## Transform response.dd from 0-360 degree to [-pi, pi]
d$dd.response <- d$dd.response / 360 * 2*pi
d$dd.response[d$dd.response > pi] <- d$dd.response[d$dd.response > pi] - 2*pi

## -------------------------------------------------------------------
## Fit climatology
## -------------------------------------------------------------------
#
#d_range <- as.POSIXlt(range(index(d)))
#
#pred_clim <- data.frame(mu = rep(NA, nrow(d)), kappa = rep(NA, nrow(d)))
#for(i in seq(1:nrow(d))){
#  cat(sprintf("Fitting climatology %2d%%\n", (i/nrow(d) * 100)%/%5 * 5))
#
#  i_plt <- as.POSIXlt(index(d)[i], origin = "1970-01-01")
#  
#  if((i_plt$mon + 1 == 2) & (i_plt$mday == 29)){
#    pred_clim[i, ] <- c("mu" = NA, "kappa" = NA)
#    next
#  }
#
#  i_years <- seq.int(min(d_range$year), max(d_range$year))[!(seq.int(min(d_range$year), 
#    max(d_range$year)) %in% i_plt$year)] + 1900
#
#  idx <- lapply(i_years, function(x) 
#    as.POSIXct(sprintf("%04d-%02d-%02d %02d:%02d:00", x, i_plt$mon + 1, i_plt$mday, 
#    i_plt$hour, i_plt$min), origin = "1970-01-01") + 60 * 60 * 24 * seq(-3, 3))
#
#  idx <- do.call(c, idx)
#  
#  train <- subset(d, index(d) %in% idx)
#
#  if(nrow(train) > 2){
#    climfit <- distfit(train$dd.response,
#                         family = dist_vonmises())
#    pred_clim[i, ] <- coef(climfit, type = "parameter")
#  } else {
#    pred_clim[i, ] <- c("mu" = NA, "kappa" = NA)
#  }
#}
#
#save(pred_clim, file = "results/circforest_pred_clim_ibk_lag6_v3.rda")

# -------------------------------------------------------------------
# Fit persistence model (no CV)
# -------------------------------------------------------------------
#
## Calculate exponential weights
calc_expweights <- function(theta, n){rev(sapply(1:n, function(x) theta^(x -1) * (1 - theta)))}
exp_weights <- calc_expweights(0.4, 6)

pred_pers <- data.frame(mu = rep(NA, nrow(d)), kappa = rep(NA, nrow(d)))
for(i in seq(1:nrow(d))){
  cat(sprintf("Fitting persistence %2d%%\n", (i/nrow(d) * 100)%/%5 * 5))

  i_plt <- as.POSIXlt(index(d)[i], origin = "1970-01-01")
  
  ## Skip 29th of February 
  if((i_plt$mon + 1 == 2) & (i_plt$mday == 29)){
    pred_pers[i, ] <- c("mu" = NA, "kappa" = NA)
    next
  }

  ## Subset training data set
  idx <- as.POSIXct(sprintf("%04d-%02d-%02d %02d:%02d:00", i_plt$year + 1900, i_plt$mon + 1, i_plt$mday, 
    i_plt$hour, i_plt$min), origin = "1970-01-01") + 60 * 60 * seq(-6, -1)

  train <- subset(d, index(d) %in% idx)
  train_weights <- exp_weights[idx %in% index(train)]

  ## Fit persistency
  persfit <- try(distfit(as.numeric(train$dd.response),
                           family = dist_vonmises(), weights = train_weights))

  ## Predict parameters
  if(class(persfit) == "try-error") {
    pred_pers[i, ] <- c("mu" = NA, "kappa" = NA)
  } else {
    pred_pers[i, ] <- coef(persfit, type = "parameter")
  }
}

## Save predictions
saveRDS(pred_pers, file = sprintf("results/circforest_pred_pers_ibk_lag6_%s.rds", my_vers))
rm(pred_pers, persfit, train); gc()

# -------------------------------------------------------------------
# Fit climatology (with CV)
# -------------------------------------------------------------------

## Fit models with cross-validation
cvID <- sort(rep(1:5, ceiling(nrow(d) / 5)))[1:nrow(d)]

pred_clim <- lapply(unique(cvID), function(x) data.frame(mu = rep(NA, sum(cvID == x)), 
  kappa = rep(NA, sum(cvID == x))))

for(cv in unique(cvID)) { 
  cat(sprintf("Fitting models cv %s/%s\n", cv, max(cvID)))

  train <- d[cv != cvID, ]
  test <- d[cv == cvID, ]
 
  d_range <- as.POSIXlt(range(index(train)))

  for(i in seq(1:nrow(test))){
    cat(sprintf("Fitting climatology %2d%%\n", (i/nrow(test) * 100)%/%5 * 5))
  
    i_plt <- as.POSIXlt(index(test)[i], origin = "1970-01-01")
    
    ## Skip 29th of February 
    if((i_plt$mon + 1 == 2) & (i_plt$mday == 29)){
      pred_clim[[cv]][i, ] <- c("mu" = NA, "kappa" = NA)
      next
    }
  
    ## Subset training data set
    i_years <- seq.int(min(d_range$year), max(d_range$year))[!(seq.int(min(d_range$year), 
    max(d_range$year)) %in% i_plt$year)] + 1900

    idx <- lapply(i_years, function(x) 
      as.POSIXct(sprintf("%04d-%02d-%02d %02d:%02d:00", x, i_plt$mon + 1, i_plt$mday, 
      i_plt$hour, i_plt$min), origin = "1970-01-01") + 60 * 60 * 24 * seq(-3, 3))
  
    idx <- do.call(c, idx)
    
    train_subset <- subset(train, index(train) %in% idx)
  
    ## Fit climatology
    climfit <- try(distfit(train_subset$dd.response,
                             family = dist_vonmises()))

    ## Predict parameters
    if(class(climfit) == "try-error") {
      pred_clim[[cv]][i, ] <- c("mu" = NA, "kappa" = NA)
    } else {
      pred_clim[[cv]][i, ] <- coef(climfit, type = "parameter")
    }
  }
}
  
## Save predictions
saveRDS(pred_clim, file = sprintf("results/circforest_pred_clim_ibk_lag6_%s.rds", my_vers))
rm(pred_clim, climfit, train, test, train_subset); gc()


# -------------------------------------------------------------------
# Fit tree and forest
# -------------------------------------------------------------------

## Transform zoo object to data.frame
tmp_time <- as.POSIXlt(index(d))
d <- as.data.frame(d)

## Add day of the year and time
d$daytime <- cut(tmp_time$hour, seq(0,24,by=3), include.lowest = TRUE)
d$doy <- tmp_time$yday + 1

rm(tmp_time); gc()

## Set up formula
f <- as.formula(paste("dd.response ~ ", paste(names(d)[-grep("response", names(d))], collapse= "+")))

## Fit models with cross-validation
cvID <- sort(rep(1:5, ceiling(nrow(d) / 5)))[1:nrow(d)]

for(cv in unique(cvID)) { 
  cat(sprintf("Fitting models cv %s/%s\n", cv, max(cvID)))

  train <- d[cv != cvID, ]
  test <- d[cv == cvID, ]
  
  ## Fit tree
  m_ct <- circtree(formula = f,
                           data = train,
                           control = disttree_control(mincriterion = (1 - sqrt(.Machine$double.eps))))
  
  ## Fit forest
  m_cf <- circforest(formula = f,
                       data = train,
                       ntree = 100,
                       mtry = ceiling(1. * length(all.vars(f[[3]]))),
                       perturb = list(replace = FALSE, fraction = 0.3),
                       control = disttree_control(nmax = c("yx" = Inf, "z" = 50)))

  ## Predict models
  pred_ct.tmp <- predict(m_ct, newdata = test, type = "parameter")
  pred_cf.tmp <- predict(m_cf, newdata = test, type = "parameter")

  ## Save predictions (and models)
  saveRDS(m_ct, 
    file = sprintf("results/circforest_model_tree_ibk_lag6_%s_cv%s.rds", my_vers, cv))
  saveRDS(pred_ct.tmp, 
    file = sprintf("results/circforest_pred_tree_ibk_lag6_%s_cv%s.rds", my_vers, cv))
  saveRDS(pred_cf.tmp, 
    file = sprintf("results/circforest_pred_forest_ibk_lag6_%s_cv%s.rds", my_vers, cv))
  rm(pred_ct.tmp, pred_cf.tmp, m_cf, m_ct, train, test); gc()
}

# -------------------------------------------------------------------
# Combine predictions
# -------------------------------------------------------------------

## Load predictions
pred_ct <- pred_cf <- list()
for(cv in unique(cvID)) {
  pred_ct.tmp <- readRDS(file = sprintf("results/circforest_pred_tree_ibk_lag6_%s_cv%s.rds", my_vers, cv))
  pred_cf.tmp <- readRDS(file = sprintf("results/circforest_pred_forest_ibk_lag6_%s_cv%s.rds", my_vers, cv))
  pred_ct[[cv]] <- pred_ct.tmp
  pred_cf[[cv]] <- pred_cf.tmp
  rm(pred_cf.tmp, pred_ct.tmp); gc()
}

pred_pers <- readRDS(file = sprintf("results/circforest_pred_pers_ibk_lag6_%s.rds", my_vers))
pred_clim <- readRDS(file = sprintf("results/circforest_pred_clim_ibk_lag6_%s.rds", my_vers))

## Combine predictions
pred <- list(tree = do.call("rbind", pred_ct), forest = do.call("rbind", pred_cf), 
  climatology = do.call("rbind", pred_clim), persistence = pred_pers)
obs  <- d$dd.response

## Remove nans (if any)
idx <- do.call(rbind, sapply(pred, function(x) which(is.na(x), arr.ind=TRUE)))
idx <- unique(idx[,1]) 
if(length(idx) > 0){
  pred_naomit <- lapply(pred, function(x) x[-idx, ])
  obs_naomit  <- obs[-idx]
} else{
  pred_naomit <- pred
  obs_naomit  <- obs
}

## Save results
save(pred, pred_naomit, obs, obs_naomit, 
  file = sprintf("results/circforest_results_ibk_lag6_%s.rda", my_vers))

# -------------------------------------------------------------------
# Validate predictions
# -------------------------------------------------------------------

load(file = sprintf("results/circforest_results_ibk_lag6_%s.rda", my_vers))

### Validate models
#crps <- lapply(pred_naomit, function(x) crps_vonmises(mu = x$mu, kappa = x$kappa, 
#  y = obs_naomit, sum = FALSE)) 

crps_grimit <- lapply(pred_naomit, function(x) sapply(1:nrow(x), function(i) 
  as.numeric(crps.circ(x = obs_naomit[i], mu = x[i, "mu"], kappa = x[i, "kappa"]))))

## Save validation
save(crps_grimit, file = sprintf("results/circforest_validation_ibk_lag6_%s.rda", my_vers))

## -------------------------------------------------------------------
## Plotting
## -------------------------------------------------------------------
if(FALSE){
  my_vers <- "v4"

  ## Plot CRPS
  load(file = sprintf("results/circforest_validation_ibk_lag6_%s.rda", my_vers))

  X11(width = 8, height = 4.5)
  par(mar = c(3.1, 4.1, 2.1, 2.1))
  crps_grimit_skill <- data.frame(sapply(crps_grimit[c("climatology", "persistence", "tree", "forest")], 
                       function(x) (1 - x/crps_grimit[["climatology"]]) * 100))
  boxplot(crps_grimit_skill, ylim = c(-60, 110), col = gray(0.6), 
    ylab = "CRPS skill ccore [%]")
  text(c(mean(unlist(crps_grimit_skill["climatology"]), na.rm = TRUE),
         mean(unlist(crps_grimit_skill["persistence"]), na.rm = TRUE),
         mean(unlist(crps_grimit_skill["tree"]), na.rm = TRUE),
         mean(unlist(crps_grimit_skill["forest"]), na.rm = TRUE)), "*", cex=2.5 , col = "red")
  dev.print(pdf, sprintf("results/_plot_circforest_validation_crpsskill_ibk_lag6_%s.pdf", my_vers))
  
  X11(width = 8, height = 4.5)
  par(mar = c(3.1, 4.1, 2.1, 2.1))
  boxplot(crps_grimit[c("climatology", "persistence", "tree", "forest")], ylim = c(0, 1), col = gray(0.6), 
    ylab = "CRPS [m/s]")
  text(c(mean(unlist(crps_grimit["climatology"]), na.rm = TRUE),
         mean(unlist(crps_grimit["persistence"]), na.rm = TRUE),
         mean(unlist(crps_grimit["tree"]), na.rm = TRUE),
         mean(unlist(crps_grimit["forest"]), na.rm = TRUE)), "*", cex=2.5 , col = "red")
  dev.print(pdf, sprintf("results/_plot_circforest_validation_crpsraw_ibk_lag6_%s.pdf", my_vers))

  ## Plot single tree
  cv <- 1
  m_ct <- readRDS(file = sprintf("results/circforest_model_tree_ibk_lag6_%s_cv%s.rds", my_vers, cv))
  plot(m_ct, ep_args = list(justmin = 10), tp_args = list(type = "geographics"), 
    ip_args = list(pval = FALSE))
}