R/functions_Johanna.R

# These functions are made by Johanna Piipponen
#### 

# Data handling and splitting  ------------------------------------------------


FetchData <- function(station, utc, lead.time, season = 0, response = 1) {
  # Loads the data from the csv files.
  # 
  # Args:
  #   station: (5, 640, 2861)
  #   utc: (1-2)
  #   lead.time: (1-65)
  #   season: the whole year (0) or season (winter-autumn 1-4)
  #   response: T2 (1)
  # 
  # Returns:
  #   data: a data frame in which the first column is the response
  #         variable and the rest are independent variables
  #   timevector: a POSIXct vector with UTC-based datetimes of the observations
  
  
  # creating the file path which relies on the constant variables in
  # importantvariables.rData
  filepath <- paste("data/station_", STATIONIDS[station],
                    "_", ANALYSISTIME[utc],
                    "_", FORECASTS[lead.time],
                    "_season", season,
                    "_", RESPONSES[response],
                    "_level", RESPONSELEVELS[response],
                    ".csv", sep="")
  
  # loads data from the file and removes the first useless row
  alldata <- read.csv(file = filepath)
  alldata <- alldata[,-1]
  
  # extracts the first column and defines it as a datetime vector
  timevector <- as.POSIXct(alldata[,1], tz = "UTC")
  # the first column is then erased
  data <- alldata[,-1]
  
  # renames the first column
  # just putting the response variable in line with the naming convention
  colnames(data)[1] <- "HAVAINTO"
  
  # gathers the returned list
  result <- list("data" = data, "timevector" = timevector)
  
  return(result)
}



CleanData <- function(data.object, what.data = "all") {
  # Cleans a data object from bad observations and variables.
  #
  # Args:
  #   data.object: a data object from FetchData function
  #   what.data: all/oldandnew/new
  #
  # Returns:
  #   A cleaned data object of the same form that the input argument.
  
  # separate the data object into a data frame and a vector
  data <- data.object$data
  timevector <- data.object$timevector
  no.of.points <- length(timevector)
  
  # first we will delete faulty variables which are NA most of the time
  
  # calculating the number of NAs is variables (not calculating the response
  # variable because we can not remove it)
  no.of.na <- apply(X = data[,-1], MARGIN = 2, FUN = function(x) sum(is.na(x)))
  
  # "normal" amount of NAs is one fourth of all points
  na.tolerance <- no.of.points/4
  
  # searching the variables which fulfill the conditions (number of NAs is lower
  # than the tolerance of NAs)
  nice.variables <- which(no.of.na < na.tolerance)
  
  # a small tweak because the repsonse variable CAN NOT be removed
  nice.variables <- c(1, nice.variables+1)
  
  # finally removing the variables which did not fulfill the earlier condition
  data <- data[,nice.variables]
  
  # secondly, we will delete all incomplete observations
  
  # searching the complete cases and keeping only them
  complete_obs <- complete.cases(data)
  data <- data[complete_obs,]
  timevector <- timevector[complete_obs]
  
  # third, we will remove snow variables because they are not important
  important.vars <- !(colnames(data) %in% c("SD", "RSN"))
  data <- data[,important.vars]
  
  # removing faulty U_100M and V_100M observations
  # both variables can vary between -40 and 40
  good.u <- (data$U_100M > -40) & (data$U_100M < 40)
  good.v <- (data$V_100M > -40) & (data$V_100M < 40)
  good.indices <- good.u & good.v
  data <- data[good.indices,]
  timevector <- timevector[good.indices]
  
  # thirdly, we will remove constant variables
  data <- RemoveConstantVariables(data)
  
  # NOTE: other things to consider are
  #   * checking if standard deviation is "too low"
  #   * removing SD and RSN
  #   * removing highly correlated variables
  
  if (what.data != "all") {
    if (what.data == "oldandnew") {
      limit.for.truncated.data <- as.POSIXct("2015-11-06 23:59:00", tz = "UTC")
    } else if (what.data == "new") {
      limit.for.truncated.data <- as.POSIXct("2016-03-20 23:59:00", tz = "UTC")
    }
    index <- which.max(timevector >= limit.for.truncated.data)
    last.index <- length(timevector)
    timevector <- timevector[index:last.index]
    data <- data[index:last.index,]
  } else {
    # carry on as usual
  }
  
  # returning the data object in the same form but somewhat smaller size
  results <- list("data" = data, "timevector" = timevector)
  
  return(results)
}


RemoveConstantVariables <- function(data) {
  # Removes variables that are constant everywhere.
  #
  # Args:
  #   data: a data matrix
  #
  # Returns:
  #   A data frame with the constant variables removed.
  
  constants <- apply(X = data, MARGIN = 2, FUN = IsVariableConstant)
  data <- data[,!constants]
  
  return(data)
}


IsVariableConstant <- function(x) {
  # Tests whether a vector is constant.
  #
  # Args:
  #   x: a vector
  #
  # Returns:
  #   TRUE if a vector is constant, FALSE otherwise.
  
  # tests if the difference between the smallest and the largest value is
  # smaller than the precision
  result <- abs(max(x) - min(x)) < .Machine$double.eps^0.5
  return(result)
}


LoadCleanedData <- function(..., what.data = "all") {
  # A wrapper function for FetchData and CleanData: performs both.
  #
  # Args:
  #   The same as FetchData has because everything is passed to it.
  #
  # Returns:
  #   A fetched and cleaned data object.
  
  data.object <- FetchData(...)
  cleaned.data.object <- CleanData(data.object = data.object,
                                   what.data = what.data)
  
  return(cleaned.data.object)
}


SplitData <- function(timevector) {
  # Splits data into year-long groups. THIS HAS BEEN UPGRADED TO FUNCTION
  # `SplitDataEvenly`!!!
  #
  # Args:
  #   timevector: a vector of POSIXct elements
  # 
  # Returns:
  #   Indices that define the first element in a new group. The last index is
  #   non-existent which is to be noted when using indices in a loop.
  
  # the data starts at 2011-12-01 which leads to choosing these cut-off dates
  constants <- as.POSIXct(c("2011-11-30 23:59:00",
                            "2012-11-30 23:59:00",
                            "2013-11-30 23:59:00",
                            "2014-11-30 23:59:00",
                            "2015-11-30 23:59:00"), tz = "UTC")
  
  # find out the indices of datetimes that happen RIGHT after the above times
  indices <- sapply(X = constants, FUN = function(x) which.max(timevector >= x))
  
  # we will erase the duplicated values
  # the duplicated values might happen if there isn't ANY data on some interval
  # (I'm looking at you, Cairo Airport)
  indices <- indices[!duplicated(indices)]
  
  # forming the indices
  indices <- c(indices, length(timevector)+1)
  
  return(indices)
}


SplitDataEvenly <- function(timevector, folds = 4) {
  # Splits data evenly into `folds` groups, default is four groups.
  #
  # Args:
  #   timevector: a vector
  #   folds: the number of groups
  #
  # Returns:
  #   Indices that define the first element in a new group. The last index is
  #   non-existent which is to be noted when using indices in a loop.
  
  no.of.points <- length(timevector)
  min.points <- no.of.points %/% folds
  add.points <- no.of.points %% folds
  point.vector <- rep(min.points, times = folds)
  if (add.points > 0) {
    point.vector[1:add.points] <- point.vector[1:add.points] + 1
  }
  indices <- rep(NA, times = (folds+1))
  indices[1] <- 1
  for (ii in 2:(folds+1)) {
    indices[ii] <- indices[ii-1] + point.vector[ii-1]
  }
  return(indices)
}


IndexVectorToFilter <- function(indices) {
  # Created a vector based on indices that can be used as a filter in plotting.
  # 
  # Args:
  #   indices: an indice vector from functions SplitData or SplitDataEvenly
  # 
  # Returns:
  #   A vector of the same length as the last element in indices minus one. For
  #   example, if the input vector is c(1,5,8,12), the output is 
  #   c(1,1,1,1,2,2,2,3,3,3,3).
  
  no.of.groups <- (length(indices) - 1)
  sizes.of.groups <- diff(indices)
  filter <- rep(1:no.of.groups, times = sizes.of.groups)
  return(filter)
}


FetchAndCombineSeasons <- function(...) {
  # Reads and combines all four seasons of the same data set.
  #
  # Args:
  #   ...: passed on to FetchData but does not need to specify `season`
  #
  # Return:
  #   Returns a data object with timevector and data.
  
  winter <- FetchData(season = 1, ...)
  spring <- FetchData(season = 2, ...)
  summer <- FetchData(season = 3, ...)
  autumn <- FetchData(season = 4, ...)
  
  time.winter <- winter$timevector
  time.spring <- spring$timevector
  time.summer <- summer$timevector
  time.autumn <- autumn$timevector
  
  data.winter <- winter$data
  data.spring <- spring$data
  data.summer <- summer$data
  data.autumn <- autumn$data
  
  all.times <- .POSIXct(c(time.winter, time.spring, time.summer, time.autumn),
                        tz = "UTC")
  all.datas <- rbind(data.winter, data.spring, data.summer, data.autumn)
  
  duplicated.indices <- duplicated(all.times)
  
  times.trunc <- all.times[!duplicated.indices]
  datas.trunc <- all.datas[!duplicated.indices,]
  
  correct.indices <- sort.list(times.trunc)
  
  timevector <- times.trunc[correct.indices]
  data <- datas.trunc[correct.indices,]
  
  results <- list("data" = data, "timevector" = timevector)
  return(results)
}


# Model fitting ---------------------------------------------------------------


FitWithStep <- function(training.set, test.set = NULL,
                        object.formula, upper.formula, lower.formula,
                        direction, steps = 1000) {
  # Chooses a linear model by a stepwise algorithm and predicts a fit in an
  # independent test set.
  #
  # Args:
  #   training.set: a data matrix
  #   test.set: a data matrix
  #   object.formula: a formula viable for a linear model
  #   upper.formula: a formula viable for a linear model
  #   lower.formula:a formula viable for a linear model
  #   direction: both/forward/backward
  #   steps: the number of steps the algorithm is allowed to take
  #
  # Returns:
  #   A list of coefficients, residuals and fitted values.
  
  # saving the training and test set to global environment
  # NOTE: some algorithms do not work without this!
  # NOTE: step function does not work without this workaround, no idea why
  training.set <<- training.set
  test.set <<- test.set
  
  # generating the linear models
  # note that they are dependent on the data used
  object.lm <- lm(object.formula, data = training.set)
  upper.lm <- lm(upper.formula, data = training.set)
  lower.lm <- lm(lower.formula, data = training.set)
  
  # choosing the perfect model
  step.model <- step(object = object.lm,
                    scope = list(upper = upper.lm, lower = lower.lm),
                    direction = direction, trace = 0, steps = steps)
  
  # if there is no test data, the fit is calculated from the training data
  if (is.null(test.set)) {
    test.set <- training.set
  }
  
  # calculating the fit
  fitted.values <- predict(object = step.model, newdata = test.set)
  
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(test.set[,1] - fitted.values)
  coefficients <- step.model$coefficients
  
  results <- list("coefficients" = coefficients,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  
  class(results) <- "simplelm"
  
  return(results)
}


FitWithGlmnet <- function(training.set, test.set = NULL,
                          alpha, standardize = TRUE,
                          pmax = NULL,
                          choosing.lambda = "1se") {
  # Chooses a best GLMNET model and predicts a fit.
  #
  # Args:
  #   training.set:
  #   test.set:
  #   alpha: from 0 to 1
  #   standardize: TRUE/FALSE
  #
  # Returns:
  #   A list of coefficients, residuals and fitted values.
  
  if (is.null(pmax)) {
    pmax = (ncol(training.set)-1)
  }
  
  # transforming the data to matrices
  training.set.x <- as.matrix(training.set[,-1])
  training.set.y <- training.set[,1]
  
  # if there is no test data, the fit is calculated from the training data
  if (is.null(test.set)) {
    test.set.x <- training.set.x
    test.set.y <- training.set.y
  } else {
    test.set.x <- as.matrix(test.set[,-1])
    test.set.y <- test.set[,1]
  }
  
  #grid <- 10^seq(10, -4, length = 200)
  
  # estimating the glmnet model
  # warnings are suppressed because the size limit (pmax) of the model results
  # into many unimportant warnings
  glmnet.model <- suppressWarnings(
    glmnet(training.set.x, training.set.y, family = "gaussian",
           alpha = alpha, 
           standardize = standardize, pmax = pmax)
  )
  
  # crossvalidating the glmnet model to find out the best value for lambda/s
  # NOTE: some sources say that the hyperparameter estimation should be done in
  # a separate data set and not in the same that was used when estimating the
  # coefficients. this is not implemented.
  filter.for.folds <- IndexVectorToFilter(SplitDataEvenly(training.set.y))
  
  cv.glmnet.model <- suppressWarnings(
    cv.glmnet(training.set.x, training.set.y, alpha = alpha,
              foldid = filter.for.folds, pmax = pmax)
  )
  
  if (choosing.lambda == "1se") {
    best.lambda <- cv.glmnet.model$lambda.1se
  } else {
    best.lambda <- cv.glmnet.model$lambda.min
  }
  
  # choosing the best coefficients
  all.coefficients <- coef(cv.glmnet.model, s = best.lambda)
  all.coef.names <- rownames(all.coefficients)
  nonzero.indices <- which(all.coefficients != 0)
  coefficients <- all.coefficients[nonzero.indices]
  names(coefficients) <- all.coef.names[nonzero.indices]
  
  # predicting the fit and residuals
  fitted.values <- predict.glmnet(object = glmnet.model, newx = test.set.x,
                                  s = best.lambda,
                                  type = "response")
  residuals <- test.set.y - fitted.values
  
  # combining results
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(residuals)
  
  results <- list("coefficients" = coefficients,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  class(results) <- "simplelm"
  
  return(results)
}


FitWithRegsubsets <- function(training.set, test.set = NULL,
                              x, force.in, method) {
  # Chooses a best subsets model of 10 variables and predicts a fit.
  #
  # Args:
  #   training.set:
  #   test.set:
  #   x: a formula
  #   force.in: the index of the variable that is forced in the model
  #   method: forward/backward/seqrep
  #
  # Returns:
  #   A list of coefficients, residuals and fitted values.
  
  # estimating the best models
  regsubsets.model <- regsubsets(x, data = training.set, nbest = 1, nvmax = 10,
                                 force.in = force.in, method = method)
  
  # if there is no test data, the fit is calculated from the training data
  if (is.null(test.set)) {
    test.set <- training.set
  }
  
  # calculating the fitted values
  fitted.values <- PredictWithRegsubsets(object = regsubsets.model,
                                         newdata = test.set)
  residuals <- test.set[,1] - fitted.values
  
  # the estimated coefficients of a model
  # a coefficient is zero if the variable is not in the model
  id <- dim(summary(regsubsets.model)$which)[1]
  coefficients <- coef(regsubsets.model, id)
  
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(residuals)
  
  results <- list("coefficients" = coefficients,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  
  class(results) <- "simplelm"
  return(results)
}


FitWithPCR <- function(training.set, test.set = NULL,
                       formula, ncomp = (length(training.set)-1), scale = TRUE,
                       regulation = "none") {
  # Performs a principal component regression and predicts with the model.
  #
  # Args:
  #   training.set:
  #   test.set:
  #   formula:
  #   ncomp: the number of principal components
  #   scale: TRUE/FALSE
  #   regulation: which regulation is chosen (none/eigenvalues/variance)
  #
  # Returns:
  #   A list of coefficients, residuals and fitted values.
  
  pca <- prcomp(~., data = training.set[,-1],
                center = TRUE, scale. = TRUE, tol = NULL)
  
  variables <- switch(regulation,
                      eigenvalues = RegulatePCRUsingEigenvalues(pca),
                      variance = RegulatePCRUsingVariance(pca),
                      none = c("."))
  no.of.variables <- length(variables)
  if (variables[1] == ".") {
    no.of.variables = ncomp
  }
  
  pcr.formula <- as.formula(paste(colnames(training.set)[1], " ~ ",
                                  paste(variables, collapse = " + "),
                                  sep = ""))
  pcr.model <- pcr(pcr.formula, ncomp = no.of.variables, data = training.set,
                   scale = scale, validation = "none")
  coefficients.array <- coef(pcr.model, ncomp = no.of.variables,
                             intercept = TRUE)
  coefficients <- as.numeric(coefficients.array)
  names(coefficients) <- rownames(coefficients.array)
  
  # if there is no test data, the fit is calculated from the training data
  if (is.null(test.set)) {
    test.set <- training.set
  }
  
  # predicts with the model
  fitted.values <- predict(pcr.model, ncomp = no.of.variables,
                           newdata = test.set)
  residuals <- test.set[,1] - fitted.values
  
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(residuals)
  
  results <- list("coefficients" = coefficients,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  
  class(results) <- "simplelm"
  
  return(results)
}


RegulatePCRUsingEigenvalues <- function(prcomp.object) {
  # Removes the least important variables based on the eigenvalues in principal
  # component analysis.
  #
  # Args:
  #   prcomp.object: an object created by the function prcomp()
  #
  # Returns:
  #   A vector of the names of the variables that are left in the model.
  
  eigenvalues <- prcomp.object$sdev^2
  eigenvectors.abs <- abs(prcomp.object$rotation)
  
  # finding the eigenvalues that are smaller than 0.70 and saving their indices
  # in a decreasing order (from the least important to more important)
  low.eigenvalues <- rev(which(eigenvalues < 0.70))
  
  # saving the variables which we are going to reduce
  variables <- rownames(eigenvectors.abs)
  
  for (ll in low.eigenvalues) {
    # finding the largest coefficient
    deleted.variable <- which.max(eigenvectors.abs[,ll])
    # deleting the variable corresponding the largest coefficient from the
    # eigenvector matrix and the variables
    eigenvectors.abs <- eigenvectors.abs[-deleted.variable,]
    variables <- variables[-deleted.variable]
  }
  
  return(variables)
}


RegulatePCRUsingVariance <- function(prcomp.object) {
  # Removes the least important variables based on the explained variation in
  # principal component analysis.
  #
  # Args:
  #   prcomp.object: an object created by the function prcomp()
  #
  # Returns:
  #   A vector of the names of the variables that are left in the model.
  
  cumulative.prop.of.var <- cumsum(prcomp.object$sdev^2 / sum(prcomp.object$sdev^2))
  eigenvectors.abs <- abs(prcomp.object$rotation)
  
  # finding how many PCs are needed to explain at least 80 % of the variance
  excess.components <- which(cumulative.prop.of.var > 0.80)
  excess.components <- excess.components[-1]
  excess.components <- rev(excess.components)
  
  # saving the initila variables
  variables <- rownames(eigenvectors.abs)
  
  for (ee in excess.components) {
    deleted.variable <- which.max(eigenvectors.abs[,ee])
    eigenvectors.abs <- eigenvectors.abs[-deleted.variable,]
    variables <- variables[-deleted.variable]
  }
  
  return(variables)
}


FitWithStepPCA <- function(training.set, test.set = NULL,
                           direction, steps = 1000) {
  # Selects a principal component regression model via the stepwise algorithm.
  #
  # Args:
  #   training.set:
  #   test.set:
  #   direction: forward/backward/both
  #   steps:
  #
  # Returns:
  #   A list of coefficients, residuals and fitted values.
  
  response.variable <- colnames(training.set)[1]
  
  pca <- prcomp(~., data = training.set[,-1],
                center = TRUE, scale. = TRUE, tol = NULL)
  
  transformed.training.set <- data.frame(training.set[,1], pca$x)
  colnames(transformed.training.set)[1] <- colnames(training.set)[1]
  
  formula.lm.full <- as.formula(paste(response.variable, " ~ .", sep = ""))
  formula.lm.constant <- as.formula(paste(response.variable, " ~ 1", sep = ""))
  
  lm.full <- lm(formula = formula.lm.full,
                data = transformed.training.set)
  lm.constant <- lm(formula = formula.lm.constant,
                    data = transformed.training.set)
  
  step.pca.model <- step(lm.constant,
                         scope = list(upper = lm.full, lower = lm.constant),
                         direction = direction, trace = 0, steps = steps)
  
  if (is.null(test.set)) {
    test.set <- training.set
  }
  
  transformed.test.set <- data.frame(test.set[,1],
                                     TransformToPCACoordinates(test.set[,-1], pca))
  colnames(transformed.test.set)[1] <- colnames(training.set)[1]
  
  fitted.values <- predict(object = step.pca.model,
                           newdata = transformed.test.set)
  residuals <- test.set[,1] - fitted.values
  
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(residuals)
  
  results <- list("coefficients" = NULL,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  
  class(results) <- "simplelm"
  
  return(results)
}


TransformToPCACoordinates <- function(x, prcomp.object) {
  # Transforms a data matrix to principal coordinates.
  #
  # Args:
  #   x:
  #   prcomp.object:
  #
  # Returns:
  #   A data frame in principal coordinate system.
  
  obs <- dim(x)[1]
  vars <- dim(x)[2]
  
  y <- x - matrix(rep(prcomp.object$center, each = obs), ncol = vars, nrow = obs)
  y <- y / matrix(rep(prcomp.object$scale, each = obs), ncol = vars, nrow = obs)
  y <- as.matrix(y) %*% prcomp.object$rotation
  y <- as.data.frame(y)
  
  colnames(y) <- paste("PC", 1:vars, sep="")
  
  return(y)
}


FitWithPLSR <- function(training.set, test.set = NULL,
                        formula, ncomp, scale, method = "kernelpls") {
  # Performs a partial least squares regression and predicts with the model.
  #
  # Args:
  #   training.set:
  #   test.set:
  #   formula:
  #   ncomp:
  #   scale:
  #
  # Returns:
  #   A vector of fitted values.
  
  # estimates the PC regression model
  plsrmodel <- plsr(formula = formula, data = training.set, scale = scale,
                    method = method, validation = "none")
  
  # if there is no test data, the fit is calculated from the training data
  if (is.null(test.set)) {
    test.set <- training.set
  }
  
  coefficients.array <- coef(plsrmodel, ncomp = ncomp, intercept = TRUE)
  coefficients <- as.numeric(coefficients.array)
  names(coefficients) <- rownames(coefficients.array)
  
  # predicts with the model
  fitted.values <- predict(plsrmodel, ncomp = ncomp, newdata = test.set)
  residuals <- test.set[,1] - fitted.values
  
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(residuals)
  
  results <- list("coefficients" = coefficients,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  
  class(results) <- "simplelm"
  
  return(results)
}


FitWithBtSVD <- function(training.set, test.set = NULL, P) {
  # Performs a partial least squares regression and predicts with the model.
  # THIS FUNCTION MAY BE BROKEN!!!
  # 
  # Args:
  #   training.set: test.set: P: minimum explanation rate
  # 
  # Returns:
  #   A list of coefficients, residuals and fitted values.
  
  training.set.y <- as.matrix(training.set[,1], ncol = 1)
  training.set.x <- as.matrix(training.set[,-1])
  training.set.x <- scale(training.set.x, center = TRUE, scale = TRUE)
  
  N <- nrow(training.set.x)
  M <- ncol(training.set.x)
  
  decomposition <- svd(training.set.x, nu = M, nv = M)
  training.U <- decomposition$u
  singular.values <- decomposition$d
  V <- decomposition$v
  
  # estimates the PC regression model
  btsvd.model <- BayesianTruncatedSVD(U = training.U, y = training.set.y,
                                  singular.values = singular.values, P = P)
  
  # if there is no test data, the fit is calculated from the training data
  if (is.null(test.set)) {
    test.set <- training.set
  }
  
  test.set.y <- as.matrix(test.set[,1], ncol = 1)
  test.set.x <- as.matrix(test.set[,-1])
  test.set.x <- scale(test.set.x, center = TRUE, scale = TRUE)
  
  test.U <- test.set.x %*% V %*% diag(1/singular.values)
  
  test.U.intercept <- cbind(rep(1, times = nrow(test.U)), test.U)
  test.Uq <- test.U.intercept[,1:ncol(btsvd.model$Uq)]
  
  # predicts with the model
  fitted.values <- test.Uq %*% btsvd.model$mu.U
  residuals <- test.set[,1] - fitted.values
  
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(residuals)
  
  results <- list("coefficients" = NULL,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  
  class(results) <- "simplelm"
  
  return(results)
}


FitWithCorrPCA <- function(training.set, test.set = NULL, min.correlation) {
  # Performs a partial least squares regression and predicts with the model.
  #
  # Args:
  #   training.set:
  #   test.set:
  #   formula:
  #   ncomp:
  #   scale:
  #
  # Returns:
  #   A list of coefficients, residuals and fitted values.
  
  response.variable <- colnames(training.set)[1]
  
  # doing PCA
  pca <- prcomp(~., data = training.set[,-1],
                center = TRUE, scale. = TRUE, tol = NULL)
  
  # calculating the correlation between the response and principal components
  correlations <- cor(training.set[,1], pca$x)
  
  # choosing the components which correlate the most with the repsonse variable
  chosen.components <- which(abs(correlations) >= min.correlation)
  
  names.of.chosen.components <- ""
  if (length(chosen.components) > 0) {
    names.of.chosen.components <- paste("PC", chosen.components, sep = "")
  }
  
  # gathering all model data
  transformed.training.set <- data.frame(training.set[,1], pca$x)
  colnames(transformed.training.set)[1] <- response.variable
  
  # writing the formula
  lm.formula <- as.formula(paste(response.variable, " ~ ",
                                 paste(names.of.chosen.components,
                                       collapse = " + "), " + 1", sep = ""))
  
  # estimating the linear model with the most correlated PCs
  lm.model <- lm(formula = lm.formula, data = transformed.training.set)
  
  if (is.null(test.set)) {
    test.set <- training.set
  }
  
  # transforming the test data
  transformed.test.set <- data.frame(test.set[,1],
                                    TransformToPCACoordinates(test.set[,-1], pca))
  colnames(transformed.test.set)[1] <- response.variable
  
  # predicting
  fitted.values <- predict(lm.model, transformed.test.set)
  residuals <- test.set[,1] - fitted.values
  
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(residuals)
  
  results <- list("coefficients" = NULL,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  
  class(results) <- "simplelm"
  
  return(results)
}


FitWithLM <- function(training.set, test.set = NULL) {
  # Estimates a full linear model and predicts with the model.
  #
  # Args:
  #   training.set:
  #   test.set:
  #
  # Returns:
  #   A list of coefficients, residuals and fitted values.
  
  response <- colnames(training.set)[1]
  formula <- paste(response, " ~ .", sep = "")
  model <- lm(formula = formula, data = training.set)
  
  if (is.null(test.set)) {
    test.set <- training.set
  }
  
  fitted.values <- as.vector(predict(object = model, newdata = test.set))
  residuals <- test.set[,1] - fitted.values
  
  fitted.values <- as.numeric(fitted.values)
  residuals <- as.numeric(residuals)
  
  results <- list("coefficients" = model$coefficients,
                  "residuals" = residuals,
                  "fitted.values" = fitted.values)
  
  class(results) <- "simplelm"
  
  return(results)
}


PredictWithRegsubsets = function(object, newdata) {
  # Predicts with a model from a regsubsets object.
  #
  # Args:
  #   object: a regsubsets object created via the regsubsets() function
  #   newdata: to which (independent) data are the predictions based on
  #
  # Returns:
  #   A vector of fitted values.
  
  # https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/ch6.html
  
  # which id corresponds to the last model (having 10 variables)
  id <- dim(summary(object)$which)[1]
  
  # the estimated coefficients of a model
  # a coefficient is zero if the variable is not in the model
  coefficients <- coef(object, id)
  
  # creating a model matrix in which the first column is ones and the rest are
  # all the possible variances
  variables <- names(newdata)
  model.formula <- as.formula(paste(variables[1], " ~ ",
                                   paste(variables[-1], collapse = " + "),
                                   sep = ""))
  model.matrix <- model.matrix(model.formula, data = newdata)
  
  # predicting the fitted values
  fitted.values <- model.matrix[, names(coefficients)] %*% coefficients
  
  return(fitted.values)
}


BayesianTruncatedSVD <- function(U, y, singular.values, P) {
  # THIS FUNCTION MAY BE BROKEN!!!
  #
  # Estimates parameters in a Bayesian linear model via the truncated SVD
  # method. The function is based on the equations presented in the paper
  # "Linear Models for Airborne-Laser-Scanning-Based Operational Forest
  # Inventory With Small Field Sample Size and Highly Correlated LiDAR Data" by
  # Junttila et al. (2015). Equations are referenced by their numbers where
  # applicable. The learning algorithm steps are based on the paper "Bayesian
  # Inference: An Introduction to Principles and Practice in Machine Learning"
  # by Tipping (2006), and the steps are referenced where applicable.
  # 
  # Args:
  #   U: left singular vector of the singular value decomposition of the
  #     predictors (matrix, NxM)
  #   y: predictand (column vector, Nx1) 
  #   singular.values: singular values of the SVD of the predictors (vector,
  #     length N)
  #   P: minimum explanation rate (scalar in interval (0,1))
  # 
  # Returns:
  #   A list with the following arguments:
  #   mu.U: mean vector
  #   sigma.lower.sq: error variance (in the code referenced as tau^(-1))
  #   alpha.vector: alpha vector with the first element being alpha0 and the
  #     rest alpha
  #   Uq: reduced left singular vector with an intercept
  
  # calculating the number of predictors needed (Eq. 2)
  explanation.rate <- cumsum(singular.values)/sum(singular.values)
  q <- which.max(explanation.rate > P)
  
  # adding an intercept to the predictor matrix (Eq. 3)
  U.with.intercept <- cbind(rep(1, times = nrow(U)), U)
  
  # saving only needed predictors
  Uq <- U.with.intercept[,1:(q+1)]
  singular.values.q <- singular.values[1:q]
  
  # calculating sizes
  N <- nrow(Uq)  # n
  M <- ncol(Uq)  # q+1
  K <- ncol(y)   # 1
  
  # defining bounds
  max.no.of.iterations <- 10000
  tolerance.of.alphas <- 100000000
  
  # vector of singular values including the intercept
  # (diag(Sq) in the paper in Eq. 3)
  lambda <- c(1, singular.values.q)
  lambda.minus.sq <- lambda^(-2)
  
  # Beginning the learning algorithm!
  # Step 1: Initialize all alphas and tau
  
  # initializing the inverse variances of the regression model parameters
  alpha.previous <- 0.1
  alpha.new <- alpha.previous
  alpha0.previous <- 0.1
  alpha0.new <- alpha0.previous
  alpha.vector <- c(alpha0.new, rep(alpha.new, times = M-1))
  
  # initializing the inverse error variance (tau is actually sigma.lower^(-2))
  tau <- 1
  
  # iteration loop (steps 2-4) begins
  for (ii in 1:max.no.of.iterations) {
    
    # Step 2: Compute weight posterior sufficient statistics mu.U and
    #         sigma.upper.U
    
    # covariance (Eq. 8 but only the "U part")
    sigma.upper.U <- ginv(tau * t(Uq) %*% Uq +
                            diag(lambda.minus.sq * alpha.vector))
    # the last term is simplified by results from multiplying diagonal matrices
    
    # mean (Eq. 9 but only the "U part")
    mu.U <- tau * sigma.upper.U %*% t(Uq) %*% y
    
    # Step 3: Compute all gammas, then re-estimate alphas and tau
    
    # under Eq. 13
    gamma <- 1 - alpha.vector * lambda.minus.sq * diag(sigma.upper.U)
    
    alpha0.new <- gamma[1] / mu.U[1]  # Eq. 11
    alpha.new <- sum(gamma[-1]) / sum(mu.U^2 * lambda.minus.sq)  # Eq. 12
    alpha.vector <- c(alpha0.new, rep(alpha.new, times = M-1))
    
    tau <- (N - sum(gamma)) / crossprod(y - Uq %*% mu.U)  # Eq. 13
    tau <- as.numeric(tau)
    
    # Step 4: Repeat until convergence (so we are checking the convergenve
    #         conditions)
    
    # checking stopping conditions
    enough.iterations <- (ii > 50)
    alpha.too.large <- (alpha.new > tolerance.of.alphas)
    convergence.reached <- ((abs(alpha0.previous-alpha0.new) +
                               abs(alpha.previous-alpha.new)) < (0.0001/N))
    
    # possible scenarios which are (almost) mutually exclusive
    if (ii == max.no.of.iterations) {
      warning("Maximum number of iterations reached!")
    } else if (enough.iterations & alpha.too.large) {
      warning("Alphas tend towards infinity!")
      break
    } else if (enough.iterations & convergence.reached) {
      # cat("Convergence reached!\n")
      break
    } else {
      # Iterations continue...
      alpha.previous <- alpha.new
      alpha0.previous <- alpha0.new
      # alpha.vector already contains the new values
    }
    
  }
  
  # returning results
  results <- list(mu.U = mu.U, sigma.lower.sq = tau^(-1),
                  alpha.vector = alpha.vector, Uq = Uq)
  return(results)
  
}


# Wrappers --------------------------------------------------------------------


Crossvalidate <- function(type, data, timevector, ...) {
  # Crossvalidates a selected algorithm with selected parameters.
  #
  # Args:
  #   type: which model selection algorithm will be used
  #   data:
  #   timevector:
  #   ...: passed to the algorithm
  #
  # Returns:
  #   A list of residuals and fitted values.
  
  # check whether the type argument is correct
  types <- c("step", "regsubsets", "glmnet", "pcr", "plsr", "steppca", "btsvd",
             "corrpca", "lm")
  if (!(type %in% types)) {
    print("Write a correct type!")
    return()
  }
  
  # the number of observations in the 'data' data frame
  no.of.points <- nrow(data)
  
  # calculating the indices where the data is splitted
  # NOTE: works only on yearly data! support on season data dropped!
  fold.indices <- SplitDataEvenly(timevector = timevector)
  
  # how many groups we get from splitting
  no.of.seasons <- length(fold.indices) - 1
  
  # saving the results
  fitted.values <- rep(NA, times = no.of.points)
  residuals <- rep(NA, times = no.of.points)
  
  for (ii in 1:no.of.seasons) {
    
    # defining the current test set indices
    test.indices <- fold.indices[ii]:(fold.indices[ii+1]-1)
    
    # the data outside current validation region is assigned as training data
    #  set and the data between current validation region is validation data
    training.set <- data[-test.indices,]
    test.set <- data[test.indices,]
    
    # removing constants from the data
    training.set <- RemoveConstantVariables(training.set)
    test.set <- test.set[,names(test.set) %in% names(training.set)]
    
    # this variable will serve as a saved state for fitted values
    # the switch function chooses the correct fitting algorithm based on type
    cv.results <- switch(type,
                     step = FitWithStep(training.set = training.set,
                                        test.set = test.set, ...),
                     regsubsets = FitWithRegsubsets(training.set = training.set,
                                                    test.set = test.set, ...),
                     glmnet = FitWithGlmnet(training.set = training.set,
                                            test.set = test.set, ...),
                     pcr = FitWithPCR(training.set = training.set,
                                      test.set = test.set, ...),
                     plsr = FitWithPLSR(training.set = training.set,
                                        test.set = test.set, ...),
                     steppca = FitWithStepPCA(training.set = training.set,
                                              test.set = test.set, ...),
                     btsvd = FitWithBtSVD(training.set = training.set,
                                          test.set = test.set, ...),
                     corrpca = FitWithCorrPCA(training.set = training.set,
                                              test.set = test.set, ...),
                     lm = FitWithLM(training.set = training.set,
                                    test.set = test.set))
    
    fitted.values[test.indices] <- cv.results$fitted.values
    residuals[test.indices] <- cv.results$residuals
  }
  results <- list("residuals" = residuals, "fitted.values" = fitted.values)
  return(results)
}


TestAlgorithms <- function(data, timevector) {
  # Tests all interesting algorithms with varying arguments.
  # 
  # Args: 
  #   data:
  #   timevector:
  # 
  # Returns:
  #   A list with elements `residuals`, `fitted.values`, `response` which only
  #   lists the values in the response variable, and `elapsed.time` which keeps
  #   track of the fitted values by different algorithms and the elapsed time in
  #   running the algorithm.
  
  # calculating the index for regsubsets (column id IN PREDICTOR SET)
  index.of.T2 <- (which(colnames(data) == "T2") - 1)
  
  # The first column of `data` defines the dependent response variable
  # `HAVAINTO` or `BIAS`. `BIAS` is defined as `HAVAINTO-T2`. The remaining 51
  # columns define the independent predictors where the most relevant predictors
  # is `T2`, the temperature at two meters. The predictors are forecasts and 
  # `HAVAINTO` is the realised temperature.
  
  response <- colnames(data)[1]
  
  fInitial <- as.formula(paste(response, " ~ 1", sep = ""))
  fFull <- as.formula(paste(response, " ~ .", sep = ""))
  fT2 <- as.formula(paste(response, " ~ T2", sep = ""))
  
  # First, let us create a list to store our crossvalidation results:
  fitted.values.by.algorithm <- as.data.frame(matrix(NA, ncol = 0,
                                                     nrow = nrow(data)))
  residuals.by.algorithm <- as.data.frame(matrix(NA, ncol = 0,
                                                     nrow = nrow(data)))
  elapsed.time.by.algorithm <- NULL
  
  
  # Basic models --------------------------------------------------------------
  
  # for comparison purposes, we are going to input ECMWF's forecast as one model
  # and a linear model with ALL predictors as another model
  if (response == "HAVAINTO") {
    fitted.values.by.algorithm$T2 <- data$T2
    residuals.by.algorithm$T2 <- data[,1] - data$T2
  } else {
    fitted.values.by.algorithm$T2 <- rep(0, times = nrow(data))
    residuals.by.algorithm$T2 <- data[,1]
  }
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm, T2 = 0)
  
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "lm",
                                   data = data, timevector = timevector)
  )
  fitted.values.by.algorithm$FullLm <- tmp.cv.result$fitted.values
  residuals.by.algorithm$FullLm <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 FullLm = tmp.time.result[[3]])
  
  
  # Default model -------------------------------------------------------------
  
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "step",
                                   data = data, timevector = timevector,
                                   object.formula = fInitial,
                                   upper.formula = fFull,
                                   lower.formula = fInitial,
                                   direction = "forward",
                                   steps = 10)
  )
  fitted.values.by.algorithm$Defaul <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Defaul <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Defaul = tmp.time.result[[3]])
  
  
  # Stepwise algorithm --------------------------------------------------------
  
  # discarded
  
  
  # Best subsets -------------------------------------------------------------
  
  # Regs01
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "regsubsets",
                                   data = data, timevector = timevector,
                                   x = fFull, force.in = NULL,
                                   method = "forward")
  )
  fitted.values.by.algorithm$Regs01 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Regs01 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Regs01 = tmp.time.result[[3]])
  
  # Regs02
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "regsubsets",
                                   data = data, timevector = timevector,
                                   x = fFull, force.in = index.of.T2,
                                   method = "forward")
  )
  fitted.values.by.algorithm$Regs02 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Regs02 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Regs02 = tmp.time.result[[3]])
  
  # Regs03
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "regsubsets",
                                   data = data, timevector = timevector,
                                   x = fFull, force.in = NULL,
                                   method = "backward")
  )
  fitted.values.by.algorithm$Regs03 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Regs03 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Regs03 = tmp.time.result[[3]])
  
  # Regs04
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "regsubsets",
                                   data = data, timevector = timevector,
                                   x = fFull, force.in = index.of.T2,
                                   method = "backward")
  )
  fitted.values.by.algorithm$Regs04 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Regs04 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Regs04 = tmp.time.result[[3]])
  
  # Regs05
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "regsubsets",
                                   data = data, timevector = timevector,
                                   x = fFull, force.in = NULL,
                                   method = "seq")
  )
  fitted.values.by.algorithm$Regs05 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Regs05 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Regs05 = tmp.time.result[[3]])
  
  # Regs06
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "regsubsets",
                                   data = data, timevector = timevector,
                                   x = fFull, force.in = index.of.T2,
                                   method = "seq")
  )
  fitted.values.by.algorithm$Regs06 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Regs06 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Regs06 = tmp.time.result[[3]])
  
  
  # Glmnet -------------------------------------------------------------------
  
  # Glmn01
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 1, standardize = TRUE)
  )
  fitted.values.by.algorithm$Glmn01 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Glmn01 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Glmn01 = tmp.time.result[[3]])
  
  # Glmn02
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 0.80, standardize = TRUE)
  )
  fitted.values.by.algorithm$Glmn02 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Glmn02 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Glmn02 = tmp.time.result[[3]])
  
  # Glmn03
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 0.60, standardize = TRUE)
  )
  fitted.values.by.algorithm$Glmn03 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$Glmn03 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 Glmn03 = tmp.time.result[[3]])
  
  # GlmnR1
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 1, pmax = 11,
                                   choosing.lambda = "min")
  )
  fitted.values.by.algorithm$GlmnR1 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$GlmnR1 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 GlmnR1 = tmp.time.result[[3]])
  
  # GlmnR2
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 0.80, pmax = 11,
                                   choosing.lambda = "min")
  )
  fitted.values.by.algorithm$GlmnR2 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$GlmnR2 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 GlmnR2 = tmp.time.result[[3]])
  
  # GlmnR3
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 0.60, pmax = 11,
                                   choosing.lambda = "min")
  )
  fitted.values.by.algorithm$GlmnR3 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$GlmnR3 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 GlmnR3 = tmp.time.result[[3]])
  
  # GlmnM1
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 1, pmax = 11,
                                   choosing.lambda = "1se")
  )
  fitted.values.by.algorithm$GlmnM1 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$GlmnM1 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 GlmnM1 = tmp.time.result[[3]])
  
  # GlmnM2
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 0.80, pmax = 11,
                                   choosing.lambda = "1se")
  )
  fitted.values.by.algorithm$GlmnM2 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$GlmnM2 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 GlmnM2 = tmp.time.result[[3]])
  
  # GlmnM3
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "glmnet",
                                   data = data, timevector = timevector,
                                   alpha = 0.60, pmax = 11,
                                   choosing.lambda = "1se")
  )
  fitted.values.by.algorithm$GlmnM3 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$GlmnM3 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 GlmnM3 = tmp.time.result[[3]])
  
  
  
  ## Principal component regression -------------------------------------------
  
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "pcr",
                                   data = data, timevector = timevector,
                                   formula = fFull, ncomp = 5, scale = FALSE)
  )
  fitted.values.by.algorithm$PCR001 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$PCR001 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 PCR001 = tmp.time.result[[3]])

  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "pcr",
                                   data = data, timevector = timevector,
                                   formula = fFull, ncomp = 5, scale = TRUE)
  )
  fitted.values.by.algorithm$PCR002 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$PCR002 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 PCR002 = tmp.time.result[[3]])

  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "pcr",
                                   data = data, timevector = timevector,
                                   formula = fFull, ncomp = 10, scale = FALSE)
  )
  fitted.values.by.algorithm$PCR003 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$PCR003 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 PCR003 = tmp.time.result[[3]])
  
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "pcr",
                                   data = data, timevector = timevector,
                                   formula = fFull, ncomp = 10, scale = TRUE)
  )
  fitted.values.by.algorithm$PCR004 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$PCR004 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 PCR004 = tmp.time.result[[3]])
  
  
  # Regulated PCR -------------------------------------------------------------
  
  # discarded

  
  # Partial least squares regression -----------------------------------------
  
  # PLSR01
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "plsr",
                                   data = data, timevector = timevector,
                                   formula = fFull, ncomp = 5, scale = FALSE,
                                   method = "kernelpls")
  )
  fitted.values.by.algorithm$PLSR01 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$PLSR01 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 PLSR01 = tmp.time.result[[3]])
  # PLSR02
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "plsr",
                                   data = data, timevector = timevector,
                                   formula = fFull, ncomp = 5, scale = TRUE,
                                   method = "kernelpls")
  )
  fitted.values.by.algorithm$PLSR02 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$PLSR02 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 PLSR02 = tmp.time.result[[3]])
  
  # PLSR03
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "plsr",
                                   data = data, timevector = timevector,
                                   formula = fFull, ncomp = 10, scale = FALSE,
                                   method = "kernelpls")
  )
  fitted.values.by.algorithm$PLSR03 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$PLSR03 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 PLSR03 = tmp.time.result[[3]])
  
  # PLSR04
  tmp.time.result <- system.time(
    tmp.cv.result <- Crossvalidate(type = "plsr",
                                   data = data, timevector = timevector,
                                   formula = fFull, ncomp = 10, scale = TRUE,
                                   method = "kernelpls")
  )
  fitted.values.by.algorithm$PLSR04 <- tmp.cv.result$fitted.values
  residuals.by.algorithm$PLSR04 <- tmp.cv.result$residuals
  elapsed.time.by.algorithm <- c(elapsed.time.by.algorithm,
                                 PLSR04 = tmp.time.result[[3]])
  
  
  # Bayesian model via truncated SVD ------------------------------------------
  
  # discarded
  
  results <- list(fitted.values = fitted.values.by.algorithm,
                  residuals = residuals.by.algorithm,
                  response = data[, 1, drop = FALSE],
                  elapsed.time = elapsed.time.by.algorithm)
  
  return(results)
  
}


LoopAlgorithmsThroughDatas <- function(stations, utcs, lead.times,
                                       seasons = c(0), responses = c(1),
                                       response.variable = "HAVAINTO",
                                       what.data = "all") {
  # Automatizes the looping through various data sets in order to decide whether
  # an algorithm performs well or not.
  #
  # Args:
  #   stations:
  #   utcs:
  #   lead.times:
  #   seasons:
  #   responses:
  #
  # Returns:
  #   A list with elements fitted.values, residuals, response, elapsed.time, 
  #   no.of.points, and lead.times similarly to TestAlgorithms. The results from
  #   multiple data sets follow each other in the matrices. The list elements 
  #   no.of.points tells to how many data points there were in a dataset (length
  #   is the number of datasets examined) and lead.times just returns the
  #   argument lead.times.
  
  fitted.values <- NULL
  residuals <- NULL
  response <- NULL
  elapsed.time <- NULL
  no.of.points <- NULL
  
  for (ss in stations) {
    for (uu in utcs) {
      for (ll in lead.times) {
        for (ee in seasons) {
          for (rr in responses) {
            print(paste("Lead time index", ll, "begins!", sep = " "))
            dataobject <- LoadCleanedData(station = ss, utc = uu,
                                          lead.time = ll, season = ee,
                                          response = rr,
                                          what.data = what.data)
            data <- dataobject$data
            timevector <- dataobject$timevector
            
            if (response.variable == "BIAS") {
              data[,1] <- data$HAVAINTO - data$T2
              colnames(data)[1] <- "BIAS"
            }
            
            results <- TestAlgorithms(data = data, timevector = timevector)
            
            fitted.values <- rbind(fitted.values, results$fitted.values)
            residuals <- rbind(residuals, results$residuals)
            response <- rbind(response, results$response)
            elapsed.time <- rbind(elapsed.time, results$elapsed.time)
            no.of.points <- c(no.of.points, length(timevector))
          }
        }
      }
    }
  }
  
  results <- list(fitted.values = fitted.values,
                  residuals = residuals,
                  response = response,
                  elapsed.time = elapsed.time,
                  no.of.points = no.of.points,
                  lead.times = lead.times)
  return(results)
  
}


# Analyzing and plotting ------------------------------------------------------


MAE <- function(x) {
  # Calculates MAE.
  result <- mean(abs(x))
  return(result)
}


MSE <- function(x) {
  # Calculates MSE.
  result <- mean(x^2)
  return(result)
}


RMSE <- function(x) {
  # Calculates RMSE.
  result <- sqrt(MSE(x))
  return(result)
}


CalculateMeasure <- function(residuals, type,
                             no.of.points = nrow(residuals)) {
  # Calculates either MAE or RMSE of the results.
  #
  # Args:
  #   residuals: a residual matrix
  #   type: the error measure (mae/rmse/mse)
  #   no.of.points: a scalar or a vector (a scalar calculates a single measure
  #   of all residuals, a vector divides the residuals into groups)
  #
  # Returns:
  #   A named vector with measures.
  
  # if no.of.points is a vector, measure is calculated separately for all
  factors <- as.factor(rep(1:length(no.of.points), times = no.of.points))
  split.list <- split(x = residuals, f = factors)
  
  measures <- matrix(NA, ncol = ncol(residuals), nrow = length(no.of.points))
  colnames(measures) <- colnames(residuals)
  
  for (ii in 1:length(no.of.points)) {
    measures[ii,] <- switch(type,
                            rmse = apply(X = split.list[[ii]], MARGIN = 2,
                                         FUN = RMSE),
                            mse = apply(X = split.list[[ii]], MARGIN = 2,
                                        FUN = MSE),
                            mae = apply(X = split.list[[ii]], MARGIN = 2,
                                        FUN = MAE))
  }
  
  measures <- as.data.frame(measures)
  rownames(measures) <- NULL
  
  return(measures)
}


CalculateSkill <- function(...) {
  # Calculates a skill score based on MAE or MSE.
  #
  # Args:
  #   ...: passed to function CalculateMeasure
  #
  # Returns:
  #   A named vector with skill score
  
  measure <- CalculateMeasure(...)
  skill <- 1 - measure / measure[,match("T2", colnames(measure))]
  
  return(skill)
}


PlotResidualDistribution <- function(residuals) {
  # Plots a box-and-whiskers plot of the results in TestAlgorithms and
  # LoopAlgorithmsThroughDatas data frames.
  #
  # Args:
  #   residuals: a residual matrix
  #
  # Results:
  #   A lattice plot.
  
  residuals.long <- melt(residuals)
  
  tplot <- bwplot(value ~ variable, data = residuals.long, pch = "|",
                  scales = list(x = list(rot = 45)),
                  ylab = "Residual")
  return(tplot)
  # IT ALSO WORKS ON ELAPSED.TIME WHEN IT IS TRANSFORMED TO A DATA FRAME
}


PlotMeasure <- function(object, type, skill = FALSE) {
  # Plots an error measure (MAE/RMSE) in size order.
  #
  # Args:
  #   object: an object from TestAlgorithms
  #   type: mae/rmse/mse
  #   skill: TRUE/FALSE if we are interested in skill score instead of measure
  #
  # Returns:
  #   A plot.
  
  residuals <- object$residuals
  
  if (skill == FALSE) {
    measure <- CalculateMeasure(residuals = residuals, type = type)
    ylab <- toupper(type)
  } else if (skill == TRUE) {
    measure <- CalculateSkill(residuals = residuals, type = type)
    ylab <- paste(toupper(type), " Skill", sep = "")
  }
  
  statistics <- data.frame(model = colnames(measure),
                           measure = as.numeric(measure),
                           row.names = NULL)
  
  statistics$model <- factor(statistics$model,
                             levels = statistics[order(statistics$measure),
                                                 "model"])
  
  # making a vector all values FALSE except TRUE for Defaul, T2, and FullLm
  pos <- rep(FALSE, times = length(statistics$model))
  pos[1:3] <- TRUE
  
  col <- c("bisque", "aquamarine")
  default.index <- match("T2", colnames(measure))
  
  tplot <- barchart(measure ~ model, data = statistics,
                    scales = list(x = list(rot = 45)),
                    col = col[pos+1],
                    ylab = ylab,
                    origin = 0,
                    panel = function(x,y,...) {
                      panel.barchart(x,y,...)
                      panel.abline(h = measure[,default.index],
                                   col.line = "red", lty = 3)
                    })
  if (skill == FALSE) {
    tplot <- update(tplot, ylim = c(0, NA))
  }
  return(tplot)
}


PlotMAEandRMSE <- function(object) {
  # Plots MAE and RMSE to the same plot.
  #
  # Args:
  #   object: from the function TestAlgorithms
  #
  # Returns:
  #   A plot.
  
  residuals <- object$residuals
  
  no.of.models <- ncol(residuals)
  names.of.models <- colnames(residuals)
  
  mae <- CalculateMeasure(residuals, type = "mae")
  rmse <- CalculateMeasure(residuals, type = "rmse")
  
  tmp <- data.frame(model = rep(names.of.models, times = 2),
                    value = c(t(mae), t(rmse)),
                    type = rep(c("MAE", "RMSE"), each = no.of.models))
  print(tmp)
  default.index <- match("T2", colnames(mae))
  
  tplot <- barchart(value ~ model, data = tmp, groups = factor(type),
                    auto.key = list(points = FALSE, rectangles = TRUE),
                    scales = list(x = list(rot = 45)),
                    ylab = "MAE/RMSE",
                    ylim = c(0, NA),
                    panel = function(x,y,...) {
                      panel.barchart(x,y,...)
                      panel.abline(h = rmse[1,default.index], col.line = "red",
                                   lty = 3)
                      panel.abline(h = mae[1,default.index], col.line = "red",
                                   lty = 3)
                    })
  return(tplot)
}


PlotMeasureAgainstLeadTime <- function(object, type, skill = FALSE,
                                       lead.time.indices = object$lead.times,
                                       model.names = c("T2", "FullLm",
                                                       "Defaul")) {
  
  # let us first calculate ALL measures with ALL lead times
  if (skill == FALSE) {
    measure <- CalculateMeasure(residuals = object$residuals, type = type,
                                no.of.points = object$no.of.points)
    ylab <- toupper(type)
  } else if (skill == TRUE) {
    measure <- CalculateSkill(residuals = object$residuals, type = type,
                              no.of.points = object$no.of.points)
    ylab <- paste(toupper(type), " Skill", sep = "")
  }
  
  # then let us separate the lead times which we truly want to plot
  measure <- measure[lead.time.indices,]
  
  lead.times <- as.numeric(FORECASTS[lead.time.indices])
  
  left.hand.side <- paste("measure$", model.names, sep = "", collapse = " + ")
  formula <- as.formula(paste(left.hand.side, " ~ lead.times", sep = ""))
  
  tplot <- xyplot(formula,
                  type = "o", auto.key = list(points = TRUE),
                  ylab = ylab, xlab = "Lead time",
                  scales = list(x = list(at = seq(from = 0, to = 240, by = 6))))
  if (skill == FALSE) {
    tplot <- update(tplot, ylim = c(0, NA))
  }
  
  return(tplot)
}


PlotMeasureWithLimitations <- function(object, type, skill = FALSE,
                                       lead.time.indices = object$lead.times,
                                       limitation) {
  # let us first calculate ALL measures with ALL lead times
  if (skill == FALSE) {
    measure <- CalculateMeasure(residuals = object$residuals, type = type,
                                no.of.points = object$no.of.points)
    ylab <- toupper(type)
  } else if (skill == TRUE) {
    measure <- CalculateSkill(residuals = object$residuals, type = type,
                              no.of.points = object$no.of.points)
    ylab <- paste(toupper(type), " Skill", sep = "")
  }
  
  # then let us separate the lead times which we truly want to plot
  measure <- measure[lead.time.indices,]
  if (skill == FALSE) {
    good.models <- apply(X = measure, MARGIN = 2,
                         FUN = function(x) all(x < measure$T2))
  } else if (skill == TRUE) {
    good.models <- apply(X = measure, MARGIN = 2, FUN = function(x) all(x < 0))
  }
  
  model.names <- colnames(object$residuals)[!good.models]
  tplot <- PlotMeasureAgainstLeadTime(object = object, type = type,
                                      skill = skill,
                                      lead.time.indices = lead.time.indices,
                                      model.names = model.names)
  return(tplot)
}


PlotResidualsAgainstLeadTime <- function(object, model.name) {
  
  lead.times <- as.numeric(FORECASTS[object$lead.times])
  lead.time.factor <- as.factor(rep(lead.times, times = object$no.of.points))
  index <- match(model.name, colnames(object$residuals))
  residuals <- object$residuals[,index]
  
  tplot <- bwplot(residuals ~ lead.time.factor, pch = "|",
                  scales = list(x = list(rot = 90)),
                  ylab = "Residual in Kelvins",
                  xlab = "Lead time in hours")
  return(tplot)
}


PlotObservationAgainstFits <- function(object, model.name) {
  fitted.values <- object$fitted.values
  index <- match(model.name, colnames(object$fitted.values))
  
  tplot <- xyplot(object$response[,1] ~ object$fitted.values[,index],
                  ylab = "Observation",
                  xlab = paste("Fitted value from ", model.name, sep = ""),
                  panel = function(x,y,...) {
                    panel.xyplot(x,y,...)
                    panel.abline(a = 0, b = 1, col.line = "red", lty = 3)
                  })
  return(tplot)
}
jylhaisi/post-processing-mos-point-utils documentation built on May 20, 2019, 6:14 p.m.