stAirPol: Spatio-temporal Modelling of Air Pollution Data

Documented in fit_subintervalls

#' fit_subintervalls
#'
#' Fit the spatio-temporal models on subinterval.
#'
#' @param data Modelling data.frame which contains information about the
#' covariables and the target variable
#' @param grid grid data where a prediction should be calculated or a
#' training_set
#' @param training_set an object generated by \link{get_test_and_training_set},
#' if those an object is delivered, the data argument will be reduced to the
#' in training_set specified training sensors.
#' @param unit for splitting the dataset, see \link{lubridate::floor_date}
#' for more informations
#' @param nItr Number of MCMC iterations. Default value is 5000.
#' @param nBurn Number of burn-in samples. This number of samples will be
#' discarded before making any inference. Default value is 1000.
#' @param return default is 'prediction', otherwise use 'parameters' to return
#' the parametervalues of each submodel in a data.frame.
#' @param tol Minimum separation distance between any two locations out of
#' those specified by coords, knots.coords and pred.coords. The default is
#' 0.005. The programm will exit if the minimum distance is less than the
#' non-zero specified value. This will ensure non-singularity of the
#' covariance matrices.
#' @param mc.cores how much cores should be used for parallelization, default is
#' one core less your maximum number of detected cores.
#' @param retry_count How much retries should be tried, if some strange behavior
#' appears, for example posteriori Mean above 500 or a RMSE of the prediction
#' above 2000. Both could be specified separately. Default is 5.
#' @param retry_reason_mean retry on posteriori Mean above that value,
#' default is 500
#' @param retry_reason_rmse retry on a RMSE above that value, default is 2000
#' @param ... Parameters passed to \link{fit_sp_model}
#'
#' @importFrom lubridate floor_date
#' @importFrom pbmcapply pbmclapply
#'
#' @seealso \link{fit_subintervalls}, \link{predict.stAirPol.model},
#' \link{predict_split} \link{fit_model}
#'
#' @return the grid with added columns of the prediction calculations, or the
#' crossvaldiation predictions
#' @export
#'
#' @examples
#' data("mini_dataset")
#' mini_dataset <- clean_model_data(mini_dataset)
#' formula = value ~ humi + temp + rainhist + windhist +
#'   trafficvol + log(sensor_age)
#' training_set <- get_test_and_training_set(mini_dataset, sampel_size = 0.75,
#'                                           random.seed = 220292)
#' prediction.gp <- fit_subintervalls(data = mini_dataset, formula = formula,
#'                             model = 'GP', training_set = training_set,
#'                             unit = '1 day')
fit_subintervalls <- function(data, grid = NULL, training_set = NULL,
                              unit = '1 week', nItr = 5000, nBurn = 1000,
                              return = 'prediction', tol = 0.01,
                              retry_count = 5, retry_reason_mean = 500,
                              retry_reason_rmse = 2000,
                              mc.cores = parallel::detectCores() - 1, ...) {
  if (!is.null(grid)) {
    data.fit <- data
    data.predict <- grid
  } else if (!is.null(training_set)) {
    data.fit <- data[sensor_id %in% training_set$train]
    data.predict <- data[sensor_id %in% training_set$test]
  } else {
    stop("grid or training_set must be not NULL")
  }
  #' split the data to subintervalls and perform a modelling on each intervall
  data.fit_list <-
    split(data.fit,
          lubridate::floor_date(data.fit$timestamp, unit = unit))
  data.predict_list <-
    split(
      data.predict,
      lubridate::floor_date(data.predict$timestamp, unit = unit)
    )
  #' parallel computation for speeding up the modelfitting
  pred_list <- pbmcapply::pbmclapply(seq_along(data.fit_list), function(i) {
    attempt <- 1
    #' retry model fitting and prediction in cause of a strange behavior
    #' sometimes (MCMC)
    while (attempt <= retry_count) {
      attempt <- attempt + 1
      t <- try({
        #' remove sensors which are always NA
        s <- data.fit_list[[i]][!is.na(value), .N, by = sensor_id][N > 0]
        d <- data.fit_list[[i]][sensor_id %in% s$sensor_id]
        model <- fit_sp_model(data = d, training_set = NULL, tol = tol,
                           nItr = nItr, nBurn = nBurn, ...)
        if (return == 'prediction') {
          p <- rbindlist(lapply(unique(data.predict_list[[i]]$sensor_id),
                                function(x) {
            predict(model, newdata  = data.predict_list[[i]][sensor_id %in% x],
                    training_set = NULL, ...)}))
          if (RMSE(p$value, p$prediction) < retry_reason_rmse) {
            return(p)
          } else {
            stop(paste0('No Covergence, RMSE above', retry_reason_rmse))
          }
          if (mean(p$prediction) > retry_reason_mean) {
            stop(paste0('No Covergence, prediction Mean above ', retry_reason_mean))
          }
          if (is.null(training_set) & !is.null(grid)) return(p)
        } else {
          m <- model$parameter
          if (any(abs(m$Mean) > retry_reason_mean)) {
            stop(paste0('No Covergence, Paramaters Mean above ', retry_reason_mean))
          }
          m$name <- row.names(model$parameter)
          m$split <- i
          return(m)
        }
      })
    }
    return(t)
  }, mc.cores = mc.cores )
  pred_list <- lapply(1:length(pred_list), function(x) {
    if ('try-error' %in% class(pred_list[[x]])) {
      print(paste0('ERROR IN TIMEINTERVAL', x))
      print(pred_list[[x]])
      return(NULL)
    } else {
      return(pred_list[[x]])
    }
  })
  newdata <- rbindlist(pred_list)
  if (return == 'prediction') {
    class(newdata) <- c('stAirPol.prediction', class(newdata))
  } else if (return == 'parameters') {
    class(newdata) <- c('stAirPol.parameters', class(newdata))
  }
  newdata
}