bonski.predict: Predict Where Ski Will Be Good

Documented in ModelData shrink_data.ModelData to_timeseries.ModelData

#' Data used in statistical models
#'
#' @return ModelData. object
#'
#' @export
ModelData <- function() {
  obj <- rlang::env(routes_stations = NULL,
                    past_horizon = NULL,
                    ski_nivo = NULL,
                    na_replacement = NULL,
                    Xy = NULL,
                    split = NULL,
                    X_train = NULL, y_train = NULL,
                    X_test = NULL, y_test = NULL)
  class(obj) <- "ModelData"
  obj
}

# -------
# METHODS
# -------

shrink_data <- function(obj, ...) UseMethod("shrink_data")

#' Assign non mandatory data to perform prediction to NULL
#'
#' @param obj ModelData
shrink_data.ModelData <- function(obj) {
  obj$ski_nivo <- NULL
  obj$Xy <- NULL
  obj$X_train <- NULL
  obj$y_train <- NULL
  obj$X_test <- NULL
  obj$y_test <- NULL
}

to_timeseries <- function(obj, ...) UseMethod("to_timeseries")

#' Take a grouped data.frame and group all variables to list variables
#' Also performed NA replacement
#'
#' @param obj ModelData
#' @param df data.frame
#'
#' @return data.frame
to_timeseries.ModelData <- function(obj, df) {
  df <- df %>%
    dplyr::arrange(dplyr::desc(date_nivo))
  if (obj$na_replacement == "mean") {
    df <- df %>%
      dplyr::mutate_at(dplyr::vars(-dplyr::group_cols()), list(~ifelse(is.na(.), mean(., na.rm = TRUE), .)))
  } else if (obj$na_replacement == "downup") {
    df <- df %>%
      tidyr::fill(.direction = "downup")
  }
  df <- df %>%
    dplyr::summarise_all(list) %>%
    dplyr::ungroup() %>%
    dplyr::select(!c(dir_vent, temp_min, temp_max, point_rose, humidite, nebulosite, precipitation, temp_neige,
                     grain_neige))
  variables <- dplyr::vars(c(force_vent, temperature, neige_totale, neige_fraiche))
  if (obj$na_replacement == "drop") {
    # remove rows missing information: any variables evaluates to a list containing a least one NA
    df <- df %>%
      dplyr::rowwise() %>%
      dplyr::filter_at(variables, ~ !any(is.na(.)))
  }
  # remove rows without any information: all variables evaluates to a list containing NA only
  df %>%
    dplyr::rowwise() %>%
    dplyr::filter_at(variables, dplyr::any_vars(!all(is.na(.))))
}

load_routes_stations <- function(obj, ...) UseMethod("load_routes_stations")

#' Join ski routes and nivo stations on position
#'
#' @param obj Model
load_routes_stations.ModelData <- function(obj) {
  # first we take the nearest station
  # then if the route start and the station are more than 50km away we discard
  custom_sf_join <- function(x, y) {
    nearest <- sf::st_nearest_feature(x, y)
    not_too_far <- furrr::future_map(
      seq_along(nearest),
      function(i) lengths(sf::st_is_within_distance(x[i,], y[nearest[i],], 50 * 1000)) > 0,
      .options = furrr::furrr_options(seed = NULL)
    ) %>%
      unlist()
    nearest[not_too_far == FALSE] <- NA
    nearest
  }

  obj$routes_stations <- sf::st_join(sf::st_sf(bonski.data::ski_routes),
                                 sf::st_sf(bonski.data::nivo_stations[, c("station", "geometry")]),
                                 join = custom_sf_join) %>%
    sf::st_drop_geometry() %>%
    tidyr::drop_na(station) %>%
    dplyr::select(c(route, orientation, station))
}

#' Set ski stations data to be used with the statistical model
#'
#' 1. Cast nivo observation datetime to date and group by station and date
#' 2. Group ski evaluations per route and date
#' 3. Join ski evaluations and nivo observations:
#'   1. join ski_routes and nivo_stations based on their position -> routes_stations
#'   2. join ski_evaluations_history and routes_stations using the route id -> ski
#'   3. join ski and nivo_observations_history using the station id and the dates: observation date and evaluation date.
#'      For an evaluation we are interested about observations of the previous days, up to past_horizon
#'
#' @param obj ModelData
#' @param past_horizon integer. Past horizon to consider in days
load_ski_nivo.ModelData <- function(obj, past_horizon) {
  obj$past_horizon <- past_horizon

  n_cores <- future::availableCores()
  future::plan(future::multisession, workers = n_cores)

  do_join <- function(i) {
    ski$date_nivo <- ski$date - lubridate::days(i)
    # 3rd join, performed past_horizon times
    dplyr::left_join(ski, nivo_observations_grouped, by = c("station" = "station", "date_nivo" = "date"))
  }

  nivo_observations_grouped <- bonski.data::group_nivo_observations(bonski.data::nivo_observations_history)

  ski_evaluations_grouped <- bonski.data::ski_evaluations_history %>%
    dplyr::group_by(route, date) %>%
    dplyr::summarise_all(mean, na.rm = TRUE) %>%
    tidyr::drop_na(skiabilite, date)

  # 1st join
  load_routes_stations(obj)

  # 2nd join
  ski <- dplyr::left_join(ski_evaluations_grouped, obj$routes_stations, by = "route")

  obj$ski_nivo <- furrr::future_map(1:obj$past_horizon, do_join, .options = furrr::furrr_options(seed = NULL)) %>%
    data.table::rbindlist()
}

restrict_orientations.ModelData <- function(obj, orientations, ski_nivo, routes_stations) {
  if (is.null(ski_nivo)) {
    ski_nivo <- obj$ski_nivo
  }
  if (is.null(routes_stations)) {
    routes_stations <- obj$routes_stations
  }
  obj$ski_nivo <- ski_nivo %>%
    dplyr::filter(orientation %in% orientations)
  obj$routes_stations <- routes_stations %>%
    dplyr::filter(orientation %in% orientations)
}

#' Set Xy data to be used with the statistical model
#'
#' 1. Group by route, date, station
#' 2. Apply a strategy to replace NA values and consider nivo observations as time series
#'
#' @param obj ModelData
#' @param na_replacement character. NA replacement strategy to apply
load_Xy.ModelData <- function(obj, na_replacement) {
  if (is.null(obj$ski_nivo)) {
    stop("ski_nivo data should be loaded first: load_ski_nivo(...)")
  }

  obj$na_replacement <- na_replacement
  obj$Xy <- to_timeseries(obj, dplyr::group_by(obj$ski_nivo, route, orientation, skiabilite, date, station))
}

#' Split train and test data
#'
#' Full dataset is divided into training ant testing following a split ratio
#'
#' @param obj Model.
#' @param split numeric. Split ratio between train and test data
split_Xy.ModelData <- function(obj, split) {
  if (is.null(obj$Xy)) {
    stop("Xy data should be loaded first: load_Xy(...)")
  }

  obj$split <- split

  Xy <- obj$Xy %>%
    dplyr::select(!c(route, orientation, date, station, date_nivo))
  sample <- caTools::sample.split(Xy$skiabilite, SplitRatio = obj$split)
  Xy_train <- subset(Xy, sample == TRUE)
  Xy_test <- subset(Xy, sample == FALSE)
  obj$X_train <- Xy_train %>%
    dplyr::select(-skiabilite)
  obj$y_train <- Xy_train$skiabilite
  obj$X_test <- Xy_test %>%
    dplyr::select(-skiabilite)
  obj$y_test <- Xy_test$skiabilite
}

fetch_fresh_X <- function(obj, ...) UseMethod("fetch_fresh_X")

#' Get last nivological observations from MeteoFrance
#'
#' @param obj Model.
#'
#' @return data.frame. fresh data
fetch_fresh_X.ModelData <- function(obj) {
  now <- lubridate::today(tzone = "Europe/Paris")
  url_base <- "https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Nivo/nivo"
  url_suffix <- "csv"

  url_from_char <- function(d) {
    date <- now - lubridate::days(d)
    paste(url_base,
          paste0(lubridate::year(date), formatC(lubridate::month(date), width = 2, flag = 0),
                  formatC(lubridate::day(date), width = 2, flag = 0)),
          url_suffix,
          sep = ".")
  }

  urls <- lapply(1:obj$past_horizon, url_from_char)
  nivo_observations <- bonski.data::request_nivo_observations(urls) %>%
    bonski.data::group_nivo_observations() %>%
    dplyr::rename(date_nivo = date)

  # Apply NA replacement strat and transform col to timeseries
  to_timeseries(obj, nivo_observations)
}

attach_Xy <- function(obj, ...) UseMethod("attach_Xy")

#' Apply predicted skiabilite to routes
#'
#' @param obj ModelData
#' @param stations data.frame
#' @param skiabilite list
#'
#' @return data.frame
attach_Xy.ModelData <- function(obj, stations, skiabilite) {
  if (is.null(obj$routes_stations)) {
    stop("routes_stations data should be loaded first: load_routes_stations(...)")
  }

  stations$skiabilite <- skiabilite
  dplyr::left_join(obj$routes_stations, stations, by = "station") %>%
    dplyr::select(route, skiabilite)
}