#' Data used in statistical models
#'
#' @return ModelData. object
#'
#' @export
ModelData <- function() {
obj <- rlang::env(routes_stations = NULL,
past_horizon = NULL,
ski_nivo = NULL,
na_replacement = NULL,
Xy = NULL,
split = NULL,
X_train = NULL, y_train = NULL,
X_test = NULL, y_test = NULL)
class(obj) <- "ModelData"
obj
}
# -------
# METHODS
# -------
shrink_data <- function(obj, ...) UseMethod("shrink_data")
#' Assign non mandatory data to perform prediction to NULL
#'
#' @param obj ModelData
shrink_data.ModelData <- function(obj) {
obj$ski_nivo <- NULL
obj$Xy <- NULL
obj$X_train <- NULL
obj$y_train <- NULL
obj$X_test <- NULL
obj$y_test <- NULL
}
to_timeseries <- function(obj, ...) UseMethod("to_timeseries")
#' Take a grouped data.frame and group all variables to list variables
#' Also performed NA replacement
#'
#' @param obj ModelData
#' @param df data.frame
#'
#' @return data.frame
to_timeseries.ModelData <- function(obj, df) {
df <- df %>%
dplyr::arrange(dplyr::desc(date_nivo))
if (obj$na_replacement == "mean") {
df <- df %>%
dplyr::mutate_at(dplyr::vars(-dplyr::group_cols()), list(~ifelse(is.na(.), mean(., na.rm = TRUE), .)))
} else if (obj$na_replacement == "downup") {
df <- df %>%
tidyr::fill(.direction = "downup")
}
df <- df %>%
dplyr::summarise_all(list) %>%
dplyr::ungroup() %>%
dplyr::select(!c(dir_vent, temp_min, temp_max, point_rose, humidite, nebulosite, precipitation, temp_neige,
grain_neige))
variables <- dplyr::vars(c(force_vent, temperature, neige_totale, neige_fraiche))
if (obj$na_replacement == "drop") {
# remove rows missing information: any variables evaluates to a list containing a least one NA
df <- df %>%
dplyr::rowwise() %>%
dplyr::filter_at(variables, ~ !any(is.na(.)))
}
# remove rows without any information: all variables evaluates to a list containing NA only
df %>%
dplyr::rowwise() %>%
dplyr::filter_at(variables, dplyr::any_vars(!all(is.na(.))))
}
load_routes_stations <- function(obj, ...) UseMethod("load_routes_stations")
#' Join ski routes and nivo stations on position
#'
#' @param obj Model
load_routes_stations.ModelData <- function(obj) {
# first we take the nearest station
# then if the route start and the station are more than 50km away we discard
custom_sf_join <- function(x, y) {
nearest <- sf::st_nearest_feature(x, y)
not_too_far <- furrr::future_map(
seq_along(nearest),
function(i) lengths(sf::st_is_within_distance(x[i,], y[nearest[i],], 50 * 1000)) > 0,
.options = furrr::furrr_options(seed = NULL)
) %>%
unlist()
nearest[not_too_far == FALSE] <- NA
nearest
}
obj$routes_stations <- sf::st_join(sf::st_sf(bonski.data::ski_routes),
sf::st_sf(bonski.data::nivo_stations[, c("station", "geometry")]),
join = custom_sf_join) %>%
sf::st_drop_geometry() %>%
tidyr::drop_na(station) %>%
dplyr::select(c(route, orientation, station))
}
#' Set ski stations data to be used with the statistical model
#'
#' 1. Cast nivo observation datetime to date and group by station and date
#' 2. Group ski evaluations per route and date
#' 3. Join ski evaluations and nivo observations:
#' 1. join ski_routes and nivo_stations based on their position -> routes_stations
#' 2. join ski_evaluations_history and routes_stations using the route id -> ski
#' 3. join ski and nivo_observations_history using the station id and the dates: observation date and evaluation date.
#' For an evaluation we are interested about observations of the previous days, up to past_horizon
#'
#' @param obj ModelData
#' @param past_horizon integer. Past horizon to consider in days
load_ski_nivo.ModelData <- function(obj, past_horizon) {
obj$past_horizon <- past_horizon
n_cores <- future::availableCores()
future::plan(future::multisession, workers = n_cores)
do_join <- function(i) {
ski$date_nivo <- ski$date - lubridate::days(i)
# 3rd join, performed past_horizon times
dplyr::left_join(ski, nivo_observations_grouped, by = c("station" = "station", "date_nivo" = "date"))
}
nivo_observations_grouped <- bonski.data::group_nivo_observations(bonski.data::nivo_observations_history)
ski_evaluations_grouped <- bonski.data::ski_evaluations_history %>%
dplyr::group_by(route, date) %>%
dplyr::summarise_all(mean, na.rm = TRUE) %>%
tidyr::drop_na(skiabilite, date)
# 1st join
load_routes_stations(obj)
# 2nd join
ski <- dplyr::left_join(ski_evaluations_grouped, obj$routes_stations, by = "route")
obj$ski_nivo <- furrr::future_map(1:obj$past_horizon, do_join, .options = furrr::furrr_options(seed = NULL)) %>%
data.table::rbindlist()
}
restrict_orientations.ModelData <- function(obj, orientations, ski_nivo, routes_stations) {
if (is.null(ski_nivo)) {
ski_nivo <- obj$ski_nivo
}
if (is.null(routes_stations)) {
routes_stations <- obj$routes_stations
}
obj$ski_nivo <- ski_nivo %>%
dplyr::filter(orientation %in% orientations)
obj$routes_stations <- routes_stations %>%
dplyr::filter(orientation %in% orientations)
}
#' Set Xy data to be used with the statistical model
#'
#' 1. Group by route, date, station
#' 2. Apply a strategy to replace NA values and consider nivo observations as time series
#'
#' @param obj ModelData
#' @param na_replacement character. NA replacement strategy to apply
load_Xy.ModelData <- function(obj, na_replacement) {
if (is.null(obj$ski_nivo)) {
stop("ski_nivo data should be loaded first: load_ski_nivo(...)")
}
obj$na_replacement <- na_replacement
obj$Xy <- to_timeseries(obj, dplyr::group_by(obj$ski_nivo, route, orientation, skiabilite, date, station))
}
#' Split train and test data
#'
#' Full dataset is divided into training ant testing following a split ratio
#'
#' @param obj Model.
#' @param split numeric. Split ratio between train and test data
split_Xy.ModelData <- function(obj, split) {
if (is.null(obj$Xy)) {
stop("Xy data should be loaded first: load_Xy(...)")
}
obj$split <- split
Xy <- obj$Xy %>%
dplyr::select(!c(route, orientation, date, station, date_nivo))
sample <- caTools::sample.split(Xy$skiabilite, SplitRatio = obj$split)
Xy_train <- subset(Xy, sample == TRUE)
Xy_test <- subset(Xy, sample == FALSE)
obj$X_train <- Xy_train %>%
dplyr::select(-skiabilite)
obj$y_train <- Xy_train$skiabilite
obj$X_test <- Xy_test %>%
dplyr::select(-skiabilite)
obj$y_test <- Xy_test$skiabilite
}
fetch_fresh_X <- function(obj, ...) UseMethod("fetch_fresh_X")
#' Get last nivological observations from MeteoFrance
#'
#' @param obj Model.
#'
#' @return data.frame. fresh data
fetch_fresh_X.ModelData <- function(obj) {
now <- lubridate::today(tzone = "Europe/Paris")
url_base <- "https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Nivo/nivo"
url_suffix <- "csv"
url_from_char <- function(d) {
date <- now - lubridate::days(d)
paste(url_base,
paste0(lubridate::year(date), formatC(lubridate::month(date), width = 2, flag = 0),
formatC(lubridate::day(date), width = 2, flag = 0)),
url_suffix,
sep = ".")
}
urls <- lapply(1:obj$past_horizon, url_from_char)
nivo_observations <- bonski.data::request_nivo_observations(urls) %>%
bonski.data::group_nivo_observations() %>%
dplyr::rename(date_nivo = date)
# Apply NA replacement strat and transform col to timeseries
to_timeseries(obj, nivo_observations)
}
attach_Xy <- function(obj, ...) UseMethod("attach_Xy")
#' Apply predicted skiabilite to routes
#'
#' @param obj ModelData
#' @param stations data.frame
#' @param skiabilite list
#'
#' @return data.frame
attach_Xy.ModelData <- function(obj, stations, skiabilite) {
if (is.null(obj$routes_stations)) {
stop("routes_stations data should be loaded first: load_routes_stations(...)")
}
stations$skiabilite <- skiabilite
dplyr::left_join(obj$routes_stations, stations, by = "station") %>%
dplyr::select(route, skiabilite)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.