Nothing
#' Dissimilarity Model Frame
#'
#' @description
#' This function generates a model frame for statistical or machine learning analysis from these objects:
#'
#' \itemize{
#' \item: Dissimilarity data frame generated by [distantia()], [distantia_ls()], [distantia_dtw()], or [distantia_time_delay()]. The output model frame will have as many rows as this data frame.
#' \item: Data frame with static descriptors of the time series. These descriptors are converted to distances between pairs of time series via [distance_matrix()].
#' \item: List defining composite predictors. This feature allows grouping together predictors that have a common meaning. For example, `composite_predictors = list(temperature = c("temperature_mean", "temperature_min", "temperature_max")` generates a new predictor named "temperature", which results from computing the multivariate distances between the vectors of temperature variables of each pair of time series. Predictors in one of such groups will be scaled before distance computation if their maximum standard deviations differ by a factor of 10 or more.
#' }
#'
#'
#' The resulting data frame contains the following columns:
#' \itemize{
#' \item `x` and `y`: names of the pair of time series represented in the row.
#' \item response columns in `response_df`.
#' \item predictors columns: representing the distance between the values of the given static predictor between `x` and `y`.
#' \item (optional) `geographic_distance`: If `predictors_df` is an sf `sf` data frame, then geographic distances are computed via [sf::st_distance()].
#' }
#'
#' This function supports a parallelization setup via [future::plan()].
#'
#' @param response_df (required, data frame) output of [distantia()], [distantia_ls()], [distantia_dtw()], or [distantia_time_delay()]. Default: NULL
#' @param predictors_df (required, data frame or sf data frame) data frame with numeric predictors for the the model frame. Must have a column with the time series names in `response_df$x` and `response_df$y`. If `sf` data frame, the column "geographic_distance" with distances between pairs of time series is added to the model frame. Default: NULL
#' @param composite_predictors (optional, list) list defining composite predictors. For example, `composite_predictors = list(a = c("b", "c"))` uses the columns `"b"` and `"c"` from `predictors_df` to generate the predictor `a` as the multivariate distance between `"b"` and `"c"` for each pair of time series in `response_df`. Default: NULL
#' @param scale (optional, logical) if TRUE, all predictors are scaled and centered with [scale()]. Default: TRUE
#' @param distance (optional, string) Method to compute the distance between predictor values for all pairs of time series in `response_df`. Default: "euclidean".
#'
#' @return data frame: with attributes "predictors", "response", and "formula".
#' @export
#' @autoglobal
#' @examples
#'
#' #covid prevalence in California counties
#' tsl <- tsl_initialize(
#' x = covid_prevalence,
#' name_column = "name",
#' time_column = "time"
#' ) |>
#' #subset to shorten example runtime
#' tsl_subset(
#' names = 1:5
#' )
#'
#' #dissimilarity analysis
#' df <- distantia_ls(tsl = tsl)
#'
#' #combine several predictors
#' #into a new one
#' composite_predictors <- list(
#' economy = c(
#' "poverty_percentage",
#' "median_income",
#' "domestic_product"
#' )
#' )
#'
#' #generate model frame
#' model_frame <- distantia_model_frame(
#' response_df = df,
#' predictors_df = covid_counties,
#' composite_predictors = composite_predictors,
#' scale = TRUE
#' )
#'
#' head(model_frame)
#'
#' #names of response and predictors
#' #and an additive formula
#' #are stored as attributes
#' attributes(model_frame)$predictors
#'
#' #if response_df is output of distantia():
#' attributes(model_frame)$response
#' attributes(model_frame)$formula
#'
#' #example of linear model
#' # model <- lm(
#' # formula = attributes(model_frame)$formula,
#' # data = model_frame
#' # )
#' #
#' # summary(model)
#'
#' @family distantia_support
distantia_model_frame <- function(
response_df = NULL,
predictors_df = NULL,
composite_predictors = NULL,
scale = TRUE,
distance = "euclidean"
){
distance <- utils_check_distance_args(
distance = distance
)
#df ----
if(is.null(response_df)){
stop(
"distantia::distantia/momentum_model_frame(): argument `response_df' must not be NULL.",
call. = FALSE
)
}
df_type_distantia <- c(
"distantia_df",
"time_delay_df"
)
df_type_momentum <- "momentum_df"
df_types <- c(
df_type_distantia,
df_type_momentum
)
df_type <- attributes(response_df)$type
if(
is.null(df_type) ||
!(df_type %in% df_types)
){
stop("distantia::distantia/momentum_model_frame(): argument 'response_df' must be the output of distantia::distantia(), distantia::distantia_time_delay(), or distantia::momentum().", call. = FALSE)
}
if(df_type %in% df_type_distantia){
f_name <- "distantia::distantia_model_frame(): "
if(df_type == "distantia_df"){
response_df <- distantia_aggregate(
df = response_df
)
}
}
if(df_type == df_type_momentum){
f_name <- "distantia::momentum_model_frame(): "
response_df <- response_df |>
momentum_aggregate() |>
momentum_to_wide()
}
#add ID
response_df$id <- seq_len(nrow(response_df))
#predictors_df ----
if(is.null(predictors_df)){
stop(
f_name, "argument `predictors_df' must not be NULL.",
call. = FALSE
)
}
if(!inherits(x = predictors_df, what = "data.frame")){
stop(
f_name, "argument `predictors_df' must be a data frame or an sf data frame.",
call. = FALSE
)
}
#identify names column in predictors response_df
df_names <- unique(
c(response_df$x, response_df$y)
)
predictors_name_column <- sapply(
X = predictors_df,
FUN = function(x){
sum(df_names %in% x) == length(df_names)
}
)
predictors_name_column <- names(predictors_name_column[which(predictors_name_column)])
if(length(predictors_name_column) == 0){
stop(
f_name, "argument 'predictors_df' must have a column with the names in 'response_df$x' and 'response_df$y'.",
call. = FALSE
)
}
#numeric columns
predictors_numeric_columns <- sapply(
X = predictors_df,
FUN = is.numeric
)
predictors_numeric_columns <- names(predictors_numeric_columns[which(predictors_numeric_columns)])
#if no predictors and no sf, stop
if(
length(predictors_numeric_columns) == 0 &&
is.null(attributes(predictors_df)$sf_column)
){
stop(
f_name, "argument 'predictors_df' must have at least one numeric column.",
call. = FALSE
)
}
#compute distances between predictors
#default predictors list
composite_predictors_default <- as.list(
predictors_numeric_columns
)
names(composite_predictors_default) <- predictors_numeric_columns
#predictors list
if(is.null(composite_predictors)){
composite_predictors <- composite_predictors_default
} else {
#add names if not named
if(is.null(names(composite_predictors))){
names(composite_predictors) <- sapply(
X = composite_predictors,
FUN = function(x){
paste(x, collapse = "_")
}
)
}
composite_predictors_default <- composite_predictors_default[!(names(composite_predictors_default) %in% names(composite_predictors))]
composite_predictors <- c(
composite_predictors,
composite_predictors_default
)
}
#remove geometry
predictors_df_no_geom <- utils_drop_geometry(
df = predictors_df
)
#id df
response_df.id <- response_df[, "id", drop = FALSE]
#compute distance matrices
model_frame_list <- future.apply::future_lapply(
X = seq_len(length(composite_predictors)),
FUN = function(x){
#extract predictors columns
df.i <- predictors_df_no_geom[
,
unlist(composite_predictors[[x]]),
drop = FALSE
]
#decide if scaling is needed
sd_ratios <- max(sapply(X = df.i, FUN = sd)) /
min(sapply(X = df.i, FUN = sd))
if(sd_ratios > 10){
df.i <- df.i |>
scale() |>
as.data.frame()
}
#add name column
df.i[[predictors_name_column]] <- predictors_df[[predictors_name_column]]
#compute distance matrix
m.i <- distantia::distance_matrix(
df = df.i,
name_column = predictors_name_column,
distance = distance
)
#prepare df
out.df.i <- response_df.id
#extract matrix as vector
variable.i <- mapply(
FUN = function(a, b) m.i[a, b],
response_df$x,
response_df$y
)
names(variable.i) <- NULL
out.df.i[[names(composite_predictors)[x]]] <- variable.i
out.df.i
},
future.seed = NULL
)
names(model_frame_list) <- names(composite_predictors)
#add geographic distance if predictors_df is an sf data frame
if(!is.null(attributes(predictors_df)$sf_column)){
if(
!any(
requireNamespace("sf", quietly = TRUE),
requireNamespace("lwgeom", quietly = TRUE)
)
){
stop(
f_name, "this function requires installing the packages 'sf' and 'lwgeom'.",
call. = FALSE
)
}
m.i <- sf::st_distance(
x = predictors_df
)
dimnames(m.i) <- list(
as.character(predictors_df[[predictors_name_column]]),
as.character(predictors_df[[predictors_name_column]])
)
variable.i <- mapply(
FUN = function(a, b) m.i[a, b],
response_df$x,
response_df$y
)
names(variable.i) <- NULL
new_distance_column <- "geographic_distance"
if(new_distance_column %in% c(colnames(response_df), colnames(predictors_df))){
new_distance_column <- paste0("_", new_distance_column, "_")
}
response_df.id[[new_distance_column]] <- variable.i
predictors_numeric_columns <- c(predictors_numeric_columns, new_distance_column)
#add to model frame list
if(exists("model_frame_list")){
model_frame_list[["geographic_distance"]] <- response_df.id
} else {
model_frame_list <- list(
geographic_distance = response_df.id
)
}
}
#add distantia df
model_frame_list <- c(
list(response = response_df),
model_frame_list
)
#join all data frames
model_frame <- Reduce(
f = function(x, y){
merge(x, y, by = "id")
},
x = model_frame_list
)
model_frame$id <- NULL
#scale
if(scale == TRUE){
model_frame <- cbind(
model_frame[, setdiff(colnames(model_frame), predictors_numeric_columns)],
model_frame[, predictors_numeric_columns, drop = FALSE] |>
base::scale() |>
as.data.frame()
)
}
attr(
x = model_frame,
which = "predictors"
) <- predictors_numeric_columns
if(df_type == "distantia_df"){
attr(
x = model_frame,
which = "response"
) <- "psi"
attr(
x = model_frame,
which = "formula"
) <- stats::as.formula(
paste(
"psi ~ ",
paste(predictors_numeric_columns, collapse = " + ")
)
)
}
model_frame
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.