R/distantia_model_frame.R
In distantia: Advanced Toolset for Efficient Time Series Dissimilarity Analysis

Documented in distantia_model_frame

#' Dissimilarity Model Frame
#'
#' @description
#' This function generates a model frame for statistical or machine learning analysis from these objects:
#'
#' \itemize{
#'   \item: Dissimilarity data frame generated by [distantia()], [distantia_ls()], [distantia_dtw()], or [distantia_time_delay()]. The output model frame will have as many rows as this data frame.
#'   \item: Data frame with static descriptors of the time series. These descriptors are converted to distances between pairs of time series via [distance_matrix()].
#'   \item: List defining composite predictors. This feature allows grouping together predictors that have a common meaning. For example, `composite_predictors = list(temperature = c("temperature_mean", "temperature_min", "temperature_max")` generates a new predictor named "temperature", which results from computing the multivariate distances between the vectors of temperature variables of each pair of time series. Predictors in one of such groups will be scaled before distance computation if their maximum standard deviations differ by a factor of 10 or more.
#' }
#'
#'
#' The resulting data frame contains the following columns:
#' \itemize{
#'   \item `x` and `y`: names of the pair of time series represented in the row.
#'   \item response columns in `response_df`.
#'   \item predictors columns: representing the distance between the values of the given static predictor between `x` and `y`.
#'   \item (optional) `geographic_distance`: If `predictors_df` is an sf `sf` data frame, then geographic distances are computed via [sf::st_distance()].
#' }
#'
#' This function supports a parallelization setup via [future::plan()].
#'
#' @param response_df (required, data frame) output of [distantia()], [distantia_ls()], [distantia_dtw()], or [distantia_time_delay()]. Default: NULL
#' @param predictors_df (required, data frame or sf data frame) data frame with numeric predictors for the the model frame. Must have a column with the time series names in `response_df$x` and `response_df$y`. If `sf` data frame, the column "geographic_distance" with distances between pairs of time series is added to the model frame. Default: NULL
#' @param composite_predictors (optional, list) list defining composite predictors. For example, `composite_predictors = list(a = c("b", "c"))` uses the columns `"b"` and `"c"` from `predictors_df` to generate the predictor `a` as the multivariate distance between `"b"` and `"c"` for each pair of time series in `response_df`. Default: NULL
#' @param scale (optional, logical) if TRUE, all predictors are scaled and centered with [scale()]. Default: TRUE
#' @param distance (optional, string) Method to compute the distance between predictor values for all pairs of time series in `response_df`. Default: "euclidean".
#'
#' @return data frame: with attributes "predictors", "response", and "formula".
#' @export
#' @autoglobal
#' @examples
#'
#' #covid prevalence in California counties
#' tsl <- tsl_initialize(
#'   x = covid_prevalence,
#'   name_column = "name",
#'   time_column = "time"
#' ) |>
#'   #subset to shorten example runtime
#'   tsl_subset(
#'     names = 1:5
#'   )
#'
#' #dissimilarity analysis
#' df <- distantia_ls(tsl = tsl)
#'
#' #combine several predictors
#' #into a new one
#' composite_predictors <- list(
#'   economy = c(
#'     "poverty_percentage",
#'     "median_income",
#'     "domestic_product"
#'     )
#' )
#'
#' #generate model frame
#' model_frame <- distantia_model_frame(
#'   response_df = df,
#'   predictors_df = covid_counties,
#'   composite_predictors = composite_predictors,
#'   scale = TRUE
#' )
#'
#' head(model_frame)
#'
#' #names of response and predictors
#' #and an additive formula
#' #are stored as attributes
#' attributes(model_frame)$predictors
#'
#' #if response_df is output of distantia():
#' attributes(model_frame)$response
#' attributes(model_frame)$formula
#'
#' #example of linear model
#' # model <- lm(
#' #   formula = attributes(model_frame)$formula,
#' #   data = model_frame
#' # )
#' #
#' # summary(model)
#'
#' @family distantia_support
distantia_model_frame <- function(
    response_df = NULL,
    predictors_df = NULL,
    composite_predictors = NULL,
    scale = TRUE,
    distance = "euclidean"
){

  distance <- utils_check_distance_args(
    distance = distance
  )

  #df ----
  if(is.null(response_df)){

    stop(
      "distantia::distantia/momentum_model_frame(): argument `response_df' must not be NULL.",
      call. = FALSE
    )

  }

  df_type_distantia <- c(
    "distantia_df",
    "time_delay_df"
  )

  df_type_momentum <- "momentum_df"

  df_types <- c(
    df_type_distantia,
    df_type_momentum
  )

  df_type <- attributes(response_df)$type

  if(
    is.null(df_type) ||
    !(df_type %in% df_types)
  ){
    stop("distantia::distantia/momentum_model_frame(): argument 'response_df' must be the output of distantia::distantia(), distantia::distantia_time_delay(), or distantia::momentum().", call. = FALSE)
  }


  if(df_type %in% df_type_distantia){

    f_name <- "distantia::distantia_model_frame(): "

    if(df_type == "distantia_df"){

      response_df <- distantia_aggregate(
        df = response_df
      )

    }
  }

  if(df_type == df_type_momentum){

    f_name <- "distantia::momentum_model_frame(): "

    response_df <- response_df |>
      momentum_aggregate() |>
      momentum_to_wide()

  }

  #add ID
  response_df$id <- seq_len(nrow(response_df))

  #predictors_df ----
  if(is.null(predictors_df)){

    stop(
      f_name, "argument `predictors_df' must not be NULL.",
      call. = FALSE
    )

  }

  if(!inherits(x = predictors_df, what = "data.frame")){

    stop(
      f_name, "argument `predictors_df' must be a data frame or an sf data frame.",
      call. = FALSE
    )

  }

  #identify names column in predictors response_df
  df_names <- unique(
    c(response_df$x, response_df$y)
  )

  predictors_name_column <- sapply(
    X = predictors_df,
    FUN = function(x){
      sum(df_names %in% x) == length(df_names)
    }
  )

  predictors_name_column <- names(predictors_name_column[which(predictors_name_column)])

  if(length(predictors_name_column) == 0){
    stop(
      f_name, "argument 'predictors_df' must have a column with the names in 'response_df$x' and 'response_df$y'.",
      call. = FALSE
    )
  }

  #numeric columns
  predictors_numeric_columns <- sapply(
    X = predictors_df,
    FUN = is.numeric
  )

  predictors_numeric_columns <- names(predictors_numeric_columns[which(predictors_numeric_columns)])

  #if no predictors and no sf, stop
  if(
    length(predictors_numeric_columns) == 0 &&
    is.null(attributes(predictors_df)$sf_column)
  ){

    stop(
      f_name, "argument 'predictors_df' must have at least one numeric column.",
      call. = FALSE
    )

  }

  #compute distances between predictors

  #default predictors list
  composite_predictors_default <- as.list(
    predictors_numeric_columns
  )

  names(composite_predictors_default) <- predictors_numeric_columns

  #predictors list
  if(is.null(composite_predictors)){

    composite_predictors <- composite_predictors_default

  } else {

    #add names if not named
    if(is.null(names(composite_predictors))){
      names(composite_predictors) <- sapply(
        X = composite_predictors,
        FUN = function(x){
          paste(x, collapse = "_")
        }
      )
    }

    composite_predictors_default <- composite_predictors_default[!(names(composite_predictors_default) %in% names(composite_predictors))]

    composite_predictors <- c(
      composite_predictors,
      composite_predictors_default
    )

  }

  #remove geometry
  predictors_df_no_geom <- utils_drop_geometry(
    df = predictors_df
  )

  #id df
  response_df.id <- response_df[, "id", drop = FALSE]

  #compute distance matrices
  model_frame_list <- future.apply::future_lapply(
    X = seq_len(length(composite_predictors)),
    FUN = function(x){

      #extract predictors columns
      df.i <- predictors_df_no_geom[
        ,
        unlist(composite_predictors[[x]]),
        drop = FALSE
      ]

      #decide if scaling is needed
      sd_ratios <- max(sapply(X = df.i, FUN = sd)) /
        min(sapply(X = df.i, FUN = sd))

      if(sd_ratios > 10){

        df.i <- df.i |>
          scale() |>
          as.data.frame()

      }

      #add name column
      df.i[[predictors_name_column]] <- predictors_df[[predictors_name_column]]

      #compute distance matrix
      m.i <- distantia::distance_matrix(
        df = df.i,
        name_column = predictors_name_column,
        distance = distance
      )

      #prepare df
      out.df.i <- response_df.id

      #extract matrix as vector
      variable.i <- mapply(
        FUN = function(a, b) m.i[a, b],
        response_df$x,
        response_df$y
      )

      names(variable.i) <- NULL

      out.df.i[[names(composite_predictors)[x]]] <- variable.i

      out.df.i

    },
    future.seed = NULL
  )

  names(model_frame_list) <- names(composite_predictors)



  #add geographic distance if predictors_df is an sf data frame
  if(!is.null(attributes(predictors_df)$sf_column)){

    if(
      !any(
        requireNamespace("sf", quietly = TRUE),
        requireNamespace("lwgeom", quietly = TRUE)
      )
    ){
      stop(
        f_name, "this function requires installing the packages 'sf' and 'lwgeom'.",
        call. = FALSE
      )
    }

    m.i <- sf::st_distance(
      x = predictors_df
    )

    dimnames(m.i) <- list(
      as.character(predictors_df[[predictors_name_column]]),
      as.character(predictors_df[[predictors_name_column]])
    )

    variable.i <- mapply(
      FUN = function(a, b) m.i[a, b],
      response_df$x,
      response_df$y
    )

    names(variable.i) <- NULL

    new_distance_column <- "geographic_distance"

    if(new_distance_column %in% c(colnames(response_df), colnames(predictors_df))){
      new_distance_column <- paste0("_", new_distance_column, "_")
    }

    response_df.id[[new_distance_column]] <- variable.i

    predictors_numeric_columns <- c(predictors_numeric_columns, new_distance_column)

    #add to model frame list
    if(exists("model_frame_list")){
      model_frame_list[["geographic_distance"]] <- response_df.id
    } else {
      model_frame_list <- list(
        geographic_distance = response_df.id
      )
    }


  }

  #add distantia df
  model_frame_list <- c(
    list(response = response_df),
    model_frame_list
  )

  #join all data frames
  model_frame <- Reduce(
    f = function(x, y){
      merge(x, y, by = "id")
    },
    x = model_frame_list
  )

  model_frame$id <- NULL

  #scale
  if(scale == TRUE){

    model_frame <- cbind(
      model_frame[, setdiff(colnames(model_frame), predictors_numeric_columns)],
      model_frame[, predictors_numeric_columns, drop = FALSE] |>
        base::scale() |>
        as.data.frame()
    )

  }

  attr(
    x = model_frame,
    which = "predictors"
  ) <- predictors_numeric_columns

  if(df_type == "distantia_df"){

    attr(
      x = model_frame,
      which = "response"
    ) <- "psi"

    attr(
      x = model_frame,
      which = "formula"
    ) <- stats::as.formula(
      paste(
        "psi ~ ",
        paste(predictors_numeric_columns, collapse = " + ")
      )
    )

  }

  model_frame

}