R/PipeOpDateFeatures.R

Defines functions get_days_per_month is_leap_year get_weeks_per_year compute_cyclic_date_features is_day compute_date_features

#' @title Preprocess Date Features
#'
#' @usage NULL
#' @name mlr_pipeops_datefeatures
#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Based on `POSIXct`/`Date` columns of the data, a set of date related features is computed and
#' added to the feature set of the output task. If no `POSIXct` or `Date` column is found, the
#' original task is returned unaltered. This functionality is based on the `add_datepart()` and
#' `add_cyclic_datepart()` functions from the \CRANpkg{fastai} package. If operation on only
#' particular `POSIXct`/`Date` columns is requested, use the `affect_columns` parameter inherited
#' from [`PipeOpTaskPreprocSimple`].
#'
#' For `Date` columns, the features `"hour"`, `"minute"`, `"second"`, and `"is_day"` are skipped.
#'
#' If `cyclic = TRUE`, cyclic features are computed for the features `"month"`, `"week_of_year"`,
#' `"day_of_year"`, `"day_of_month"`, `"day_of_week"`, `"hour"`, `"minute"` and `"second"`. This
#' means that for each feature `x`, two additional features are computed, namely the sine and cosine
#' transformation of `2 * pi * x / max_x` (here `max_x` is the largest possible value the feature
#' could take on `+ 1`, assuming the lowest possible value is given by 0, e.g., for hours from 0 to
#' 23, this is 24). This is useful to respect the cyclical nature of features such as seconds, i.e.,
#' second 21 and second 22 are one second apart, but so are second 60 and second 1 of the next
#' minute.
#'
#' @section Construction:
#' ```
#' PipeOpDateFeatures$new(id = "datefeatures", param_vals = list())
#' ```
#'
#' * `id` :: `character(1)`\cr
#'   Identifier of resulting object, default `"datefeatures"`.
#' * `param_vals` :: named `list`\cr
#'   List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise
#'   be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with date-related features computed and added to the
#' feature set of the output task and the `POSIXct` columns of the data removed from the
#' feature set (depending on the value of `keep_date_var`).
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from
#' [`PipeOpTaskPreproc`].
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
#' * `keep_date_var` :: `logical(1)`\cr
#'   Should the `POSIXct` columns be kept as features? Default `FALSE`.
#' * `cyclic` :: `logical(1)`\cr
#'   Should cyclic features be computed? See Internals. Default `FALSE`.
#' * `year` :: `logical(1)`\cr
#'   Should the year be extracted as a feature? Default `TRUE`.
#' * `quarter` :: `logical(1)`\cr
#'   Should the quarter be extracted as a feature? Default `TRUE`.
#' * `month` :: `logical(1)`\cr
#'   Should the month be extracted as a feature? Default `TRUE`.
#' * `week_of_year` :: `logical(1)`\cr
#'   Should the week of the year be extracted as a feature? Default `TRUE`.
#' * `day_of_year` :: `logical(1)`\cr
#'   Should the day of the year be extracted as a feature? Default `TRUE`.
#' * `day_of_month` :: `logical(1)`\cr
#'   Should the day of the month be extracted as a feature? Default `TRUE`.
#' * `day_of_week` :: `logical(1)`\cr
#'   Should the day of the week (ISO 8601) be extracted as a feature? Default `TRUE`.
#' * `hour` :: `logical(1)`\cr
#'   Should the hour be extracted as a feature? Default `TRUE`.
#' * `minute` :: `logical(1)`\cr
#'   Should the minute be extracted as a feature? Default `TRUE`.
#' * `second` :: `logical(1)`\cr
#'   Should the second be extracted as a feature? Default `TRUE`.
#' * `is_day` :: `logical(1)`\cr
#'   Should a feature be extracted indicating whether it is day time (06:00am - 08:00pm)?
#'   Default `TRUE`.
#'
#' @section Internals:
#' The cyclic feature transformation always assumes that values range from 0, so some values
#' (e.g. day of the month) are shifted before sine/cosine transform.
#'
#' @section Fields:
#' Only fields inherited from [`PipeOp`].
#'
#' @section Methods:
#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @examples
#' library("mlr3")
#' dat = iris
#' set.seed(1)
#' dat$date = sample(
#'   seq(as.POSIXct("2020-02-01"), to = as.POSIXct("2020-02-29"), by = "hour"), size = 150L
#' )
#' task = TaskClassif$new("iris_date", backend = dat, target = "Species")
#' pop = po("datefeatures", param_vals = list(cyclic = FALSE, minute = FALSE, second = FALSE))
#' pop$train(list(task))
#' pop$state
#' @family PipeOps
#' @template seealso_pipeopslist
#' @include PipeOpTaskPreproc.R
#' @export
PipeOpDateFeatures = R6Class("PipeOpDateFeatures",
  inherit = PipeOpTaskPreprocSimple,
  public = list(
    initialize = function(id = "datefeatures", param_vals = list()) {
      ps = ps(
        keep_date_var = p_lgl(tags = c("train", "predict", "required")),
        cyclic = p_lgl(tags = c("train", "predict", "required")),
        year = p_lgl(tags = c("train", "predict", "datepart", "required")),
        quarter = p_lgl(tags = c("train", "predict", "datepart", "required")),
        month = p_lgl(tags = c("train", "predict", "datepart", "required")),
        week_of_year = p_lgl(tags = c("train", "predict", "datepart", "required")),
        day_of_year = p_lgl(tags = c("train", "predict", "datepart", "required")),
        day_of_month = p_lgl(tags = c("train", "predict", "datepart", "required")),
        day_of_week = p_lgl(tags = c("train", "predict", "datepart", "required")),
        hour = p_lgl(tags = c("train", "predict", "datepart", "required")),
        minute = p_lgl(tags = c("train", "predict", "datepart", "required")),
        second = p_lgl(tags = c("train", "predict", "datepart", "required")),
        is_day = p_lgl(tags = c("train", "predict", "datepart", "required"))
      )
      ps$values = list(
        keep_date_var = FALSE,
        cyclic = FALSE,
        year = TRUE,
        quarter = TRUE,
        month = TRUE,
        week_of_year = TRUE,
        day_of_year = TRUE,
        day_of_month = TRUE,
        day_of_week = TRUE,
        hour = TRUE,
        minute = TRUE,
        second = TRUE,
        is_day = TRUE
      )
      super$initialize(
        id = id,
        param_set = ps,
        param_vals = param_vals,
        feature_types = c("POSIXct", "Date")
      )
    }
  ),
  private = list(
    .transform_dt = function(dt, levels) {
      pv = self$param_set$get_values(tags = "train")
      features = names(which(unlist(self$param_set$get_values(tags = "datepart"))))
      if (length(features) == 0L) {
        return(dt)
      }

      # special handling of year because this is needed for day_of_year and day_of_month
      drop_year = "year" %nin% features
      features = unique(c("year", features))

      date_features = c(
        "year",
        "quarter",
        "month",
        "hour",
        "minute",
        "second",
        "week_of_year",
        "day_of_year",
        "day_of_month",
        "day_of_week",
        "is_day"
      )
      date_features = features[features %in% date_features]
      cyclic_features = features[
        features %in%
          c("month", "week_of_year", "day_of_year", "day_of_month", "day_of_week", "hour", "minute", "second")
      ]

      cols = copy(names(dt))
      for (j in cols) {
        x = compute_date_features(dt[[j]], date_features)
        set(dt, j = paste0(j, ".", names(x)), value = x)
      }

      # if cyclic = TRUE for month, week_of_year, day_of_year, day_of_month, day_of_week, hour,
      # minute and second, two columns are additionally added, each consisting of their sine and
      # cosine transformation of in general 2 * pi * x / max_x (x starting from 0)
      if (pv$cyclic && length(cyclic_features) > 0L) {
        for (j in cols) {
          nm = paste0(j, ".", rep(cyclic_features, each = 2L), "_", c("sin", "cos"))
          set(dt, j = nm, value = compute_cyclic_date_features(dt, cyclic_features, j))
        }
      }

      if (!pv$keep_date_var) {
        set(dt, j = cols, value = NULL)
      }

      if (drop_year) {
        set(dt, j = paste0(cols, ".", "year"), value = NULL)
      }

      dt
    }
  )
)

compute_date_features = function(x, features) {
  if (inherits(x, "Date")) {
    features = features[features %nin% c("hour", "minute", "second", "is_day")]
  }
  res = map(features, function(feature, nm) {
    switch(feature,
      year = year(x),
      quarter = quarter(x),
      month = month(x),
      week_of_year = isoweek(x),
      day_of_year = yday(x),
      day_of_month = mday(x),
      day_of_week = wday(x),
      hour = hour(x),
      minute = minute(x),
      second = second(x),
      is_day = is_day(x)
    )
  })
  set_names(res, features)
}

is_day = function(x) {
  hours = hour(x)
  (6L <= hours) & (hours <= 20L)
}

# helper function to compute cyclic date features of date features, i.e.,
# sine and cosine transformations of 2 * pi * x / max_x
compute_cyclic_date_features = function(date_features, features, date_var) {
  # drop the date_var-specific colnames here, this makes it easier in lapply
  cn = names(date_features)
  names(date_features) = c(cn[1L], gsub(paste0(date_var, "."), "", cn[-1L], fixed = TRUE))
  unlist(
    lapply(features, function(feature) {
      # all values are expected to start at 0 and therefore may be shifted by - 1
      value = if (feature %in% c("month", "week_of_year", "day_of_year", "day_of_month")) {
        date_features[[feature]] - 1L
      } else {
        date_features[[feature]]
      }
      maximum = switch(feature,
        month = 12L,
        week_of_year = get_weeks_per_year(date_features[["year"]]),
        day_of_year = 365L + as.integer(is_leap_year(date_features[["year"]])),
        day_of_month = get_days_per_month(date_features[["year"]], month = date_features[["month"]]),
        day_of_week = 7L,
        hour = 24L,
        minute = 60L,
        second = 60L
      )
      value_scaled = 2L * pi * value / maximum
      list(sin(value_scaled), cos(value_scaled))
    }),
    recursive = FALSE
  )
}

get_weeks_per_year = function(year) {
  as.integer(format(as.POSIXct(paste0(year, "-12-31"), format = "%Y-%m-%d"), "%U"))
}

is_leap_year = function(year) {
  (year %% 4L == 0L & year %% 100L != 0L) | (year %% 400L == 0L)
}

get_days_per_month = function(year, month) {
  c(31L, 28L, 31L, 30L, 31L, 30L, 31L, 31L, 30L, 31L, 30L, 31L)[month] + (month == 2L & is_leap_year(year))
}

mlr_pipeops$add("datefeatures", PipeOpDateFeatures)

Try the mlr3pipelines package in your browser

Any scripts or data that you put into this service are public.

mlr3pipelines documentation built on Nov. 7, 2025, 9:06 a.m.