Nothing
#' @title Preprocess Date Features
#'
#' @usage NULL
#' @name mlr_pipeops_datefeatures
#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Based on `POSIXct`/`Date` columns of the data, a set of date related features is computed and
#' added to the feature set of the output task. If no `POSIXct` or `Date` column is found, the
#' original task is returned unaltered. This functionality is based on the `add_datepart()` and
#' `add_cyclic_datepart()` functions from the \CRANpkg{fastai} package. If operation on only
#' particular `POSIXct`/`Date` columns is requested, use the `affect_columns` parameter inherited
#' from [`PipeOpTaskPreprocSimple`].
#'
#' For `Date` columns, the features `"hour"`, `"minute"`, `"second"`, and `"is_day"` are skipped.
#'
#' If `cyclic = TRUE`, cyclic features are computed for the features `"month"`, `"week_of_year"`,
#' `"day_of_year"`, `"day_of_month"`, `"day_of_week"`, `"hour"`, `"minute"` and `"second"`. This
#' means that for each feature `x`, two additional features are computed, namely the sine and cosine
#' transformation of `2 * pi * x / max_x` (here `max_x` is the largest possible value the feature
#' could take on `+ 1`, assuming the lowest possible value is given by 0, e.g., for hours from 0 to
#' 23, this is 24). This is useful to respect the cyclical nature of features such as seconds, i.e.,
#' second 21 and second 22 are one second apart, but so are second 60 and second 1 of the next
#' minute.
#'
#' @section Construction:
#' ```
#' PipeOpDateFeatures$new(id = "datefeatures", param_vals = list())
#' ```
#'
#' * `id` :: `character(1)`\cr
#' Identifier of resulting object, default `"datefeatures"`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise
#' be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with date-related features computed and added to the
#' feature set of the output task and the `POSIXct` columns of the data removed from the
#' feature set (depending on the value of `keep_date_var`).
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from
#' [`PipeOpTaskPreproc`].
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
#' * `keep_date_var` :: `logical(1)`\cr
#' Should the `POSIXct` columns be kept as features? Default `FALSE`.
#' * `cyclic` :: `logical(1)`\cr
#' Should cyclic features be computed? See Internals. Default `FALSE`.
#' * `year` :: `logical(1)`\cr
#' Should the year be extracted as a feature? Default `TRUE`.
#' * `quarter` :: `logical(1)`\cr
#' Should the quarter be extracted as a feature? Default `TRUE`.
#' * `month` :: `logical(1)`\cr
#' Should the month be extracted as a feature? Default `TRUE`.
#' * `week_of_year` :: `logical(1)`\cr
#' Should the week of the year be extracted as a feature? Default `TRUE`.
#' * `day_of_year` :: `logical(1)`\cr
#' Should the day of the year be extracted as a feature? Default `TRUE`.
#' * `day_of_month` :: `logical(1)`\cr
#' Should the day of the month be extracted as a feature? Default `TRUE`.
#' * `day_of_week` :: `logical(1)`\cr
#' Should the day of the week (ISO 8601) be extracted as a feature? Default `TRUE`.
#' * `hour` :: `logical(1)`\cr
#' Should the hour be extracted as a feature? Default `TRUE`.
#' * `minute` :: `logical(1)`\cr
#' Should the minute be extracted as a feature? Default `TRUE`.
#' * `second` :: `logical(1)`\cr
#' Should the second be extracted as a feature? Default `TRUE`.
#' * `is_day` :: `logical(1)`\cr
#' Should a feature be extracted indicating whether it is day time (06:00am - 08:00pm)?
#' Default `TRUE`.
#'
#' @section Internals:
#' The cyclic feature transformation always assumes that values range from 0, so some values
#' (e.g. day of the month) are shifted before sine/cosine transform.
#'
#' @section Fields:
#' Only fields inherited from [`PipeOp`].
#'
#' @section Methods:
#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @examples
#' library("mlr3")
#' dat = iris
#' set.seed(1)
#' dat$date = sample(
#' seq(as.POSIXct("2020-02-01"), to = as.POSIXct("2020-02-29"), by = "hour"), size = 150L
#' )
#' task = TaskClassif$new("iris_date", backend = dat, target = "Species")
#' pop = po("datefeatures", param_vals = list(cyclic = FALSE, minute = FALSE, second = FALSE))
#' pop$train(list(task))
#' pop$state
#' @family PipeOps
#' @template seealso_pipeopslist
#' @include PipeOpTaskPreproc.R
#' @export
PipeOpDateFeatures = R6Class("PipeOpDateFeatures",
inherit = PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "datefeatures", param_vals = list()) {
ps = ps(
keep_date_var = p_lgl(tags = c("train", "predict", "required")),
cyclic = p_lgl(tags = c("train", "predict", "required")),
year = p_lgl(tags = c("train", "predict", "datepart", "required")),
quarter = p_lgl(tags = c("train", "predict", "datepart", "required")),
month = p_lgl(tags = c("train", "predict", "datepart", "required")),
week_of_year = p_lgl(tags = c("train", "predict", "datepart", "required")),
day_of_year = p_lgl(tags = c("train", "predict", "datepart", "required")),
day_of_month = p_lgl(tags = c("train", "predict", "datepart", "required")),
day_of_week = p_lgl(tags = c("train", "predict", "datepart", "required")),
hour = p_lgl(tags = c("train", "predict", "datepart", "required")),
minute = p_lgl(tags = c("train", "predict", "datepart", "required")),
second = p_lgl(tags = c("train", "predict", "datepart", "required")),
is_day = p_lgl(tags = c("train", "predict", "datepart", "required"))
)
ps$values = list(
keep_date_var = FALSE,
cyclic = FALSE,
year = TRUE,
quarter = TRUE,
month = TRUE,
week_of_year = TRUE,
day_of_year = TRUE,
day_of_month = TRUE,
day_of_week = TRUE,
hour = TRUE,
minute = TRUE,
second = TRUE,
is_day = TRUE
)
super$initialize(
id = id,
param_set = ps,
param_vals = param_vals,
feature_types = c("POSIXct", "Date")
)
}
),
private = list(
.transform_dt = function(dt, levels) {
pv = self$param_set$get_values(tags = "train")
features = names(which(unlist(self$param_set$get_values(tags = "datepart"))))
if (length(features) == 0L) {
return(dt)
}
# special handling of year because this is needed for day_of_year and day_of_month
drop_year = "year" %nin% features
features = unique(c("year", features))
date_features = c(
"year",
"quarter",
"month",
"hour",
"minute",
"second",
"week_of_year",
"day_of_year",
"day_of_month",
"day_of_week",
"is_day"
)
date_features = features[features %in% date_features]
cyclic_features = features[
features %in%
c("month", "week_of_year", "day_of_year", "day_of_month", "day_of_week", "hour", "minute", "second")
]
cols = copy(names(dt))
for (j in cols) {
x = compute_date_features(dt[[j]], date_features)
set(dt, j = paste0(j, ".", names(x)), value = x)
}
# if cyclic = TRUE for month, week_of_year, day_of_year, day_of_month, day_of_week, hour,
# minute and second, two columns are additionally added, each consisting of their sine and
# cosine transformation of in general 2 * pi * x / max_x (x starting from 0)
if (pv$cyclic && length(cyclic_features) > 0L) {
for (j in cols) {
nm = paste0(j, ".", rep(cyclic_features, each = 2L), "_", c("sin", "cos"))
set(dt, j = nm, value = compute_cyclic_date_features(dt, cyclic_features, j))
}
}
if (!pv$keep_date_var) {
set(dt, j = cols, value = NULL)
}
if (drop_year) {
set(dt, j = paste0(cols, ".", "year"), value = NULL)
}
dt
}
)
)
compute_date_features = function(x, features) {
if (inherits(x, "Date")) {
features = features[features %nin% c("hour", "minute", "second", "is_day")]
}
res = map(features, function(feature, nm) {
switch(feature,
year = year(x),
quarter = quarter(x),
month = month(x),
week_of_year = isoweek(x),
day_of_year = yday(x),
day_of_month = mday(x),
day_of_week = wday(x),
hour = hour(x),
minute = minute(x),
second = second(x),
is_day = is_day(x)
)
})
set_names(res, features)
}
is_day = function(x) {
hours = hour(x)
(6L <= hours) & (hours <= 20L)
}
# helper function to compute cyclic date features of date features, i.e.,
# sine and cosine transformations of 2 * pi * x / max_x
compute_cyclic_date_features = function(date_features, features, date_var) {
# drop the date_var-specific colnames here, this makes it easier in lapply
cn = names(date_features)
names(date_features) = c(cn[1L], gsub(paste0(date_var, "."), "", cn[-1L], fixed = TRUE))
unlist(
lapply(features, function(feature) {
# all values are expected to start at 0 and therefore may be shifted by - 1
value = if (feature %in% c("month", "week_of_year", "day_of_year", "day_of_month")) {
date_features[[feature]] - 1L
} else {
date_features[[feature]]
}
maximum = switch(feature,
month = 12L,
week_of_year = get_weeks_per_year(date_features[["year"]]),
day_of_year = 365L + as.integer(is_leap_year(date_features[["year"]])),
day_of_month = get_days_per_month(date_features[["year"]], month = date_features[["month"]]),
day_of_week = 7L,
hour = 24L,
minute = 60L,
second = 60L
)
value_scaled = 2L * pi * value / maximum
list(sin(value_scaled), cos(value_scaled))
}),
recursive = FALSE
)
}
get_weeks_per_year = function(year) {
as.integer(format(as.POSIXct(paste0(year, "-12-31"), format = "%Y-%m-%d"), "%U"))
}
is_leap_year = function(year) {
(year %% 4L == 0L & year %% 100L != 0L) | (year %% 400L == 0L)
}
get_days_per_month = function(year, month) {
c(31L, 28L, 31L, 30L, 31L, 30L, 31L, 31L, 30L, 31L, 30L, 31L)[month] + (month == 2L & is_leap_year(year))
}
mlr_pipeops$add("datefeatures", PipeOpDateFeatures)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.