R/date.R

Defines functions tidy.step_date print.step_date bake.step_date get_date_features ord2fac prep.step_date step_date_new step_date

Documented in step_date tidy.step_date

#' Date feature generator
#'
#' `step_date()` creates a *specification* of a recipe step that will convert
#' date data into one or more factor or numeric variables.
#'
#' @inheritParams step_pca
#' @inheritParams step_center
#' @param ... One or more selector functions to choose variables
#'  for this step. The selected variables should have class `Date` or
#'  `POSIXct`. See [selections()] for more details.
#' @param features A character string that includes at least one
#'  of the following values: `month`, `dow` (day of week),
#'  `doy` (day of year), `week`, `month`,
#'  `decimal` (decimal date, e.g. 2002.197), `quarter`,
#'  `semester`, `year`.
#' @param label A logical. Only available for features
#'  `month` or `dow`. `TRUE` will display the day of
#'  the week as an ordered factor of character strings, such as
#'  "Sunday." `FALSE` will display the day of the week as a
#'  number.
#' @param abbr A logical. Only available for features `month`
#'  or `dow`. `FALSE` will display the day of the week as
#'  an ordered factor of character strings, such as "Sunday".
#'  `TRUE` will display an abbreviated version of the label,
#'  such as "Sun". `abbr` is disregarded if `label =
#'  FALSE`.
#' @param ordinal A logical: should factors be ordered? Only
#'  available for features `month` or `dow`.
#' @param locale Locale to be used for `month` and `dow`, see [locales].
#'  On Linux systems you can use `system("locale -a")` to list all the
#'  installed locales. Can be a locales string, or a [clock::clock_labels()]
#'  object. Defaults to `clock::clock_locale()$labels`.
#' @param keep_original_cols A logical to keep the original variables in the
#'  output. Defaults to `TRUE`.
#' @template step-return
#' @family dummy variable and encoding steps
#' @export
#' @details Unlike some other steps, `step_date` does *not*
#'  remove the original date variables by default. Set `keep_original_cols`
#'  to `FALSE` to remove them.
#'
#'  See [step_time()] if you want to calculate features that are smaller than
#'  days.
#'
#'  # Tidying
#'
#'  When you [`tidy()`][tidy.recipe()] this step, a tibble with columns
#'  `terms` (the selectors or variables selected), `value` (the feature
#'  names), and `ordinal` (a logical) is returned.
#'
#' # Tidying
#'
#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with
#' columns `terms`, `value`, `ordinal` , and `id`:
#'
#' \describe{
#'   \item{terms}{character, the selectors or variables selected}
#'   \item{value}{character, the feature names}
#'   \item{ordinal}{logical, are factors ordered}
#'   \item{id}{character, id of this step}
#' }
#'
#' @template case-weights-not-supported
#'
#' @examples
#' library(lubridate)
#'
#' examples <- data.frame(
#'   Dan = ymd("2002-03-04") + days(1:10),
#'   Stefan = ymd("2006-01-13") + days(1:10)
#' )
#' date_rec <- recipe(~ Dan + Stefan, examples) %>%
#'   step_date(all_predictors())
#'
#' tidy(date_rec, number = 1)
#'
#' date_rec <- prep(date_rec, training = examples)
#'
#' date_values <- bake(date_rec, new_data = examples)
#' date_values
#'
#' tidy(date_rec, number = 1)
step_date <-
  function(recipe,
           ...,
           role = "predictor",
           trained = FALSE,
           features = c("dow", "month", "year"),
           abbr = TRUE,
           label = TRUE,
           ordinal = FALSE,
           locale = clock::clock_locale()$labels,
           columns = NULL,
           keep_original_cols = TRUE,
           skip = FALSE,
           id = rand_id("date")) {
    feat <-
      c(
        "year",
        "doy",
        "week",
        "decimal",
        "semester",
        "quarter",
        "dow",
        "month"
      )
    if (!is_tune(features)) {
      if (!all(features %in% feat)) {
        offenders <- features[!features %in% feat]

        cli::cli_abort(c(
          x = "Possible values of {.arg features} should include:",
          "*" = "{.or {.val {feat}}}.",
          i = "Invalid values were: {.val {offenders}}."
        ))
      }
    }
    add_step(
      recipe,
      step_date_new(
        terms = enquos(...),
        role = role,
        trained = trained,
        features = features,
        abbr = abbr,
        label = label,
        ordinal = ordinal,
        locale = locale,
        columns = columns,
        keep_original_cols = keep_original_cols,
        skip = skip,
        id = id
      )
    )
  }

step_date_new <-
  function(terms, role, trained, features, abbr, label, ordinal, locale,
           columns, keep_original_cols, skip, id) {
    step(
      subclass = "date",
      terms = terms,
      role = role,
      trained = trained,
      features = features,
      abbr = abbr,
      label = label,
      ordinal = ordinal,
      locale = locale,
      columns = columns,
      keep_original_cols = keep_original_cols,
      skip = skip,
      id = id
    )
  }


#' @export
prep.step_date <- function(x, training, info = NULL, ...) {
  col_names <- recipes_eval_select(x$terms, training, info)
  check_type(training[, col_names], types = c("date", "datetime"))

  step_date_new(
    terms = x$terms,
    role = x$role,
    trained = TRUE,
    features = x$features,
    abbr = x$abbr,
    label = x$label,
    ordinal = x$ordinal,
    locale = x$locale,
    columns = col_names,
    keep_original_cols = get_keep_original_cols(x),
    skip = x$skip,
    id = x$id
  )
}


ord2fac <- function(x, what) {
  x <- x[[what]]
  factor(as.character(x), levels = levels(x), ordered = FALSE)
}


get_date_features <-
  function(dt,
           feats,
           locale,
           abbr = TRUE,
           label = TRUE,
           ord = FALSE) {
    ## pre-allocate values
    res <- matrix(NA_integer_, nrow = length(dt), ncol = length(feats))
    colnames(res) <- feats
    res <- as_tibble(res)

    if ("year" %in% feats) {
      res[, grepl("year$", names(res))] <- vec_cast(year(dt), integer())
    }
    if ("doy" %in% feats) {
      res[, grepl("doy$", names(res))] <- vec_cast(yday(dt), integer())
    }
    if ("week" %in% feats) {
      res[, grepl("week$", names(res))] <- vec_cast(week(dt), integer())
    }
    if ("decimal" %in% feats) {
      res[, grepl("decimal$", names(res))] <- decimal_date(dt)
    }
    if ("quarter" %in% feats) {
      res[, grepl("quarter$", names(res))] <- vec_cast(quarter(dt), integer())
    }
    if ("semester" %in% feats) {
      res[, grepl("semester$", names(res))] <- vec_cast(semester(dt), integer())
    }
    if ("dow" %in% feats) {
      if (inherits(locale, "clock_labels")) {
        dow <- clock::date_weekday_factor(
          x = dt, abbreviate = abbr, labels = locale
        )
        if (!label) {
          dow <- as.integer(dow)
        }
        res[, grepl("dow$", names(res))] <- dow
      } else {
        res[, grepl("dow$", names(res))] <-
          wday(dt, abbr = abbr, label = label, locale = locale)
      }
      if (!ord & label == TRUE) {
        res[, grepl("dow$", names(res))] <-
          ord2fac(res, grep("dow$", names(res), value = TRUE))
      }
    }
    if ("month" %in% feats) {
      if (inherits(locale, "clock_labels")) {
        month <- clock::date_month_factor(
          dt, abbreviate = abbr, labels = locale
        )
        if (!label) {
          month <- as.integer(month)
        }
        res[, grepl("month$", names(res))] <- month
      } else {
        res[, grepl("month$", names(res))] <-
          month(dt, abbr = abbr, label = label, locale = locale)
      }
      if (!ord & label == TRUE) {
        res[, grepl("month$", names(res))] <-
          ord2fac(res, grep("month$", names(res), value = TRUE))
      }
    }
    res
  }

#' @export
bake.step_date <- function(object, new_data, ...) {
  col_names <- names(object$columns)
  check_new_data(col_names, object, new_data)

  new_cols <- rep(
    length(object$features),
    each = length(col_names)
  )
  names(new_cols) <- col_names

  date_values <- matrix(NA, nrow = nrow(new_data), ncol = sum(new_cols))

  # Dummy column names to avoid tibble warning
  colnames(date_values) <- as.character(seq_len(sum(new_cols)))

  date_values <- as_tibble(date_values)

  new_names <- vector("character", length = ncol(date_values))

  strt <- 1
  for (col_name in col_names) {
    cols <- (strt):(strt + new_cols[col_name] - 1)

    tmp <- get_date_features(
      dt = new_data[[col_name]],
      feats = object$features,
      locale = object$locale %||% Sys.getlocale("LC_TIME"),
      abbr = object$abbr,
      label = object$label,
      ord = object$ordinal
    )

    date_values[, cols] <- tmp

    new_names[cols] <- paste(col_name, names(tmp), sep = "_")

    strt <- max(cols) + 1
  }

  names(date_values) <- new_names

  date_values <- check_name(date_values, new_data, object, names(date_values))

  new_data <- vec_cbind(new_data, date_values)

  new_data <- remove_original_cols(new_data, object, col_names)
  new_data
}

#' @export
print.step_date <-
  function(x, width = max(20, options()$width - 29), ...) {
    title <- "Date features from "
    print_step(x$columns, x$terms, x$trained, title, width)
    invisible(x)
  }

#' @rdname tidy.recipe
#' @export
tidy.step_date <- function(x, ...) {
  if (is_trained(x)) {
    res <- tidyr::crossing(
      terms = unname(x$columns),
      value = x$features,
      ordinal = x$ordinal
    )
  } else {
    term_names <- sel2char(x$terms)
    res <- tidyr::crossing(
      terms = term_names,
      value = x$features,
      ordinal = x$ordinal
    )
  }
  tibble::add_column(res, id = x$id)
}
topepo/recipes documentation built on April 10, 2024, 10:30 p.m.