#' Pivot data from wide to long
#'
#' @description
#' This is a method for the tidyr `pivot_longer()` generic. It is translated to
#' [data.table::melt()]
#'
#' @param data A [lazy_dt()].
#' @inheritParams tidyr::pivot_longer
#' @param names_ptypes,names_transform,values_ptypes,values_transform
#' Not currently supported by dtplyr.
#' @examples
#' library(tidyr)
#'
#' # Simplest case where column names are character data
#' relig_income_dt <- lazy_dt(relig_income)
#' relig_income_dt %>%
#' pivot_longer(!religion, names_to = "income", values_to = "count")
#'
#' # Slightly more complex case where columns have common prefix,
#' # and missing missings are structural so should be dropped.
#' billboard_dt <- lazy_dt(billboard)
#' billboard %>%
#' pivot_longer(
#' cols = starts_with("wk"),
#' names_to = "week",
#' names_prefix = "wk",
#' values_to = "rank",
#' values_drop_na = TRUE
#' )
#'
#' # Multiple variables stored in column names
#' lazy_dt(who) %>%
#' pivot_longer(
#' cols = new_sp_m014:newrel_f65,
#' names_to = c("diagnosis", "gender", "age"),
#' names_pattern = "new_?(.*)_(.)(.*)",
#' values_to = "count"
#' )
#'
#' # Multiple observations per row
#' anscombe_dt <- lazy_dt(anscombe)
#' anscombe_dt %>%
#' pivot_longer(
#' everything(),
#' names_to = c(".value", "set"),
#' names_pattern = "(.)(.)"
#' )
# exported onLoad
pivot_longer.dtplyr_step <- function(data,
cols,
names_to = "name",
names_prefix = NULL,
names_sep = NULL,
names_pattern = NULL,
names_ptypes = NULL,
names_transform = NULL,
names_repair = "check_unique",
values_to = "value",
values_drop_na = FALSE,
values_ptypes = NULL,
values_transform = NULL,
...) {
if (!is.null(names_ptypes)) {
abort("`names_ptypes` is not supported by dtplyr")
}
if (!is.null(names_transform)) {
abort("`names_transform` is not supported by dtplyr")
}
if (!is.null(values_ptypes)) {
abort("`values_ptypes` is not supported by dtplyr")
}
if (!is.null(values_transform)) {
abort("`values_transform` is not supported by dtplyr")
}
measure_vars <- names(tidyselect::eval_select(enquo(cols), data))
if (length(measure_vars) == 0) {
abort("`cols` must select at least one column.")
}
multiple_names_to <- length(names_to) > 1
uses_dot_value <- ".value" %in% names_to
variable_name <- "variable"
if (uses_dot_value) {
if (!is.null(names_sep)) {
names_to_setup <- str_separate(measure_vars, into = names_to, sep = names_sep)
} else if (!is.null(names_pattern)) {
names_to_setup <- str_extract(measure_vars, into = names_to, names_pattern)
} else {
abort("If you use '.value' in `names_to` you must also supply
`names_sep' or `names_pattern")
}
.value <- names_to_setup$.value
v_fct <- factor(.value, levels = unique(.value))
measure_vars <- split(measure_vars, v_fct)
values_to <- names(measure_vars)
names(measure_vars) <- NULL
if (multiple_names_to) {
variable_name <- names_to[!names_to == ".value"]
.value_ids <- split(names_to_setup[[variable_name]], v_fct)
.value_id <- .value_ids[[1]]
# Make sure data is "balanced"
# https://github.com/Rdatatable/data.table/issues/2575
# The list passed to measure.vars also needs the same number of column names per element
equal_ids <- map_lgl(
.value_ids[-1],
function(.x) isTRUE(all.equal(.value_id, .x))
)
if (all(equal_ids)) {
.value_id <- vctrs::vec_rep_each(.value_id, length(pull(data)))
} else {
abort("`data.table::melt()` doesn't currently support melting of unbalanced datasets.")
}
}
} else if (multiple_names_to) {
if (is.null(names_sep) && is.null(names_pattern)) {
abort("If you supply multiple names in `names_to` you must also
supply `names_sep` or `names_pattern`")
} else if (!is.null(names_sep) && !is.null(names_pattern)) {
abort("only one of names_sep or names_pattern should be provided")
}
} else {
variable_name <- names_to
}
args <- list(
measure.vars = measure_vars,
variable.name = variable_name,
value.name = values_to,
na.rm = values_drop_na,
variable.factor = FALSE
)
# Clean up call args if defaults are used
if (variable_name == "variable") {
args$variable.name <- NULL
}
if (identical(values_to, "value")) {
args$value.name <- NULL
}
if (is_false(values_drop_na)) {
args$na.rm <- NULL
}
id_vars <- setdiff(data$vars, unlist(measure_vars))
out <- step_call(
data,
"melt",
args = args,
vars = c(id_vars, variable_name, values_to)
)
if (!is.null(names_prefix)) {
out <- mutate(out, !!variable_name := gsub(paste0("^", names_prefix), "", !!sym(variable_name)))
}
if (multiple_names_to && uses_dot_value) {
out <- mutate(out, !!variable_name := !!.value_id)
} else if (multiple_names_to && !uses_dot_value) {
if (!is.null(names_sep)) {
into_cols <- str_separate(pull(out, !!sym(variable_name)), names_to, sep = names_sep)
} else {
into_cols <- str_extract(pull(out, !!sym(variable_name)), into = names_to, regex = names_pattern)
}
out <- mutate(out, !!!into_cols)
# Need to drop variable_name and move names_to vars to correct position
# Recreates relocate logic so only select is necessary, not relocate + select
out_vars <- out$vars
var_idx <- which(out_vars == variable_name)
before_vars <- out_vars[seq_along(out_vars) < var_idx]
after_vars <- out_vars[seq_along(out_vars) > var_idx]
out <- select(out, !!!syms(before_vars), !!!syms(names_to), !!!syms(after_vars))
} else if (!multiple_names_to && uses_dot_value) {
out <- mutate(out, variable = NULL)
}
step_repair(out, repair = names_repair)
}
# ==============================================================================
# inlined from tidyr
# https://github.com/tidyverse/tidyr/issues/1103
# ==============================================================================
# nocov start
# str_extract() -----------------------------------------------------------------
str_extract <- function(x, into, regex, convert = FALSE) {
stopifnot(
is_string(regex),
is_character(into)
)
out <- str_match_first(x, regex)
if (length(out) != length(into)) {
stop(
"`regex` should define ", length(into), " groups; ", ncol(out), " found.",
call. = FALSE
)
}
# Handle duplicated names
if (anyDuplicated(into)) {
pieces <- split(out, into)
into <- names(pieces)
out <- lapply(pieces, pmap_chr, paste0, sep = "")
}
into <- as_utf8_character(into)
non_na_into <- !is.na(into)
out <- out[non_na_into]
names(out) <- into[non_na_into]
if (convert) {
out[] <- lapply(out, utils::type.convert, as.is = TRUE)
}
out
}
str_match_first <- function(string, regex) {
loc <- regexpr(regex, string, perl = TRUE)
loc <- group_loc(loc)
out <- lapply(
seq_len(loc$matches),
function(i) substr(string, loc$start[, i], loc$end[, i])
)
out[-1]
}
group_loc <- function(x) {
start <- cbind(as.vector(x), attr(x, "capture.start"))
end <- start + cbind(attr(x, "match.length"), attr(x, "capture.length")) - 1L
no_match <- start == -1L
start[no_match] <- NA
end[no_match] <- NA
list(matches = ncol(start), start = start, end = end)
}
# str_separate() -----------------------------------------------------------------
str_separate <- function(x, into, sep, convert = FALSE, extra = "warn", fill = "warn") {
if (!is.character(into)) {
abort("`into` must be a character vector")
}
if (is.numeric(sep)) {
out <- strsep(x, sep)
} else if (is_character(sep)) {
out <- data.table::tstrsplit(x, sep, fixed = TRUE, names = TRUE)
out <- as_tibble(out)
} else {
abort("`sep` must be either numeric or character")
}
names(out) <- as_utf8_character(into)
out <- out[!is.na(names(out))]
if (convert) {
out[] <- lapply(out, utils::type.convert, as.is = TRUE)
}
out
}
strsep <- function(x, sep) {
nchar <- nchar(x)
pos <- lapply(sep, function(i) {
if (i >= 0) return(i)
pmax(0, nchar + i)
})
pos <- c(list(0), pos, list(nchar))
lapply(1:(length(pos) - 1), function(i) {
substr(x, pos[[i]] + 1, pos[[i + 1]])
})
}
str_split_n <- function(x, pattern, n_max = -1) {
if (is.factor(x)) {
x <- as.character(x)
}
m <- gregexpr(pattern, x, perl = TRUE)
if (n_max > 0) {
m <- lapply(m, function(x) slice_match(x, seq_along(x) < n_max))
}
regmatches(x, m, invert = TRUE)
}
slice_match <- function(x, i) {
structure(
x[i],
match.length = attr(x, "match.length")[i],
index.type = attr(x, "index.type"),
useBytes = attr(x, "useBytes")
)
}
list_indices <- function(x, max = 20) {
if (length(x) > max) {
x <- c(x[seq_len(max)], "...")
}
paste(x, collapse = ", ")
}
# pmap_chr() -----------------------------------------------------------------
pmap_chr <- function(.l, .f, ...) {
as.character(pmap(.l, .f, ...))
}
# nocov end
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.