#' Split a string into columns
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' Each of these functions takes a string column and splits it into multiple
#' new columns:
#'
#' * `separate_wider_delim()` splits by delimiter.
#' * `separate_wider_position()` splits at fixed widths.
#' * `separate_wider_regex()` splits with regular expression matches.
#'
#' These functions are equivalent to [separate()] and [extract()], but use
#' [stringr](https://stringr.tidyverse.org/) as the underlying string
#' manipulation engine, and their interfaces reflect what we've learned from
#' [unnest_wider()] and [unnest_longer()].
#'
#' @inheritParams unnest_longer
#' @param cols <[`tidy-select`][tidyr_tidy_select]> Columns to separate.
#' @param names_sep If supplied, output names will be composed
#' of the input column name followed by the separator followed by the
#' new column name. Required when `cols` selects multiple columns.
#'
#' For `separate_wider_delim()` you can specify instead of `names`, in which
#' case the names will be generated from the source column name, `names_sep`,
#' and a numeric suffix.
#' @param names For `separate_wider_delim()`, a character vector of output
#' column names. Use `NA` if there are components that you don't want
#' to appear in the output; the number of non-`NA` elements determines the
#' number of new columns in the result.
#' @param delim For `separate_wider_delim()`, a string giving the delimiter
#' between values. By default, it is interpreted as a fixed string; use
#' [stringr::regex()] and friends to split in other ways.
#' @inheritParams rlang::args_dots_empty
#' @param too_few What should happen if a value separates into too few
#' pieces?
#'
#' * `"error"`, the default, will throw an error.
#' * `"debug"` adds additional columns to the output to help you
#' locate and resolve the underlying problem. This option is intended to
#' help you debug the issue and address and should not generally remain in
#' your final code.
#' * `"align_start"` aligns starts of short matches, adding `NA` on the end
#' to pad to the correct length.
#' * `"align_end"` (`separate_wider_delim()` only) aligns the ends of short
#' matches, adding `NA` at the start to pad to the correct length.
#' @param too_many What should happen if a value separates into too many
#' pieces?
#'
#' * `"error"`, the default, will throw an error.
#' * `"debug"` will add additional columns to the output to help you
#' locate and resolve the underlying problem.
#' * `"drop"` will silently drop any extra pieces.
#' * `"merge"` (`separate_wider_delim()` only) will merge together any
#' additional pieces.
#' @param cols_remove Should the input `cols` be removed from the output?
#' Always `FALSE` if `too_few` or `too_many` are set to `"debug"`.
#' @returns A data frame based on `data`. It has the same rows, but different
#' columns:
#'
#' * The primary purpose of the functions are to create new columns from
#' components of the string.
#' For `separate_wider_delim()` the names of new columns come from `names`.
#' For `separate_wider_position()` the names come from the names of `widths`.
#' For `separate_wider_regex()` the names come from the names of
#' `patterns`.
#'
#' * If `too_few` or `too_many` is `"debug"`, the output will contain additional
#' columns useful for debugging:
#'
#' * `{col}_ok`: a logical vector which tells you if the input was ok or
#' not. Use to quickly find the problematic rows.
#' * `{col}_remainder`: any text remaining after separation.
#' * `{col}_pieces`, `{col}_width`, `{col}_matches`: number of pieces,
#' number of characters, and number of matches for `separate_wider_delim()`,
#' `separate_wider_position()` and `separate_regexp_wider()` respectively.
#'
#' * If `cols_remove = TRUE` (the default), the input `cols` will be removed
#' from the output.
#'
#' @export
#' @examples
#' df <- tibble(id = 1:3, x = c("m-123", "f-455", "f-123"))
#' # There are three basic ways to split up a string into pieces:
#' # 1. with a delimiter
#' df %>% separate_wider_delim(x, delim = "-", names = c("gender", "unit"))
#' # 2. by length
#' df %>% separate_wider_position(x, c(gender = 1, 1, unit = 3))
#' # 3. defining each component with a regular expression
#' df %>% separate_wider_regex(x, c(gender = ".", ".", unit = "\\d+"))
#'
#' # Sometimes you split on the "last" delimiter
#' df <- tibble(var = c("race_1", "race_2", "age_bucket_1", "age_bucket_2"))
#' # _delim won't help because it always splits on the first delimiter
#' try(df %>% separate_wider_delim(var, "_", names = c("var1", "var2")))
#' df %>% separate_wider_delim(var, "_", names = c("var1", "var2"), too_many = "merge")
#' # Instead, you can use _regex
#' df %>% separate_wider_regex(var, c(var1 = ".*", "_", var2 = ".*"))
#' # this works because * is greedy; you can mimic the _delim behaviour with .*?
#' df %>% separate_wider_regex(var, c(var1 = ".*?", "_", var2 = ".*"))
#'
#' # If the number of components varies, it's most natural to split into rows
#' df <- tibble(id = 1:4, x = c("x", "x y", "x y z", NA))
#' df %>% separate_longer_delim(x, delim = " ")
#' # But separate_wider_delim() provides some tools to deal with the problem
#' # The default behaviour tells you that there's a problem
#' try(df %>% separate_wider_delim(x, delim = " ", names = c("a", "b")))
#' # You can get additional insight by using the debug options
#' df %>%
#' separate_wider_delim(
#' x,
#' delim = " ",
#' names = c("a", "b"),
#' too_few = "debug",
#' too_many = "debug"
#' )
#'
#' # But you can suppress the warnings
#' df %>%
#' separate_wider_delim(
#' x,
#' delim = " ",
#' names = c("a", "b"),
#' too_few = "align_start",
#' too_many = "merge"
#' )
#'
#' # Or choose to automatically name the columns, producing as many as needed
#' df %>% separate_wider_delim(x, delim = " ", names_sep = "", too_few = "align_start")
separate_wider_delim <- function(
data,
cols,
delim,
...,
names = NULL,
names_sep = NULL,
names_repair = "check_unique",
too_few = c("error", "debug", "align_start", "align_end"),
too_many = c("error", "debug", "drop", "merge"),
cols_remove = TRUE
) {
check_installed("stringr")
check_data_frame(data)
check_required(cols)
check_dots_empty()
check_string(delim, allow_empty = FALSE)
if (is.null(names) && is.null(names_sep)) {
cli::cli_abort("Must specify at least one of {.arg names} or {.arg names_sep}.")
}
check_character(names, allow_null = TRUE)
if (is_named(names)) {
cli::cli_abort("{.arg names} must be an unnamed character vector.")
}
too_few <- arg_match(too_few)
too_many <- arg_match(too_many)
check_bool(cols_remove)
error_call %<~% current_env()
map_unpack(
data, {{ cols }},
function(x, col) str_separate_wider_delim(x, col,
names = names,
delim = delim,
names_sep = names_sep,
too_few = too_few,
too_many = too_many,
cols_remove = cols_remove,
error_call = error_call
),
names_sep = names_sep,
names_repair = names_repair
)
}
str_separate_wider_delim <- function(
x,
col,
names,
delim,
names_sep = NULL,
too_few = "error",
too_many = "error",
cols_remove = TRUE,
error_call = caller_env()
) {
if (is_bare_string(delim)) {
delim <- stringr::fixed(delim)
}
if (too_many == "merge") {
if (is.null(names)) {
cli::cli_abort(
'Must provide {.arg names} when {.code too_many = "merge"}.',
call = error_call
)
}
n <- length(names)
} else {
n <- Inf
}
pieces <- stringr::str_split(x, delim, n = n)
lengths <- lengths(pieces)
n_pieces <- ifelse(is.na(x), NA, lengths)
names <- names %||% as.character(seq_len(int_max(lengths, 0)))
p <- length(names)
check_df_alignment(col, p, "pieces", n_pieces,
too_few = too_few,
too_many = too_many,
advice_short = c(
i = 'Use `too_few = "debug"` to diagnose the problem.',
i = 'Use `too_few = "align_start"/"align_end"` to silence this message.'
),
advice_long = c(
i = 'Use `too_many = "debug"` to diagnose the problem.',
i = 'Use `too_many = "drop"/"merge"` to silence this message.'
),
call = error_call
)
out <- df_align(
x = pieces,
names = names,
align_direction = if (too_few == "align_end") "end" else "start"
)
if (!cols_remove || too_few == "debug" || too_many == "debug") {
out[[col]] <- x
}
if (too_few == "debug" || too_many == "debug") {
separate_warn_debug(col, names_sep, c("ok", "pieces", "remainder"))
sep_loc <- stringr::str_locate_all(x, delim)
sep_last <- map(sep_loc, function(x) if (nrow(x) < p) NA else x[p, "start"])
remainder <- stringr::str_sub(x, sep_last)
remainder[is.na(remainder) & !is.na(x)] <- ""
problem <- !is.na(x) & (
(too_few == "debug" & n_pieces < p) |
(too_many == "debug" & n_pieces > p)
)
out[[debug_name(col, names_sep, "ok")]] <- !problem
out[[debug_name(col, names_sep, "pieces")]] <- n_pieces
out[[debug_name(col, names_sep, "remainder")]] <- remainder
}
out
}
#' @rdname separate_wider_delim
#' @param widths A named numeric vector where the names become column names,
#' and the values specify the column width. Unnamed components will match,
#' but not be included in the output.
#' @export
separate_wider_position <- function(
data,
cols,
widths,
...,
names_sep = NULL,
names_repair = "check_unique",
too_few = c("error", "debug", "align_start"),
too_many = c("error", "debug", "drop"),
cols_remove = TRUE
) {
check_installed("stringr")
check_data_frame(data)
check_required(cols)
check_required(widths)
if (!is_integerish(widths) || !any(have_name(widths))) {
cli::cli_abort("{.arg widths} must be a (partially) named integer vector.")
}
if (any(widths <= 0)) {
cli::cli_abort("All values of {.arg widths} must be positive.")
}
check_dots_empty()
too_few <- arg_match(too_few)
too_many <- arg_match(too_many)
check_bool(cols_remove)
error_call %<~% current_env()
map_unpack(
data, {{ cols }},
function(x, col) str_separate_wider_position(x, col,
widths = widths,
names_sep = names_sep,
too_few = too_few,
too_many = too_many,
cols_remove = cols_remove,
error_call = error_call
),
names_sep = names_sep,
names_repair = names_repair
)
}
str_separate_wider_position <- function(x,
col,
widths,
names_sep = NULL,
too_few = "error",
too_many = "error",
cols_remove = TRUE,
error_call = caller_env()
) {
breaks <- cumsum(c(1L, unname(widths)))[-(length(widths) + 1L)]
expected_width <- sum(widths)
width <- stringr::str_length(x)
check_df_alignment(col, expected_width, "characters", width,
too_few = too_few,
too_many = too_many,
advice_short = c(
i = 'Use `too_few = "debug"` to diagnose the problem.',
i = 'Use `too_few = "align_start"` to silence this message.'
),
advice_long = c(
i = 'Use `too_many = "debug"` to diagnose the problem.',
i = 'Use `too_many = "drop"` to silence this message.'
),
call = error_call
)
skip <- names(widths) == ""
from <- cbind(start = breaks[!skip], length = widths[!skip])
names <- names(widths)[!skip]
pieces <- stringr::str_sub_all(x, from)
pieces <- map(pieces, function(x) x[x != ""])
out <- df_align(
x = pieces,
names = names,
align_direction = if (too_few == "end") "end" else "start"
)
if (!cols_remove || too_few == "debug" || too_many == "debug") {
out[[col]] <- x
}
if (too_few == "debug" || too_many == "debug") {
separate_warn_debug(col, names_sep, c("ok", "width", "remainder"))
problem <- !is.na(x) & (
(too_few == "debug" & width < expected_width) |
(too_many == "debug" & width > expected_width)
)
out[[debug_name(col, names_sep, "width")]] <- width
out[[debug_name(col, names_sep, "remainder")]] <- stringr::str_sub(x, expected_width + 1, width)
out[[debug_name(col, names_sep, "ok")]] <- !problem
}
out
}
#' @rdname separate_wider_delim
#' @param patterns A named character vector where the names become column names
#' and the values are regular expressions that match the contents of the
#' vector. Unnamed components will match, but not be included in the output.
#' @export
separate_wider_regex <- function(
data,
cols,
patterns,
...,
names_sep = NULL,
names_repair = "check_unique",
too_few = c("error", "debug", "align_start"),
cols_remove = TRUE
) {
check_installed("stringr")
check_data_frame(data)
check_required(cols)
check_character(patterns)
if (length(patterns) > 0 && all(!have_name(patterns))) {
cli::cli_abort("{.arg patterns} must be a named character vector.")
}
check_dots_empty()
check_string(names_sep, allow_null = TRUE)
too_few <- arg_match(too_few)
check_bool(cols_remove)
error_call %<~% current_env()
map_unpack(
data, {{ cols }},
function(x, col) str_separate_wider_regex(x, col,
patterns = patterns,
names_sep = names_sep,
too_few = too_few,
cols_remove = cols_remove,
error_call = error_call
),
names_sep = names_sep,
names_repair = names_repair
)
}
str_separate_wider_regex <- function(x,
col,
patterns,
names_sep = NULL,
too_few = "error",
cols_remove = TRUE,
error_call = caller_env()) {
has_name <- names2(patterns) != ""
groups <- stringr::str_c("(", ifelse(has_name, "", "?:"), patterns, ")")
full_match <- stringr::str_c("^", stringr::str_flatten(groups, collapse = ""), "$")
match <- stringr::str_match(x, full_match)
if (ncol(match) != sum(has_name) + 1L) {
cli::cli_abort(c(
"Invalid number of groups.",
i = 'Did you use "()" instead of "(?:)" inside {.arg patterns}?'
), call = error_call)
}
matches <- match[, -1, drop = FALSE]
out <- as_tibble(matches, .name_repair = "none")
colnames(out) <- names2(patterns)[has_name]
if (!cols_remove || too_few == "debug") {
out[[col]] <- x
}
match_count <- ifelse(is.na(x), NA, length(patterns))
remainder <- ifelse(is.na(x), NA, "")
problems <- !is.na(x) & is.na(match[, 1])
no_match <- which(problems)
if (length(no_match) > 0) {
if (too_few == "error") {
cli::cli_abort(c(
"Expected each value of {.var {col}} to match the pattern, the whole pattern, and nothing but the pattern.",
"!" = "{length(no_match)} value{?s} {?has/have} problem{?s}.",
i = 'Use {.code too_few = "debug"} to diagnose the problem.',
i = 'Use {.code too_few = "align_start"} to silence this message.'
), call = error_call)
}
# Progressively relax the matches
for (i in rev(seq_along(groups))) {
partial <- paste0("^", paste(groups[1:i], collapse = ""), "(.*)$")
match <- stringr::str_match(x[no_match], partial)
has_match <- !is.na(match[, 1])
match_idx <- no_match[has_match]
if (length(match_idx) == 0) {
next
}
matches <- as_tibble(match[has_match, -1, drop = FALSE], .name_repair = "none")
cols <- names2(patterns)[has_name][1:(ncol(matches) - 1)]
out[match_idx, cols] <- matches[1:(ncol(matches) - 1)]
remainder[match_idx] <- matches[[ncol(matches)]]
match_count[match_idx] <- i
no_match <- no_match[!has_match]
if (length(no_match) == 0) {
break
}
}
if (length(no_match) > 0) {
match_count[no_match] <- 0L
remainder[no_match] <- x[no_match]
}
}
if (too_few == "debug") {
separate_warn_debug(col, names_sep, c("ok", "matches", "remainder"))
out[debug_name(col, names_sep, "ok")] <- !problems
out[debug_name(col, names_sep, "matches")] <- match_count
out[debug_name(col, names_sep, "remainder")] <- remainder
}
out
}
# helpers -----------------------------------------------------------------
map_unpack <- function(data, cols, fun, names_sep, names_repair, error_call = caller_env()) {
cols <- tidyselect::eval_select(
enquo(cols),
data = data,
allow_rename = FALSE,
allow_empty = FALSE,
error_call = error_call
)
col_names <- names(cols)
for (col in col_names) {
data[[col]] <- fun(data[[col]], col)
}
unpack(
data = data,
cols = all_of(col_names),
names_sep = names_sep,
names_repair = names_repair,
error_call = error_call
)
}
# cf. df_simplify
df_align <- function(
x,
names,
align_direction = c("start", "end")
) {
vec_check_list(x)
if (length(x) == 0) {
return(new_data_frame(rep_named(names, list(character())), size = 0))
}
align_direction <- arg_match(align_direction)
n <- length(x)
p <- length(names)
out <- df_align_transpose(x, p, align_direction = align_direction)
out <- out[!is.na(names)]
names(out) <- names[!is.na(names)]
new_data_frame(out, size = n)
}
df_align_transpose <- function(x, p, align_direction = "start") {
sizes <- list_sizes(x)
# Find the start location of each piece
starts <- cumsum(c(1L, sizes[-length(sizes)]))
# Combine all pieces sequentially, zapping names for performance.
x <- list_unchop(x, ptype = character(), name_spec = zap())
indices <- vector("list", p)
if (align_direction == "start") {
# General idea is to use the `starts` to slice with, replacing them with
# `NA` when we are too short, and then advancing the `starts` by 1 at each
# iteration.
for (i in seq_len(p)) {
small <- sizes < i
index <- starts
index[small] <- NA_integer_
indices[[i]] <- index
starts <- starts + 1L
}
} else {
# `"end"` is a little tricky because we have to hold the start
# location constant if we couldn't use it because the piece was too small.
# Pieces that are too large are automatically ignored.
gap <- sizes - p
for (i in seq_len(p)) {
small <- gap < 0
not_small <- !small
index <- starts
index[small] <- NA_integer_
indices[[i]] <- index
starts[not_small] <- starts[not_small] + 1L
gap <- gap + 1L
}
}
vec_chop(x, indices)
}
check_df_alignment <- function(
col,
p,
type,
sizes,
too_few,
too_many,
advice_short = NULL,
advice_long = NULL,
call = caller_env()) {
n_short <- sum(sizes < p, na.rm = TRUE)
n_long <- sum(sizes > p, na.rm = TRUE)
error_short <- too_few == "error" && n_short > 0
error_long <- too_many == "error" && n_long > 0
if (!error_short && !error_long) {
return()
}
cli::cli_abort(c(
"Expected {p} {type} in each element of {.var {col}}.",
"!" = if (error_short) "{n_short} value{?s} {?was/were} too short.",
if (error_short) advice_short,
"!" = if (error_long) "{n_long} value{?s} {?was/were} too long.",
if (error_long) advice_long
), call = call)
}
separate_warn_debug <- function(col, names_sep, vars) {
vars <- debug_name(col, names_sep, vars)
cli::cli_warn("Debug mode activated: adding variables {.var {vars}}.")
}
debug_name <- function(col, names_sep, var) {
paste0(col, names_sep %||% "_", var)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.