Nothing
#' Concatenate Multiple Variables With Padding
#'
#' @description
#' Concatenate multiple variables inside a data frame into a new variable. An
#' automatic or individual padding can be applied. The padding character can be
#' chosen freely.
#'
#' The function can also be used to give a single variable a padding.
#'
#' @param data_frame A data frame which contains the the variables to concatenate.
#' @param ... The names of the variables to concatenate.
#' @param padding_char A single character which will be used to fill up the empty places.
#' @param padding_length A numeric vector containing the individual padding length per variable.
#' @param padding_right FALSE by default. If TRUE insert padding characters on the right side
#' instead of the left side.
#'
#' @return
#' Returns a character vector.
#'
#' @seealso
#' Other character manipulating functions: [sub_string()], [remove_blanks()]
#'
#' @examples
#' # Example data frame
#' my_data <- dummy_data(100)
#'
#' # Concatenate variables as provided
#' my_data[["id1"]] <- my_data |> concat(household_id, state, age)
#'
#' # Concatenate variables with leading zeros. Each variable will
#' # receive an individual padding length according to their
#' # longest value.
#' my_data[["id2"]] <- my_data |> concat(household_id, state, age,
#' padding_char = "0")
#'
#' # Concatenate variables with individual character and lengths.
#' my_data[["id2"]] <- my_data |> concat(household_id, state, age,
#' padding_char = "_",
#' padding_length = c(5, 3, 4))
#'
#' # Padding a single variable in place
#' my_data[["state"]] <- my_data |> concat(state, padding_char = "0")
#'
#' @export
concat <- function(data_frame,
...,
padding_char = NULL,
padding_length = NULL,
padding_right = FALSE){
variables <- dots_to_char(...)
# If no padding is defined just concatenate provided variables as they are
if (is.null(padding_char) && is.null(padding_length)){
return(do.call(paste0, data_frame[variables]))
}
# Padding character may only be of length one
if (!is.null(padding_char) && collapse::vlengths(padding_char) != 1){
message(" ! WARNING: <Padding chararacter> must be a single character. Concat will be done without <padding character>.")
return(do.call(paste0, data_frame[variables]))
}
# If no padding character is given use a blank
if (is.null(padding_char)){
padding_char <- " "
}
# Determine padding lengths. If NULL all given variables will be checked individually
# for their maximum length.
if (is.null(padding_length)){
padding_length <- vapply(data_frame[variables],
function(variable){
collapse::fmax(nchar(variable), na.rm = TRUE)
},
integer(1))
}
# With given padding length it has to be checked whether the provided vector
# is to long or to short in comparison to the number of provided variables.
# Either way it has to be matched to the number of provided variables. Only if
# it is of equal length nothing happens here.
else{
number_of_columns <- length(variables)
number_of_paddings <- length(padding_length)
# If the padding vector is shorter than the number of provided variables,
# the remaining variables will receive the individual maximum length as
# padding length like above.
if (number_of_paddings < number_of_columns){
message(" ! WARNING: <Padding length> is shorter than the number of variables.\n",
" Missing lengths will be filled up using maximum individual variable length.")
missing_length <- vapply(data_frame[variables[(number_of_paddings + 1):number_of_columns]],
function(variable){
collapse::fmax(nchar(variable), na.rm = TRUE)
},
integer(1))
padding_length <- c(padding_length, missing_length)
}
# If the padding vector is longer than the number of provided variables,
# juts trim it down.
else if (number_of_paddings > number_of_columns){
message(" ! WARNING: <Padding length> is longer than the number of variables.\n",
" Extra lengths will be ignored.")
padding_length <- padding_length[seq_len(number_of_columns)]
}
}
# Apply padding to all variables individually
padded_variables <- Map(function(variable, padding_length){
# NA values are just filled up with padding character
variable[is.na(variable)] <- strrep(padding_char, padding_length)
# Get the number of places to fill up with padding character per observation
pad_per_observation <- pmax(padding_length - collapse::vlengths(variable), 0)
# Concatenate padding and variable values
if (!padding_right){
paste0(strrep(padding_char, pad_per_observation), variable)
}
else{
paste0(variable, strrep(padding_char, pad_per_observation))
}
},
data_frame[variables],
padding_length
)
# Concatenate all padded variables together to one single variable
do.call(paste0, padded_variables)
}
#' Retrieve A Substring From A Character
#'
#' @description
#' [sub_string()] can extract parts of a character from the left side, right side
#' or from the middle. It is also able to start or end at specific letter sequences
#' instead of positions.
#'
#' @param data_frame A data frame which contains the variables to concatenate.
#' @param variable A character variable to extract parts from.
#' @param from The names of the variables to concatenate.
#' @param to A single character which will be used to fill up the empty places.
#' @param case_sensitive TRUE by default. When a character expression is passed as
#' from or to it makes a difference whether a letter is written in upper or lower case.
#' Pass FALSE to handle upper and lower case equaly.
#'
#' @return
#' Returns parts of a character vector.
#'
#' @seealso
#' Other character manipulating functions: [concat()], [remove_blanks()]
#'
#' @examples
#' # Example data frame
#' my_data <- dummy_data(100)
#'
#' # Extract text from the left
#' my_data[["left"]] <- my_data |> sub_string(education, to = 2)
#'
#' # Extract text from the right
#' my_data[["right"]] <- my_data |> sub_string(education, from = 2)
#'
#' # Extract text from the middle
#' my_data[["middle"]] <- my_data |> sub_string(education, from = 2, to = 3)
#'
#' # Find text and extract from the left
#' my_data[["left2"]] <- my_data |> sub_string(education, to = "l")
#'
#' # Find text and extract from the right
#' my_data[["right2"]] <- my_data |> sub_string(education, from = "l")
#'
#' # Find text and extract from the middle
#' my_data[["middle2"]] <- my_data |> sub_string(education, from = "i", to = "l")
#'
#' @export
sub_string <- function(data_frame,
variable,
from = NULL,
to = NULL,
case_sensitive = TRUE){
variable <- get_origin_as_char(variable, substitute(variable))
# If no value variables are provided abort
if (length(variable) <= 1){
if (length(variable) == 0 || variable == ""){
message(" X ERROR: No <variables> provided. Blank removal will be aborted.")
return(invisible(NULL))
}
}
# Adjust variable
if (length(variable) > 1){
message(" ! WARNING: <Variable> may only be of length one. The first Element will be used.")
variable <- variable[[1]]
}
# Make sure that the variable provided is part of the data frame
variable <- data_frame |> part_of_df(variable, check_only = TRUE)
if (is.list(variable)){
message(" X ERROR: The provided <variable> '", variable[[1]], "' is not part of\n",
" the data frame. Substring will be aborted.")
return(invisible(NULL))
}
if (!is.character(data_frame[[variable]])){
message(" X ERROR: <Variable> type must be character. Substring will be aborted.")
return(invisible(NULL))
}
if (is.null(from) && is.null(to)){
message(" X ERROR: Neither <from> nor <to> is provided. Substring will be aborted.")
return(invisible(NULL))
}
# Adjust from and length
if (length(to) > 1){
message(" ! WARNING: <To> may only be of length one. The first Element will be used.")
to <- to[[1]]
}
if (length(from) > 1){
message(" ! WARNING: <From> may only be of length one. The first Element will be used.")
from <- from[[1]]
}
variable_vector <- data_frame[[variable]]
# If to is a character, extract the text up until the first match of to
if (is.character(to)){
# If no match is found, to is adjusted to retrieve the whole text
if (case_sensitive){
to <- regexpr(to, variable_vector)
}
else{
to <- regexpr(tolower(to), tolower(variable_vector))
}
to <- data.table::fifelse(to == -1, 9999, to)
}
# If from is a character, extract the text up until the first match of from
if (is.character(from)){
# If no match is found, from is adjusted to retrieve the whole text
if (case_sensitive){
from <- regexpr(from, variable_vector)
}
else{
from <- regexpr(tolower(from), tolower(variable_vector))
}
from <- data.table::fifelse(from == -1, 1, from)
}
# In case no from position is given, substring starts at position 1 up to to-position
if (is.null(from)){
sub_variable <- substring(variable_vector, 1, to)
}
# In case no to position is given, substring starts at from and goes to the end
else if (is.null(to)){
sub_variable <- substring(variable_vector, from, nchar(variable_vector))
}
# In case both positions are given, extract text in between these two points
else{
sub_variable <- substring(variable_vector, from, to)
}
sub_variable
}
#' Remove Blanks
#'
#' @description
#' Removes leading and trailing blanks or both. Can also remove all blanks from a
#' character or normalize multiple blanks to single ones.
#'
#' @param data_frame A data frame which contains the character variables from which blanks
#' should be removed.
#' @param variable Variable name of the one from which to remove blanks.
#' @param which "all" by default. Can be "leading", "trailing", "trim", "normalize" or "all".
#' Determines which blanks should be removed
#'
#' @return
#' Returns a character vector with removed blanks.
#'
#' @seealso
#' Other character manipulating functions: [concat()], [sub_string()]
#'
#' @examples
#' # Example data frame
#' my_data <- dummy_data(100)
#' my_data[["blanks"]] <- " This is a test "
#'
#' # Remove blanks
#' my_data[["leading"]] <- my_data |> remove_blanks(blanks, which = "leading")
#' my_data[["trailing"]] <- my_data |> remove_blanks(blanks, which = "trailing")
#' my_data[["trim"]] <- my_data |> remove_blanks(blanks, which = "trim")
#' my_data[["all"]] <- my_data |> remove_blanks(blanks, which = "all")
#' my_data[["normalize"]] <- my_data |> remove_blanks(blanks, which = "normalize")
#'
#' @export
remove_blanks <- function(data_frame,
variable,
which = "all"){
# Convert to character vectors
variable <- get_origin_as_char(variable, substitute(variable))
# If no value variable are provided abort
if (length(variable) <= 1){
if (length(variable) == 0 || variable == ""){
message(" X ERROR: No <variable> provided. Blank removal will be aborted.")
return(invisible(NULL))
}
}
# Adjust variable
if (length(variable) > 1){
message(" ! WARNING: <Variable> may only be of length one. The first Element will be used.")
variable <- variable[[1]]
}
# Make sure that the provided variable is part of the data frame.
variable <- data_frame |> part_of_df(variable)
if (length(variable) == 0){
message(" X ERROR: No valid <variable> provided. Blank removal will be aborted.")
return(invisible(NULL))
}
# Abort if non-character variable are selected
variable_vector <- unlist(data_frame[variable])
if (!is.character(variable_vector[[1]])){
message(" X ERROR: Blank removal only works with a character <variable>. Blank removal will be aborted.")
return(invisible(NULL))
}
# Check if which is a valid option
which <- tolower(which)
if (!which %in% c("trim", "leading", "trailing", "all", "normalize")){
message(" ! WARNING: Invalid option for <which> provided. Allowed are 'trim', 'leading', 'trailing', 'all' and\n",
" 'normalize'. 'all' will be used.")
which <- "all"
}
# Remove blanks according to selected option
variable_vector <- switch(which,
trim = trimws(variable_vector, which = "both"),
leading = trimws(variable_vector, which = "left"),
trailing = trimws(variable_vector, which = "right"),
all = gsub(" +", "", variable_vector),
normalize = {
variable_vector <- trimws(variable_vector)
gsub(" +", " ", variable_vector)
})
variable_vector
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.