R/text_util_fun.R

Defines functions chars_to_text words_to_text text_to_chars text_to_words_regex text_to_words text_to_sentences collapse_chars

Documented in chars_to_text collapse_chars text_to_chars text_to_sentences text_to_words words_to_text

## text_util_fun.R | ds4psy
## hn | uni.kn | 2022 06 28
## ---------------------------

## (0) Utility functions for string manipulation and text/character objects. ------ 

## (A) Defining character vectors and strings of text: ---------- 

# Umlaute / German umlauts: ------ 

# Sources: For Unicode characters, see:
# <https://home.unicode.org/>
# <https://www.unicode.org/charts/>
# <https://en.wikipedia.org/wiki/List_of_Unicode_characters>

uml_a <- "\u00E4"  # ä
uml_o <- "\u00F6"  # ö
uml_u <- "\u00FC"  # ü

uml_A <- "\u00C4"  # Ä
uml_O <- "\u00D6"  # Ö
uml_U <- "\u00DC"  # Ü

uml_s <- "\u00DF"  # ß

# As named vector:
umlaut <- c(uml_a, uml_o, uml_u, 
            uml_A, uml_O, uml_U,
            uml_s)
names(umlaut) <- c("a", "o", "u", 
                   "A", "O", "U",
                   "s") 

## Check:
# umlaut
# names(umlaut)

# paste(umlaut, collapse = " ")
# paste0("Hansj", umlaut["o"], "rg i", umlaut["s"], "t gern s", umlaut["u"], "sse ", umlaut["A"], "pfel.")


#' Umlaut provides German Umlaut letters (as Unicode characters). 
#' 
#' \code{Umlaut} provides the German Umlaut letters (aka. diaeresis/diacritic) 
#' as a named character vector. 
#' 
#' For Unicode details, see 
#' \url{https://home.unicode.org/}, 
#  \url{https://www.unicode.org/charts/}, and 
#  \url{https://en.wikipedia.org/wiki/List_of_Unicode_characters}. 
#' 
#' For details on German Umlaut letters (aka. diaeresis/diacritic), see 
#' \url{https://en.wikipedia.org/wiki/Diaeresis_(diacritic)} and 
#' \url{https://en.wikipedia.org/wiki/Germanic_umlaut}. 
#' 
#' @examples
#' Umlaut
#' names(Umlaut)
#' 
#' paste0("Hansj", Umlaut["o"], "rg i", Umlaut["s"], "t s", Umlaut["u"], "sse ", Umlaut["A"], "pfel.")
#' paste0("Das d", Umlaut["u"], "nne M", Umlaut["a"], "dchen l", Umlaut["a"], "chelt.")
#' paste0("Der b", Umlaut["o"], "se Mann macht ", Umlaut["u"], "blen ", Umlaut["A"], "rger.")
#' paste0("Das ", Umlaut["U"], "ber-Ich ist ", Umlaut["a"], "rgerlich.")
#' 
#' @family text objects and functions
#' 
#' @export

Umlaut <- umlaut 

## Check:
# Umlaut
# names(Umlaut)

## Apply:
# paste(Umlaut, collapse = " ")
# paste0("Hansj", Umlaut["o"], "rg i", Umlaut["s"], "t gern s", Umlaut["u"], "sse ", Umlaut["A"], "pfel.")
# paste0("Das d", Umlaut["u"], "nne M", Umlaut["a"], "dchen l", Umlaut["a"], "chelt sch", Umlaut["o"], "n.")
# paste0("Der b", Umlaut["o"], "se Mann macht ", Umlaut["u"], "blen ", Umlaut["A"], "rger.")


# metachar: Meta-characters of extended regular expressions (in R): ------ 

# metachar provides the metacharacters of extended regular expressions (as a character vector)
# See documentation to ?regex 

metas <- c(". \ | ( ) [ { ^ $ * + ?")

# as vector:
mv <- unlist(strsplit(metas, split = " "))
# mv

mv[2] <- "\\"  # correction for \
# mv

## Check:
# writeLines(mv)
# nchar(paste0(mv, collapse = ""))  # 12


#' metachar provides metacharacters (as a character vector). 
#' 
#' \code{metachar} provides the metacharacters of extended regular expressions 
#' (as a character vector).
#' 
#' \code{metachar} allows illustrating the notion of 
#' meta-characters in regular expressions 
#' (and provides corresponding exemplars). 
#' 
#' See \code{?base::regex} for details on regular expressions 
#' and \code{?"'"} for a list of character constants/quotes in R.
#' 
#' @examples
#' metachar
#' length(metachar)  # 12
#' nchar(paste0(metachar, collapse = ""))  # 12
#' 
#' @family text objects and functions
#' 
#' @seealso
#' \code{\link{cclass}} for a vector of character classes. 
#' 
#' @export

metachar <- mv

# ## Check:
# metachar
# length(metachar)  # 12
# nchar(paste0(metachar, collapse = ""))  # 12
# # writeLines(metachar)


# cclass: A (named) vector of different character classes (in R): ------ 

# letters:
ltr <- paste(letters, collapse = "")  # lowercase
LTR <- paste(LETTERS, collapse = "")  # uppercase

# digits: "0 1 2 3 4 5 6 7 8 9"
dig <- paste(0:9, collapse = "")
# hex: "0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f"
hex <- paste(c(0:9, LETTERS[1:6], letters[1:6]), collapse = "")

# punctuation:
# pun <- "! # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~"  # with space
pun <- "!#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"  # w/o space

# spaces (4 different kinds):
sp1 <- " "   # space
sp2 <- "\t"  # tab
sp3 <- "\n"  # new line
sp4 <- "\r"  # carriage return
# spc <- paste(sp1, sp2, sp3, sp4, collapse = " ")  # with space
spc <- paste(sp1, sp2, sp3, sp4, collapse = "")  # w/o space

# Combine (to ccv):
ccv <- c(ltr, LTR, dig, hex, pun, spc)
names(ccv) <- c("ltr", "LTR", "dig", "hex", "pun", "spc")

## Check: 
# ccv
# ccv["hex"]  # select by name
# writeLines(ccv["pun"])
# stringr::str_view(ccv, "\\.", match = TRUE)
# stringr::str_view(ccv, "\\\\", match = TRUE)
# stringr::str_view(ccv, "[:punct:]", match = TRUE)
# stringr::str_view(ccv, "[:space:]", match = TRUE)
# stringr::str_view(ccv, "[:blank:]", match = TRUE)
# stringr::str_view(ccv, "\t", match = TRUE)
# grep("\r", ccv, value = TRUE)
#
## Note: See ?"'" for character constants in R.


#' cclass provides character classes (as a named vector).
#' 
#' \code{cclass} provides different character classes  
#' (as a named character vector).
#' 
#' \code{cclass} allows illustrating matching 
#' character classes via regular expressions. 
#' 
#' See \code{?base::regex} for details on regular expressions 
#' and \code{?"'"} for a list of character constants/quotes in R. 
#' 
#' @examples
#' cclass["hex"]  # select by name
#' writeLines(cclass["pun"])
#' grep("[[:alpha:]]", cclass, value = TRUE)
#' 
#' @family text objects and functions
#' 
#' @seealso
#' \code{\link{metachar}} for a vector of metacharacters. 
#' 
#' @export

cclass <- ccv

## Check:
# cclass
# cclass["hex"]  # select by name
# writeLines(cclass["pun"])
# grep("[[:alpha:]]", cclass, value = TRUE)
# grep("[[:space:]]", cclass, value = TRUE)


# Other/specific text elements (for ds4psy course materials): ------ 

# course_title     <- paste0("Data science for psychologists")
# course_title_abb <- paste0("ds4psy")
# # psi <- expression(psi)
# name_hn <- "Hansjoerg Neth"
# name_course <- paste0(course_title, " (", course_title_abb, "), by ", name_hn, "")

# # Table of contents (ToC) [Spring 2020]: 
# toc <- tibble::tribble(
#   ~nr, ~lbl,                 ~val,  ~part,   
#    0,  "Introduction",          2,  0,
#    1,  "R basics",             10,  0, # was: "Basic R concepts and commands",
#    2,  "Visualizing data",      8,  1, 
#    3,  "Transforming data",     9,  1, 
#    4,  "Exploring data",       10,  1, # was: "Exploring data (EDA)"
#    5,  "Tibbles",               6,  2, 
#    6,  "Importing data",        5,  2,  
#    7,  "Tidying data",          7,  2,  
#    8,  "Joining data",          6,  2,  
#    9,  "Text data",             9,  2, # increased value 
#   10,  "Time data",             8,  2, # increased value 
#   11,  "Functions",            10,  3, 
#   12,  "Iteration",             8,  3)
# 
# # toc  # used in plot_tbar() and plot_tclock()




## (B) Converting text strings (between units, e.g., sentences, words, characters): ---------- 


# collapse_chars: Turn a multi-element character string into a 1-element character string: ------ 

# Goal: A utility function to ensure that multi-element text inputs are handled consistently.
# Note: sep is ONLY used when collapsing multi-element strings and inserted BETWEEN elements. 

#' Collapse character inputs \code{x} into a single string. 
#' 
#' \code{collapse_chars} converts multi-element character inputs \code{x} 
#' into a single string of text (i.e., a character object of length 1), 
#' separating its elements by \code{sep}. 
#' 
#' As \code{collapse_chars} is a wrapper around 
#' \code{paste(x, collapse = sep)}. 
#' It preserves spaces within the elements of \code{x}. 
#' 
#' The separator \code{sep} is only used when collapsing multi-element vectors 
#' and inserted between elements.
#' 
#' See \code{\link{chars_to_text}} for combining character vectors into text. 
#' 
#' @param x A vector (required), typically a character vector. 
#' 
#' @param sep A character inserted as separator/delimiter 
#' between elements when collapsing multi-element strings of \code{x}.  
#' Default: \code{sep = " "} (i.e., insert 1 space between elements). 
#' 
#' @return A character vector (of length 1). 
#' 
#' @examples
#' collapse_chars(c("Hello", "world", "!"))
#' collapse_chars(c("_", " _ ", "  _  "), sep = "|")  # preserves spaces
#' writeLines(collapse_chars(c("Hello", "world", "!"), sep = "\n"))
#' collapse_chars(1:3, sep = "")
#' 
#' @family text objects and functions
#'
#' @seealso
#' \code{\link{chars_to_text}} for combining character vectors into text; 
#' \code{\link{text_to_chars}} for splitting text into a vector of characters; 
#' \code{\link{text_to_words}} for splitting text into a vector of words; 
#' \code{\link{strsplit}} for splitting strings. 
#' 
#' @export

collapse_chars <- function(x, sep = " "){
  
  # Initialize: 
  x0 <- as.character(x)
  x1 <- NA
  
  # Main: 
  if (length(x0) > 1){  # A multi-element character vector: Insert sep BETWEEN elements
    
    # sep <- " " # Use "" OR " " OR "\n" "\r" "\t" # (see ?"'" for character constants in R)
    x1 <- paste(x0, collapse = sep)  # collapse multi-element strings (ADDING sep between elements). 
    
  } else {  # NO collapse and NO use of sep: 
    
    x1 <- x0  
    
  }
  
  # Output: 
  return(x1)
  
} # collapse_chars(). 

## Check:
# collapse_chars(c("Hello", "world", "!"))
# collapse_chars(c("_", " _ ", "  _  "), sep = "|")  # preserves spaces
# writeLines(collapse_chars(c("Hello", "world", "!"), sep = "\n"))  # new line sep
# collapse_chars(1:3, sep = " ")  # works for numeric vectors!
# collapse_chars("Hello there")
# collapse_chars(123)
# # Special cases:
# collapse_chars(NA)
# collapse_chars("")



# text_to_sentences: Turn a text (consisting of one or more strings) into a vector of all its sentences: ------ 

#' Split strings of text \code{x} into sentences. 
#' 
#' \code{text_to_sentences} splits text \code{x} 
#' (consisting of one or more character strings) 
#' into a vector of its constituting sentences. 
#' 
#' The splits of \code{x} will occur at given punctuation marks 
#' (provided as a regular expression, default: \code{split_delim = "\\.|\\?|!"}).   
#' Empty leading and trailing spaces are removed before returning 
#' a vector of the remaining character sequences (i.e., the sentences).
#' 
#' The Boolean argument \code{force_delim} distinguishes between 
#' two splitting modes: 
#' 
#' \enumerate{
#' 
#'   \item If \code{force_delim = FALSE} (as per default), 
#'   a standard sentence-splitting pattern is assumed: 
#'   A sentence delimiter in \code{split_delim} must be followed by 
#'   one or more blank spaces and a capital letter starting the next sentence. 
#'   Sentence delimiters in \code{split_delim} are not removed 
#'   from the output.
#'   
#'   \item If \code{force_delim = TRUE}, 
#'   the function enforces splits at each delimiter in \code{split_delim}. 
#'   For instance, any dot (i.e., the metacharacter \code{"\\."}) is  
#'   interpreted as a full stop, so that sentences containing dots 
#'   mid-sentence (e.g., for abbreviations, etc.) are split into parts. 
#'   Sentence delimiters in \code{split_delim} are removed 
#'   from the output.
#'   
#'   }
#' 
#' Internally, \code{text_to_sentences} first uses \code{\link{paste}} 
#' to collapse strings (adding \code{sep} between elements) and then 
#' \code{\link{strsplit}} to split strings at \code{split_delim}.
#' 
#' @param x A string of text (required), 
#' typically a character vector. 
#' 
#' @param sep A character inserted as separator/delimiter 
#' between elements when collapsing multi-element strings of \code{x}.  
#' Default: \code{sep = " "} (i.e., insert 1 space between elements). 
#' 
#' @param split_delim Sentence delimiters (as regex) 
#' used to split the collapsed string of \code{x} into substrings. 
#' Default: \code{split_delim = "\\.|\\?|!"} (rather than \code{"[[:punct:]]"}).  
#' 
#' @param force_delim Boolean: Enforce splitting at \code{split_delim}? 
#' If \code{force_delim = FALSE} (as per default), 
#' a standard sentence-splitting pattern is assumed: 
#' \code{split_delim} is followed by one or more blank spaces and a capital letter. 
#' If \code{force_delim = TRUE}, splits at \code{split_delim} are 
#' enforced (without considering spacing or capitalization).
#' 
#' @return A character vector (of sentences). 
#' 
#' @examples
#' x <- c("A first sentence. Exclamation sentence!", 
#'        "Any questions? But etc. can be tricky. A fourth --- and final --- sentence.")
#' text_to_sentences(x)
#' text_to_sentences(x, force_delim = TRUE)
#' 
#' # Changing split delimiters:
#' text_to_sentences(x, split_delim = "\\.")  # only split at "."
#' 
#' text_to_sentences("Buy apples, berries, and coconuts.")
#' text_to_sentences("Buy apples, berries; and coconuts.", 
#'                   split_delim = ",|;|\\.", force_delim = TRUE)
#'                   
#' text_to_sentences(c("123. 456? 789! 007 etc."), force_delim = TRUE)
#' 
#' # Split multi-element strings (w/o punctuation):
#' e3 <- c("12", "34", "56")
#' text_to_sentences(e3, sep = " ")  # Default: Collapse strings adding 1 space, but: 
#' text_to_sentences(e3, sep = ".", force_delim = TRUE)  # insert sep and force split.
#' 
#' # Punctuation within sentences:
#' text_to_sentences("Dr. who is left intact.")
#' text_to_sentences("Dr. Who is problematic.")
#' 
#' @family text objects and functions
#'
#' @seealso
#' \code{\link{text_to_words}} for splitting text into a vector of words; 
#' \code{\link{text_to_chars}} for splitting text into a vector of characters; 
#' \code{\link{count_words}} for counting the frequency of words; 
#' \code{\link{strsplit}} for splitting strings. 
#' 
#' @export

text_to_sentences <- function(x,  # string(s) of text
                              sep = " ",  # separator/delimiter inserted between multi-element strings of x 
                              split_delim = "\\.|\\?|!",  # sentence delimiters used (as regex). ToDo: Consider "[[:punct:]]".
                              force_delim = FALSE         # force split at delimiters
){
  
  # 0. Initialize:
  st <- NA
  regex <- NA
  # split_delim <- "([[:punct:]])"  # as user argument
  
  # 1. Handle inputs:
  x1 <- as.character(x)
  
  # 2. Main:
  
  # Paste all into one string:
  # WAS: x2 <- paste(x1, collapse = " ")
  x2 <- collapse_chars(x = x1, sep = sep)
  
  # Split at SENTENCE punctuation provided by split_delim:
  if (!force_delim){ # more specific: 
    
    # (A) Smart splitting: Expect well-formatted pattern: 
    #     Sentence delimiter, single space, capital letter to start sentence: 
    
    # regex_1sC <- paste("(?<=(", split_delim, "))\\s(?=[A-Z])", sep = "")   # require exactly 1 space and capitalization
    regex_nsC <- paste("(?<=(", split_delim, "))\\s{1,}(?=[A-Z])", sep = "") # require 1 or more spaces and capitalization
    
    x3 <- unlist(strsplit(x2, split = regex_nsC, perl = TRUE))
    
  } else { # more general: 
    
    # (B) Force split at delimiter provided by split_delim 
    #     (e.g., if multiple spaces, or no capitalization of first letter in sentence): 
    
    regex_fd <- split_delim  # force split at split_delim 
    
    x3 <- unlist(strsplit(x2, split = regex_fd, perl = TRUE))
    
  }
  
  # 3. Post-process split sentences:
  x4 <- unlist(strsplit(x3, split = "^( ){1,}"))  # a. Remove LEADING spaces
  x5 <- unlist(strsplit(x4, split = "$( ){1,}"))  # b. Remove TRAILING spaces
  st <- x5[x5 != ""]   # c. Remove all instances of ""
  
  # 4. Output: 
  return(st)
  
} # text_to_sentences(). 

## Check:
# x <- c("A first sentence. Exclamation sentence!",
#        "Any questions? But etc. can be tricky. A fourth --- and final --- sentence.")
# text_to_sentences(x)
# text_to_sentences(x, force_delim = TRUE)
# 
# # Number of spaces between sentences:
# s1 <- c("One space! Between sentences.", "Split ok?")
# text_to_sentences(s1)
# s2 <- c("Two or more spaces!  Between sentences. ", " Split ok?")
# text_to_sentences(s2)
# 
# # Changing split delimiters:
# text_to_sentences(x, split_delim = "\\.")  # only split at "."
# 
# text_to_sentences("Buy apples, berries, and coconuts.")
# text_to_sentences("Buy apples, berries; and coconuts.",
#                   split_delim = ",|;|\\.", force_delim = TRUE)
# 
# text_to_sentences(c("123. 456? 789! 007 etc."), force_delim = TRUE)
# 
# # Splitting multi-element strings (w/o punctuation):
# text_to_sentences(c("123", "456", "789"), sep = " ")  # Default: collapse strings with 1 added space, but:
# text_to_sentences(c("123", "456", "789"), sep = ".", force_delim = TRUE)  # inserts sep and forces split.
# 
# # Punctuation within sentences:
# text_to_sentences("Dr. who is left intact.")
# text_to_sentences("Dr. Who is problematic.")


# text_to_words: Turn a text (consisting of one or more strings) into a vector of its words: ------ 

#' Split string(s) of text \code{x} into words. 
#' 
#' \code{text_to_words} splits a string of text \code{x} 
#' (consisting of one or more character strings) 
#' into a vector of its constituting words. 
#' 
#' \code{text_to_words} removes all (standard) punctuation marks 
#' and empty spaces in the resulting text parts, 
#' before returning a vector of the remaining character symbols 
#' (as its words).
#' 
#' Internally, \code{text_to_words} uses \code{\link{strsplit}} to 
#' split strings at punctuation marks (\code{split = "[[:punct:]]"}) 
#' and blank spaces (\code{split = "( ){1,}"}).
#'
#' @param x A string of text (required), 
#' typically a character vector. 
#' 
#' @return A character vector (of words). 
#'
#' @examples
#' # Default: 
#' x <- c("Hello!", "This is a 1st sentence.", "This is the 2nd sentence.", "The end.")
#' text_to_words(x)
#' 
#' @family text objects and functions
#'
#' @seealso
#' \code{\link{text_to_words}} for splitting a text into its words; 
#' \code{\link{text_to_sentences}} for splitting text into a vector of sentences;  
#' \code{\link{text_to_chars}} for splitting text into a vector of characters;  
#' \code{\link{count_words}} for counting the frequency of words; 
#' \code{\link{strsplit}} for splitting strings. 
#' 
#' @export

text_to_words <- function(x){
  
  # 0. Initialize:
  wds <- NA
  
  # 1. Handle inputs:
  x1 <- as.character(x)
  
  # 2. Main: 
  x2 <- unlist(strsplit(x1, split = "[[:punct:]]"))  # remove punctuation
  x3 <- unlist(strsplit(x2, split = "( ){1,}"))      # remove 1+ spaces
  wds <- x3[x3 != ""]  # remove instances of ""
  
  # 3. Output: 
  return(wds)
  
} # text_to_words(). 

## Check:
# s3 <- c("A first sentence.", "The second sentence.",
#         "A third --- and also the final --- sentence.")
# (wv <- text_to_words(s3))


# text_to_words_regex: Alternative to text_to_words (using 1 regex): -------- 

# (Note: Currently not exported, and not used.)

text_to_words_regex <- function(x){
  
  unlist(regmatches(x, gregexpr(pattern = "\\w+", x)))
  
}

## Check:
# s2 <- c("This is  a  test.", "Does this work?")
# text_to_words_regex(s2)
# text_to_words_regex(s3)


# text_to_chars: Turn a text (consisting of one or more strings) into a vector of its characters: ------ 

#' Split string(s) of text \code{x} into its characters. 
#' 
#' \code{text_to_chars} splits a string of text \code{x} 
#' (consisting of one or more character strings) 
#' into a vector of its individual characters.  
#' 
#' If \code{rm_specials = TRUE}, 
#' most special (or non-word) characters are 
#' removed. (Note that this currently works 
#' without using regular expressions.)
#' 
#' \code{text_to_chars} is an inverse function of \code{\link{chars_to_text}}. 
#' 
#' @param x A string of text (required).
#' 
#' @param rm_specials Boolean: Remove special characters? 
#' Default: \code{rm_specials = TRUE}. 
#' 
#' @param sep Character to insert between the elements 
#' of a multi-element character vector as input \code{x}? 
#' Default: \code{sep = ""} (i.e., add nothing).
#'
#' @return A character vector (containing individual characters). 
#'
#' @examples
#' s3 <- c("A 1st sentence.", "The 2nd sentence.",
#'         "A 3rd --- and  FINAL --- sentence.")
#' text_to_chars(s3)
#' text_to_chars(s3, sep = "\n")
#' text_to_chars(s3, rm_specials = TRUE) 
#'
#' @family text objects and functions
#' 
#' @aliases str2vec
#'
#' @seealso
#' \code{\link{chars_to_text}} for combining character vectors into text; 
#' \code{\link{text_to_sentences}} for splitting text into a vector of sentences; 
#' \code{\link{text_to_words}} for splitting text into a vector of words; 
#' \code{\link{count_chars}} for counting the frequency of characters; 
#' \code{\link{count_words}} for counting the frequency of words; 
#' \code{\link{strsplit}} for splitting strings. 
#' 
#' @export

text_to_chars <- function(x, rm_specials = FALSE, sep = ""){
  
  # 0. Initialize:
  chars <- NA
  
  # 1. Inputs:
  if (all(is.na(x))){ return(x) }  # handle NAs (by returning x unaltered)
  x0 <- as.character(x)
  
  x1 <- collapse_chars(x0, sep = sep)  # collapse multi-element strings (ADDING sep between elements). 
  
  # 2. Remove special characters: 
  # x2 <- unlist(strsplit(x1, split = "[[:punct:]]"))  # remove punctuation
  # x3 <- unlist(strsplit(x2, split = "( ){1,}"))      # remove 1+ spaces
  # x4 <- x3[x3 != ""]  # remove instances of ""
  
  # 3. Main: 
  x2 <- unlist(strsplit(x1, split = ""))
  
  # 4. Remove special characters: 
  if (rm_specials){
    
    # Define special chars: 
    space   <- c("", " ")  # [[:space:]]
    hyphens <- c("-", "--", "---")
    punct   <- c(",", ";", ":", ".", "!", "?")  # punctuation [[:punct:]]  
    parents <- c("(", ")", "[", "]", "{", "}", "<", ">")  # parentheses
    
    spec_char <- c(punct, space, hyphens, parents)
    # Note: cclass includes additional symbols.
    
    # Remove special characters:
    chars <- x2[!(x2 %in% spec_char)]
    
  } else {
    
    chars <- x2  # as is 
    
  } # if (rm_specials). 
  
  # 4. Output: 
  return(chars)
  
} # text_to_chars(). 

## Check:
# s3 <- c("A first sentence.", "The second sentence.",
#       "A third --- and also THE   FINAL --- sentence.")
# (wv <- text_to_chars(s3))
# (wv_2 <- text_to_chars(s3, sep = "\n"))
# (wv_3 <- text_to_chars(s3, rm_specials = TRUE))
# 
# text_to_chars(c("See 3 spaces:   ?"))
# # Note:
# text_to_chars(c(1:3))
# text_to_chars(c(NA, NA))
# text_to_chars(c(NA))


# words_to_text: Turn a vector of words x into a (single) vector: ------ 

#' Paste or collapse words \code{x} into a text. 
#' 
#' \code{words_to_text} pastes or collapses 
#' a character string \code{x} into a single text string. 
#' 
#' \code{words_to_text} is essentially identical to 
#' \code{\link{collapse_chars}}. 
#' Internally, both functions are wrappers around  
#' \code{\link{paste}} with a \code{collapse} argument. 
#' 
#' @param x A string of text (required), typically a character vector. 
#' 
#' @param collapse A character string to separate the elements of \code{x} 
#' in the resulting text. 
#' Default: \code{collapse = " "}. 
#' 
#' @return A text (as a collapsed character vector). 
#'
#' @examples
#' s <- c("Hello world!", "A 1st sentence.", "A 2nd sentence.", "The end.")
#' words_to_text(s)
#' cat(words_to_text(s, collapse = "\n"))
#' 
#' @family text objects and functions
#'
#' @seealso
#' \code{\link{text_to_words}} for splitting a text into its words; 
#' \code{\link{text_to_sentences}} for splitting text into a vector of sentences;  
#' \code{\link{text_to_chars}} for splitting text into a vector of characters;  
#' \code{\link{count_words}} for counting the frequency of words; 
#' \code{\link{collapse_chars}} for collapsing character vectors; 
#' \code{\link{strsplit}} for splitting strings. 
#' 
#' @export

words_to_text <- function(x, collapse = " "){
  
  paste(x, collapse = collapse)
  
} # words_to_text(). 

## Check:
# words_to_text(c("This", "is only", "a test"))
# cat(words_to_text(wv, collapse = "\n"))


# chars_to_text: Turn a character vector x into a (single) string of text (preserving punctuation and spaces): ------

# Inverse of text_to_chars() above:  
# ASSUME that x consists of individual characters, but may contain spaces. 
# Goal: Exactly preserve all characters (e.g., punctuation and spaces).
# (Note: Simply using paste(x, collapse = "") would lose all spaces.) 

#' Combine character inputs \code{x} into a single string of text. 
#' 
#' \code{chars_to_text} combines multi-element character inputs \code{x} 
#' into a single string of text (i.e., a character object of length 1), 
#' while preserving punctuation and spaces. 
#' 
#' \code{chars_to_text} is an inverse function of \code{\link{text_to_chars}}. 
#' 
#' Note that using \code{paste(x, collapse = "")} would remove spaces. 
#' See \code{\link{collapse_chars}} for a simpler alternative. 
#' 
#' @param x A vector (required), typically a character vector. 
#'
#' @param sep Character to insert between the elements 
#' of a multi-element character vector as input \code{x}? 
#' Default: \code{sep = ""} (i.e., add nothing). 
#'   
#' @return A character vector (of length 1). 
#' 
#' @examples
#' # (a) One string (with spaces and punctuation):
#' t1 <- "Hello world! This is _A   TEST_. Does this work?"
#' (cv <- unlist(strsplit(t1, split = "")))
#' (t2 <- chars_to_text(cv))
#' t1 == t2
#' 
#' # (b) Multiple strings (nchar from 0 to >1):
#' s <- c("Hi", " ", "", "there!", " ", "", "Does  THIS  work?")
#' chars_to_text(s)
#' 
#' # Note: Using sep argument: 
#' chars_to_text(c("Hi there!", "How are you today?"), sep = "  ")
#' chars_to_text(1:3, sep = " | ")
#'  
#' @aliases vec2str 
#' 
#' @family text objects and functions
#'
#' @seealso
#' \code{\link{collapse_chars}} for collapsing character vectors; 
#' \code{\link{text_to_chars}} for splitting text into a vector of characters; 
#' \code{\link{text_to_words}} for splitting text into a vector of words; 
#' \code{\link{strsplit}} for splitting strings. 
#' 
#' @export

chars_to_text <- function(x, sep = ""){
  
  # Initialize:
  x0 <- as.character(x)
  char_t <- NA
  
  # Ensure that x0 consists only of individual characters:
  if (any(nchar(x0) > 1)){
    
    # # (a) Without a sep argument:
    # one_cv <- paste(x0, collapse = "")  # paste/collapse all into a single char vector
    
    # (b) With a sep argument (as in collapse_chars() above):
    one_cv <- paste(x0, collapse = sep)
    
    char_v <- unlist(strsplit(one_cv, split = ""))  # split into a vector of individual characters
    
  } else {
    
    # # (a) Without a sep argument:
    # char_v <- x0  # use vector of single characters
    
    # (b) With a sep argument (as in collapse_chars() above):
    char_v <- paste(x0, collapse = sep)
    
  }
  # print(char_v)  # 4debugging
  
  # Main: Convert char_v into char_t (preserving spaces): 
  my_space <- "_h3d8o5m1v7z4_"  # some cryptic replacement for any " " (in character string)
  char_v_hlp <- gsub(pattern = " ", replacement = my_space, x = char_v)  # helper (with spaces replaced)
  char_s_hlp <- paste(char_v_hlp, collapse = "")  # char string helper (with spaces as my_space)
  char_t <- gsub(pattern = my_space, replacement = " ", x = char_s_hlp)  # char string (with original spaces)
  
  # # Check: Does nchar(char_s) equal length(char_v)? 
  # n_char_v <- length(char_v)
  # n_char_t <- nchar(char_t)
  # if (n_char_t != n_char_v){
  #   message(paste0("chars_to_text: nchar(char_t) = ", n_char_t, 
  #                  " differs from length(char_v) = ", n_char_v, "."))
  # }
  
  return(char_t)
  
} # chars_to_text().

## Check:
# # (a) One string (with spaces and punctuation):
# t1 <- "Hello world! This is _A   TEST_. Does this work?"
# (cv <- unlist(strsplit(t1, split = "")))
# (t2 <- chars_to_text(cv))
# t1 == t2
# 
# # (b) Multiple strings (nchar from 0 to >1):
# s <- c("", "Hi", " ", "there!", " ", "Does  THIS  work?", "")
# chars_to_text(s)
# 
# # Using sep argument: 
# chars_to_text(c("Hi there!", "How are you today?"), sep = " ")
# chars_to_text(1:3, sep = " | ")



# Verify that chars_to_text() and text_to_chars() complement each other: ------ 

# s_1 <- c("This is some text.", " ", "Note that 2 sentences can occur in the same character object. As is the case here!")
# t_1 <- chars_to_text(s_1)
# (crs <- text_to_chars(t_1))  # individual characters (including spaces)
# t_2 <- chars_to_text(crs)
# all.equal(t_1, t_2)


# Verify that sentences can be recreated (after splitting into chars and combine words to text): ------ 

# s_3 <- c("A first sentence.", "The second sentence.",
#          "A third --- and also THE   FINAL --- sentence.")
# (crs <- text_to_chars(s_3, sep = " "))
# (txt <- words_to_text(crs, collapse = ""))
# # OR: 
# # (txt <- chars_to_text(crs, sep = ""))
# (snt <- text_to_sentences(txt))
# all.equal(snt, s_3)



## (C) Miscellaneous text/string utility functions: ------ 

# Source: From string_fun.R  | i2ds  | 2022 06 26
# Redundant functions for manipulating/transforming character strings: ------ 

# vec2str: Turn a vector of symbols into a character string: ------
#          (Redundant to chars_to_text() and collapse_chars() above.)

# vec2str <- function(v) {
#   
#   paste(v, collapse = "")
#   
#   # Note: Simply using paste(v, collapse = "") loses all spaces.
#   # Better: chars_to_text() preserves spaces (see above). 
#   
# } # vec2str(). 


# str2vec: Turn a character string into a vector (of 1-symbol character elements): ------ 
#          (Redundant to text_to_chars() above.)

# str2vec <- function(s){
#   
#   unlist(strsplit(s, split = ""))  # assumes ONLY 1-symbol elements/digits
#   
# } # str2vec(). 


## Done: ----------

# - Document and export collapse_chars() AND chars_to_text().  
# - Replace vec2str() and str2vec() by superior functions in text_fun.R 


## ToDo: ----------

# - etc.

## eof. ----------------------
hneth/ds4psy documentation built on May 1, 2024, 4:26 a.m.