R/string.R

Defines functions alnum_id pair_chars bump_version valid_syntax split_source split_lines query_params numbers_to_words is_blank

Documented in alnum_id bump_version is_blank numbers_to_words split_lines split_source valid_syntax

#' Test if a character vector consists of blank strings
#'
#' Return a logical vector indicating if elements of a character vector are
#' blank (white spaces or empty strings).
#' @param x A character vector.
#' @return `TRUE` for blank elements, or `FALSE` otherwise.
#' @export
#' @examples
#' xfun::is_blank('')
#' xfun::is_blank('abc')
#' xfun::is_blank(c('', '  ', '\n\t'))
#' xfun::is_blank(c('', ' ', 'abc'))
is_blank = function(x) grepl('^\\s*$', x)

#' Convert numbers to English words
#'
#' This can be helpful when writing reports with \pkg{knitr}/\pkg{rmarkdown} if
#' we want to print numbers as English words in the output. The function `n2w()`
#' is an alias of `numbers_to_words()`.
#' @param x A numeric vector. The absolute values should be less than `1e15`.
#' @param cap Whether to capitalize the first letter of the word. This can be
#'   useful when the word is at the beginning of a sentence. Default is `FALSE`.
#' @param hyphen Whether to insert hyphen (-) when the number is between 21 and
#'   99 (except 30, 40, etc.).
#' @param and Whether to insert `and` between hundreds and tens, e.g., write 110
#'   as \dQuote{one hundred and ten} if `TRUE` instead of \dQuote{one hundred
#'   ten}.
#' @return A character vector.
#' @author Daijiang Li
#' @export
#' @examples library(xfun)
#' n2w(0, cap = TRUE)
#' n2w(0:121, and = TRUE)
#' n2w(1e6)
#' n2w(1e11+12345678)
#' n2w(-987654321)
#' n2w(1e15-1)
#' n2w(123.456)
#' n2w(123.45678901)
#' n2w(123.456789098765)
numbers_to_words = function(x, cap = FALSE, hyphen = TRUE, and = FALSE) {
  if (!is.numeric(x)) stop('The input is not numeric.')
  if (any(abs(x) >= 1e15)) stop('The absolute value must be less than 1e15.')
  opts = options(scipen = 15, OutDec = '.')  # avoid scientific notation
  on.exit(options(opts), add = TRUE)

  zero_to_19 = c(
    'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
    'eleven', 'twelve', paste0(c('thir', 'four', 'fif', 'six', 'seven', 'eigh', 'nine'), 'teen')
  )
  names(zero_to_19) = as.character(0:19)
  tens = c('twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety')
  names(tens) = as.character(seq(20, 90, 10))
  marks = c('', 'thousand,', 'million,', 'billion,', 'trillion,')

  convert_1 = function(x_c) zero_to_19[x_c]  # 0 - 9

  # 10 - 99
  convert_2 = function(x_c) {
    x_cs = strsplit(x_c, split = '')[[1]]
    if (x_cs[1] == 1) return(zero_to_19[x_c])  # 10 - 19
    if (x_cs[2] == 0) return(tens[x_c])  # 20, 30, 40, ...
    # 21, 22, etc.
    paste(tens[as.integer(x_cs[1]) - 1], convert_1(x_cs[2]), sep = if (hyphen) '-' else ' ')
  }

  # 100 - 999
  convert_3 = function(x_c) {
    x_cs = strsplit(x_c, split = '')[[1]]
    n_hundreds = paste(convert_1(x_cs[1]), 'hundred', sep = ' ')
    out = if (x_cs[2] == '0') {
      if (x_cs[3] == '0') return(n_hundreds)  # x00
      convert_1(x_cs[3])  # x0x
    } else {
      convert_2(paste(x_cs[2:3], collapse = ''))  # xxx
    }
    paste(n_hundreds, out, sep = if (and) ' and ' else ' ')
  }

  convert_le3 = function(x_c) {
    x_c = gsub('^0+', '', x_c) # avoid something like 000, 001, 010; but also remove 0
    n = nchar(x_c)
    if (n == 0) return('')
    if (n == 1) return(convert_1(x_c))
    if (n == 2) return(convert_2(x_c))
    if (n == 3) return(convert_3(x_c))
  }

  convert_one = function(x) {
    minus = if (x >= 0) '' else {
      x = abs(x); 'minus '
    }
    if (x == 0) {
      out = 'zero'  # because convert_le3 removed all 0s
    } else {
      x_marks = strsplit(format(floor(x), big.mark = ','), split = ',')[[1]]  # e.g. 123,456,789
      out = vapply(x_marks, convert_le3, character(1))  # group by 3 digits
      x_marks2 = marks[length(x_marks):1]  # units?
      x_marks2[which(out == '')] = ''  # e.g. 4,000,123, 000, remove millions
      out = paste(out, x_marks2, sep = ' ', collapse = ' ')  # zip together
    }
    out = paste0(minus, out)
    out = gsub('^ *|,? *$', '', out)  # trim heading/trailing space
    out = gsub(' {2,}', ' ', out)  # remove multiple spaces
    if (cap) out = sub('^([a-z])', '\\U\\1', out, perl = TRUE)
    if (x - floor(x) > 0) {
      frac = sub('^[0-9]+[.]', '', as.character(x))
      frac = convert_1(strsplit(frac, '')[[1]])
      out = paste(c(out, 'point', frac), collapse = ' ')
    }
    out
  }

  if (length(x) > 1) vapply(x, convert_one, character(1)) else convert_one(x)
}

#' @export
#' @rdname numbers_to_words
n2w = numbers_to_words

# create a URL query string from named parameters
query_params = function(..., .list = list()) {
  x = if (missing(.list)) list(...) else .list
  x = paste(names(x), x, sep = '=', collapse = '&')
  if (x != '') paste0('?', x) else x
}

#' Split a character vector by line breaks
#'
#' Call `unlist(strsplit(x, '\n'))` on the character vector `x` and
#' make sure it works in a few edge cases: `split_lines('')` returns
#' `''` instead of `character(0)` (which is the returned value of
#' `strsplit('', '\n')`); `split_lines('a\n')` returns `c('a',
#' '')` instead of `c('a')` (which is the returned value of
#' `strsplit('a\n', '\n')`.
#' @param x A character vector.
#' @return All elements of the character vector are split by `'\n'` into
#'   lines.
#' @export
#' @examples xfun::split_lines(c('a', 'b\nc'))
split_lines = function(x) {
  if (length(grep('\n', x)) == 0L) return(x)
  x = gsub('\n$', '\n\n', x)
  x[x == ''] = '\n'
  unlist(strsplit(x, '\n'))
}

#' Split source lines into complete expressions
#'
#' Parse the lines of code one by one to find complete expressions in the code,
#' and put them in a list.
#' @param x A character vector of R source code.
#' @return A list of character vectors, and each vector contains a complete R
#'   expression.
#' @export
#' @examples xfun::split_source(c('if (TRUE) {', '1 + 1', '}', 'print(1:5)'))
split_source = function(x) {
  if ((n <- length(x)) < 1) return(list(x))
  i = i1 = i2 = 1
  res = list()
  while (i2 <= n) {
    piece = x[i1:i2]
    if (valid_syntax(piece)) {
      res[[i]] = piece; i = i + 1
      i1 = i2 + 1 # start from the next line
    }
    i2 = i2 + 1
  }
  if (i1 <= n) parse(text = piece)  # must be an error there
  res
}

#' Check if the syntax of the code is valid
#'
#' Try to [parse()] the code and see if an error occurs.
#' @param code A character vector of R source code.
#' @param silent Whether to suppress the error message when the code is not
#'   valid.
#' @return `TRUE` if the code could be parsed, otherwise `FALSE`.
#' @export
#' @examples xfun::valid_syntax('1+1')
#' xfun::valid_syntax('1+')
#' xfun::valid_syntax(c('if(T){1+1}', 'else {2+2}'), silent = FALSE)
valid_syntax = function(code, silent = TRUE) {
  !inherits(try(parse_only(code), silent = silent), 'try-error')
}

#' Bump version numbers
#'
#' Increase the last digit of version numbers, e.g., from `0.1` to
#' `0.2`, or `7.23.9` to `7.23.10`.
#' @param x A vector of version numbers (of the class `"numeric_version"`),
#'   or values that can be coerced to version numbers via
#'   `as.numeric_version()`.
#' @return A vector of new version numbers.
#' @export
#' @examples xfun::bump_version(c('0.1', '91.2.14'))
bump_version = function(x) {
  x = as.numeric_version(x)
  for (i in seq_along(x)) {
    v = x[i]
    n = length(unclass(v)[[1]])
    v[[1, n]] = v[[1, n]] + 1  # bump the last digit
    x[i] = v
  }
  x
}

#' Fix pairs of characters in a file
#'
#' For example, the curly braces may be wrong (the opening and closing braces
#' are swapped for some reason).
#' @param x A character vector (by default, read from `file`).
#' @param file Path to a text file.
#' @param chars A vector of characters of length 2. By default, it is a pair of
#'   curly double quotes.
#' @references <https://d.cosx.org/d/420794/5>
#' @noRd
#' @examples
#' files = list.files('.', '[.]R?md$', recursive = TRUE, full.names = TRUE)
#' for (f in files) {
#'   pair_chars(file = f)
#'   # curly single quotes
#'   pair_chars(file = f, chars = c('\U2018', '\U2019'))
#' }
pair_chars = function(x = read_utf8(file), file, chars = c('\U201c', '\U201d')) {
  if (length(chars) != 2) stop("'chars' must be of length 2 (i.e., a pair of characters)")
  is_file = !missing(file)
  r = paste(c('[', chars, ']'), collapse = '')
  k = gregexpr(r, x)
  m = regmatches(x, k)
  for (i in seq_along(m)) {
    n = length(m[[i]])
    if (n %% 2 != 0) {
      warning(
        'The characters do not appear in pairs in the text (',
        'line: ', i, if (is_file) c('; file: ', file), '):\n', x[i], '\n'
      )
      next
    }
    m[[i]] = rep(chars, length.out = n)
  }
  x2 = x
  regmatches(x, k) = m
  if (is_file) {
    if (!identical(x, x2)) xfun::write_utf8(x, file)
    invisible(x)
  } else x
}

#' Generate ID strings
#'
#' Substitute certain (by default, non-alphanumeric) characters with dashes and
#' remove extra dashes at both ends to generate ID strings. This function is
#' intended for generating IDs for HTML elements, so HTML tags in the input text
#' will be removed first.
#' @param x A character vector.
#' @param exclude A (Perl) regular expression to detect characters to be
#'   replaced by dashes. By default, non-alphanumeric characters are replaced.
#' @return A character vector of IDs.
#' @export
#' @examples
#' x = c('Hello world 123!', 'a  &b*^##c 456')
#' xfun::alnum_id(x)
#' xfun::alnum_id(x, '[^[:alpha:]]+')  # only keep alphabetical chars
#' # when text contains HTML tags
#' xfun::alnum_id('<h1>Hello <strong>world</strong>!')
alnum_id = function(x, exclude = '[^[:alnum:]]+') {
  x = strip_html(x)
  tolower(gsub('^-+|-+$', '', gsub(exclude, '-', x, perl = TRUE)))
}

#' Strip HTML tags
#'
#' Remove HTML tags and comments from text.
#' @param x A character vector.
#' @return A character vector with HTML tags and comments stripped off.
#' @export
#' @examples
#' xfun::strip_html('<a href="#">Hello <!-- comment -->world!</a>')
strip_html = function (x) {
  x = gsub('<!--.*?-->', '', x)
  x = gsub('<[^>]+>', '', x)
  x
}

Try the xfun package in your browser

Any scripts or data that you put into this service are public.

xfun documentation built on Nov. 2, 2023, 6 p.m.