R/substr.R

Defines functions `substring<-` substring `gsubstrl<-` `gsubstr<-` gsubstrl gsubstr `substrl<-` `substr<-` substrl substr

Documented in gsubstr gsubstrl substr substring substrl

# kate: default-dictionary en_AU

## stringx package for R
## Copyleft (C) 2021-2025, Marek Gagolewski <https://www.gagolewski.com/>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details. You have received
## a copy of the GNU General Public License along with this program.


#' @title
#' Extract or Replace Substrings
#'
#' @description
#' \code{substr} and \code{substrl} extract
#' contiguous parts of given character strings.
#' The former operates based on start and end positions
#' while the latter is fed with substring lengths.
#'
#' Their replacement versions allow for substituting parts of strings
#' with new content.
#'
#' \code{gsubstr} and \code{gsubstrl} allow for extracting or replacing
#' multiple chunks from each string.
#'
#' @details
#' Not to be confused with \code{\link{sub}}.
#'
#' \code{substring} is a [DEPRECATED] synonym for \code{substr}.
#'
#' Note that these functions can break some meaningful Unicode code point
#' sequences, e.g., when inputs are not normalised. For extracting
#' initial parts of strings based on character width, see \code{\link{strtrim}}.
#'
#' Note that \code{gsubstr} (and related functions) expect
#' \code{start}, \code{stop}, \code{length}, and \code{value}
#' to be lists. Non-list arguments will be converted by calling
#' \code{\link[base]{as.list}}. This is different from the default policy
#' applied by \code{\link[stringi]{stri_sub_all}}, which calls
#' \code{\link[base]{list}}.
#'
#' Note that \code{substrl} and \code{gsubstrl} are
#' interoperable with \code{\link{regexpr2}} and \code{\link{gregexpr2}},
#' respectively, and hence can be considered as substituted for the
#' [DEPRECATED] \code{\link[base]{regmatches}} (which is more specialised).
#'
#'
#' @section Differences from Base R:
#' Replacements for and enhancements of base \code{\link[base]{substr}}
#' and \code{\link[base]{substring}}
#' implemented with \code{\link[stringi]{stri_sub}} and
#' \code{\link[stringi]{stri_sub_all}},
#'
#' \itemize{
#' \item \code{substring} is "for compatibility with S", but this should
#'     no longer matter
#'     \bold{[here, \code{substring} is equivalent to \code{substr}; in a
#'     future version, using the former may result in a warning]}
#' \item \code{substr} is not vectorised with respect to all the arguments
#'     (and \code{substring} is not fully vectorised wrt \code{value})
#'     \bold{[fixed here]}
#' \item not all attributes are taken from the longest of the inputs
#'     \bold{[fixed here]}
#' \item partial recycling with no warning
#'     \bold{[fixed here]}
#' \item the replacement must be of the same length as the chunk
#'     being substituted \bold{[fixed here]}
#' \item negative indexes are silently treated as 1
#'     \bold{[changed here: negative indexes count from the end of the string]}
#' \item replacement of different length than the extracted substring
#'     never changes the length of the string
#'     \bold{[changed here -- output length is input length minus
#'     length of extracted plus length of replacement]}
#' \item \code{\link{regexpr}} (amongst others) return start positions
#'     and lengths of matches, but base \code{substr} only uses
#'     start and end
#'     \bold{[fixed by introducing \code{substrl}]}
#' \item there is no function to extract or replace multiple
#'     chunks in each string (other than \code{\link{regmatches}}
#'     that works on outputs generated by \code{\link[base]{gregexpr}} et al.)
#'     \bold{[fixed by introducing \code{gsubstrl}]}
#' }
#'
#'
#' @param x,text character vector
#'     whose parts are to be extracted/replaced
#'
#' @param start,first numeric vector (for \code{substr})
#'     or list of numeric vectors (for \code{gsubstr})
#'     giving the start indexes;
#'     e.g., 1 denotes the first code point;
#'     negative indexes
#'     count from the end of a string, i.e., -1 is the last character
#'
#' @param stop,last numeric vector (for \code{substr})
#'     or list of numeric vectors (for \code{gsubstr})
#'     giving the end indexes (inclusive);
#'     note that if the start position is farther than the
#'     end position, this indicates an empty substring therein (see Examples)
#'
#' @param length numeric vector  (for \code{substr})
#'     or list of numeric vectors (for \code{gsubstr})
#'     giving the substring lengths;
#'     negative lengths result in a missing value or empty vector
#'     (see \code{ignore_negative_length}) or the corresponding
#'     substring being unchanged
#'
#' @param value character vector  (for \code{substr})
#'     or list of character vectors  (for \code{gsubstr})
#'     defining the replacements strings
#'
#' @param ignore_negative_length single logical value;
#'     whether negative lengths should be ignored or yield missing values
#'
#'
#' @return
#' \code{substr} and \code{substrl} return a character vector (in UTF-8).
#' \code{gsubstr} and \code{gsubstrl} return a list of character vectors.
#'
#' Their replacement versions modify \code{x} 'in-place' (see Examples).
#'
#' The attributes are copied from the longest arguments (similar to
#' binary operators).
#'
#'
#'
#' @examples
#' x <- "spam, spam, bacon, and spam"
#' base::substr(x, c(1, 13), c(4, 17))
#' base::substring(x, c(1, 13), c(4, 17))
#' substr(x, c(1, 13), c(4, 17))
#' substrl(x, c(1, 13), c(4, 5))
#'
#' # replacement function used as an ordinary one - return a copy of x:
#' base::`substr<-`(x, 1, 4, value="jam")
#' `substr<-`(x, 1, 4, value="jam")
#' base::`substr<-`(x, 1, 4, value="porridge")
#' `substr<-`(x, 1, 4, value="porridge")
#'
#' # interoperability with gregexpr2:
#' p <- "[\\w&&[^a]][\\w&&[^n]][\\w&&[^d]]\\w+"  # regex: all words but 'and'
#' gsubstrl(x, gregexpr2(x, p))
#' `gsubstrl<-`(x, gregexpr2(x, p), value=list(c("a", "b", "c", "d")))
#'
#' # replacement function modifying x in-place:
#' substr(x, 1, 4) <- "eggs"
#' substr(x, 1, 0) <- "porridge, "        # prepend (start<stop)
#' substr(x, nchar(x)+1) <- " every day"  # append (start<stop)
#' print(x)
#'
#'
#'
#' @seealso
#' Related function(s): \code{\link{strtrim}}, \code{\link{nchar}},
#'    \code{\link{startsWith}}, \code{\link{endsWith}},
#'    \code{\link{gregexpr}}
#'
#' @rdname substr
substr <- function(x, start=1L, stop=-1L)
{
    if (!is.character(x))   x <- as.character(x)
    if (!is.numeric(start)) start <- as.numeric(start)
    if (!is.numeric(stop))  stop  <- as.numeric(stop)

    ret <- stringi::stri_sub(x, from=start, to=stop, use_matrix=FALSE)
    .attribs_propagate_nary(ret, x, start, stop)
}


#' @rdname substr
substrl <- function(x, start=1L, length=attr(start, "match.length"), ignore_negative_length=FALSE)
{
    if (!is.character(x))    x <- as.character(x)
    if (!is.numeric(start))  start <- as.numeric(start)
    if (!is.numeric(length)) length <- as.numeric(length)

    if (missing(length)) attr(start, "match.length") <- NULL

    ret <- stringi::stri_sub(x, from=start, length=length, use_matrix=FALSE, ignore_negative_length=ignore_negative_length)
    .attribs_propagate_nary(ret, x, start, length)
}


#' @rdname substr
`substr<-` <- function(x, start=1L, stop=-1L, value)
{
    if (!is.character(x))     x     <- as.character(x)
    if (!is.numeric(start))   start <- as.numeric(start)
    if (!is.numeric(stop))    stop  <- as.numeric(stop)
    if (!is.character(value)) value <- as.character(value)

    ret <- stringi::`stri_sub<-`(x, from=start, to=stop, omit_na=FALSE, use_matrix=FALSE, value=value)
    .attribs_propagate_nary(ret, x, start, stop, value)
}


#' @rdname substr
`substrl<-` <- function(x, start=1L, length=attr(start, "match.length"), value)
{
    if (!is.character(x))     x      <- as.character(x)
    if (!is.numeric(start))   start  <- as.numeric(start)
    if (!is.numeric(length))  length <- as.numeric(length)
    if (!is.character(value)) value  <- as.character(value)

    if (missing(length)) attr(start, "match.length") <- NULL

    ret <- stringi::`stri_sub<-`(x, from=start, length=length, omit_na=FALSE, use_matrix=FALSE, value=value)
    .attribs_propagate_nary(ret, x, start, length, value)
}


#' @rdname substr
gsubstr <- function(x, start=list(1L), stop=list(-1L))
{
    if (!is.character(x)) x <- as.character(x)
    if (!is.list(start))  start <- as.list(start)
    if (!is.list(stop))   stop  <- as.list(stop)

    ret <- stringi::stri_sub_all(x, from=start, to=stop, use_matrix=FALSE)
    .attribs_propagate_nary(ret, x, start, stop)
}


#' @rdname substr
gsubstrl <- function(x, start=list(1L), length=lapply(start, attr, "match.length"), ignore_negative_length=TRUE)
{
    if (!is.character(x)) x <- as.character(x)
    if (!is.list(start))  start <- as.list(start)
    if (!is.list(length)) length  <- as.list(length)

    ret <- stringi::stri_sub_all(x, from=start, length=length, use_matrix=FALSE, ignore_negative_length=ignore_negative_length)
    .attribs_propagate_nary(ret, x, start, length)
}


#' @rdname substr
`gsubstr<-` <- function(x, start=list(1L), stop=list(-1L), value)
{
    if (!is.character(x))     x     <- as.character(x)
    if (!is.list(start))      start <- as.list(start)
    if (!is.list(stop))       stop  <- as.list(stop)
    if (!is.list(value))      value <- as.list(value)

    ret <- stringi::`stri_sub_all<-`(x, from=start, to=stop, omit_na=FALSE, use_matrix=FALSE, value=value)
    .attribs_propagate_nary(ret, x, start, stop, value)
}


#' @rdname substr
`gsubstrl<-` <- function(x, start=list(1L), length=lapply(start, attr, "match.length"), value)
{
    if (!is.character(x))     x      <- as.character(x)
    if (!is.list(start))      start  <- as.list(start)
    if (!is.list(length))     length <- as.list(length)
    if (!is.list(value))      value  <- as.list(value)

    ret <- stringi::`stri_sub_all<-`(x, from=start, length=length, omit_na=FALSE, use_matrix=FALSE, value=value)
    .attribs_propagate_nary(ret, x, start, length, value)
}


#' @rdname substr
substring <- function(text, first=1L, last=-1L)
{
    substr(x=text, start=first, stop=last)
}


#' @rdname substr
`substring<-` <- function(text, first=1L, last=-1L, value)
{
    `substr<-`(x=text, start=first, stop=last, value=value)
}
gagolews/stringx documentation built on Jan. 15, 2025, 9:46 p.m.