# kate: default-dictionary en_AU
## stringx package for R
## Copyleft (C) 2021-2025, Marek Gagolewski <https://www.gagolewski.com/>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details. You have received
## a copy of the GNU General Public License along with this program.
#' @title
#' Extract or Replace Substrings
#'
#' @description
#' \code{substr} and \code{substrl} extract
#' contiguous parts of given character strings.
#' The former operates based on start and end positions
#' while the latter is fed with substring lengths.
#'
#' Their replacement versions allow for substituting parts of strings
#' with new content.
#'
#' \code{gsubstr} and \code{gsubstrl} allow for extracting or replacing
#' multiple chunks from each string.
#'
#' @details
#' Not to be confused with \code{\link{sub}}.
#'
#' \code{substring} is a [DEPRECATED] synonym for \code{substr}.
#'
#' Note that these functions can break some meaningful Unicode code point
#' sequences, e.g., when inputs are not normalised. For extracting
#' initial parts of strings based on character width, see \code{\link{strtrim}}.
#'
#' Note that \code{gsubstr} (and related functions) expect
#' \code{start}, \code{stop}, \code{length}, and \code{value}
#' to be lists. Non-list arguments will be converted by calling
#' \code{\link[base]{as.list}}. This is different from the default policy
#' applied by \code{\link[stringi]{stri_sub_all}}, which calls
#' \code{\link[base]{list}}.
#'
#' Note that \code{substrl} and \code{gsubstrl} are
#' interoperable with \code{\link{regexpr2}} and \code{\link{gregexpr2}},
#' respectively, and hence can be considered as substituted for the
#' [DEPRECATED] \code{\link[base]{regmatches}} (which is more specialised).
#'
#'
#' @section Differences from Base R:
#' Replacements for and enhancements of base \code{\link[base]{substr}}
#' and \code{\link[base]{substring}}
#' implemented with \code{\link[stringi]{stri_sub}} and
#' \code{\link[stringi]{stri_sub_all}},
#'
#' \itemize{
#' \item \code{substring} is "for compatibility with S", but this should
#' no longer matter
#' \bold{[here, \code{substring} is equivalent to \code{substr}; in a
#' future version, using the former may result in a warning]}
#' \item \code{substr} is not vectorised with respect to all the arguments
#' (and \code{substring} is not fully vectorised wrt \code{value})
#' \bold{[fixed here]}
#' \item not all attributes are taken from the longest of the inputs
#' \bold{[fixed here]}
#' \item partial recycling with no warning
#' \bold{[fixed here]}
#' \item the replacement must be of the same length as the chunk
#' being substituted \bold{[fixed here]}
#' \item negative indexes are silently treated as 1
#' \bold{[changed here: negative indexes count from the end of the string]}
#' \item replacement of different length than the extracted substring
#' never changes the length of the string
#' \bold{[changed here -- output length is input length minus
#' length of extracted plus length of replacement]}
#' \item \code{\link{regexpr}} (amongst others) return start positions
#' and lengths of matches, but base \code{substr} only uses
#' start and end
#' \bold{[fixed by introducing \code{substrl}]}
#' \item there is no function to extract or replace multiple
#' chunks in each string (other than \code{\link{regmatches}}
#' that works on outputs generated by \code{\link[base]{gregexpr}} et al.)
#' \bold{[fixed by introducing \code{gsubstrl}]}
#' }
#'
#'
#' @param x,text character vector
#' whose parts are to be extracted/replaced
#'
#' @param start,first numeric vector (for \code{substr})
#' or list of numeric vectors (for \code{gsubstr})
#' giving the start indexes;
#' e.g., 1 denotes the first code point;
#' negative indexes
#' count from the end of a string, i.e., -1 is the last character
#'
#' @param stop,last numeric vector (for \code{substr})
#' or list of numeric vectors (for \code{gsubstr})
#' giving the end indexes (inclusive);
#' note that if the start position is farther than the
#' end position, this indicates an empty substring therein (see Examples)
#'
#' @param length numeric vector (for \code{substr})
#' or list of numeric vectors (for \code{gsubstr})
#' giving the substring lengths;
#' negative lengths result in a missing value or empty vector
#' (see \code{ignore_negative_length}) or the corresponding
#' substring being unchanged
#'
#' @param value character vector (for \code{substr})
#' or list of character vectors (for \code{gsubstr})
#' defining the replacements strings
#'
#' @param ignore_negative_length single logical value;
#' whether negative lengths should be ignored or yield missing values
#'
#'
#' @return
#' \code{substr} and \code{substrl} return a character vector (in UTF-8).
#' \code{gsubstr} and \code{gsubstrl} return a list of character vectors.
#'
#' Their replacement versions modify \code{x} 'in-place' (see Examples).
#'
#' The attributes are copied from the longest arguments (similar to
#' binary operators).
#'
#'
#'
#' @examples
#' x <- "spam, spam, bacon, and spam"
#' base::substr(x, c(1, 13), c(4, 17))
#' base::substring(x, c(1, 13), c(4, 17))
#' substr(x, c(1, 13), c(4, 17))
#' substrl(x, c(1, 13), c(4, 5))
#'
#' # replacement function used as an ordinary one - return a copy of x:
#' base::`substr<-`(x, 1, 4, value="jam")
#' `substr<-`(x, 1, 4, value="jam")
#' base::`substr<-`(x, 1, 4, value="porridge")
#' `substr<-`(x, 1, 4, value="porridge")
#'
#' # interoperability with gregexpr2:
#' p <- "[\\w&&[^a]][\\w&&[^n]][\\w&&[^d]]\\w+" # regex: all words but 'and'
#' gsubstrl(x, gregexpr2(x, p))
#' `gsubstrl<-`(x, gregexpr2(x, p), value=list(c("a", "b", "c", "d")))
#'
#' # replacement function modifying x in-place:
#' substr(x, 1, 4) <- "eggs"
#' substr(x, 1, 0) <- "porridge, " # prepend (start<stop)
#' substr(x, nchar(x)+1) <- " every day" # append (start<stop)
#' print(x)
#'
#'
#'
#' @seealso
#' Related function(s): \code{\link{strtrim}}, \code{\link{nchar}},
#' \code{\link{startsWith}}, \code{\link{endsWith}},
#' \code{\link{gregexpr}}
#'
#' @rdname substr
substr <- function(x, start=1L, stop=-1L)
{
if (!is.character(x)) x <- as.character(x)
if (!is.numeric(start)) start <- as.numeric(start)
if (!is.numeric(stop)) stop <- as.numeric(stop)
ret <- stringi::stri_sub(x, from=start, to=stop, use_matrix=FALSE)
.attribs_propagate_nary(ret, x, start, stop)
}
#' @rdname substr
substrl <- function(x, start=1L, length=attr(start, "match.length"), ignore_negative_length=FALSE)
{
if (!is.character(x)) x <- as.character(x)
if (!is.numeric(start)) start <- as.numeric(start)
if (!is.numeric(length)) length <- as.numeric(length)
if (missing(length)) attr(start, "match.length") <- NULL
ret <- stringi::stri_sub(x, from=start, length=length, use_matrix=FALSE, ignore_negative_length=ignore_negative_length)
.attribs_propagate_nary(ret, x, start, length)
}
#' @rdname substr
`substr<-` <- function(x, start=1L, stop=-1L, value)
{
if (!is.character(x)) x <- as.character(x)
if (!is.numeric(start)) start <- as.numeric(start)
if (!is.numeric(stop)) stop <- as.numeric(stop)
if (!is.character(value)) value <- as.character(value)
ret <- stringi::`stri_sub<-`(x, from=start, to=stop, omit_na=FALSE, use_matrix=FALSE, value=value)
.attribs_propagate_nary(ret, x, start, stop, value)
}
#' @rdname substr
`substrl<-` <- function(x, start=1L, length=attr(start, "match.length"), value)
{
if (!is.character(x)) x <- as.character(x)
if (!is.numeric(start)) start <- as.numeric(start)
if (!is.numeric(length)) length <- as.numeric(length)
if (!is.character(value)) value <- as.character(value)
if (missing(length)) attr(start, "match.length") <- NULL
ret <- stringi::`stri_sub<-`(x, from=start, length=length, omit_na=FALSE, use_matrix=FALSE, value=value)
.attribs_propagate_nary(ret, x, start, length, value)
}
#' @rdname substr
gsubstr <- function(x, start=list(1L), stop=list(-1L))
{
if (!is.character(x)) x <- as.character(x)
if (!is.list(start)) start <- as.list(start)
if (!is.list(stop)) stop <- as.list(stop)
ret <- stringi::stri_sub_all(x, from=start, to=stop, use_matrix=FALSE)
.attribs_propagate_nary(ret, x, start, stop)
}
#' @rdname substr
gsubstrl <- function(x, start=list(1L), length=lapply(start, attr, "match.length"), ignore_negative_length=TRUE)
{
if (!is.character(x)) x <- as.character(x)
if (!is.list(start)) start <- as.list(start)
if (!is.list(length)) length <- as.list(length)
ret <- stringi::stri_sub_all(x, from=start, length=length, use_matrix=FALSE, ignore_negative_length=ignore_negative_length)
.attribs_propagate_nary(ret, x, start, length)
}
#' @rdname substr
`gsubstr<-` <- function(x, start=list(1L), stop=list(-1L), value)
{
if (!is.character(x)) x <- as.character(x)
if (!is.list(start)) start <- as.list(start)
if (!is.list(stop)) stop <- as.list(stop)
if (!is.list(value)) value <- as.list(value)
ret <- stringi::`stri_sub_all<-`(x, from=start, to=stop, omit_na=FALSE, use_matrix=FALSE, value=value)
.attribs_propagate_nary(ret, x, start, stop, value)
}
#' @rdname substr
`gsubstrl<-` <- function(x, start=list(1L), length=lapply(start, attr, "match.length"), value)
{
if (!is.character(x)) x <- as.character(x)
if (!is.list(start)) start <- as.list(start)
if (!is.list(length)) length <- as.list(length)
if (!is.list(value)) value <- as.list(value)
ret <- stringi::`stri_sub_all<-`(x, from=start, length=length, omit_na=FALSE, use_matrix=FALSE, value=value)
.attribs_propagate_nary(ret, x, start, length, value)
}
#' @rdname substr
substring <- function(text, first=1L, last=-1L)
{
substr(x=text, start=first, stop=last)
}
#' @rdname substr
`substring<-` <- function(text, first=1L, last=-1L, value)
{
`substr<-`(x=text, start=first, stop=last, value=value)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.