R/sub.R

Defines functions stri_sub_replace_all `stri_sub_all<-` stri_sub_all stri_sub_replace `stri_sub<-` stri_sub

Documented in stri_sub stri_sub_all stri_sub_replace stri_sub_replace_all

# kate: default-dictionary en_US

## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2023, Marek Gagolewski <https://www.gagolewski.com/>
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#' @title
#' Extract a Substring From or Replace a Substring In a Character Vector
#'
#' @description
#' \code{stri_sub} extracts particular substrings at code point-based
#' index ranges provided. Its replacement version allows to substitute
#' (in-place) parts of
#' a string with given replacement strings. \code{stri_sub_replace}
#' is its forward pipe operator-friendly variant that returns
#' a copy of the input vector.
#'
#' For extracting/replacing multiple substrings from/within each string, see
#' \code{\link{stri_sub_all}}.
#'
#' @details
#' Vectorized over \code{str}, [\code{value}], \code{from} and
#' (\code{to} or \code{length}). Parameters
#' \code{to} and \code{length} are mutually exclusive.
#'
#' Indexes are 1-based, i.e., the start of a string is at index 1.
#' For negative indexes in \code{from} or \code{to},
#' counting starts at the end of the string.
#' For instance, index -1 denotes the last code point in the string.
#' Non-positive \code{length} gives an empty string.
#'
#' Argument \code{from} gives the start of a substring to extract.
#' Argument \code{to} defines the last index of a substring, inclusive.
#' Alternatively, its \code{length} may be provided.
#'
#' If \code{from} is a two-column matrix, then these two columns are
#' used as \code{from} and \code{to}, respectively,
#' unless the second column is named \code{length}.
#' In such a case anything passed
#' explicitly as \code{to} or \code{length} is ignored.
#' Such types of index matrices are generated by \code{\link{stri_locate_first}}
#' and \code{\link{stri_locate_last}}. If extraction based on
#' \code{\link{stri_locate_all}} is needed, see
#' \code{\link{stri_sub_all}}.
#'
#' In \code{stri_sub}, out-of-bound indexes are silently
#' corrected. If \code{from} > \code{to}, then an empty string is returned.
#' By default, negative \code{length} results in the corresponding output being
#' \code{NA}, see \code{ignore_negative_length}, though.
#'
#' In \code{stri_sub<-}, some configurations of indexes may work as
#' substring 'injection' at the front, back, or in middle.
#' Negative \code{length} does not alter the corresponding input string.
#'
#' If both \code{to} and \code{length} are provided,
#' \code{length} has priority over \code{to}.
#'
#' Note that for some Unicode strings, the extracted substrings might not
#' be well-formed, especially if input strings are not normalized
#' (see \code{\link{stri_trans_nfc}}),
#' include byte order marks, Bidirectional text marks, and so on.
#' Handle with care.
#'
#'
#'
#'
#' @param str character vector
#'
#' @param from integer vector giving the start indexes; alternatively,
#'     if \code{use_matrix=TRUE},
#'     a two-column matrix of type \code{cbind(from, to)}
#'     (unnamed columns or the 2nd column named other than \code{length})
#'     or \code{cbind(from, length=length)} (2nd column named \code{length})
#'
#' @param to integer vector giving the end indexes; mutually exclusive with
#'     \code{length} and \code{from} being a matrix
#'
#' @param length integer vector giving the substring lengths;
#'     mutually exclusive with \code{to} and \code{from} being a matrix
#'
#' @param omit_na single logical value; indicates whether missing values
#'     in any of the indexes or in \code{value} leave the corresponding input string
#'     unchanged [replacement function only]
#'
#' @param use_matrix single logical value; see \code{from}
#'
#' @param replacement alias of \code{value} [wherever applicable]
#'
#' @param value a character vector defining the replacement strings
#'     [replacement function only]
#'
#' @param ignore_negative_length single logical value; whether
#'     negative lengths should be ignored or result in missing values
#'
#' @param ... arguments to be passed to \code{stri_sub<-}
#'
#'
#' @return
#' \code{stri_sub} and \code{stri_sub_replace} return a character vector.
#' \code{stri_sub<-} changes the \code{str} object 'in-place'.
#'
#' @examples
#' s <- c("spam, spam, bacon, and spam", "eggs and spam")
#' stri_sub(s, from=-4)
#' stri_sub(s, from=1, length=c(10, 4))
#' (stri_sub(s, 1, 4) <- 'stringi')
#'
#' x <- c('12 3456 789', 'abc', '', NA, '667')
#' stri_sub(x, stri_locate_first_regex(x, '[0-9]+')) # see stri_extract_first
#' stri_sub(x, stri_locate_last_regex(x, '[0-9]+'))  # see stri_extract_last
#'
#' stri_sub_replace(x, stri_locate_first_regex(x, '[0-9]+'),
#'     omit_na=TRUE, replacement='***') # see stri_replace_first
#' stri_sub_replace(x, stri_locate_last_regex(x, '[0-9]+'),
#'     omit_na=TRUE, replacement='***') # see stri_replace_last
#'
#'
#' \dontrun{x |> stri_sub_replace(1, 5, replacement='new_substring')}
#' @family indexing
#' @rdname stri_sub
#' @export
stri_sub <- function(
    str, from = 1L, to = -1L, length,
    use_matrix=TRUE, ignore_negative_length=FALSE
) {
    use_matrix <- (is.logical(use_matrix) && base::length(use_matrix) == 1L && !is.na(use_matrix) && use_matrix) # isTRUE(use_matrix)
    if (missing(length)) {
        if (use_matrix && is.matrix(from) && !missing(to)) {
            warning("argument `to` is ignored in the current context")
            to <- NULL
        }
        .Call(C_stri_sub, str, from, to, NULL, use_matrix, ignore_negative_length)
    } else {
        if (!missing(to))
            warning("argument `to` is ignored in the current context")
        if (use_matrix && is.matrix(from)) {
            warning("argument `length` is ignored in the current context")
            length <- NULL
        }
        .Call(C_stri_sub, str, from, NULL, length, use_matrix, ignore_negative_length)
    }
}


#' @rdname stri_sub
#' @export
`stri_sub<-` <- function(
    str, from = 1L, to = -1L, length, omit_na=FALSE, use_matrix=TRUE, value
) {
    use_matrix <- (is.logical(use_matrix) && base::length(use_matrix) == 1L && !is.na(use_matrix) && use_matrix) # isTRUE(use_matrix)
    if (missing(length)) {
        if (use_matrix && is.matrix(from) && !missing(to)) {
            warning("argument `to` is ignored in this context")
            to <- NULL
        }
        .Call(C_stri_sub_replacement, str, from, to, NULL, omit_na, value, use_matrix)
    } else {
        if (!missing(to))
            warning("argument `to` is ignored in this context")
        if (use_matrix && is.matrix(from)) {
            warning("argument `length` is ignored in this context")
            length <- NULL
        }
        .Call(C_stri_sub_replacement, str, from, NULL, length, omit_na, value, use_matrix)
    }
}


#' @rdname stri_sub
#' @export
stri_sub_replace <- function(..., replacement, value = replacement)
    `stri_sub<-`(...,  value = value)



#' @title
#' Extract or Replace Multiple Substrings
#'
#' @description
#' \code{stri_sub_all} extracts multiple substrings from each string.
#' Its replacement version substitutes (in-place) multiple substrings with the
#' corresponding replacement strings.
#' \code{stri_sub_replace_all} (alias \code{stri_sub_all_replace})
#' is its forward pipe operator-friendly variant, returning
#' a copy of the input vector.
#'
#' For extracting/replacing single substrings from/within each string, see
#' \code{\link{stri_sub}}.
#'
#' @details
#' Vectorized over \code{str}, [\code{value}], \code{from} and
#' (\code{to} or \code{length}). Just like in \code{\link{stri_sub}}, parameters
#' \code{to} and \code{length} are mutually exclusive.
#'
#' In one of the simplest scenarios, \code{stri_sub_all(str, from, to)},
#' the i-th element of the resulting list
#' generated like \code{stri_sub(str[i], from[[i]], to[[i]])}.
#' As usual, if one of the inputs is shorter than the others,
#' recycling rule is applied.
#'
#'
#' If any of \code{from}, \code{to}, \code{length},
#' or \code{value} is not a list,
#' it is wrapped into a list.
#'
#' If \code{from} consists of a two-column matrix, then these two columns are
#' used as \code{from} and \code{to}, respectively,
#' unless the second column is named \code{length}.
#' Such types of index matrices are generated by
#' \code{\link{stri_locate_all}}.
#' If extraction or replacement based on \code{\link{stri_locate_first}}
#' or \code{\link{stri_locate_last}} is needed, see \code{\link{stri_sub}}.
#'
#' In the replacement function, the index ranges must be sorted
#' with respect to \code{from} and must be mutually disjoint.
#' Negative \code{length} does not result in any altering of the
#' corresponding input string. On the other hand, in \code{stri_sub_all},
#' this make the corresponding chunk be ignored,
#' see \code{ignore_negative_length}, though.
#'
#' @param str character vector
#'
#' @param from list of integer vector giving the start indexes; alternatively,
#'     if \code{use_matrix=TRUE}, a list of two-column matrices of type
#'     \code{cbind(from, to)}
#'     (unnamed columns or the 2nd column named other than \code{length})
#'     or \code{cbind(from, length=length)} (2nd column named \code{length})
#'
#' @param to list of integer vectors giving the end indexes
#'
#' @param length list of integer vectors giving the substring lengths
#'
#' @param omit_na single logical value; indicates whether missing values
#'     in any of the indexes or in \code{value} leave the part of the
#'     corresponding input string
#'     unchanged [replacement function only]
#'
#' @param use_matrix single logical value; see \code{from}
#'
#' @param replacement alias of \code{value} [wherever applicable]
#'
#' @param value a list of character vectors defining the replacement strings
#'     [replacement function only]
#'
#' @param ignore_negative_length single logical value; whether
#'     negative lengths should be ignored or result in missing values
#'
#' @param ... arguments to be passed to \code{stri_sub_all<-}
#'
#'
#' @return
#' \code{stri_sub_all} returns a list of character vectors.
#' Its replacement versions modify the input 'in-place'.
#'
#' @examples
#' x <- c('12 3456 789', 'abc', '', NA, '667')
#' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+')) # see stri_extract_all
#' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+', omit_no_match=TRUE))
#'
#' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+', omit_no_match=TRUE)) <- '***'
#' print(x)
#'
#' stri_sub_replace_all('a b c', c(1, 3, 5), c(1, 3, 5), replacement=c('A', 'B', 'C'))
#'
#'
#' @family indexing
#' @rdname stri_sub_all
#' @export
stri_sub_all <- function(
    str, from = list(1L), to = list(-1L), length,
    use_matrix=TRUE, ignore_negative_length=TRUE
) {
    if (!is.list(from))
        from <- list(from)

    if (missing(length)) {
        if (!missing(to) && !is.list(to)) {
            to <- list(to)
        }

        .Call(C_stri_sub_all, str, from, to, NULL, use_matrix, ignore_negative_length)
    } else {
        if (!missing(to))
            warning("argument `to` is ignored in this context")

        if (!is.list(length)) {
            length <- list(length)
        }

        .Call(C_stri_sub_all, str, from, NULL, length, use_matrix, ignore_negative_length)
    }
}


#' @rdname stri_sub_all
#' @export
`stri_sub_all<-` <- function(
    str, from = list(1L), to = list(-1L), length,
    omit_na=FALSE, use_matrix=TRUE, value
) {
    if (!is.list(from))
        from <- list(from)
    if (!is.list(value))
        value <- list(value)

    if (missing(length)) {
        if (!missing(to) && !is.list(to)) {
            to <- list(to)
        }

        .Call(C_stri_sub_replacement_all, str, from, to, NULL, omit_na, value, use_matrix)
    } else {
        if (!missing(to))
            warning("argument `to` is ignored in this context")

        if (!is.list(length)) {
            length <- list(length)
        }

        .Call(C_stri_sub_replacement_all, str, from, NULL, length, omit_na, value, use_matrix)
    }
}


#' @rdname stri_sub_all
#' @export
stri_sub_replace_all <- function(..., replacement, value=replacement)
    `stri_sub_all<-`(...,  value=value)


#' @rdname stri_sub_all
#' @export
stri_sub_all_replace <- stri_sub_replace_all

Try the stringi package in your browser

Any scripts or data that you put into this service are public.

stringi documentation built on Nov. 23, 2023, 5:07 p.m.