R/length.R
In stringi: Fast and Portable Character String Processing Facilities

Documented in stri_isempty stri_length stri_numbytes stri_width

# kate: default-dictionary en_US

## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2024, Marek Gagolewski <https://www.gagolewski.com/>
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#' @title
#' Count the Number of Bytes
#'
#' @description
#' Counts the number of bytes needed to store
#' each string in the computer's memory.
#'
#' @details
#' Often, this is not the function you would normally use
#' in your string processing activities. See \code{\link{stri_length}} instead.
#'
#' For 8-bit encoded strings, this is the same as \code{\link{stri_length}}.
#' For UTF-8 strings, the returned values may be greater
#' than the number of code points, as UTF-8 is not a fixed-byte encoding:
#' one code point may be encoded by 1-4 bytes
#' (according to the current Unicode standard).
#'
#' Missing values are handled properly.
#'
#' The strings do not need to be re-encoded to perform this operation.
#'
#' The returned values do not include the trailing NUL bytes,
#' which are used internally to mark the end of string data (in C).
#'
#' @param str character vector or an object coercible to
#'
#' @return Returns an integer vector of the same length as \code{str}.
#'
#' @examples
#' stri_numbytes(letters)
#' stri_numbytes(c('abc', '123', '\u0105\u0104'))
#'
#' \dontrun{
#' # this used to fail on Windows, where there were no native support
#' # for 4-bytes Unicode characters; see, however, stri_unescape_unicode():
#' stri_numbytes('\U001F600') # compare stri_length('\U001F600')
#' }
#'
#' @export
#' @family length
stri_numbytes <- function(str) {
    .Call(C_stri_numbytes, str)
}


#' @title
#' Count the Number of Code Points
#'
#' @description
#' This function returns the number of code points
#' in each string.
#'
#' @details
#' Note that the number of code points is
#' not the same as the `width` of the string when
#' printed on the console.
#'
#' If a given string is in UTF-8 and has not been properly normalized
#' (e.g., by \code{\link{stri_trans_nfc}}), the returned counts may sometimes be
#' misleading. See \code{\link{stri_count_boundaries}} for a method to count
#' \emph{Unicode characters}. Moreover, if an incorrect UTF-8 byte sequence
#' is detected, then a warning is generated and the corresponding output element
#' is set to \code{NA}, see also \code{\link{stri_enc_toutf8}} for a method
#' to deal with such cases.
#'
#' Missing values are handled properly.
#' For `byte` encodings we get, as usual, an error.
#'
#' @param str character vector or an object coercible to
#' @return Returns an integer vector of the same length as \code{str}.
#'
#' @examples
#' stri_length(LETTERS)
#' stri_length(c('abc', '123', '\u0105\u0104'))
#' stri_length('\u0105') # length is one, but...
#' stri_numbytes('\u0105') # 2 bytes are used
#' stri_numbytes(stri_trans_nfkd('\u0105')) # 3 bytes here but...
#' stri_length(stri_trans_nfkd('\u0105')) # ...two code points (!)
#' stri_count_boundaries(stri_trans_nfkd('\u0105'), type='character') # ...and one Unicode character
#'
#' @export
#' @family length
stri_length <- function(str)
{
    .Call(C_stri_length, str)
}


#' @title
#' Determine if a String is of Length Zero
#'
#' @description
#' This is the fastest way to find out
#' whether the elements of a character vector are empty strings.
#'
#' @details
#' Missing values are handled properly.
#'
#' @param str character vector or an object coercible to
#' @return Returns a logical vector of the same length as \code{str}.
#'
#' @examples
#' stri_isempty(letters[1:3])
#' stri_isempty(c(',', '', 'abc', '123', '\u0105\u0104'))
#' stri_isempty(character(1))
#'
#' @export
#' @family length
stri_isempty <- function(str)
{
    .Call(C_stri_isempty, str)
}


#' @title
#' Determine the Width of Code Points
#'
#' @description
#' Approximates the number of text columns the `cat()` function
#' might use to print a string using a mono-spaced font.
#'
#' @details
#' The Unicode standard does not formalize the notion of a character
#' width. Roughly based on \url{http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c},
#' \url{https://github.com/nodejs/node/blob/master/src/node_i18n.cc},
#' and UAX #11 we proceed as follows.
#' The following code points are of width 0:
#' \itemize{
#' \item code points with general category (see \link{stringi-search-charclass})
#' \code{Me}, \code{Mn}, and \code{Cf}),
#' \item \code{C0} and \code{C1} control codes (general category \code{Cc})
#' - for compatibility with the \code{\link{nchar}} function,
#' \item Hangul Jamo medial vowels and final consonants
#' (code points with enumerable property \code{UCHAR_HANGUL_SYLLABLE_TYPE}
#' equal to \code{U_HST_VOWEL_JAMO} or \code{U_HST_TRAILING_JAMO};
#' note that applying the NFC normalization with \code{\link{stri_trans_nfc}}
#' is encouraged),
#' \item ZERO WIDTH SPACE (U+200B),
#' }
#'
#' Characters with the \code{UCHAR_EAST_ASIAN_WIDTH} enumerable property
#' equal to \code{U_EA_FULLWIDTH} or \code{U_EA_WIDE} are
#' of width 2.
#'
#' Most emojis and characters with general category So (other symbols)
#' are of width 2.
#'
#' SOFT HYPHEN (U+00AD) (for compatibility with \code{\link{nchar}})
#' as well as any other characters have width 1.
#'
#' @param str character vector or an object coercible to
#' @return Returns an integer vector of the same length as \code{str}.
#'
#' @examples
#' stri_width(LETTERS[1:5])
#' stri_width(stri_trans_nfkd('\u0105'))
#' stri_width(stri_trans_nfkd('\U0001F606'))
#' stri_width( # Full-width equivalents of ASCII characters:
#'    stri_enc_fromutf32(as.list(c(0x3000, 0xFF01:0xFF5E)))
#' )
#' stri_width(stri_trans_nfkd('\ubc1f')) # includes Hangul Jamo medial vowels and final consonants
#' @export
#' @family length
#'
#' @references
#' \emph{East Asian Width} -- Unicode Standard Annex #11,
#' \url{https://www.unicode.org/reports/tr11/}
stri_width <- function(str)
{
    .Call(C_stri_width, str)
}