R/tokenizers.R

Defines functions normalize_characters tokenize_fastestword tokenize_fasterword tokenize_sentence tokenize_character restore_special1 preserve_special1 tokenize_word1 breakrules_reset breakrules_set breakrules_get tokenize_custom tokenize_word4 restore_special preserve_special tokenize_word2

Documented in breakrules_get breakrules_reset breakrules_set tokenize_character tokenize_custom tokenize_fasterword tokenize_fastestword tokenize_sentence tokenize_word1 tokenize_word2 tokenize_word4

# man page ----------

#' quanteda tokenizers
#'
#' Internal methods for tokenization providing default and legacy methods for
#' text segmentation.
#' @name tokenize_internal
#' @rdname tokenize_internal
#' @aliases tokenize
#' @param x (named) character; input texts
#' @details
#' Each of the word tokenizers corresponds to a major version of \pkg{quanteda},
#' kept here for backward compatibility and comparison.  `tokenize_word3()` is
#' identical to `tokenize_word2()`.
#' 
#' @return a list of characters corresponding to the (most conservative)
#'   tokenization, including whitespace where applicable; except for
#'   `tokenize_word1()`, which is a special tokenizer for Internet language that
#'   includes URLs, #hashtags, @usernames, and email addresses.
#' @keywords tokens internal
#' @examples
#' \dontrun{
#' txt <- c(doc1 = "Tweet https://quanteda.io using @quantedainit and #rstats.",
#'          doc2 = "The £1,000,000 question.",
#'          doc4 = "Line 1.\nLine2\n\nLine3.",
#'          doc5 = "?",
#'          doc6 = "Self-aware machines! \U0001f600",
#'          doc7 = "Qu'est-ce que c'est?")
#' tokenize_word2(txt)
#' tokenize_word2(txt, split_hyphens = FALSE)
#' tokenize_word1(txt, split_hyphens = FALSE)
#' tokenize_word4(txt, split_hyphens = FALSE, split_elisions = TRUE)
#' tokenize_fasterword(txt)
#' tokenize_fastestword(txt)
#' tokenize_sentence(txt)
#' tokenize_character(txt[2])
#' }
NULL

# default v2 and v3 tokenizer ----------

#' @rdname tokenize_internal
#' @importFrom stringi stri_replace_all_regex stri_detect_fixed stri_split_boundaries
#' @export
tokenize_word2 <- function(x, split_hyphens = FALSE, verbose = quanteda_options("verbose"), ...) {

    if (verbose) catm(" ...segmenting into words\n")
    m <- names(x)
    x[is.na(x)] <- "" # make NAs ""

    # this will not be needed if we can modify the ICU type rules to protect them
    # remove variant selector & whitespace with diacritical marks
    x <- stri_replace_all_regex(x, c("[\uFE00-\uFE0F]", "\\s[\u0300-\u036F]"), "",
                                vectorize_all = FALSE)

    structure(stri_split_boundaries(x, type = "word"), names = m)
}

#' @rdname tokenize_internal
#' @export
tokenize_word3 <- tokenize_word2

preserve_special <- function(x, split_hyphens = TRUE, split_tags = TRUE, verbose = FALSE) {

    name <- names(x)
    x <- as.character(x)

    hyphen <- "[\\p{Pd}]"
    username <- quanteda_options("pattern_username")
    hashtag <- quanteda_options("pattern_hashtag")
    # preserves web and email address
    address <- "(https?:\\/\\/(www\\.)?|@)[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-z]{2,4}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)"

    regex <- address
    if (!split_hyphens) {
        if (verbose) catm(" ...preserving hyphens\n")
        regex <- c(regex, hyphen)
    }
    if (!split_tags) {
        if (verbose) catm(" ...preserving social media tags (#, @)\n")
        regex <- c(regex, username, hashtag)
    }

    s <- stri_extract_all_regex(x, paste(regex, collapse = "|"),  omit_no_match = TRUE)
    r <- lengths(s)
    s <- unlist_character(s, use.names = FALSE)

    # index specials
    u <- unique(s)
    u <- u[order(stri_length(u), decreasing = TRUE)] # substitute longer match first
    index <- split(rep(seq_along(x), r), factor(s, levels = u))
    if (length(index)) {
        special <- paste0("\u100000", seq_along(index), "\u100001")
        names(special) <- names(index)
        for (i in seq_along(index)) {
            x[index[[i]]] <- stri_replace_all_fixed(
                x[index[[i]]],
                names(special)[i],
                special[i],
                vectorize_all = FALSE
            )
        }
    } else {
        special <- character()
    }
    structure(x, names = name, special = special)
}

restore_special <- function(x, special, recompile = TRUE) {

    if (!length(special))
        return(x)

    type <- attr(x, "types")
    # extract all placeholders
    d <- stri_extract_all_regex(type, "\u100000\\d+\u100001", omit_no_match = TRUE)
    r <- lengths(d)
    d <- unlist_character(d, use.names = FALSE)

    # index placeholders
    index <- split(rep(seq_along(type), r), factor(d, levels = unique(d)))
    if (length(index)) {
        pos <- fastmatch::fmatch(names(index), special)
        for (i in seq_along(index)) {
            type[index[[i]]] <- stri_replace_all_fixed(
                type[index[[i]]],
                special[pos[i]],
                names(special)[pos[i]],
                vectorize_all = FALSE
            )
        }
    }
    if (!identical(type, attr(x, "types"))) {
        attr(x, "types") <- type
    }
    return(x)
}

# v4 tokenizer ---------

#' @rdname tokenize_internal
#' @export
tokenize_word4 <- function(x, split_hyphens = FALSE, split_tags = FALSE, split_elisions = FALSE,
                           verbose = quanteda_options("verbose"), ...) {
    
    rules <- breakrules_get("word")
    if (!split_hyphens) {
        if (verbose) catm(" ...preserving hyphens\n")
    } else {
        rules[["keep_hyphens"]] <- NULL
    }
    if (!split_elisions) {
        # NOTE: consider splitting in tokens.tokens()
        if (verbose) catm(" ...preserving elisions\n")
        rules[["split_elisions"]] <- NULL
    }

    username <- quanteda_options("pattern_username")
    hashtag <- quanteda_options("pattern_hashtag")
    
    ftp <- "s?ftp://[-+a-zA-Z0-9@#:.%~=_&/]+"
    http <- "(https?://)?(www.)?[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)+([/?#][-+a-zA-Z0-9@#:.%~=_&]+)*[/?#]?"
    email <- "[-+a-zA-Z0-9_.]+@[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)*\\.[a-z]+"
    regex <- c(email, ftp, http)
    if (!split_tags) {
        if (verbose) catm(" ...preserving social media tags (#, @)\n")
        regex <- c(regex, username, hashtag)
        x <- stri_replace_all_regex(x, paste(regex, collapse = "|"), "\uE001$0\uE002")
        rules[["split_tags"]] <- NULL
    }
    # if (!split_tags) {
    #     if (verbose) catm(" ...preserving social media tags (#, @)\n")
    #     # NOTE: default patterns are protected by keep_tags
    #     rules[["keep_usernames"]] <- paste0(stri_replace_all_fixed(quanteda_options("pattern_username"), "@", "\\@"), ";")
    #     rules[["keep_hashtags"]] <- paste0(stri_replace_all_fixed(quanteda_options("pattern_hashtag"), "#", "\\#"), ";")
    # } else {
    #     rules[["keep_tags"]] <- NULL
    # }
    tokenize_custom(x, rules)
}

#' Customizable tokenizer
#'
#' Allows users to tokenize texts using customized boundary rules. See the [ICU
#' website](https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html)
#' for how to define boundary rules.
#' @param x character vector for texts to tokenize
#' @param rules a list of rules for rule-based boundary detection
#' @details The package contains internal sets of rules for word and sentence 
#' breaks, which are lists
#'   of rules for word and sentence boundary detection. `base` is copied from
#'   the ICU library. Other rules are created by the package maintainers in 
#'   `system.file("breakrules/breakrules_custom.yml")`.
#'   
#'   This function allows modification of those rules, and applies them as a new
#'   tokenizer.
#'   
#'   Custom word rules:
#'   \describe{
#'   \item{`base`}{ICU's rules for detecting word/sentence boundaries}
#'   \item{`keep_hyphens`}{quanteda's rule for preserving hyphens}
#'   \item{`keep_url`}{quanteda's rule for preserving URLs}
#'   \item{`keep_email`}{quanteda's rule for preserving emails}
#'   \item{`keep_tags`}{quanteda's rule for preserving tags}
#'   \item{`split_elisions`}{quanteda's rule for splitting elisions}
#'   \item{`split_tags`}{quanteda's rule for splitting tags}
#'   }
#'
#' @return `tokenize_custom()` returns a list of characters containing tokens.
#' @importFrom stringi stri_split_boundaries
#' @source
#' <https://raw.githubusercontent.com/unicode-org/icu/main/icu4c/source/data/brkitr/rules/word.txt>
#' 
#' <https://raw.githubusercontent.com/unicode-org/icu/main/icu4c/source/data/brkitr/rules/sent.txt>
#' @examples
#' lis <- tokenize_custom("a well-known http://example.com", rules = breakrules_get("word"))
#' tokens(lis, remove_separators = TRUE)
#' @export
tokenize_custom <- function(x, rules) {
    x[is.na(x)] <- ""
    rule <- paste0(unlist(rules), collapse = "\n")
    structure(stri_split_boundaries(x, type = rule), names = names(x))
}

#' @rdname tokenize_custom
#' @description
#' Tools for custom word and sentence breakrules, to retrieve, set, or reset
#' them to package defaults.
#' @keywords internal
#' @export
#' @param what character; which set of rules to return, one of `"word"` or
#'   `"sentence"`
#' @returns `breakrules_get()` returns the existing break rules as a list.
#' @examples
#' breakrules_get("word")
#' breakrules_get("sentence")
#' 
breakrules_get <- function(what = c("word", "sentence")) {
    what <- match.arg(what)
    if (what == "word") {
        global$breakrules_word
    } else if (what == "sentence") {
        global$breakrules_sentence
    }
}

#' @rdname tokenize_custom
#' @keywords internal
#' @export
#' @returns `breakrules_set()` returns nothing but reassigns the global
#'   breakrules to `x`.
#' @examples
#' brw <- breakrules_get("word")
#' brw$keep_email <- "@[a-zA-Z0-9_]+"
#' breakrules_set(brw, what = "word")
breakrules_set <- function(x, what = c("word", "sentence")) {
    what <- match.arg(what)
    if (what == "word") {
        global$breakrules_word <- x
    } else if (what == "sentence") {
        global$breakrules_sentence <- x
    }
}

#' @rdname tokenize_custom
#' @keywords internal
#' @export
#' @returns `breakrules_reset()` returns nothing but reassigns the global
#'   breakrules to the system defaults.  These rules are defined in
#'   `system.file("breakrules/")`.
#' @examples
#' breakrules_reset("sentence")
#' breakrules_reset("word")
breakrules_reset <- function(what = c("word", "sentence")) {
    what <- match.arg(what)
    if (what == "word") {
        global$breakrules_word <-
            c(list(base = paste0(readLines(system.file("breakrules/word.txt", package = "quanteda")),
                                 collapse = "\n")),
              yaml::read_yaml(system.file("breakrules/custom.yml", package = "quanteda")))
    } else if (what == "sentence") {
        global$breakrules_sentence <-
            list(base = paste0(readLines(system.file("breakrules/sent.txt", package = "quanteda")),
                               collapse = "\n"))
    }
}


# legacy tokenizers ----------

#' @rdname tokenize_internal
#' @inheritParams tokens
#' @importFrom stringi stri_detect_regex stri_detect_charclass
#'   stri_replace_all_regex stri_detect_fixed stri_replace_all_fixed
#' @export
tokenize_word1 <- function(x, split_hyphens = FALSE, verbose = quanteda_options("verbose"), ...) {

    m <- names(x)
    x[is.na(x)] <- "" # make NAs ""

    # remove variant selector & whitespace with diacritical marks
    x <- stri_replace_all_regex(x, c("[\uFE00-\uFE0F]", "\\s[\u0300-\u036F]"), "",
                                vectorize_all = FALSE)

    if (verbose) catm(" ...segmenting into words\n")
    x <- stri_split_boundaries(x, type = "word", locale = quanteda_options("tokens_locale"))
    structure(x, names = m)
}

# substitutions to preserve hyphens and tags
preserve_special1 <- function(x, split_hyphens = TRUE, split_tags = TRUE, verbose = FALSE) {
    if (!split_hyphens) {
        if (verbose) catm(" ...preserving hyphens\n")
        x <- stri_replace_all_regex(x, "(\\w)\\p{Pd}+", "$1_hy_")
    }
    if (!split_tags) {
        if (verbose) catm(" ...preserving social media tags (#, @)\n")
        x <- stri_replace_all_fixed(x, c("#", "@"), c("_ht_", "_as_"), vectorize_all = FALSE)
    }
    return(x)
}

# re-substitute the replacement hyphens and tags
restore_special1 <- function(x, split_hyphens = TRUE, split_tags = TRUE) {
    type <- attr(x, "types")
    if (!split_hyphens)
        type <- stri_replace_all_fixed(type, "_hy_", "-")
    if (!split_tags)
        type <- stri_replace_all_fixed(type, c("_ht_", "_as_"), c("#", "@"),
                                       vectorize_all = FALSE)
    if (!identical(type, attr(x, "types"))) {
        attr(x, "types") <- type
    }
    return(x)
}

#' @rdname tokenize_internal
#' @importFrom stringi stri_split_boundaries
#' @export
tokenize_character <- function(x, ...) {
    stri_split_boundaries(x, type = "character", simplify = FALSE)
}

#' @rdname tokenize_internal
#' @importFrom stringi stri_replace_all_regex stri_replace_all_fixed
#'   stri_split_boundaries stri_trim_right
#' @export
tokenize_sentence <- function(x, verbose = FALSE, ...) {
    if (verbose) catm(" ...segmenting into sentences.\n")
    m <- names(x)
    x <- stri_replace_all_fixed(x, "\n", " ") # TODO consider removing
    x <- stri_split_boundaries(x, type = "sentence", locale = quanteda_options("tokens_locale"))
    x <- lapply(x, function(y) if (length(y)) stri_trim_right(y) else "")
    structure(x, names = m)
}

#' @rdname tokenize_internal
#' @importFrom stringi stri_split_regex
#' @export
tokenize_fasterword <- function(x, ...) {
    stri_split_regex(x, "[\\p{Z}\\p{C}]+")
}

#' @rdname tokenize_internal
#' @importFrom stringi stri_split_regex
#' @export
tokenize_fastestword <- function(x, ...) {
    stri_split_regex(x, " ")
}


normalize_characters <- function(x) {
    # convert the dreaded "curly quotes" to ASCII equivalents
    x <- stri_replace_all_fixed(x,
                                c("\u201C", "\u201D", "\u201F",
                                  "\u2018", "\u201B", "\u2019"),
                                c("\"", "\"", "\"",
                                  "\'", "\'", "\'"), vectorize_all = FALSE)

    # replace all hyphens with simple hyphen
    x <- stri_replace_all_fixed(x, c("\u2012", "\u2013", "\u2014", "\u2015", "\u2053"), "--",
                                vectorize_all = FALSE)
    x <- stri_replace_all_regex(x, c("\\p{Pd}", "\\p{Pd}{2,}"), c("-", " - "),
                                vectorize_all = FALSE)

    return(x)
}

Try the quanteda package in your browser

Any scripts or data that you put into this service are public.

quanteda documentation built on May 31, 2023, 8:28 p.m.