Nothing
#' Match quanteda objects against token types
#'
#' Developer function to match patterns in quanteda objects against token types.
#' @param x a list of character vectors, [dictionary] or collocations object
#' @inheritParams valuetype
#' @inheritParams pattern2id
#' @param concatenator the concatenation character that joins multi-word
#' expression in `types`
#' @inheritParams tokens_lookup
#' @param match_pattern select only single-word patterns or multi-word patterns
#' should be matched. If "any", it matches both single-word and multi-word patterns.
#' @return `object2fixed()` returns a list of character vectors of matched
#' types. `object2id()` returns a list of indices of matched types with
#' attributes. The "pattern" attribute records the indices of the matched patterns
#' in `x`; the "key" attribute records the keys of the matched patterns when `x` is
#' [dictionary].
#' @seealso [pattern2id()]
#' @keywords development internal
#' @export
object2id <- function(x, types, valuetype = c("glob", "fixed", "regex"),
case_insensitive = TRUE,
concatenator = "_", levels = 1,
match_pattern = c("any", "single", "multi"),
keep_nomatch = FALSE) {
if (is.dfm(x))
stop("dfm cannot be used as pattern")
types <- check_character(types, min_len = 0, max_len = Inf, strict = TRUE)
valuetype <- match.arg(valuetype)
case_insensitive <- check_logical(case_insensitive)
concatenator <- check_character(concatenator)
levels <- check_integer(levels, min = 1, max_len = Inf)
match_pattern <- match.arg(match_pattern)
keep_nomatch <- check_logical(keep_nomatch)
if (is.collocations(x)) {
if (nrow(x) == 0) return(list())
temp <- stri_split_charclass(x$collocation, "\\p{Z}")
names(temp) <- x$collocation
if (case_insensitive) {
result <- pattern2id(temp, types, valuetype = "fixed",
case_insensitive = TRUE)
} else {
temp <- lapply(temp, function(x) fastmatch::fmatch(x, types))
result <- temp[unlist(lapply(temp, function(x) all(!is.na(x))), use.names = FALSE)]
}
attr(result, "pattern") <- match(names(result), names(temp))
} else {
if (length(x) == 0) return(list())
if (is.dictionary(x)) {
x <- as.dictionary(x)
temp <- flatten_list(x, levels)
key <- names(temp)
temp <- split_values(temp, " ", concatenator)
} else if (is.list(x)) {
x <- lapply(x, function(x) check_character(x, min_len = 0, max_len = Inf, strict = TRUE))
temp <- x
names(temp) <- stri_c_list(x, " ")
} else {
x <- check_character(x, min_len = 0, max_len = Inf, strict = TRUE)
temp <- as.list(x)
names(temp) <- x
}
if (identical(match_pattern, "single")) {
temp <- temp[lengths(temp) == 1] # only single-word patterns
} else if (identical(match_pattern, "multi")) {
temp <- temp[lengths(temp) > 1] # only multi-word patterns
}
result <- pattern2id(temp, types, valuetype, case_insensitive, keep_nomatch)
# NOTE: need to return index in x?
attr(result, "pattern") <- match(names(result), names(temp))
if (is.dictionary(x))
attr(result, "key") <- key
}
return(result)
}
#' @rdname object2id
#' @keywords development internal
#' @export
#' @examples
#' types <- c("A", "AA", "B", "BB", "B_B", "C", "C-C")
#'
#' # dictionary
#' dict <- dictionary(list(A = c("a", "aa"),
#' B = c("BB", "B B"),
#' C = c("C", "C-C")))
#' object2fixed(dict, types)
#' object2fixed(dict, types, match_pattern = "single")
#' object2fixed(dict, types, match_pattern = "multi")
#'
#' # phrase
#' pats <- phrase(c("a", "aa", "zz", "bb", "b b"))
#' object2fixed(pats, types)
#' object2fixed(pats, types, keep_nomatch = TRUE)
object2fixed <- function(x, types, valuetype = c("glob", "fixed", "regex"),
case_insensitive = TRUE,
concatenator = "_", levels = 1,
match_pattern = c("any", "single", "multi"),
keep_nomatch = FALSE) {
temp <- object2id(x, types, valuetype, case_insensitive, concatenator,
levels, match_pattern, keep_nomatch)
result <- lapply(temp, function(x) types[x])
return(result)
}
# temporary duplication to avoid breaking existing packages (quanteda.textstats)
pattern2list <- object2id
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.