#' Tokenize sentences using 'MeCab'
#'
#' @param x A data.frame like object or a character vector to be tokenized.
#' @param text_field <[`data-masked`][rlang::args_data_masking]>
#' String or symbol; column containing texts to be tokenized.
#' @param docid_field <[`data-masked`][rlang::args_data_masking]>
#' String or symbol; column containing document IDs.
#' @param sys_dic Character scalar; path to the system dictionary for 'MeCab'.
#' Note that the system dictionary is expected to be compiled with UTF-8,
#' not Shift-JIS or other encodings.
#' @param user_dic Character scalar; path to the user dictionary for 'MeCab'.
#' @param split Logical. When passed as `TRUE`, the function
#' internally splits the sentences into sub-sentences
#' using \code{stringi::stri_split_boundaries(type = "sentence")}.
#' @param partial Logical. When passed as `TRUE`, activates
#' partial parsing mode.
#' To activate this feature, remember that all spaces at the start and end of
#' the input chunks are already squashed. In particular, trailing spaces
#' of chunks sometimes cause errors when parsing.
#' @param grain_size Integer value larger than 1.
#' This argument is internally passed to `RcppParallel::parallelFor` function.
#' Setting a larger chunk size could improve the performance in some cases.
#' @param mode Character scalar to switch output format.
#' @returns A tibble or a named list of tokens.
#' @export
#' @examples
#' \dontrun{
#' df <- tokenize(
#' data.frame(
#' doc_id = seq_along(5:8),
#' text = ginga[5:8]
#' )
#' )
#' head(df)
#' }
tokenize <- function(x,
text_field = "text",
docid_field = "doc_id",
sys_dic = "",
user_dic = "",
split = FALSE,
partial = FALSE,
grain_size = 1L,
mode = c("parse", "wakati")) {
UseMethod("tokenize")
}
#' @export
tokenize.default <- function(x,
text_field = "text",
docid_field = "doc_id",
sys_dic = "",
user_dic = "",
split = FALSE,
partial = FALSE,
grain_size = 1L,
mode = c("parse", "wakati")) {
mode <- rlang::arg_match(mode, c("parse", "wakati"))
x <- dplyr::as_tibble(x)
text_field <- enquo(text_field)
docid_field <- enquo(docid_field)
result <- tagger_impl(
dplyr::pull(x, {{ text_field }}),
dplyr::pull(x, {{ docid_field }}),
sys_dic, user_dic, split, partial, grain_size
)
# if it's a factor, preserve ordering
col_names <- as_name(docid_field)
if (is.factor(x[[col_names]])) {
col_u <- levels(x[[col_names]])
} else {
col_u <- unique(x[[col_names]])
}
tbl <- x %>%
dplyr::select(-!!text_field) %>%
dplyr::mutate(dplyr::across(!!docid_field, ~ factor(., col_u))) %>%
dplyr::rename(doc_id = {{ docid_field }}) %>%
dplyr::left_join(
result,
by = c("doc_id" = "doc_id")
)
if (!identical(mode, "wakati")) {
return(tbl)
}
as_tokens(tbl, pos_field = NULL, nm = col_u)
}
#' @export
tokenize.character <- function(x,
text_field = "text",
docid_field = "doc_id",
sys_dic = "",
user_dic = "",
split = FALSE,
partial = FALSE,
grain_size = 1L,
mode = c("parse", "wakati")) {
mode <- rlang::arg_match(mode, c("parse", "wakati"))
nm <- names(x)
if (is.null(nm)) {
nm <- seq_along(x)
}
tbl <- tagger_impl(x, nm, sys_dic, user_dic, split, partial, grain_size)
if (!identical(mode, "wakati")) {
return(tbl)
}
as_tokens(tbl, pos_field = NULL, nm = nm)
}
#' @noRd
tagger_impl <- function(sentences,
docnames,
sys_dic,
user_dic,
split,
partial,
grain_size) {
# check if dictionaries are available.
if (rlang::is_empty(dictionary_info(sys_dic, user_dic))) {
rlang::abort("Can't find dictionaries.", class = "gbs_missing_dict")
}
grain_size <- ifelse(grain_size > 0L, as.integer(grain_size), 1L)
if (isTRUE(split)) {
res <-
stringi::stri_split_boundaries(sentences, type = "sentence") %>%
rlang::as_function(~ {
sizes <- lengths(.)
posParallelRcpp(
unlist(., use.names = FALSE),
sys_dic,
user_dic,
partial,
grain_size
) %>%
dplyr::left_join(
data.frame(
doc_id = rep(docnames, sizes),
sentence_id = seq_len(sum(sizes))
),
by = "sentence_id"
)
})() %>%
dplyr::mutate(
sentence_id = dplyr::consecutive_id(.data$sentence_id),
.by = "doc_id"
) %>%
dplyr::relocate("doc_id", dplyr::everything())
} else {
res <-
posParallelRcpp(sentences, sys_dic, user_dic, partial, grain_size) %>%
dplyr::left_join(
data.frame(
sentence_id = seq_along(sentences),
doc_id = docnames
),
by = "sentence_id"
) %>%
dplyr::relocate("doc_id", dplyr::everything())
}
res %>%
dplyr::mutate(doc_id = factor(.data$doc_id, unique(.data$doc_id))) %>%
dplyr::as_tibble()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.