R/kuromoji.R

Defines functions kuromoji

Documented in kuromoji

#' Call kuromoji tokenizer
#'
#' @param chr Character vector to be tokenized.
#' @return A data.frame
#'
#' @export
kuromoji <- function(chr) {
  stopifnot(rlang::is_character(chr))

  # keep names
  nm <- names(chr)
  if (identical(nm, NULL)) {
    nm <- seq_along(chr)
  }

  purrr::imap_dfr(purrr::set_names(chr, nm), function(str, id) {
    str <- stringi::stri_replace_na(stringi::stri_enc_toutf8(str), "")
    tokens <- rJava::.jcall(
      Tokenizer(),
      "Ljava/util/List;",
      "tokenize",
      str
    )
    res <- lapply(tokens, function(elem) {
      surface <- elem$getSurfaceForm()
      feature <- elem$getAllFeatures()
      Encoding(surface) <- "UTF-8"
      Encoding(feature) <- "UTF-8"
      data.frame(
        doc_id = id,
        token = surface,
        feature = feature,
        # is_known = elem$isKnown(),
        is_unk = elem$isUnknown(),
        is_user = elem$isUser()
      )
    })
    purrr::map_dfr(res, ~.)
  })
}
paithiov909/tangela documentation built on Dec. 1, 2023, 2:52 a.m.