R/utils.R

Defines functions get_transition_cost is.blank.character is.blank.default is.blank is_blank as_tokens

Documented in as_tokens get_transition_cost is_blank

#' Create a list of tokens
#'
#' @param tbl A tibble of tokens out of \code{tokenize()}.
#' @param token_field <[`data-masked`][rlang::args_data_masking]>
#' Column containing tokens.
#' @param pos_field Column containing features
#' that will be kept as the names of tokens.
#' If you don't need them, give a `NULL` for this argument.
#' @param nm Names of returned list.
#' If left with `NULL`, "doc_id" field of `tbl` is used instead.
#' @returns A named list of tokens.
#' @export
#' @examples
#' \dontrun{
#' tokenize(
#'   data.frame(
#'     doc_id = seq_along(5:8),
#'     text = ginga[5:8]
#'   )
#' ) |>
#'   prettify(col_select = "POS1") |>
#'   as_tokens()
#' }
as_tokens <- function(tbl,
                      token_field = "token",
                      pos_field = get_dict_features()[1],
                      nm = NULL) {
  token_field <- as_name(enquo(token_field))
  col_names <- as_name("doc_id")

  if (is.null(nm)) {
    if (is.factor(tbl[[col_names]])) {
      nm <- levels(tbl[[col_names]])
    } else {
      nm <- unique(tbl[[col_names]])
    }
  }

  if (is.null(pos_field)) {
    tbl[[token_field]] %>%
      split(tbl[[col_names]]) %>%
      rlang::set_names(nm)
  } else {
    rlang::set_names(tbl[[token_field]], tbl[[pos_field]]) %>%
      split(tbl[[col_names]]) %>%
      rlang::set_names(nm)
  }
}

#' Check if scalars are blank
#'
#' @param x Object to check its emptiness.
#' @param trim Logical. If passed as `TRUE`
#' and the object is a character vector,
#' `stringi::stri_trim()` is applied before checking.
#' @param ... Additional arguments for \code{base::sapply()}.
#' @returns Logicals.
#' @export
#' @examples
#' is_blank(list(c(a = "", b = NA_character_), NULL))
is_blank <- function(x, trim = TRUE, ...) {
  if (!is.list(x)) {
    if (is.null(x)) {
      return(TRUE)
    }
    if (is.character(x) && trim) x <- stringi::stri_trim(x)
    is.blank(x)
  } else {
    if (length(x) == 0) {
      return(TRUE)
    }
    sapply(x, is_blank, trim = trim, ...)
  }
}

is.blank <- function(x) {
  UseMethod("is.blank", x)
}

#' @export
is.blank.default <- function(x) {
  is.na(x) | is.nan(x)
}

#' @export
is.blank.character <- function(x) {
  is.na(x) | stringi::stri_isempty(x)
}

#' Get transition cost between pos attributes
#'
#' @inherit transition_cost
#' @inheritParams transition_cost
#' @export
get_transition_cost <- function(rcAttr,
                                lcAttr,
                                sys_dic = "",
                                user_dic = "") {
  dict <- dictionary_info(sys_dic, user_dic)
  if (rlang::is_empty(dict)) {
    rlang::abort("Can't find dictionaries.", class = "gbs_missing_dict")
  }
  rcAttr <- as.integer(rcAttr)
  lcAttr <- as.integer(lcAttr)
  size <- min(dict[["lsize"]], dict[["rsize"]])
  if (min(rcAttr, lcAttr) < 0 || rcAttr > size || lcAttr > size) {
    rlang::abort("rcAttr and/or lcAttr is invalid.")
  }
  transition_cost(rcAttr, lcAttr, sys_dic, user_dic)
}
paithiov909/gibasa documentation built on June 14, 2025, 4:31 p.m.