audubon: Japanese Text Processing Tools

Documented in collapse_tokens

#' Collapse sequences of tokens by condition
#'
#' Concatenates sequences of tokens in the tidy text dataset,
#' while grouping them by an expression.
#'
#' Note that this function drops all columns except but 'token'
#' and columns for grouping sequences. So, the returned data.frame
#' has only 'doc_id', 'sentence_id', 'token_id', and 'token' columns.
#'
#' @param tbl A tidy text dataset.
#' @param condition <[`data-masked`][rlang::args_data_masking]>
#' A logical expression.
#' @param .collapse String with which tokens are concatenated.
#' @returns A data.frame.
#' @export
#' @examples
#' \donttest{
#' df <- prettify(head(hiroba), col_select = "POS1")
#' collapse_tokens(df, POS1 == "\u540d\u8a5e")
#' }
collapse_tokens <- function(tbl,
                            condition,
                            .collapse = "") {
  condition <- enquo(condition)
  tbl %>%
    dplyr::group_by(.data$doc_id, .data$sentence_id) %>%
    dplyr::mutate(
      gbs_flag = dplyr::if_else(!!condition, 0L, .data$token_id),
      token_id = with(rle(.data$gbs_flag), rep(seq_along(values), lengths))
    ) %>%
    dplyr::group_by(.data$doc_id, .data$sentence_id, .data$token_id) %>%
    dplyr::reframe(
      token = .data$token %>%
        stringi::stri_remove_empty_na() %>%
        stringi::stri_c(collapse = .collapse)
    ) %>%
    dplyr::mutate(token = dplyr::na_if(.data$token, ""))
}