R/tokenizer.R

Defines functions new_nlp_tokenizer_model new_nlp_tokenizer validator_nlp_tokenizer nlp_tokenizer.tbl_spark nlp_tokenizer.ml_pipeline nlp_tokenizer.spark_connection nlp_tokenizer

Documented in nlp_tokenizer

#' Spark NLP Tokenizer approach
#' 
#' Spark ML estimator that identifies tokens with tokenization open standards. A few rules will help customizing
#' it if defaults do not fit user needs. See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#tokenizer}
#' 
#' @template roxlate-nlp-algo
#' @template roxlate-inputs-output-params
#' @param exceptions String array. List of tokens to not alter at all. Allows composite tokens like two worded tokens that the user may not want to split.
#' @param exceptions_path NOTE: NOT IMPLEMENTED. String. Path to txt file with list of token exceptions
#' @param exceptions_path_read_as LINE_BY_LINE or SPARK_DATASET
#' @param exceptions_path_options Options to pass to the Spark reader. Defaults to {"format" = "text"}
#' @param case_sensitive_exceptions Boolean. Whether to follow case sensitiveness for matching exceptions in text
#' @param context_chars String array. Whether to follow case sensitiveness for matching exceptions in text
#' @param split_chars String array.  List of 1 character string to rip off from tokens, such as parenthesis or question marks. Ignored if using prefix, infix or suffix patterns.
#' @param split_pattern String. pattern to separate from the inside of tokens. takes priority over splitChars.
#' @param target_pattern String. Basic regex rule to identify a candidate for tokenization. Defaults to `\\S+` which means anything not a space
#' @param suffix_pattern String. Regex to identify subtokens that are in the end of the token. Regex has to end with `\\z` and must contain groups (). Each group will become a separate token within the prefix. Defaults to non-letter characters. e.g. quotes or parenthesis
#' @param prefix_pattern String. Regex to identify subtokens that come in the beginning of the token. Regex has to start with `\\A` and must contain groups (). Each group will become a separate token within the prefix. Defaults to non-letter characters. e.g. quotes or parenthesis
#' @param infix_patterns String array. extension pattern regex with groups to the top of the rules (will target first, from more specific to the more general).
#' 
#' @export
nlp_tokenizer <- function(x, input_cols, output_col,
                 exceptions = NULL, exceptions_path = NULL, exceptions_path_read_as = "LINE_BY_LINE", 
                 exceptions_path_options = list("format" = "text"),
                 case_sensitive_exceptions = NULL, context_chars = NULL,
                 split_chars = NULL, split_pattern = NULL, target_pattern = NULL, suffix_pattern = NULL, prefix_pattern = NULL, 
                 infix_patterns = NULL,
                 uid = random_string("tokenizer_")) {
  UseMethod("nlp_tokenizer")
}

#' @export
nlp_tokenizer.spark_connection <- function(x, input_cols, output_col,
                                           exceptions = NULL, exceptions_path = NULL, exceptions_path_read_as = "LINE_BY_LINE", 
                                           exceptions_path_options = list("format" = "text"),
                                           case_sensitive_exceptions = NULL, context_chars = NULL,
                                           split_chars = NULL, split_pattern = NULL, target_pattern = NULL, suffix_pattern = NULL, prefix_pattern = NULL, 
                                           infix_patterns = NULL,
                                           uid = random_string("tokenizer_")) {
  args <- list(
    input_cols = input_cols,
    output_col = output_col,
    exceptions = exceptions,
    exceptions_path = exceptions_path,
    exceptions_path_read_as = exceptions_path_read_as,
    exceptions_path_options = exceptions_path_options,
    case_sensitive_exceptions = case_sensitive_exceptions,
    context_chars = context_chars,
    split_chars = split_chars,
    split_pattern = split_pattern,
    target_pattern = target_pattern,
    suffix_pattern = suffix_pattern,
    prefix_pattern = prefix_pattern,
    infix_patterns = infix_patterns,
    uid = uid
  ) %>% 
   validator_nlp_tokenizer()

  
  if (!is.null(args[["exceptions_path_options"]])) {
    args[["exception_path_options"]] <- list2env(args[["exceptions_path_options"]])
  }
  
  jobj <- sparklyr::spark_pipeline_stage(
    x, "com.johnsnowlabs.nlp.annotators.Tokenizer",
    input_cols = args[["input_cols"]],
    output_col = args[["output_col"]],
    uid = args[["uid"]]
  ) %>%
    sparklyr::jobj_set_param("setExceptions", args[["exceptions"]]) %>%
    sparklyr::jobj_set_param("setCaseSensitiveExceptions", args[["case_sensitive_exceptions"]]) %>%
    sparklyr::jobj_set_param("setContextChars", args[["context_chars"]]) %>%
    sparklyr::jobj_set_param("setSplitChars", args[["split_chars"]]) %>%
    sparklyr::jobj_set_param("setSplitPattern", args[["split_pattern"]]) %>%
    sparklyr::jobj_set_param("setTargetPattern", args[["target_pattern"]]) %>%
    sparklyr::jobj_set_param("setSuffixPattern", args[["suffix_pattern"]]) %>%
    sparklyr::jobj_set_param("setPrefixPattern", args[["prefix_pattern"]]) %>%
    sparklyr::jobj_set_param("setInfixPatterns", args[["infix_patterns"]])
  
  if (!is.null(args[["exceptions_path"]])) {
    sparklyr::invoke(jobj, "setExceptionsPath", args[["exceptions_path"]], read_as(x, args[["exceptions_path_read_as"]]), args[["options"]])
  }

  new_nlp_tokenizer(jobj)
}

#' @export
nlp_tokenizer.ml_pipeline <- function(x, input_cols, output_col,
                             exceptions = NULL, exceptions_path = NULL, exceptions_path_read_as = "LINE_BY_LINE",
                             exceptions_path_options = list("format" = "text"),
                             case_sensitive_exceptions = NULL, context_chars = NULL,
                             split_chars = NULL, split_pattern = NULL, target_pattern = NULL, suffix_pattern = NULL, prefix_pattern = NULL, 
                             infix_patterns = NULL,
                             uid = random_string("tokenizer_")) {

  stage <- nlp_tokenizer.spark_connection(
    x = sparklyr::spark_connection(x),
    input_cols = input_cols,
    output_col = output_col,
    exceptions = exceptions,
    exceptions_path = exceptions_path,
    exceptions_path_read_as = exceptions_path_read_as,
    exceptions_path_options = exceptions_path_options,
    case_sensitive_exceptions = case_sensitive_exceptions,
    context_chars = context_chars,
    split_chars = split_chars,
    split_pattern = split_pattern,
    target_pattern = target_pattern,
    suffix_pattern = suffix_pattern,
    prefix_pattern = prefix_pattern,
    infix_patterns = infix_patterns,
    uid = uid
  )
  
  sparklyr::ml_add_stage(x, stage)
}

#' @export
nlp_tokenizer.tbl_spark <- function(x, input_cols, output_col,
                           exceptions = NULL, exceptions_path = NULL, exceptions_path_read_as = "LINE_BY_LINE", 
                           exceptions_path_options = list("format" = "text"),
                           case_sensitive_exceptions = NULL, context_chars = NULL,
                           split_chars = NULL, split_pattern = NULL, target_pattern = NULL, suffix_pattern = NULL, prefix_pattern = NULL, 
                           infix_patterns = NULL,
                           uid = random_string("tokenizer_")) {
  stage <- nlp_tokenizer.spark_connection(
    x = sparklyr::spark_connection(x),
    input_cols = input_cols,
    output_col = output_col,
    exceptions = exceptions,
    exceptions_path = exceptions_path,
    exceptions_path_read_as = exceptions_path_read_as,
    exceptions_path_options = exceptions_path_options,
    case_sensitive_exceptions = case_sensitive_exceptions,
    context_chars = context_chars,
    split_chars = split_chars,
    split_pattern = split_pattern,
    target_pattern = target_pattern,
    suffix_pattern = suffix_pattern,
    prefix_pattern = prefix_pattern,
    infix_patterns = infix_patterns,
    uid = uid
  )
  
  stage %>% 
    sparklyr::ml_fit(x)
}

#' @import forge
validator_nlp_tokenizer <- function(args) {
  # Input checking, much of these can be factored out; can be composed
  #   with other input checkers to avoid redundancy
  args[["input_cols"]] <- cast_string_list(args[["input_cols"]])
  args[["output_col"]] <- cast_string(args[["output_col"]])
  args[["exceptions"]] <- cast_nullable_string_list(args[["exceptions"]])
  args[["exceptions_path"]] <- cast_nullable_string(args[["exceptions_path"]])
  args[["exceptions_path_read_as"]] <- cast_choice(args[["exceptions_path_read_as"]], choices = c("LINE_BY_LINE", "SPARK_DATASET"))
  args[["case_sensitive_exceptions"]] <- cast_nullable_logical(args[["case_sensitive_exceptions"]])
  args[["context_chars"]] <- cast_nullable_string_list(args[["context_chars"]])
  args[["split_chars"]] <- cast_nullable_string_list(args[["split_chars"]])
  args[["split_pattern"]] <- cast_nullable_string(args[["split_pattern"]])
  args[["target_pattern"]] <- cast_nullable_string(args[["target_pattern"]])
  args[["suffix_pattern"]] <- cast_nullable_string(args[["suffix_pattern"]])
  args[["prefix_pattern"]] <- cast_nullable_string(args[["prefix_pattern"]])
  args[["infix_patterns"]] <- cast_nullable_string_list(args[["infix_patterns"]])
  args
}

new_nlp_tokenizer <- function(jobj) {
  sparklyr::new_ml_estimator(jobj, class = "nlp_tokenizer")
}

new_nlp_tokenizer_model <- function(jobj) {
  sparklyr::new_ml_transformer(jobj, class = "nlp_tokenizer_model")
}
r-spark/sparknlp documentation built on Oct. 15, 2022, 10:50 a.m.