R/RcppExports.R

Defines functions tokenize_sentences preprocess predict_sbo_predictor kgram_freqs_fast_cpp kgram_freqs_cpp get_word_freqsC get_pc_ptr get_kgram_prefix

Documented in preprocess tokenize_sentences

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

get_kgram_prefix <- function(line, N, dict, EOS) {
    .Call(`_sbo_get_kgram_prefix`, line, N, dict, EOS)
}

get_pc_ptr <- function(object) {
    .Call(`_sbo_get_pc_ptr`, object)
}

get_word_freqsC <- function(text) {
    .Call(`_sbo_get_word_freqsC`, text)
}

kgram_freqs_cpp <- function(sentences, N, dict) {
    .Call(`_sbo_kgram_freqs_cpp`, sentences, N, dict)
}

kgram_freqs_fast_cpp <- function(input, N, dict, erase = "[^.?!:;'\\w\\s]", lower_case = TRUE, EOS = ".?!:;") {
    .Call(`_sbo_kgram_freqs_fast_cpp`, input, N, dict, erase, lower_case, EOS)
}

predict_sbo_predictor <- function(ptr_sexp, input) {
    .Call(`_sbo_predict_sbo_predictor`, ptr_sexp, input)
}

#' Preprocess text corpus
#'
#' A simple text preprocessing utility.
#'
#' @export
#'
#' @author Valerio Gherardi
#' @md
#'
#' @param input a character vector.
#' @param erase a length one character vector. Regular expression matching parts of
#' text to be erased from input. The default removes anything not alphanumeric,
#' white space, apostrophes or punctuation characters (i.e. ".?!:;").
#' @param lower_case a length one logical vector. If TRUE, puts everything to lower
#' case.
#' @return a character vector containing the processed output.
#' @examples
#' preprocess("Hi @@ there! I'm using `sbo`.")
preprocess <- function(input, erase = "[^.?!:;'\\w\\s]", lower_case = TRUE) {
    .Call(`_sbo_preprocess`, input, erase, lower_case)
}

#' Sentence tokenizer
#'
#' Get sentence tokens from text
#'
#' @export
#'
#' @author Valerio Gherardi
#' @md
#'
#' @param input a character vector.
#' @param EOS a length one character vector listing all (single character)
#' end-of-sentence tokens.
#' @return a character vector, each entry of which corresponds to a single
#' sentence.
#' @examples
#' tokenize_sentences("Hi there! I'm using `sbo`.")
tokenize_sentences <- function(input, EOS = ".?!:;") {
    .Call(`_sbo_tokenize_sentences`, input, EOS)
}

Try the sbo package in your browser

Any scripts or data that you put into this service are public.

sbo documentation built on Dec. 6, 2020, 1:06 a.m.