R/parsing.R

Defines functions strip_multiple_spaces.data.frame strip_multiple_spaces.list strip_multiple_spaces.character strip_multiple_spaces strip_non_alphanum.data.frame strip_non_alphanum.list strip_non_alphanum.character strip_non_alphanum strip_numeric.data.frame strip_numeric.list strip_numeric.character strip_numeric strip_tags.data.frame strip_tags.list strip_tags.character strip_tags strip_punctuation.data.frame strip_punctuation.list strip_punctuation.character strip_punctuation stem_text.data.frame stem_text.list stem_text.character stem_text split_alphanum.data.frame split_alphanum.list split_alphanum.character split_alphanum .custom_filters .availabale_filters filter_rare.character filter_rare preprocess.data.frame preprocess.character preprocess strip_short.data.frame strip_short.list strip_short.character strip_short remove_stopwords.data.frame remove_stopwords.list remove_stopwords.character remove_stopwords stem_porter.porter_stemmer_model stem_porter porter_stemmer

Documented in filter_rare filter_rare.character porter_stemmer preprocess preprocess.character preprocess.data.frame remove_stopwords remove_stopwords.character remove_stopwords.data.frame remove_stopwords.list split_alphanum split_alphanum.character split_alphanum.data.frame split_alphanum.list stem_porter stem_porter.porter_stemmer_model stem_text stem_text.character stem_text.data.frame stem_text.list strip_multiple_spaces strip_multiple_spaces.character strip_multiple_spaces.data.frame strip_multiple_spaces.list strip_non_alphanum strip_non_alphanum.character strip_non_alphanum.data.frame strip_non_alphanum.list strip_numeric strip_numeric.character strip_numeric.data.frame strip_numeric.list strip_punctuation strip_punctuation.character strip_punctuation.data.frame strip_punctuation.list strip_short strip_short.character strip_short.data.frame strip_short.list strip_tags strip_tags.character strip_tags.data.frame strip_tags.list

#' Porter Stemmer
#' 
#' The Porter stemming algorithm (or ‘Porter stemmer’) is a process 
#' for removing the commoner morphological and inflexional endings 
#' from words in English. Its main use is as part of a term normalisation 
#' process that is usually done when setting up Information Retrieval systems.
#' 
#' @param stemmer A porter stemmer as returned by \code{\link{porter_stemmer}}.
#' @param text Text to parse.
#' 
#' @examples
#' \dontrun{
#' # create model
#' stemmer <- porter_stemmer()
#'
#' # stem
#' stemmer$stem("survey")
#' # or convenience method
#' stem_porter(stemmer, "survey")
#' }
#' 
#' @name porter_stemmer
#' @export
porter_stemmer <- function() {
  model <- gensim$parsing$porter$PorterStemmer()
  model <- structure(model, class = c("porter_stemmer_model", class(model)))
  invisible(model)
}

#' @rdname porter_stemmer
#' @export
stem_porter <- function(stemmer, text) UseMethod("stem_porter")

#' @rdname porter_stemmer
#' @method stem_porter porter_stemmer_model
#' @export
stem_porter.porter_stemmer_model <- function(stemmer, text){
  assert_that(!missing(text), msg = "Missing `text`.")

  purrr::map(text, stemmer$stem) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Remove stopwords
#' 
#' Remove stopwords from a character string.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param ... Any other options.
#' 
#' @name remove_stopwords
#' 
#' @export
remove_stopwords <- function(s, ...) UseMethod("remove_stopwords")

#' @rdname remove_stopwords
#' @method remove_stopwords character
#' @export
remove_stopwords.character <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$remove_stopwords) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname remove_stopwords
#' @method remove_stopwords list
#' @export
remove_stopwords.list <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$remove_stopwords) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname remove_stopwords
#' @method remove_stopwords data.frame
#' @export
remove_stopwords.data.frame <- function(s, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$remove_stopwords) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Strip Short Words
#' 
#' Remove words less than a certain length.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param min_len Minimum word length.
#' @param ... Any other options.
#' 
#' @name strip_short
#' 
#' @export
strip_short <- function(s, min_len = 5, ...) UseMethod("strip_short")

#' @rdname strip_short
#' @method strip_short character
#' @export
strip_short.character <- function(s, min_len = 5, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_short, minsize = min_len) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_short
#' @method strip_short list
#' @export
strip_short.list <- function(s, min_len = 5, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_short, minsize = min_len) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_short
#' @method strip_short data.frame
#' @export
strip_short.data.frame <- function(s, min_len = 5, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$strip_short, minsize = min_len) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Preprocess text
#' 
#' Remove stopwords from a character string.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param filters Filters to apply, see filter section.
#' @param to_lower Whether to convert to lowercase before processing.
#' @param ... Any other options.
#' 
#' @section Filters:
#' \itemize{
#'   \item{\code{strip_tags}}
#'   \item{\code{strip_punctuation}}
#'   \item{\code{strip_multiple_spaces}}
#'   \item{\code{strip_numeric}}
#'   \item{\code{remove_stopwords}}
#'   \item{\code{strip_short}}
#'   \item{\code{stem_text}}
#' }
#' 
#' @name preprocess
#' 
#' @export
preprocess <- function(s, ..., 
  filters = c("strip_tags", "strip_punctuation", "strip_multiple_spaces", "strip_numeric",
    "remove_stopwords", "strip_short", "stem_text"), to_lower = TRUE) UseMethod("preprocess")

#' @rdname preprocess
#' @method preprocess character
#' @export
preprocess.character <- function(s, ...,
  filters = c("strip_tags", "strip_punctuation", "strip_multiple_spaces", "strip_numeric",
    "remove_stopwords", "strip_short", "stem_text"), to_lower = TRUE){

  custom_filters <- .custom_filters(filters) 

  if(to_lower)
    s <- tolower(s)

  s %>% 
    purrr::map(gensim$parsing$preprocessing$preprocess_string, custom_filters) %>% 
    purrr::map(reticulate::py_to_r)
}

#' @rdname preprocess
#' @method preprocess list
#' @export
preprocess.list <- preprocess.character

#' @rdname preprocess
#' @method preprocess data.frame
#' @export
preprocess.data.frame <- function(s, text, ...,
  filters = c("strip_tags", "strip_punctuation", "strip_multiple_spaces", "strip_numeric",
    "remove_stopwords", "strip_short", "stem_text"), to_lower = TRUE){
  assert_that(!missing(text), msg = "Missing `text`")

  custom_filters <- .custom_filters(filters) 

  s <- s %>% 
    dplyr::pull(!!dplyr::enquo(text))
  
  if(to_lower)
    s <- tolower(s)

  s %>% 
    purrr::map(gensim$parsing$preprocessing$preprocess_string, custom_filters) %>% 
    purrr::map(reticulate::py_to_r)
}

#' Filter Rarely
#' 
#' Filter rarely appearing keywords from documents.
#' 
#' @param s A Character string or a list.
#' @param min_freq Minimum term frequency to keep terms in. 
#' 
#' @name filter_rare
#' 
#' @export
filter_rare <- function(s, min_freq = 1) UseMethod("filter_rare")

#' @rdname filter_rare
#' @method filter_rare character
#' @export
filter_rare.character <- function(s, min_freq = 1){
  assert_that(min_freq >= 0)

  # count
  tf <- tibble::tibble(
    word = unlist(s)
  ) %>% 
    dplyr::count(word) %>% 
    dplyr::filter(n > min_freq) %>% 
    dplyr::pull(word)

  # predicate
  .keep <- function(x){
    x %in% tf
  }

  # map over lists
  purrr::map(
    s,
    function(x){
      purrr::keep(x, .keep)
    }
  )
}

#' @rdname filter_rare
#' @method filter_rare list
#' @export
filter_rare.list <- filter_rare.character

.availabale_filters <- function(){
  tibble::tibble(
    args = c("strip_tags", "strip_punctuation", "strip_multiple_spaces", "strip_numeric",
      "remove_stopwords", "strip_short", "stem_text"),
    func = c(
      gensim$parsing$preprocessing$strip_tags,
      gensim$parsing$preprocessing$strip_punctuation,
      gensim$parsing$preprocessing$strip_multiple_whitespaces,
      gensim$parsing$preprocessing$strip_numeric,
      gensim$parsing$preprocessing$remove_stopwords,
      gensim$parsing$preprocessing$strip_short,
      gensim$parsing$preprocessing$stem_text
    )
  )
}

.custom_filters <- function(filters){
  .availabale_filters() %>% 
    dplyr::filter(args %in% filters) %>%
    dplyr::pull(func) %>% 
    as.list()
}

#' Split Alphanumerics
#' 
#' Split Alphanumerics from a character string.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param ... Any other options.
#' 
#' @name split_alphanum
#' 
#' @export
split_alphanum <- function(s, ...) UseMethod("split_alphanum")

#' @rdname split_alphanum
#' @method split_alphanum character
#' @export
split_alphanum.character <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$split_alphanum) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname split_alphanum
#' @method split_alphanum list
#' @export
split_alphanum.list <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$split_alphanum) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname split_alphanum
#' @method split_alphanum data.frame
#' @export
split_alphanum.data.frame <- function(s, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$split_alphanum) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Stem
#' 
#' Transform into lowercase and stem a character string.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param ... Any other options.
#' 
#' @name stem_text
#' 
#' @export
stem_text <- function(s, ...) UseMethod("stem_text")

#' @rdname stem_text
#' @method stem_text character
#' @export
stem_text.character <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$stem_text) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname stem_text
#' @method stem_text list
#' @export
stem_text.list <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$stem_text) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname stem_text
#' @method stem_text data.frame
#' @export
stem_text.data.frame <- function(s, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$stem_text) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Strip Punctuation
#' 
#' Replace punctuation characters with spaces.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param ... Any other options.
#' 
#' @name strip_punctuation
#' 
#' @export
strip_punctuation <- function(s, ...) UseMethod("strip_punctuation")

#' @rdname strip_punctuation
#' @method strip_punctuation character
#' @export
strip_punctuation.character <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_punctuation) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_punctuation
#' @method strip_punctuation list
#' @export
strip_punctuation.list <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_punctuation) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_punctuation
#' @method strip_punctuation data.frame
#' @export
strip_punctuation.data.frame <- function(s, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$strip_punctuation) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Strip Tags
#' 
#' Remove tags from character string.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param ... Any other options.
#' 
#' @name strip_tags
#' 
#' @export
strip_tags <- function(s, ...) UseMethod("strip_tags")

#' @rdname strip_tags
#' @method strip_tags character
#' @export
strip_tags.character <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_tags) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_tags
#' @method strip_tags list
#' @export
strip_tags.list <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_tags) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_tags
#' @method strip_tags data.frame
#' @export
strip_tags.data.frame <- function(s, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$strip_tags) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Strip Numerics
#' 
#' Remove digits from character string.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param ... Any other options.
#' 
#' @name strip_numeric
#' 
#' @export
strip_numeric <- function(s, ...) UseMethod("strip_numeric")

#' @rdname strip_numeric
#' @method strip_numeric character
#' @export
strip_numeric.character <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_numeric) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_numeric
#' @method strip_numeric list
#' @export
strip_numeric.list <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_numeric) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_numeric
#' @method strip_numeric data.frame
#' @export
strip_numeric.data.frame <- function(s, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$strip_numeric) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Strip Non Alphanumerics
#' 
#' Remove non-alphabetic characters from string.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param ... Any other options.
#' 
#' @name strip_non_alphanum
#' 
#' @export
strip_non_alphanum <- function(s, ...) UseMethod("strip_non_alphanum")

#' @rdname strip_non_alphanum
#' @method strip_non_alphanum character
#' @export
strip_non_alphanum.character <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_non_alphanum) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_non_alphanum
#' @method strip_non_alphanum list
#' @export
strip_non_alphanum.list <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_non_alphanum) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_non_alphanum
#' @method strip_non_alphanum data.frame
#' @export
strip_non_alphanum.data.frame <- function(s, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$strip_non_alphanum) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' Strip Multiple space
#' 
#' Remove repeating whitespace characters (spaces, tabs, line breaks) from s and turns tabs & line breaks into spaces.
#' 
#' @param s A Character string or data.frame.
#' @param text bare name of text column.
#' @param ... Any other options.
#' 
#' @name strip_multiple_spaces
#' 
#' @export
strip_multiple_spaces <- function(s, ...) UseMethod("strip_multiple_spaces")

#' @rdname strip_multiple_spaces
#' @method strip_multiple_spaces character
#' @export
strip_multiple_spaces.character <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_multiple_whitespaces) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_multiple_spaces
#' @method strip_multiple_spaces list
#' @export
strip_multiple_spaces.list <- function(s, ...){
  s %>% 
    purrr::map(gensim$parsing$preprocessing$strip_multiple_whitespaces) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}

#' @rdname strip_multiple_spaces
#' @method strip_multiple_spaces data.frame
#' @export
strip_multiple_spaces.data.frame <- function(s, text, ...){
  assert_that(!missing(text), msg = "Missing `text`")
  s %>% 
    dplyr::pull(!!dplyr::enquo(text)) %>% 
    purrr::map(gensim$parsing$preprocessing$strip_multiple_whitespaces) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    unlist()
}
news-r/gensimr documentation built on Jan. 9, 2021, 5:55 a.m.