word.lists: A Collection Of Word Lists For ESL

Documented in get_definition get_nltk_data get_translation get_wordlist install_nltk synsets

#' Install nltk if you don't have it
#'
#' While reticulate is capable of binding to any Python environment available on
#' a system, it’s much more straightforward for users if there is a common
#' environment used by R packages with convenient high-level functions provided
#' for installation. We therefore strongly recommend that R package developers
#' use the approach described here.
#'
#' https://rstudio.github.io/reticulate/articles/package.html#installing-python-dependencies
#'
#' @param method the method
#' @param conda the conda environment
#'
#' @export
#'
install_nltk <- function(method = "auto", conda = "auto") {
  reticulate::py_install("nltk", method = method, conda = conda)
}



#' Download nltk data
#'
#' This by default downloads the open multilingual wordnet 1.4 (omw) & wordnet 2022 (wn)
#' datasets from `nltk.download`. You can optionally use the gui downloader interface to select whatever you may want to use.
#'
#' @param method auto - refer to reticulate documentation
#' @param conda auto - refer to reticulate documentation
#' @param gui bool - optionally run the GUI downloader to select specific datasets.
#' Note that running the GUI does _not_ install omw or wn datasets.
#'
#' @export
#'
get_nltk_data <- function(method = "auto", conda = "auto", gui = FALSE){

  if(gui){
    reticulate::import("nltk")$download()
  } else{
    reticulate::import("nltk")$download("omw-1.4")
    reticulate::import("nltk")$download("wordnet2022")
  }
}

#' Synsets function from wordnet
#'
#' @export
synsets <- function(){
  reticulate::import("nltk", delay_load = TRUE)$wordnet$wn$synsets
}


#' Get translations of individual words from nltk
#'
#' This will match up to 4 meanings where the requested word and part of speech match.
#' The first 2 results from each meaning-set are returned, then the first four unique
#' translations are selected. This means there is a fairly strong tendency towards the
#' most common two or three synsets. For more information on synsets, see the
#' Open Multilingual Wordnet documentation.
#'
#' @param token an English word to pass to nltk's synset search
#' @param part_of_speech one of "v" -verb, "a" - adjective, "n" - noun, or "r" - adverb
#' @param language check the list of options with wordlists::ntlk_languages. Defaults to japanese
#'
#' @return a character vector of length one
#' @export
#'
get_translation <-  function(token, part_of_speech, language = "jpn"){
  # get the first two lemmas of a given language
  # for every synset that contains the token and matches the part_of_speech
  synsets <- synsets()

  if(!language %in% word.lists::nltk_languages$lang){
    stop(
    glue::glue("Language not available.
    Available languages are:
    {stringr::str_flatten(word.lists::nltk_languages$lang, collapse = ' || ')}"))
  }

  if(class(token) != "character"){
    token <- as.character(token)
  }

  if(length(synsets(token, pos = part_of_speech)) == 0) {
    NA_character_
  } else{
    purrr::map(
      1:length(synsets(token, pos = part_of_speech)) ,
      ~synsets(token, pos = part_of_speech)[[.x]]$lemma_names(lang = language)[1:2]
    ) %>%
      # lose anything that did not return a value
      # return the top four unique entries
      # lose all results with capitals (they are probably unwanted proper nouns)
      purrr::discard(is.null) %>%
      unlist() %>%
      unique() %>%
      stringr::str_remove("[A-Z].+") %>%
      dplyr::na_if("") %>%
      purrr::discard(is.na) %>%
      utils::head(4) %>%
      stringr::str_flatten(collapse = " || ") %>%
      stringr::str_replace_all("_", " ")
  }
}

#' Get English definitions of individual words from nltk
#'
#' This will match up to 3 meanings where the requested word and part of speech match.
#' For more information on synsets, see the
#' Open Multilingual Wordnet documentation.
#'
#' @param token an English word to pass to nltk's synset search
#' @param part_of_speech one of "v" -verb, "a" - adjective, "n" - noun, or "r" - adverb
#'
#' @return a character vector of length one, with definitions separated by ";\\n"
#' @export
#'
get_definition <- function(token, part_of_speech){
  synsets <- synsets()
  if(length(synsets(token, pos = part_of_speech)) == 0) {
    NA_character_
  } else{
    purrr::map(
      1:length(synsets(token, pos = part_of_speech)) ,
      ~synsets(token, pos = part_of_speech)[[.x]]$definition()
    ) %>%
      # lose anything that did not return a value
      # return the top four unique entries
      # lose all results with capitals (they are probably unwanted proper nouns)
      purrr::discard(is.null) %>%
      unlist() %>%
      unique() %>%
      stringr::str_remove("(?=;).+") %>%
      dplyr::na_if("") %>%
      purrr::discard(is.na) %>%
      utils::head(3) %>%
      stringr::str_flatten(collapse = ";\n ") %>%
      stringr::str_replace_all("_", " ")
  }
}


#' Make wordlists from udpiped dataframes
#'
#' This will add the first four unique translations that match the words and pos provided.
#' Anything that is not a verb, adjective, noun, or adverb will return blanks, as will anything
#' that does not have a translation in OMW for the language requested.
#' It will not look for proper nouns; udpipe does a great job of parsing pos, with the exception of Title Case.
#' Consider pre-processing if you have a lot of Title Case.
#' This is intended to be used in conjunction with the word frequency lists to aid in building wordlists for ESL teachers.
#'
#' @param data a dataframe containing doc_id, sentence_id, token_id, token, lemma, upos. Should be piped from udpipe.
#' @param language three-character abbreviation of language to find translations for. Refer to `wordlists::nltk_languages`.
#' @param def TRUE will add a definition column, anything else will leave it out. Defaults to TRUE
#'
#' @return a dataframe
#' @export
#'
get_wordlist <- function(data, language, def = TRUE){
  doc_id <- lemma <- pos <- sentence <- sentence_id <- token <- token_id <- upos <- NULL

  pos_trans <- tibble::tribble(
    ~upos, ~pos,
    "AUX",  "v",
    "ADJ",  "a",
    "NOUN",  "n",
    "VERB",  "v",
    "ADV",  "r"
  )

  base <- data %>%
    dplyr::select(doc_id, sentence_id, token_id, token, lemma, upos, sentence) %>%
    dplyr::left_join(pos_trans)

  nope <- base %>%
    dplyr::filter(is.na(pos))

  yep <- base %>%
    dplyr::filter(!is.na(pos))

  res <- yep %>%
    dplyr::mutate(translation = purrr::map2_chr(token, pos, ~get_translation(.x, .y, lang = language)))

  if (def == TRUE){
    res <- res %>%
      dplyr::mutate(definition = purrr::map2_chr(token, pos, ~get_definition(.x, .y)))
  }

  res %>%
    dplyr::bind_rows(nope) %>%
    dplyr::mutate(token_id = as.numeric(token_id)) %>%
    dplyr::arrange(doc_id, sentence_id, token_id) %>%
    dplyr::filter(upos != "PUNCT") %>%
    dplyr::select(tidyselect::everything(), sentence)
}

antdurrant/word.lists documentation built on July 20, 2023, 3:57 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

antdurrant/word.lists
A Collection Of Word Lists For ESL

R/get_translations.R
In antdurrant/word.lists: A Collection Of Word Lists For ESL

Defines functions get_wordlist get_definition get_translation synsets get_nltk_data install_nltk

Documented in get_definition get_nltk_data get_translation get_wordlist install_nltk synsets

R Package Documentation

Browse R Packages

We want your feedback!

antdurrant/word.lists A Collection Of Word Lists For ESL

R/get_translations.R In antdurrant/word.lists: A Collection Of Word Lists For ESL

Defines functions get_wordlist get_definition get_translation synsets get_nltk_data install_nltk

Documented in get_definition get_nltk_data get_translation get_wordlist install_nltk synsets

R Package Documentation

Browse R Packages

We want your feedback!

antdurrant/word.lists
A Collection Of Word Lists For ESL

R/get_translations.R
In antdurrant/word.lists: A Collection Of Word Lists For ESL