R/get_dict.R

#' Get dictionary information for word
#'
#' @param word Word to lookup in dictionary
#'
#' @return
#' @export
#'
get_dict <- function(word, delay = FALSE){
  url <- paste0("http://www.merriam-webster.com/dictionary/", word)
  ua_string <- "Googlebot/2.1 (+http://www.google.com/bot.html)"
  html <- rvest::html_session(url, httr::user_agent(ua_string))

  # if page doesn't load, skip
  if(httr::status_code(html) != 200){
    word_info <- dplyr::data_frame(word = word,
                                   type = NA,
                                   definition = NA,
                                   origin = NA)

    return(word_info)
  }

  type_speech <- html %>%
    rvest::html_nodes(".main-attr em") %>%
    rvest::html_text() %>%
    gsub("[^[:alnum:] ]", "", .) %>%
    unique

  definition <- html %>%
    rvest::html_node(".definition-inner-item span") %>%
    rvest::html_text() %>%
    # remove first colon if present and clean up extra white space
    stringr::str_replace(., ":", "") %>%
    stringr::str_trim() %>%
    gsub("\\s+", " ", .)

  origin <- html %>%
    rvest::html_nodes("div[class^=card-box]") %>%
    # keep only origin field
    .[grepl("origin", ., ignore.case = TRUE)] %>%
    rvest::html_node("p") %>%
    rvest::html_text()
    # stringr::str_split(",") %>%
    # .[[1]] %>%
    # stringr::str_trim()

  # combine info into data frame
  word_info <- dplyr::data_frame(word = word,
                    type = ifelse(length(type_speech) != 0, type_speech, NA),
                    definition = ifelse(length(definition) != 0, definition, NA),
                    origin = ifelse(length(origin) != 0, origin, NA))

  # add delay to slow down queries to website - prevents block
  if(delay) Sys.sleep(5)

  return(word_info)
}

#' Get list of competition words
#'
#' @param results Results from previous competitions, generated by [get_seasons]
#'
#' @return
#' @export
#'
#' @examples
get_words <- function(results){
  words <- results %>%
    dplyr::select(word_correct) %>%
    dplyr::rename(word = word_correct) %>%
    unique %>%
    dplyr::arrange(word) %>%
    .$word

  return(words)
}

#' Get dictionary info for a list of competition words
#'
#' @param words A vector produced by [get_words]
#'
#' @return
#' @export
#'
#' @examples
get_words_dict <- function(words){
  words_dict <- purrr::map_df(words, get_dict, delay = TRUE)

  return(words_dict)
}
bensoltoff/rspellingbee documentation built on May 12, 2019, 2:09 p.m.