#' Get n-gram frequencies
#'
#' `ngram` downloads data from the Google Ngram Viewer website and
#' returns it in a tibble.
#'
#' @param phrases vector of phrases, with a maximum of 12 items
#' @param corpus Google corpus to search (see Details for possible values)
#' @param year_start start year, default is 1800. Data available back to 1500.
#' @param year_end end year, default is 2008
#' @param smoothing smoothing parameter, default is 3
#' @param case_ins Logical indicating whether to force a case insensitive search.
#' Default is `FALSE`.
#' @param aggregate Sum up the frequencies for ngrams associated with wildcard
#' or case insensitive searches. Default is `FALSE`.
#' @param count Default is `FALSE`.
#' @param drop_parent Drop the parent phrase associated with a wildcard
#' or case-insensitive search. Default is `FALSE`.
#' @param drop_all Delete the suffix "(All)" from aggregated case-insensitive
#' searches. Default is `FALSE`.
#' @param type Include the Google return type (e.g. NGRAM, NGRAM_COLLECTION,
#' EXPANSION) from result set. Default is `FALSE`.
#' @return `ngram` returns an object of class "`ngram`",
#' which is a tidyverse `tibble` enriched with attributes reflecting
#' some of the parameters used in the Ngram Viewer query.
#' @details
#' Google generated two datasets drawn from digitised books in the Google
#' Books collection. One was generated in July 2009, the second in July 2012
#' and the third in 2019. Google is expected to update these datasets as book
#' scanning continues.
#'
#' This function provides the annual frequency of words or phrases, known
#' as n-grams, in a sub-collection or "corpus" taken from the Google Books
#' collection.The search across the corpus is case-sensitive.
#'
#' If the function is unable to retrieve data from the Google Ngram Viewer
#' site (either because of access issues or if the format of Google's site
#' has changed) a NULL result is returned and messages are printed to the
#' console but no errors or warnings are raised (this is to align with
#' CRAN package policies).
#'
#' Below is a list of available corpora. Note that the data for the 2012
#' corpuses only extends to 2009.
#' \tabular{ll}{
#' \bold{Corpus} \tab \bold{Corpus Name}\cr
#' en-US-2019\tab American English 2019\cr
#' en-US-2012\tab American English 2012\cr
#' en-US-2009\tab American English 2009\cr
#' en-GB-2019\tab British English 2019\cr
#' en-GB-2012\tab British English 2012\cr
#' en-GB-2009\tab British English 2009\cr
#' zh-Hans-2019\tab Chinese 2019\cr
#' zh-Hans-2012\tab Chinese 2012\cr
#' zh-Hans-2009\tab Chinese 2009\cr
#' en-2019\tab English 2019\cr
#' en-2012\tab English 2012\cr
#' en-2009\tab English 2009\cr
#' en-fiction-2019\tab English Fiction 2019\cr
#' en-fiction-2012\tab English Fiction 2012\cr
#' en-fiction-2009\tab English Fiction 2009\cr
#' en-1M-2009\tab English One Million\cr
#' fr-2019\tab French 2019\cr
#' fr-2012\tab French 2012\cr
#' fr-2009\tab French 2009\cr
#' de-2019\tab German 2019\cr
#' de-2012\tab German 2012\cr
#' de-2009\tab German 2009\cr
#' iw-2019\tab Hebrew 2019\cr
#' iw-2012\tab Hebrew 2012\cr
#' iw-2009\tab Hebrew 2009\cr
#' es-2019\tab Spanish 2019\cr
#' es-2012\tab Spanish 2012\cr
#' es-2009\tab Spanish 2009\cr
#' ru-2019\tab Russian 2019\cr
#' ru-2012\tab Russian 2012\cr
#' ru-2009\tab Russian 2009\cr
#' it-2019\tab Italian 2019\cr
#' it-2012\tab Italian 2012\cr
#' }
#'
#' The Google Million is a sub-collection of Google Books. All are in
#' English with dates ranging from 1500 to 2008.
#' No more than about 6,000 books were chosen from any one year, which
#' means that all of the scanned books from early years are present,
#' and books from later years are randomly sampled. The random samplings
#' reflect the subject distributions for the year (so there are more
#' computer books in 2000 than 1980).
#'
#' See \url{http://books.google.com/ngrams/info} for the full Ngram syntax.
#' @examples
#' \donttest{ngram(c("mouse", "rat"), year_start = 1950)
#' ngram(c("blue_ADJ", "red_ADJ"))
#' ngram(c("_START_ President Roosevelt", "_START_ President Truman"), year_start = 1920)
#' }
#' @export
ngram <- function(phrases, corpus = "en", year_start = 1800,
year_end = 2022, smoothing = 3, case_ins=FALSE,
aggregate = FALSE, count = FALSE,
drop_parent = FALSE, drop_all = FALSE, type = FALSE) {
#if (!curl::has_internet()) {stop("Unable to access internet.")}
phrases <- ngram_check_phrases(phrases)
# Loop over corpuses
dfs <- lapply(corpus, function(corp) ngram_single(phrases, corpus = corp,
year_start = year_start,
year_end = year_end,
smoothing = smoothing,
case_ins = case_ins))
ng <- bind_rows(dfs)
if (length(ng) == 0) return(NULL)
class(ng) <- c("ngram", class(ng))
ng <- truncate_years(ng)
if (aggregate) {
ng <- filter(ng, .data$type != "EXPANSION")
} else {
ng <- filter(ng, .data$type %in% c("NGRAM", "EXPANSION"))
}
print(ng)
if (drop_parent || all(ng$Parent == "")) ng$Parent <- NULL
if (drop_all) {
ng <- mutate(ng,
Phrase = if_else(type == "CASE_INSENSITIVE",
stringr::str_replace(.data$Phrase, "\\s*\\(All\\)\\z", ""),
.data$Phrase))
}
#ng <- select(ng, -"clean")
attr(ng, "smoothing") <- smoothing
attr(ng, "case_sensitive") <- !case_ins
ng$Corpus <- as.factor(ng$Corpus)
ng$Phrase <- as.factor(ng$Phrase)
if (type) ng$Type <- ng$type
ng$type <- NULL
return(ng)
}
ngram_single <- function(phrases, corpus, year_start, year_end,
smoothing, case_ins) {
if (!(corpus %in% corpuses$Shorthand)) {warning(paste(corpus, "not a valid corpus. Defaulting to en-2019."))}
#corpus <- get_corpus_n(corpus)
query <- as.list(environment())
if (case_ins) query["case_insensitive"] <- "true"
query$phrases <- NULL
query$case_ins <- NULL
ng_url <- ngram_url(phrases, query)
html <- ngram_fetch_xml(ng_url)
if (is.null(html)){
ng <- NULL
} else {
ng <- ngram_fetch_data(html)
warnings <- ngram_check_warnings(html)
show_warnings(warnings)
}
return(ng)
}
ngram_check_phrases <- function(phrases){
stopifnot(is.character(phrases))
phrases <- phrases[phrases != ""]
if (length(phrases) == 0) stop("No valid phrases provided.")
if (!all(check_balanced(phrases))) stop("mis-matched parentheses")
if (length(phrases) > 12) {
phrases <- phrases[1:12]
warning("Maximum number of phrases exceeded: only using first 12.")
}
return(phrases)
}
ngram_fetch_xml <- function(url) {
# retrieve data from Google Ngram Viewer site
# no errors or warnings generated on fail, only messages
try_get <- function(x, ...) {
tryCatch(
httr::GET(url = x, httr::timeout(3), ...),
error = function(e) conditionMessage(e),
warning = function(w) conditionMessage(w)
)
}
is_response <- function(x) {
class(x) == "response"
}
# first check internet connection
if (!curl::has_internet()) {
message("No internet connection.")
return(invisible(NULL))
}
# then try for timeout problems
resp <- try_get(url)
if (!is_response(resp)) {
message("Please check Google's Ngram Viewer site is up.")
message(resp)
return(invisible(NULL))
}
# then stop if status > 400
if (httr::http_error(resp)) {
message("Please check Google's Ngram Viewer site is up.")
httr::message_for_status(resp)
return(invisible(NULL))
}
return(xml2::read_html(resp))
}
ngram_check_warnings <- function(html) {
node <- xml2::xml_find_first(html, "//div[@id='warning-area']")
warnings <- list()
if (length(node) > 0) {
for (n in xml2::xml_find_all(node, "div")) {
type <- xml2::xml_text(xml2::xml_find_first(n, "mwc-icon"))
msg <- stringr::str_trim(xml2::xml_text(xml2::xml_find_first(n, "span")))
msg <- stringr::str_replace_all(msg, "\\s+", " ")
msg <- stringr::str_replace(msg, "No valid ngrams to plot!", "No valid ngrams retrieved!")
warnings <- c(warnings, list(list(type = type, message = msg)))
}
}
return(warnings)
}
ngram_fetch_data <- function(html) {
data <- tryCatch(
{
if (is.null(html)) {
NULL
} else {
corpus <- xml2::xml_find_first(html, "//select[@id='form-corpus']/option")
corpus <- xml2::xml_attr(corpus, "value")
if (grepl("^[0-9]+$", corpus, perl = TRUE)) {
corpus <- get_corpus_text(as.numeric(corpus))
}
script <- xml2::xml_find_all(html, "//div[@id='chart']/following::script")[1]
json <- xml2::xml_text(script)
json <- stringr::str_split(json, "\n")[[1]]
json <- json[json != '']
json <- stringr::str_squish(json)
years <- xml2::xml_find_all(html, "//div[@id='chart']/following::script")[2]
years <- xml2::xml_text(years)
years <- stringr::str_split(years, "\n")[[1]]
years <- grep('drawD3Chart', years, value = TRUE)
years <- as.integer(stringr::str_split(grep("drawD3Chart", years, value = TRUE), ",")[[1]][2:3])
data <- rjson::fromJSON(json)
if (length(data) == 0) return(NULL)
data <- lapply(data,
function(x) tibble::add_column(tibble::as_tibble(x),
Year = seq.int(years[1], years[2])))
data <- bind_rows(data)
data <- mutate(data, ngram = textutils::HTMLdecode(data$ngram), Corpus = corpus)
data <- relocate(data, "Year", "ngram", "timeseries", "Corpus")
data <- rename(data, Phrase = "ngram", Frequency = "timeseries", Parent = "parent")
data
}
},
error=function(cond) {
message("Error parsing ngram data, please contact package maintainer.")
message("Here's the original error message:")
message(cond)
message("\nError occurred in the following code:")
message(conditionCall(cond))
return(NULL)
},
warning=function(cond) {
message("Warning generated when parsing ngram data.")
message("Here's the original warning message:")
message(cond)
return(NULL)
},
finally = {}
)
return(data)
}
ngram_url <- function(phrases, query=character()) {
url <- "https://books.google.com/ngrams/graph"
n <- length(phrases)
for (i in 1:n) {
if (grepl("\\+|/", phrases[i])) phrases[i] <- paste0("(", phrases[i], ")")
p <- phrases[i]
if (!(Encoding(p) %in% c("unknown", "UTF-8"))) {
phrases[i] <- iconv(p, Encoding(p), "UTF-8")
}
}
phrases <- paste(curl::curl_escape(stringr::str_trim(phrases)),
collapse = "%2c")
if (phrases == "") stop("No valid phrases provided.")
url <- paste0(url, "?content=", phrases)
if (length(query) > 0) url <- httr::modify_url(url, query = query)
url <- gsub("%28", "(", url)
url <- gsub("%29", ")", url)
url <- gsub("%20", "+", url)
return(url)
}
check_balanced <- function(x) {
# Check parenthesis are appropriately balanced (i.e. every open is closed)
sapply(x, function(str) {
str <- gsub("[^\\(\\)]", "", str)
str <- strsplit(str, "")[[1]]
str <- ifelse(str == "(", 1, -1)
all(cumsum(str) >= 0) && sum(str) == 0
})
}
show_warnings <- function(warnings){
if (length(warnings) > 0) {
for (w in warnings) {
warning(w$message, call. = FALSE)
}
}
}
get_corpus_n <- function(corpus, default = "en-2019"){
stopifnot(is.character(corpus))
n <- corpuses[corpus, "Number"]
if (any(is.na(n)) && !is.na(default)) {
if (is.character(default)) default <- get_corpus_n(default)
stopifnot(default %in% corpuses$Number)
invalid <- paste(corpus[is.na(n)], collapse = ", ")
warning(paste0("Unknown corpus ", invalid, ". Using default corpus instead."), call. = FALSE)
n[is.na(n)] <- default
}
return(n)
}
get_corpus_text <- function(n, default = NA){
stopifnot(is.numeric(n))
text <- row.names(corpuses)[match(n, corpuses$Number)]
if (any(is.na(text)) && !is.na(default)) {
if (is.numeric(default)) default <- get_corpus_text(default)
stopifnot(default %in% row.names(corpuses))
invalid <- paste(n[is.na(text)], collapse = ", ")
warning(paste0("Unknown corpus ", invalid, ". Using default corpus instead."), call. = FALSE)
text[is.na(text)] <- default
}
return(text)
}
truncate_years <- function(ngram){
stopifnot(class(ngram)[1] == "ngram")
ngram$Corpus <- as.character(ngram$Corpus)
ngram <- left_join(ngram, select(corpuses,
"Shorthand",
"Last.Year"),
by = c("Corpus" = "Shorthand"))
ngram <- filter(ngram, .data$Year <= "Last.Year")
ngram$Last.Year <- NULL
return(ngram)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.