langdata: Practice Language Datasets

#' Brown Corpus
#'
#' A dataset containing the 1,155,866 tokenized words for 15 genre categories of
#' a sample of American English.
#'
#' @format A data frame with 223,506 rows and 11 variables:
#' \describe{
#'   \item{document_id}{ID for each corpus document}
#'   \item{category}{Label code for each of the 15 corpus categories}
#'   \item{category_description}{Description label for the corpus categories}
#'   \item{words}{Tokenized words from the corpus}
#'   \item{pos}{Part of speech label for each word in the corpus}
#' }
#' @source \url{http://www.nltk.org/nltk_data/}
"brown"