# ABOUT -------------------------------------------------------------------
# Description: Acquire and curate the Brown Corpus
# Usage: Internet connection required
# Author: Jerid Francom
# Date: 2018-01-28
# SETUP -------------------------------------------------------------------
pacman::p_load(tidyverse, usethis, tadr)
# _ Functions -------------------------------------------------------------
parse_xml_doc <- function(file) {
# Function: reads a Brown XML file and returns a tidy dataset
# with the attributes `document_id`, `category`, `words`, and `pos`
require(xml2)
cat("Reading", basename(file), "... ") # status update
doc <- read_xml(file) # read xml document
doc %>% xml_ns_strip() # remove the namespace to simplify path matching
document_id <-
doc %>%
xml_find_all("//text") %>% # isolate element
xml_attr("id") %>% # extract attribute
str_extract("\\d+") # extract digits
category <-
doc %>%
xml_find_all("//text") %>% # isolate element
xml_attr("decls") # extract attribute
words <-
doc %>%
xml_find_all("//w|//c") %>% # isolate elements
xml_text() # extract text
pos <-
doc %>%
xml_find_all("//w|//c") %>% # isolate elements
xml_attr("type") # extract attribute
data <-
data_frame(document_id, category, words, pos) # create tidy dataset
cat("Done.\n") # status update
return(data) # make the function output the dataset
}
# RUN ---------------------------------------------------------------------
# Download Brown Corpus (BROWN) -------------------------------------------
# Download tei version (xml) http://www.nltk.org/nltk_data/
get_compressed_data(url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/brown_tei.zip", target_dir = "data-raw/original/brown/")
# Tidy Brown language data ------------------------------------------------
# Get the paths to the corpus files
files <-
list.files(path = "data-raw/original/brown/", # directory where the files are
pattern = "^\\w\\d+", # only return files starting with letter + digits
full.names = TRUE) # return the full path
# Read files and return a tidy dataset
brown_data <-
files %>% # pass files
map(parse_xml_doc) %>% # iteratively process each file
bind_rows() # bind the resulting data frames into one
# Tidy Brown meta-data ----------------------------------------------------
# Get category description information from `Corpus.xml` file
doc <- read_xml(x = "data-raw/original/brown/Corpus.xml") # read xml
category <-
doc %>%
xml_find_all("//d1:category") %>% # isolate elements
xml_attr("id") # extract attributes
category_description <-
doc %>%
xml_find_all("//d1:category") %>% # isolate elements
xml_text(trim = TRUE) # extract text (trim any whitespace)
brown_categories <-
data_frame(category, category_description) # create a data frame
# Join brown_data and brown_categories
brown <-
left_join(brown_data, brown_categories) %>% # join by `category`
select(document_id, category, category_description, words, pos) %>% # arrange column order
as_tibble() #
# Write data to disk ------------------------------------------------------
# Write the curated dataset to the `data-raw/derived/` directory
write_csv(x = brown, path = "data-raw/derived/brown.csv")
# Write the curated dataset to the `inst/extdata/` directory
write_csv(x = brown, path = "inst/extdata/brown.csv")
# Write the curated dataset to the `data/` directory
use_data(brown)
# CLEANUP -----------------------------------------------------------------
rm(list = ls()) # clean up objects
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.