data-raw/build_sotu.R

#Get SOTU from "The Programming Historian"

library(tidyverse)
library(stringr)
library(data.table)

base_url <- "https://programminghistorian.org/assets/basic-text-processing-in-r"

ns <- c(1:236)%>% #through Obama's last.
  str_pad(3,"left", "0")

sotu <- list()
for (i in 1:length(ns)) {
url <- paste(base_url,"/sotu_text/",ns[i],".txt",sep="")
sotu[[i]] <- paste(readLines(url), collapse = "\n")}

cdr_sotu_meta <- read_csv(sprintf("%s/%s", base_url, "metadata.csv"))%>%
  mutate(doc_id=as.character(row_number()))%>%
  select(doc_id,president:sotu_type)


cdr_sotu_corpus <- unlist(sotu) %>%
  melt() %>%
  mutate(value=as.character(value))%>%
  rename(text=value)%>%
  bind_cols(cdr_sotu_meta)%>%
  select(doc_id:sotu_type,text)


#mutate_if(is.factor,as.character)

library(spacyr)
spacy_initialize()

#Problem is that we lose text structure here.

cdr_sotu_ann <- cdr_sotu_corpus  %>%
  corpuslingr::clr_prep_corpus(hyphenate = TRUE)%>%
  spacyr::spacy_parse(tag=TRUE)

#Replace "qq"s before pushing.
cdr_sotu_ann$lemma <- gsub("qq", "-", cdr_sotu_ann$lemma)
cdr_sotu_ann$token <- gsub("qq", "-", cdr_sotu_ann$token)

#cdr_sotu_ann <- corpuslingr::clr_set_corpus (cdr_sotu_ann)
class(cdr_sotu_ann) <- c("spacyr_parsed", "data.frame")

setwd("C:\\Users\\jason\\Google Drive\\GitHub\\packages\\sotuAnn")
#Output
#devtools::use_data_raw()
devtools::use_data(cdr_sotu_ann, overwrite=TRUE)
devtools::use_data(cdr_sotu_meta, overwrite=TRUE)
jaytimm/sotuAnn documentation built on May 30, 2019, 11:40 p.m.