#Get SOTU from "The Programming Historian"
library(tidyverse)
library(stringr)
library(data.table)
base_url <- "https://programminghistorian.org/assets/basic-text-processing-in-r"
ns <- c(1:236)%>% #through Obama's last.
str_pad(3,"left", "0")
sotu <- list()
for (i in 1:length(ns)) {
url <- paste(base_url,"/sotu_text/",ns[i],".txt",sep="")
sotu[[i]] <- paste(readLines(url), collapse = "\n")}
cdr_sotu_meta <- read_csv(sprintf("%s/%s", base_url, "metadata.csv"))%>%
mutate(doc_id=as.character(row_number()))%>%
select(doc_id,president:sotu_type)
cdr_sotu_corpus <- unlist(sotu) %>%
melt() %>%
mutate(value=as.character(value))%>%
rename(text=value)%>%
bind_cols(cdr_sotu_meta)%>%
select(doc_id:sotu_type,text)
#mutate_if(is.factor,as.character)
library(spacyr)
spacy_initialize()
#Problem is that we lose text structure here.
cdr_sotu_ann <- cdr_sotu_corpus %>%
corpuslingr::clr_prep_corpus(hyphenate = TRUE)%>%
spacyr::spacy_parse(tag=TRUE)
#Replace "qq"s before pushing.
cdr_sotu_ann$lemma <- gsub("qq", "-", cdr_sotu_ann$lemma)
cdr_sotu_ann$token <- gsub("qq", "-", cdr_sotu_ann$token)
#cdr_sotu_ann <- corpuslingr::clr_set_corpus (cdr_sotu_ann)
class(cdr_sotu_ann) <- c("spacyr_parsed", "data.frame")
setwd("C:\\Users\\jason\\Google Drive\\GitHub\\packages\\sotuAnn")
#Output
#devtools::use_data_raw()
devtools::use_data(cdr_sotu_ann, overwrite=TRUE)
devtools::use_data(cdr_sotu_meta, overwrite=TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.