data-raw/slate.R

#devtools::install_github("jaytimm/corpuslingr")
library(sf)
library(corpuslingr)
library(tidyverse)
library(data.table)
library(spacyr)
spacy_initialize()

##Get texts/titles/etc. from Slate.

#Set file paths to raw data.
setwd("C:\\Users\\jason\\Google Drive\\GitHub\\packages\\corpusdatr\\slate")

txts <- list.files(path="C:\\Users\\jason\\Google Drive\\GitHub\\packages\\corpusdatr\\slate", recursive=TRUE,pattern = ".txt")


#Get titles, on line 5.
titles <- lapply(txts, function(x) {
  paste(scan(x,what="char",sep="\n",nlines=6,strip.white = TRUE,quiet=TRUE),collapse="")})


#Get approximate text length.
GetLength <- function(x) {
  y <-paste(scan(x,what="char",sep="\n",skip = 6,encoding="UTF-8",quiet=TRUE),collapse=" ")%>%
    gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "",., perl=TRUE)
  length(strsplit(y," ")[[1]])}

lengths <- lapply(txts, GetLength) %>%
  melt()

#Filter titles.
titlesClean <- melt(titles)%>%
  bind_cols(data.frame(txts))%>%
  rename(title=value)%>%
  left_join(lengths, by = "L1")%>%
  filter(title!="",!grepl("No\\.|Headline|Chatter", title),nchar(as.character(title))>5,value < 1500, value >850)

#Select random 1,000 texts.
set.seed(99)
randSlate <- titlesClean[sample(1:nrow(titlesClean), 1000,
  	replace=FALSE),]%>% select(title,txts)%>%
  mutate(doc_id=as.character(row_number()))

#Get texts from random list. Clean whitespace. Set encoding.
slate <- lapply(as.character(randSlate$txts), function(x) {
  paste(scan(x,what="char",sep="\n",skip = 6,encoding="UTF-8",quiet=TRUE),collapse=" ")})


##Create tif.
cdr_slate_corpus <- unlist(slate) %>%
  melt() %>%
  mutate(doc_id=as.character(row_number()))%>%
  left_join(randSlate)%>%
  select(doc_id,title,txts,value)%>%
  rename(oancID=txts,text = value)%>%
  mutate_if(is.factor,as.character)


########################




#Annotate texts via `spacyr`.
cdr_slate_ann <- cdr_slate_corpus  %>%
  corpuslingr::clr_prep_corpus(hyphenate = TRUE)%>%
  spacyr::spacy_parse(tag=TRUE) #%>%
  #corpuslingr::clr_set_corpus(ent_as_tag=FALSE)

cdr_slate_ann$lemma <- gsub("qq", "-", cdr_slate_ann$lemma)
cdr_slate_ann$token <- gsub("qq", "-", cdr_slate_ann$token)

cdr_slate_ann$lemma <- ifelse(cdr_slate_ann$pos=="PROPN"|cdr_slate_ann$pos=="ENTITY"|cdr_slate_ann$lemma=="-PRON-",cdr_slate_ann$token,cdr_slate_ann$lemma)


class(cdr_slate_ann) <- c("spacyr_parsed", "data.frame")


forShiny <- cdr_slate_ann %>%
  corpuslingr::clr_set_corpus(ent_as_tag=FALSE)

#Text descriptives and meta.
cdr_slate_meta <- corpuslingr::clr_desc_corpus(cdr_slate_ann)$text %>% #FIX.
  left_join(randSlate) %>% #Error occurs here.
  rename(oancID=txts) %>%
  mutate_if(is.factor,as.character)%>%
  arrange(as.numeric(doc_id))

#Add locations
cdr_slate_gpe <- read.csv("C:\\Users\\jason\\Google Drive\\GitHub\\packages\\corpusdatr\\data-raw\\slateGPEs.csv")%>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326, agr = "constant")%>%
  mutate_if(is.factor,as.character)

setwd("C:\\Users\\jason\\Google Drive\\GitHub\\packages\\corpusdatr")
#Output
devtools::use_data(cdr_slate_corpus, overwrite=TRUE)
devtools::use_data(cdr_slate_ann, overwrite=TRUE)
devtools::use_data(cdr_slate_meta, overwrite=TRUE)
devtools::use_data(cdr_slate_gpe, overwrite=TRUE)
#devtools::use_data_raw()


setwd("C:\\Users\\jason\\Google Drive\\GitHub\\Shiny\\shinyCorpusSearch\\data")
saveRDS (forShiny, 'slate_ann.rds')
jaytimm/corpusdatr documentation built on Aug. 5, 2020, 11:48 a.m.