data-raw/googlenews.R

library(tidyverse)
library(data.table)

meta <- read.csv("C:\\Users\\jtimm\\Google Drive\\GitHub\\getNewsCorpus\\meta\\runningMeta.csv")%>%
  mutate(pubdates=as.Date(pubdates, "%d-%b-%y"))%>%
  filter(pubdates > "2017-11-26" & pubdates < "2017-12-21")%>%
  mutate(doc_id=as.character(doc_id))%>%
  mutate_if(is.factor,as.character)


setwd("C:\\Users\\jtimm\\Google Drive\\GitHub\\getNewsCorpus\\annotations")
annotations <- list.files(path="C:\\Users\\jtimm\\Google Drive\\GitHub\\getNewsCorpus\\annotations", pattern = ".csv", recursive=TRUE) %>%
  lapply(., read.csv) %>%
  rbindlist() %>%
  mutate(doc_id=as.character(doc_id))%>%
  filter(!lemma %in% c(' ','_'))


cdr_gnews_historical <-  annotations %>%
  group_by(doc_id,token,lemma,pos,tag)%>%
  summarize(freq= n())%>%
  ungroup()%>%
  mutate_if(is.factor,as.character)%>%
  filter(doc_id %in% meta$doc_id) %>%
  arrange(as.numeric(doc_id))%>%
  mutate(doc_id = as.character(match(doc_id, unique(doc_id))))


cdr_gnews_meta <- meta %>%
  select(doc_id:titles,docN:docSent)%>%
  mutate(doc_id = as.character(match(doc_id, unique(doc_id))))

setwd("C:\\Users\\jtimm\\Google Drive\\GitHub\\packages\\corpusdatr")
devtools::use_data(cdr_gnews_historical, overwrite=TRUE)
devtools::use_data(cdr_gnews_meta, overwrite=TRUE)
jaytimm/corpusdatr documentation built on Aug. 5, 2020, 11:48 a.m.