#### 1. Preliminary steps ####

## Install castarter (devtools required for installing from github)
# install.packages("devtools")
devtools::install_github(repo = "giocomai/castarter")

## Load castarter
library("castarter")

## Set project and website name.
# They will remain in memory in the current R session,
# so need need to include them in each function call.
SetCastarter(project = "Presidents", website = "Kremlin_en")

## Creates the folder structure where files will be saved within the
## current working directory
CreateFolders()

#### 2. Download index/archive pages ####

# Check out the archive page, e.g. http://en.kremlin.ru/events/president/news/
# See what is the last archive page, if they are defined by consecutive numbers

indexLinks <- CreateLinks(linkFirstChunk = "http://en.kremlin.ru/events/president/news/page/",
                          startPage = 1,
                          endPage = 1082)

## Downloads all html files of the archive pages
# This can be interrupted: by default, if the command is issued again it will download only missing pages (i.e. pages that have not been downloaded previously)
DownloadContents(links = indexLinks, type = "index")
# Downloads again files with oddly small size (usually, errors), if any.
DownloadContents(links = indexLinks, type = "index", missingPages = FALSE)

#### 3. Extract links to individual pages ####

# Find criteria to filter only direct links to news items
# Open an archive page at random.
# browseURL(url = sample(x = indexLinks, size = 1))

links <- links <- ExtractLinks(domain = "http://en.kremlin.ru/",
                               partOfLink = "president/news/",
                               partOfLinkToExclude = c("page",
                                                       "photos",
                                                       "videos",
                                                       "special"))
# length(links)/length(indexLinks)
# Explore links and check if more or less alright, if number realistic
# length(links)/length(indexLinks)
# View(links)
# head(links)

#### 4. Download pages ####

DownloadContents(links = links, wait = 3)
DownloadContents(links = links, missingPages = FALSE, wait = 3)

#### 5. Extract metadata and text ####

# open an article at random to find out where metadata are located
# browseURL(url = sample(x = indexLinks, size = 1))

id <- ExtractId()

titles <- ExtractTitles(removeEverythingAfter = " •")


dates <- ExtractDates(container = "time",
                      attribute = "datetime",
                      dateFormat = "Ymd",
                      keepAllString = TRUE)

# check how many dates were not captured
sum(is.na(dates))
links[is.na(dates)]

language <- "english"

metadata <- ExportMetadata(dates = dates,
                           id = id,
                           titles = titles,
                           language = language,
                           links = links)

## Extract text
text <- ExtractText(container = "div",
                    containerClass = "read__content",
                    subElement = "p",
                    removeEverythingAfter = "\nPublished in")

i <- sample(x = 1:length(text), 1)
titles[i]
dates[i]
text[i]

#### 6. Save and export ####

SaveWebsite(dataset = TRUE)


giocomai/castarterpresidents documentation built on May 30, 2019, 11:45 p.m.