# wikipedia Preparation Script
# The dataset was imported from:
# https://github.com/zonination/emperors
# It was assembled from the following wikipedia page:
# https://en.wikipedia.org/wiki/List_of_Roman_emperors
# Stage one: Collecting data
wikipedia <- readr::read_csv("data-raw/emperors/wikipedia/emperors.csv")
# Stage two: Correcting data
# In this stage you will want to correct the variable names and
# formats of the 'Wikipedia' object until the object created
# below (in stage three) passes all the tests.
# First, the dataset contains a notes column about some errors in dates.
wikipedia$notes
# Some of the first few dates are supposed to be negative (BC).
# While for other dates, only the year is accurate...
# However, due to issues in dealing with dates with most R packages,
# these dates became positive or were completed arbitrarily.
# Let's change these back and treat dates with `{messydates}`.
# Get these columns into character to be able to modify them.
wikipedia$birth <- as.character(wikipedia$birth)
wikipedia$death <- as.character(wikipedia$death)
wikipedia$reign.start <- as.character(wikipedia$reign.start)
wikipedia$reign.end <- as.character(wikipedia$reign.end)
# Now let's correct negative columns, according to the notes,
# by adding a BC to them.
for(i in c(1, 2, 4, 6)) {
wikipedia$birth[i] <- paste0(wikipedia$birth[i], " BC")
}
wikipedia$reign.start[1] <- paste0(wikipedia$reign.start[1], " BC")
# Some dates are estimates, while others only the year is correct,
# according to the notes.
# How could we be more specific about that?
# Let's look into how `{messydates}` indicates uncertainty
?messydates::class
# We should assign a "~" to dates which are estimates.
for(i in c(25, 28, 29, 30, 31, 51, 55, 63, 65, 67)) {
# birth is estimate
wikipedia$birth[i] <- paste0(wikipedia$birth[i], "~")
}
# death is estimate
wikipedia$death[61] <- paste0(wikipedia$death[61], "~")
# reign end is estimate
wikipedia$reign.end[33] <- paste0(wikipedia$reign.end[33], "~")
for(i in c(50, 57, 58)) { # birth and death are estimates
wikipedia$birth[i] <- paste0(wikipedia$birth[i], "~")
wikipedia$death[i] <- paste0(wikipedia$death[i], "~")
}
for(i in c(17, 56)) { # death and reign end are estimates
wikipedia$death[i] <- paste0(wikipedia$death[i], "~")
wikipedia$reign.end[i] <- paste0(wikipedia$reign.end[i], "~")
}
for(i in 62) {
# birth and reign start are estimates
wikipedia$birth[i] <- paste0(wikipedia$birth[i], "~")
wikipedia$reign.start[i] <- paste0(wikipedia$reign.start[i], "~")
}
for(i in c(27, 43, 52)) {
# birth, death and reign end estimates
wikipedia$birth[i] <- paste0(wikipedia$birth[i], "~")
wikipedia$death[i] <- paste0(wikipedia$death[i], "~")
wikipedia$reign.end[i] <- paste0(wikipedia$reign.end[i], "~")
}
for(i in c(44, 47, 48)){
# death and reign start and reign end estimates
wikipedia$death[i] <- paste0(wikipedia$death[i], "~")
wikipedia$reign.end[i] <- paste0(wikipedia$reign.end[i], "~")
wikipedia$reign.start[i] <- paste0(wikipedia$reign.start[i], "~")
}
for(i in c(34, 35, 36, 37, 38, 39, 40, 41, 45, 46, 60)){
# birth, death and reign start and reign end estimates
wikipedia$birth[i] <- paste0(wikipedia$birth[i], "~")
wikipedia$death[i] <- paste0(wikipedia$death[i], "~")
wikipedia$reign.end[i] <- paste0(wikipedia$reign.end[i], "~")
wikipedia$reign.start[i] <- paste0(wikipedia$reign.start[i], "~")
}
# Let's also keep the year only for those dates which
# the notes detail only year in certain.
for(i in c(18, 22, 23)){ # reign start year only
wikipedia$reign.start[i] <- stringr::str_extract(wikipedia$reign.start[i],
"^[0-9]{3}")
}
# birth year only
wikipedia$birth[24] <- stringr::str_extract(wikipedia$birth[24], "^[0-9]{3}")
# Finally, some dates appear to be ranges.
# `{messydates}` deals with ranges with a ".." separator.
wikipedia$birth[20] <- paste0(wikipedia$birth[20], "..", "0137-02-02")
wikipedia$birth[66] <- paste0(wikipedia$birth[66], "..", "0359-05-23")
# Remove non-ASCII characters
wikipedia <- purrr::map(wikipedia, stringi::stri_enc_toascii)
# Let's standardise dates and variable names
wikipedia <- as_tibble(wikipedia) %>%
transmutate(ID = name,
Begin = messydates::as_messydate(reign.start),
End = messydates::as_messydate(reign.end)) %>%
dplyr::rename(FullName = name.full,
Birth = birth,
Death = death,
CityBirth = birth.cty,
ProvinceBirth = birth.prv,
Rise = rise,
Cause = cause,
Killer = killer,
Dynasty = dynasty,
Era = era,
Notes = notes,
Verif = verif.who) %>%
dplyr::select(-index) %>%
dplyr::relocate(ID, Begin, End)
# manydata includes several functions that should help cleaning
# and standardising your data.
# Please see the vignettes or website for more details.
# Stage three: Connecting data
# Next run the following line to make Wikipedia available
# within the qPackage.
manypkgs::export_data(wikipedia, datacube = "emperors", URL = "https://github.com/zonination/emperors")
# This function also does two additional things.
# First, it creates a set of tests for this object to ensure adherence
# to certain standards.You can hit Cmd-Shift-T (Mac) or Ctrl-Shift-T (Windows)
# to run these tests locally at any point.
# Any test failures should be pretty self-explanatory and may require
# you to return to stage two and further clean, standardise, or wrangle
# your data into the expected format.
# Second, it also creates a documentation file for you to fill in.
# Please make sure that you cite any sources appropriately and fill in as
# much detail about the variables etc as possible.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.