library(fs)
library(dplyr)
library(gutenbergr)
library(here)
library(purrr)
library(stringr)
library(tibble)
library(xml2)
source(here::here("data-raw", "parsers.R"))
# Grab the timestamp when we *started* this process. Don't update again until
# the source data is after this timestamp. Note that we don't actually *use*
# this timestamp yet, other than to inform users.
updated <- lubridate::date(lubridate::now(tzone = "UTC"))
cache_dir <- download_raw_data()
rdf_paths <- unname(fs::dir_ls(cache_dir, recurse = TRUE, glob = "*.rdf"))
all_metadata <- purrr::map(
rdf_paths,
parse_all_metadata
)
new_gutenberg_authors <- purrr::map(all_metadata, ~ .x$authors) |>
purrr::list_rbind() |>
dplyr::distinct(gutenberg_author_id, .keep_all = TRUE) |>
dplyr::arrange(gutenberg_author_id)
new_gutenberg_languages <- purrr::map(all_metadata, ~ .x$languages) |>
purrr::list_rbind() |>
dplyr::distinct() |>
dplyr::arrange(gutenberg_id, language)
new_gutenberg_metadata <- purrr::map(all_metadata, ~ .x$metadata) |>
purrr::list_rbind() |>
dplyr::arrange(gutenberg_id, gutenberg_author_id)
new_gutenberg_subjects <- purrr::map_dfr(all_metadata, ~ .x$subjects) |>
dplyr::distinct() |>
dplyr::arrange(gutenberg_id)
# waldo::compare(nrow(gutenberg_authors), nrow(new_gutenberg_authors))
# waldo::compare(nrow(gutenberg_subjects), nrow(new_gutenberg_subjects))
# waldo::compare(nrow(gutenberg_languages), nrow(new_gutenberg_languages))
# waldo::compare(nrow(gutenberg_metadata), nrow(new_gutenberg_metadata))
# dplyr::distinct(new_gutenberg_metadata, gutenberg_id, has_text) |>
# dplyr::left_join(
# dplyr::distinct(gutenberg_metadata, gutenberg_id, has_text),
# by = "gutenberg_id"
# ) |>
# dplyr::filter(has_text.x != has_text.y) |>
# dplyr::filter(!has_text.x)
gutenberg_authors <- new_gutenberg_authors
gutenberg_subjects <- new_gutenberg_subjects
gutenberg_languages <- new_gutenberg_languages
gutenberg_metadata <- new_gutenberg_metadata
attr(gutenberg_authors, "date_updated") <- updated
attr(gutenberg_languages, "date_updated") <- updated
attr(gutenberg_metadata, "date_updated") <- updated
attr(gutenberg_subjects, "date_updated") <- updated
usethis::use_data(gutenberg_authors, overwrite = TRUE, compress = "xz")
usethis::use_data(gutenberg_languages, overwrite = TRUE, compress = "bzip2")
usethis::use_data(gutenberg_metadata, overwrite = TRUE, compress = "xz")
usethis::use_data(gutenberg_subjects, overwrite = TRUE, compress = "xz")
# Clean up.
unlink(cache_dir, recursive = TRUE)
rm(
all_metadata,
gutenberg_authors,
gutenberg_languages,
gutenberg_metadata,
gutenberg_subjects,
new_gutenberg_authors,
new_gutenberg_languages,
new_gutenberg_metadata,
new_gutenberg_subjects,
cache_dir,
rdf_paths,
download_raw_data,
parse_all_metadata,
parse_author,
parse_subject,
updated
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.