library(tidyverse)
library(secretary)
library(chariot)
library(pg13)
library(skyscraper)
library(police)
rn_urls <-
chariot::queryAthena(
"SELECT DISTINCT
rn_url
FROM chemidplus.registry_number_log
WHERE rn_url IS NOT NULL AND rn_url NOT IN (
SELECT rn_url
FROM chemidplus.rn_url_validity
);",
override_cache = TRUE) %>%
dplyr::distinct() %>%
unlist() %>%
unname()
rn_urls <- sample(rn_urls)
if (!interactive()) {
report_filename <- paste0("~/Desktop/registry_number_log_to_tables_", as.character(Sys.Date()), ".txt")
cat(file = report_filename)
}
if (length(rn_urls)) {
errors <- vector()
total <- length(rn_urls)
if (!interactive()) {
cat("########### First Iteration\n", file = report_filename, append = TRUE)
}
while (length(rn_urls)) {
rn_url <- rn_urls[1]
response <-
police::try_catch_error_as_null(
xml2::read_html(rn_url, options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE"))
)
Sys.sleep(5)
if (is.null(response)) {
response <- police::try_catch_error_as_null(xml2::read_html(rn_url, options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")))
Sys.sleep(5)
}
if (!is.null(response)) {
conn <- chariot::connectAthena()
output <-
tryCatch(
get_rn_url_validity(conn = conn,
rn_url = rn_url,
response = response),
error = function(e) paste("Error")
)
chariot::dcAthena(conn = conn,
remove = TRUE)
if (length(output)) {
if (output == "Error") {
errors <-
c(errors,
rn_url)
}
}
conn <- chariot::connectAthena()
skyscraper::get_names_and_synonyms(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
# conn <- chariot::connectAthena()
# get_classification_code(conn = conn,
# rn_url = rn_url,
# response = response,
# sleep_time = 0)
# chariot::dcAthena(conn = conn,
# remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_classification(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_registry_numbers(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_links_to_resources(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
} else {
conn <- chariot::connectAthena()
output <-
get_rn_url_validity(conn = conn,
rn_url = rn_url)
chariot::dcAthena(conn = conn,
remove = TRUE)
errors <-
c(errors,
rn_url)
# if (length(output)) {
#
# if (output == "Error") {
#
# errors <-
# c(errors,
# rn_url)
#
# }
# }
}
rn_urls <- rn_urls[-1]
rm(rn_url)
rm(response)
rm(output)
if (nrow(showConnections())) {
closeAllConnections()
}
#Metrics
completed_ct <- length(rn_urls)
if (interactive()) {
secretary::typewrite(secretary::italicize(signif(100*((total-length(rn_urls))/total), digits = 2), "percent completed."))
secretary::typewrite(secretary::cyanTxt(length(rn_urls), "out of", total, "to go."))
secretary::typewrite(secretary::redTxt(length(errors), "errors."))
} else {
cat("[", as.character(Sys.time()), "]", sep = "", file = report_filename, append = TRUE)
cat("\t", length(rn_urls), "/", total, " (", signif(100*((total-length(rn_urls))/total), digits = 2), " percent completed)\n", sep = "", file = report_filename, append = TRUE)
cat("[", as.character(Sys.time()), "]", sep = "", file = report_filename, append = TRUE)
cat("\t", length(errors), " errors\n", sep = "", file = report_filename, append = TRUE)
}
if ((completed_ct %% 50) == 0) {
skyscraper::export_schema_to_data_repo(target_dir = "~/GitHub/chemidplusData/",
schema = "chemidplus")
}
}
if (!interactive()) {
cat("########### Second Iteration\n", file = report_filename, append = TRUE)
}
rn_urls <- errors
errors <- vector()
total <- length(rn_urls)
while (length(rn_urls)) {
rn_url <- rn_urls[1]
response <-
police::try_catch_error_as_null(
xml2::read_html(rn_url, options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE"))
)
Sys.sleep(10)
if (is.null(response)) {
response <- police::try_catch_error_as_null(xml2::read_html(rn_url, options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")))
Sys.sleep(10)
}
if (!is.null(response)) {
conn <- chariot::connectAthena()
output <-
tryCatch(
get_rn_url_validity(conn = conn,
rn_url = rn_url,
response = response),
error = function(e) paste("Error")
)
chariot::dcAthena(conn = conn,
remove = TRUE)
if (length(output)) {
if (output == "Error") {
errors <-
c(errors,
rn_url)
}
}
conn <- chariot::connectAthena()
skyscraper::get_names_and_synonyms(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
# conn <- chariot::connectAthena()
# get_classification_code(conn = conn,
# rn_url = rn_url,
# response = response,
# sleep_time = 0)
# chariot::dcAthena(conn = conn,
# remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_classification(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_registry_numbers(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_links_to_resources(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
} else {
conn <- chariot::connectAthena()
output <-
tryCatch(
get_rn_url_validity(conn = conn,
rn_url = rn_url),
error = function(e) paste("Error")
)
chariot::dcAthena(conn = conn,
remove = TRUE)
errors <-
c(errors,
rn_url)
# if (length(output)) {
#
# if (output == "Error") {
#
# errors <-
# c(errors,
# rn_url)
#
# }
# }
}
rn_urls <- rn_urls[-1]
rm(rn_url)
rm(response)
rm(output)
if (nrow(showConnections())) {
closeAllConnections()
}
if (interactive()) {
secretary::typewrite(secretary::italicize(signif(100*((total-length(rn_urls))/total), digits = 2), "percent completed."))
secretary::typewrite(secretary::cyanTxt(length(rn_urls), "out of", total, "to go."))
secretary::typewrite(secretary::redTxt(length(errors), "errors."))
} else {
cat("[", as.character(Sys.time()), "]", sep = "", file = report_filename, append = TRUE)
cat("\t", length(rn_urls), "/", total, " (", signif(100*((total-length(rn_urls))/total), digits = 2), " percent completed)\n", sep = "", file = report_filename, append = TRUE)
cat("[", as.character(Sys.time()), "]", sep = "", file = report_filename, append = TRUE)
cat("\t", length(errors), " errors\n", sep = "", file = report_filename, append = TRUE)
}
#Metrics
completed_ct <- length(rn_urls)
if ((completed_ct %% 25) == 0) {
skyscraper::export_schema_to_data_repo(target_dir = "~/GitHub/chemidplusData/",
schema = "chemidplus")
}
}
if (!interactive()) {
cat("########### Third Iteration\n", file = report_filename, append = TRUE)
}
rn_urls <- errors
errors <- vector()
total <- length(rn_urls)
while (length(rn_urls)) {
rn_url <- rn_urls[1]
response <-
police::try_catch_error_as_null(
xml2::read_html(rn_url, options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE"))
)
Sys.sleep(20)
if (is.null(response)) {
response <- police::try_catch_error_as_null(xml2::read_html(rn_url, options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")))
Sys.sleep(20)
}
if (!is.null(response)) {
conn <- chariot::connectAthena()
output <-
tryCatch(
get_rn_url_validity(conn = conn,
rn_url = rn_url,
response = response),
error = function(e) paste("Error")
)
chariot::dcAthena(conn = conn,
remove = TRUE)
if (length(output)) {
if (output == "Error") {
errors <-
c(errors,
rn_url)
}
}
conn <- chariot::connectAthena()
skyscraper::get_names_and_synonyms(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
# conn <- chariot::connectAthena()
# get_classification_code(conn = conn,
# rn_url = rn_url,
# response = response,
# sleep_time = 0)
# chariot::dcAthena(conn = conn,
# remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_classification(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_registry_numbers(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
conn <- chariot::connectAthena()
skyscraper::get_links_to_resources(conn = conn,
rn_url = rn_url,
response = response,
sleep_time = 0)
chariot::dcAthena(conn = conn,
remove = TRUE)
} else {
conn <- chariot::connectAthena()
output <-
tryCatch(
get_rn_url_validity(conn = conn,
rn_url = rn_url),
error = function(e) paste("Error")
)
chariot::dcAthena(conn = conn,
remove = TRUE)
errors <-
c(errors,
rn_url)
# if (length(output)) {
#
# if (output == "Error") {
#
# errors <-
# c(errors,
# rn_url)
#
# }
# }
}
rn_urls <- rn_urls[-1]
rm(rn_url)
rm(response)
rm(output)
if (nrow(showConnections())) {
closeAllConnections()
}
if (interactive()) {
secretary::typewrite(secretary::italicize(signif(100*((total-length(rn_urls))/total), digits = 2), "percent completed."))
secretary::typewrite(secretary::cyanTxt(length(rn_urls), "out of", total, "to go."))
secretary::typewrite(secretary::redTxt(length(errors), "errors."))
} else {
cat("[", as.character(Sys.time()), "]", sep = "", file = report_filename, append = TRUE)
cat("\t", length(rn_urls), "/", total, " (", signif(100*((total-length(rn_urls))/total), digits = 2), " percent completed)\n", sep = "", file = report_filename, append = TRUE)
cat("[", as.character(Sys.time()), "]", sep = "", file = report_filename, append = TRUE)
cat("\t", length(errors), " errors\n", sep = "", file = report_filename, append = TRUE)
}
#Metrics
completed_ct <- length(rn_urls)
if ((completed_ct %% 10) == 0) {
skyscraper::export_schema_to_data_repo(target_dir = "~/GitHub/chemidplusData/",
schema = "chemidplus")
}
}
if (!interactive()) {
cat("########### COMPLETE\n", file = report_filename, append = TRUE)
cat("\n########### ERRORS\n", file = report_filename, append = TRUE)
cat(errors, sep = "\n", file = report_filename, append = TRUE)
} else {
secretary::typewrite_bold("ERRORS:")
errors %>%
purrr::map(~secretary::typewrite(., tabs = 1))
}
} else {
if (interactive()) {
secretary::typewrite_italic("No new RN urls.")
} else {
cat("[", as.character(Sys.time()), "]", sep = "", file = report_filename, append = TRUE)
cat("\t", "All RN urls in in the Registry Number Log are in the RN URL Validity Table. There are no new RN URLs to parse into tables.", sep = "", file = report_filename, append = TRUE)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.