sos_scrape <- function(id_nums) {
for (id_num in id_nums) {
url <- paste0("https://www.sos.ok.gov/gov/pardons/pardonInformation.aspx?id=", id_num)
ht <- read_html(url)
txt <- ht %>%
html_text() %>%
str_remove_all("[\n\r]") %>%
str_squish %>%
str_extract("(?<=Summary DOC).*(?=Business Services)")
doc_num_tmp = str_extract(txt, "(?<=Number: )\\d*")
first_name_tmp = str_extract(txt, "(?<=First Name: ).*(?= Middle)")
last_name_tmp = str_extract(txt, "(?<=Last Name: ).*?(?= County)")
county_tmp = str_extract(txt, "(?<=County: ).*?(?= Name)")
docs <- html_table(ht)[[2]]
names(docs) <- c("document", "filing_date", "filing_type", "action", "document_county")
docs <- docs %>%
filter(!is.na(document))
## Track missing records to cut off scraping ####
if (nrow(docs) == 0 | str_detect(docs[1,1], "No records")) {
if (!exists("no_data")) {
no_data <- tibble()
}
no_data <- no_data %>%
bind_rows(tibble(id_num = id_num))
message("No data for ID ", id_num)
if (nrow(no_data) == 5) {
connect_ojo()
dbWriteTable(ojo_db, "sos_certificates", d, append = TRUE, row.names = FALSE)
ojo_disconnect_all()
break
}
} else {
tmp <- docs %>%
filter(!is.na(document)) %>%
mutate(ppb_id = id_num,
doc_num = doc_num_tmp,
first_name = first_name_tmp,
last_name = last_name_tmp,
filing_date = dmy(filing_date),
county = as.character(county_tmp),
document_county = as.character(document_county),
filing_type = as.character(filing_type),
action = as.character(action)) %>%
mutate_if(is.character, str_to_upper) %>%
select(ppb_id, doc_num, first_name, last_name, county, filing_date, filing_type, action, document_county)
if (exists("no_data")) {rm(no_data)}
message(paste("Information gathered for Pardon ID", id_num, ":\n", tmp[1,"first_name"],
tmp[1,"last_name"], "\n",
"Filed: ",
tmp[nrow(tmp),"filing_date"] %>%
as.numeric %>%
as_date(origin = ymd("1970-01-01"))))
if (exists("d")) {
d <<- bind_rows(d, tmp) %>%
distinct %>%
filter(!is.na(first_name))
} else {
d <<- tmp
}
if (nrow(d) >= 20) {
connect_ojo()
dbWriteTable(ojo_db, "sos_certificates", d, append = TRUE, row.names = FALSE)
ojo_disconnect_all()
message("Upload successful")
rm(d, pos = ".GlobalEnv")
}
}
}
connect_ojo()
dbWriteTable(ojo_db, "sos_certificates", d, append = TRUE, row.names = FALSE)
ojo_disconnect_all()
}
sos_scrape_old <- function() {
ojo_disconnect_all()
connect_ojo()
sos_scrape((dbGetQuery(ojo_db, "SELECT min(ppb_id) FROM ojo.sos_certificates;") %>% as.numeric):1)
ojo_disconnect_all()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.