R/sos_scrape.R

Defines functions sos_scrape_old sos_scrape

sos_scrape <- function(id_nums) {
  
  for (id_num in id_nums) {
    url <- paste0("https://www.sos.ok.gov/gov/pardons/pardonInformation.aspx?id=", id_num)
    
    ht <- read_html(url) 
    
    txt <- ht %>% 
      html_text() %>% 
      str_remove_all("[\n\r]") %>% 
      str_squish %>% 
      str_extract("(?<=Summary DOC).*(?=Business Services)")
    
    doc_num_tmp = str_extract(txt, "(?<=Number: )\\d*")
    first_name_tmp = str_extract(txt, "(?<=First Name: ).*(?= Middle)")
    last_name_tmp = str_extract(txt, "(?<=Last Name: ).*?(?= County)")
    county_tmp = str_extract(txt, "(?<=County: ).*?(?= Name)")
    
    docs <- html_table(ht)[[2]]
    
    names(docs) <- c("document", "filing_date", "filing_type", "action", "document_county")
    
    docs <- docs %>% 
      filter(!is.na(document))
    
    ## Track missing records to cut off scraping ####
    
    if (nrow(docs) == 0 | str_detect(docs[1,1], "No records")) {
      
      if (!exists("no_data")) {
        no_data <- tibble()
      }
      
      no_data <- no_data %>% 
        bind_rows(tibble(id_num = id_num))
      
      message("No data for ID ", id_num)
      
      if (nrow(no_data) == 5) {
        connect_ojo()
        dbWriteTable(ojo_db, "sos_certificates", d, append = TRUE, row.names = FALSE)
        ojo_disconnect_all()
        
        break
      }
      
    } else {
      
      tmp <- docs %>% 
        filter(!is.na(document)) %>% 
        mutate(ppb_id = id_num, 
               doc_num = doc_num_tmp,
               first_name = first_name_tmp,
               last_name = last_name_tmp,
               filing_date = dmy(filing_date),
               county = as.character(county_tmp),
               document_county = as.character(document_county),
               filing_type = as.character(filing_type),
               action = as.character(action)) %>% 
        mutate_if(is.character, str_to_upper) %>% 
        select(ppb_id, doc_num, first_name, last_name, county, filing_date, filing_type, action, document_county)
      
      if (exists("no_data")) {rm(no_data)}
      
      message(paste("Information gathered for Pardon ID", id_num, ":\n", tmp[1,"first_name"],
                    tmp[1,"last_name"], "\n",
                    "Filed: ", 
                    tmp[nrow(tmp),"filing_date"] %>% 
                      as.numeric %>% 
                      as_date(origin = ymd("1970-01-01"))))
      
      if (exists("d")) {
        d <<- bind_rows(d, tmp) %>%
          distinct %>% 
          filter(!is.na(first_name))
      } else {
        d <<- tmp
      }
      
      if (nrow(d) >= 20) {
        connect_ojo()
        dbWriteTable(ojo_db, "sos_certificates", d, append = TRUE, row.names = FALSE)
        ojo_disconnect_all()
        
        message("Upload successful")
        
        rm(d, pos = ".GlobalEnv")
      }
    }
    
  }
  connect_ojo()
  dbWriteTable(ojo_db, "sos_certificates", d, append = TRUE, row.names = FALSE)
  ojo_disconnect_all()
}

sos_scrape_old <- function() {
  ojo_disconnect_all()
  connect_ojo()
  sos_scrape((dbGetQuery(ojo_db, "SELECT min(ppb_id) FROM ojo.sos_certificates;") %>% as.numeric):1)
  ojo_disconnect_all()
}
openjusticeok/ojo documentation built on Feb. 2, 2021, 5:47 a.m.