R/odcr_scrape.R

Defines functions odcr_scrape

odcr_scrape <- function(courts, casetypes, years, case_seqs, updatedb = TRUE, update_freq = 20) {
  
  for (court_tmp in courts) {
    for (casetype_tmp in casetypes) {
      for (caseyear_tmp in years) {
        for (caseseq_tmp in case_seqs) {
          start <- Sys.time()
          
          casenum_tmp <- paste(casetype_tmp, caseyear_tmp,
                               str_pad(caseseq_tmp, side = "left", width = 5, pad = 0),
                               sep = "-")
          
          u <- court_ref[court_ref$court == court_tmp, "url_pattern"] %>%
            str_replace("XX", paste0(casetype_tmp, "+")) %>%
            str_replace("YY", str_sub(caseyear_tmp, 3, 4)) %>%
            str_replace("ZZZZ", str_pad(caseseq_tmp, side = "left", pad = 0, width = 4))
          
          ht <- try(read_html(httr::GET(u,
                                        config = httr::config(ssl_verifypeer = FALSE))))
          
          if (class(ht) != "try-error") {
            d <- ht %>%
              html_nodes("table") %>%
              html_table()
            
            if (length(d) < 2) {
              updates_tmp <- tibble(odcr_update_id = paste0(courtlist[[str_to_upper(court_tmp)]], casenum_tmp, "-", Sys.time()),
                                    court = str_to_upper(court_tmp),
                                    casenum = casenum_tmp,
                                    casetype = casetype_tmp,
                                    file_year = caseyear_tmp,
                                    updated = Sys.time(),
                                    available = FALSE)
              
              lastupdate_tmp <- updates_tmp %>%
                mutate(odcr_lastupdate_id = str_sub(odcr_update_id, 1, 17)) %>%
                select(odcr_lastupdate_id, court, casenum, casetype, file_year, last_update = updated, available)
              
              if (exists("odcr_updates")) {odcr_updates <<- bind_rows(odcr_updates, updates_tmp)
              } else { odcr_updates <<- updates_tmp }
              
              if (exists("odcr_lastupdate")) {odcr_lastupdate <<- bind_rows(odcr_lastupdate, lastupdate_tmp)
              } else { odcr_lastupdate <<- lastupdate_tmp }
              
              message(paste(court_tmp, casenum_tmp, "scraped in", Sys.time() - start, "seconds (data not available)."))
              
            } else {
              
              cid <- tibble(court = str_to_upper(court_tmp),
                            court_code = courtlist[[court_tmp]],
                            casenum = casenum_tmp,
                            casetype = casetype_tmp,
                            file_year = caseyear_tmp)
              
              pl <- ht %>%
                html_nodes("a") %>%
                html_attr("href")
              
              pl <- pl[str_detect(pl, "person")]
              
              definfo <- tibble()
              
              for (i in seq_along(pl)) {
                
                pp <- try(read_html(httr::GET(paste0("https://www1.odcr.com/", pl[1]),
                                              config = httr::config(ssl_verifypeer = FALSE))) %>%
                            html_nodes("span") %>%
                            html_text)
                
                if (class(pp) != "try-error") {
                  
                  definfo <- tibble(court = str_to_upper(court_tmp),
                                    court_code = courtlist[[court_tmp]],
                                    casenum = casenum_tmp,
                                    casetype = casetype_tmp,
                                    file_year = caseyear_tmp,
                                    defname = str_squish(pp[2]),
                                    def_mob = paste0("1/", str_extract(pp[4], "\\d{1,2}/\\d{2,4}")) %>%
                                      dmy,
                                    def_address = pp[6] %>% str_squish,
                                    def_zip = def_address %>%
                                      str_extract("\\d{5}")) %>%
                    mutate(odcr_pp_id = paste(court_code, casenum,
                                              str_pad(i, 2, "left", 0), sep = "-")) %>%
                    select(odcr_pp_id, court, casenum, casetype, file_year, defname, def_mob, def_address, def_zip) %>%
                    bind_rows(definfo)
                  
                }
                
                if (exists("odcr_party_profile")) {odcr_party_profile <<- bind_rows(odcr_party_profile, definfo)
                } else {odcr_party_profile <<- definfo}
                
              }
              if (length(d) < 2) {} else {
                
                for (i in 1:length(d)) {
                  
                  t <- as.data.frame(d[[i]])
                  
                  if (t[1,1] == "Case Identifier") {
                    odcr_caseinfo_tmp <- cid %>%
                      mutate(odcr_ci_id = paste(court_code, casenum),
                             file_date = t[3, 2] %>%
                               mdy,
                             judge = ifelse("Judge" %in% t$X1,
                                            t[str_detect(t$X1, "Judge"), "X2"], NA),
                             odcr_ci_id = paste0(court_code, casenum)) %>%
                      select(odcr_ci_id, court, casenum, casetype, file_year, file_date, judge)
                    
                    if (exists("odcr_caseinfo")) {odcr_caseinfo <<- bind_rows(odcr_caseinfo, odcr_caseinfo_tmp)
                    } else {odcr_caseinfo <<- odcr_caseinfo_tmp}
                  }
                  
                  if (any(c("Judge", "Agency", "Attorney", "Defendant") %in% t[,1])) {
                    odcr_parties_tmp <- t %>%
                      mutate(casenum = casenum_tmp) %>%
                      right_join(cid, by = "casenum") %>%
                      mutate(party = X2 %>%
                               str_extract(".*?(?=\t|\n|$)") %>%
                               str_squish,
                             party_loc = X2 %>%
                               str_extract("(?<=\\sof ).*$") %>%
                               str_squish) %>%
                      mutate(odcr_pn_id = paste0(court_code, casenum, "-",
                                                 str_pad(row_number(), side = "left", width = 2, pad = "0"))) %>%
                      select(odcr_pn_id, court, casenum, casetype, file_year, party_type = X1, party, party_loc)
                    
                    if (exists("odcr_parties")) {odcr_parties <<- bind_rows(odcr_parties, odcr_parties_tmp)
                    } else {odcr_parties <<- odcr_parties_tmp}
                  }
                  
                  if ("Amount" %in% names(t) & i < 4) {
                    
                    odcr_mins_tmp <- cid %>%
                      left_join(t %>%
                                  mutate(casenum = casenum_tmp), by = "casenum") %>%
                      filter(Date != "Grand Total") %>%
                      mutate(min_date = mdy(Date),
                             fee_amt = Amount %>%
                               str_remove_all("\\$|,") %>%
                               as.numeric) %>%
                      fill(min_date) %>%
                      select(court, court_code, casenum, casetype, file_year, min_date, min_desc = Description, fee_amt) %>%
                      group_by(court, court_code, casenum, casetype, file_year, min_date, min_desc) %>%
                      summarize(fee_amt = sum(fee_amt, na.rm = TRUE)) %>%
                      ungroup %>%
                      mutate(odcr_min_id = paste0(court_code, casenum, "-",
                                                  str_pad(row_number(), width = 3, side = "left", pad = 0))) %>%
                      select(odcr_min_id, court, casenum, casetype, file_year, min_date, min_desc, fee_amt)
                    
                    if (exists("odcr_mins")) {odcr_mins <<- bind_rows(odcr_mins, odcr_mins_tmp)
                    } else {odcr_mins <<- odcr_mins_tmp}
                  }
                  
                  if ("Time" %in% names(t)) {
                    
                    odcr_events_tmp <- cid %>%
                      left_join(t %>%
                                  mutate(casenum = casenum_tmp), by = "casenum") %>%
                      mutate(event_date = mdy(Date),
                             event_desc = Description %>%
                               str_extract(".*?(?=\n|\t|Completed|$)"),
                             complete_date = str_extract(Description, "\\d{1,2}/\\d{1,2}/\\d{2,4}") %>%
                               mdy,
                             event_code = str_extract(Description, "(?<=Code:).*") %>%
                               str_squish) %>%
                      mutate(odcr_ev_id = paste0(court_code, casenum, "-",
                                                 str_pad(row_number(), width = 3, side = "left", pad = 0))) %>%
                      select(odcr_ev_id, court, casenum, casetype, file_year, event_date, event_time = Time, event_desc, complete_date, event_code)
                    
                    if (exists("odcr_events")) {odcr_events <<- bind_rows(odcr_events, odcr_events_tmp)
                    } else {odcr_events <<- odcr_events_tmp}
                    
                  }
                  
                  if ("Amount" %in% names(t) & i > 3) {
                    
                    odcr_pays_tmp <- cid %>%
                      left_join(t %>%
                                  mutate(casenum = casenum_tmp), by = "casenum") %>%
                      filter(Date != "Grand Total") %>%
                      mutate(pay_date = mdy(Date),
                             pay_amt = Amount %>%
                               str_remove_all("\\$|,") %>%
                               as.numeric) %>%
                      mutate(odcr_pay_id = paste0(court_code, casenum, "-",
                                                  str_pad(row_number(), width = 3, side = "left", pad = 0))) %>%
                      select(odcr_pay_id, court, casenum, casetype, file_year, pay_date, pay_desc = Description, pay_amt)
                    
                    if (exists("odcr_pays")) {odcr_pays <<- bind_rows(odcr_pays, odcr_pays_tmp)
                    } else {odcr_pays <<- odcr_pays_tmp}
                  }
                  
                  #### Counts and dispositions  ####
                  counts <- ht %>%
                    html_nodes("li") %>%
                    html_text()
                  
                  counts <- counts[!str_detect(counts, "Pricing|Login|Sign Up|New search|Contact Us|Advertise|Facebook")] %>%
                    as.data.frame() %>%
                    mutate(casenum = casenum_tmp)
                  
                  names(counts)[1] <- "ct_desc"
                  counts$ct_desc <- str_to_upper(counts$ct_desc)
                  
                  odcr_disps_tmp <- cid %>%
                    left_join(counts, by = "casenum")
                  #
                  # disp_dates <- odcr_events_tmp[str_detect(odcr_events_tmp$event_desc, "DISPO|DEFER|CONVICT|DISMISS") & !str_detect(odcr_events_tmp$event_desc, "COSTS"), "event_date"] %>%
                  #   as.list %>%
                  #   unlist %>%
                  #   as_date %>%
                  #   unique
                  #
                  # disp_mins_tmp <<- odcr_mins %>%
                  #   filter(min_date %in% disp_dates)
                  
                  # cd <- case_when(any(str_detect(disp_mins_tmp$min_desc, "CONVICT")) ~ "CONVICTION",
                  #                 any(str_detect(disp_mins_tmp$min_desc, "DEFER")) ~ "DEFERRED",
                  #                 any(str_detect(disp_mins_tmp$min_desc, "DISMISS")) ~ "DISMISSED")
                  
                  odcr_disps_tmp <- odcr_disps_tmp %>%
                    #                mutate(case_disp = cd) %>%
                    mutate(ct_no = row_number(),
                           ct_desc = str_remove(ct_desc, "\\d{1,2}\\.") %>%
                             str_squish,
                           odcr_disp_id = paste0(court_code, casenum, "-",
                                                 str_pad(ct_no, width = 3, side = "left", pad = "0"), "-",
                                                 str_pad(row_number(), width = 3, side = "left", pad = "0"))) %>%
                    select(odcr_disp_id, court, casenum, casetype, file_year, ct_no, ct_desc)
                  
                  if (exists("odcr_disps")) {odcr_disps <<- bind_rows(odcr_disps, odcr_disps_tmp)
                  } else {odcr_disps <<- odcr_disps_tmp}
                }
              }
              
              message(paste(court_tmp, casenum_tmp, "scraped in", Sys.time() - start, "seconds."))
              
              updates_tmp <- tibble(odcr_update_id = paste0(courtlist[[str_to_upper(court_tmp)]], casenum_tmp, "-", Sys.time()),
                                    court = str_to_upper(court_tmp),
                                    casenum = casenum_tmp,
                                    casetype = casetype_tmp,
                                    file_year = caseyear_tmp,
                                    updated = Sys.time(),
                                    available = ifelse(length(d) < 3, FALSE, TRUE))
              
              lastupdate_tmp <- updates_tmp %>%
                mutate(odcr_lastupdate_id = str_sub(odcr_update_id, 1, 17)) %>%
                select(odcr_lastupdate_id, court, casenum, casetype, file_year, last_update = updated, available)
              
              if (exists("odcr_updates")) {odcr_updates <<- bind_rows(odcr_updates, updates_tmp)
              } else { odcr_updates <<- updates_tmp }
              
              if (exists("odcr_lastupdate")) {odcr_lastupdate <<- bind_rows(odcr_lastupdate, lastupdate_tmp)
              } else { odcr_lastupdate <<- lastupdate_tmp }
              
              if (updatedb == TRUE & nrow(odcr_updates) >= update_freq) {
                odcr_updatedb()
                oscn_reset()
                disconnect_ojo()
              }
            }
          }
        }
      }
    }
  }
  if (updatedb == TRUE) {
    odcr_updatedb()
    oscn_reset()
  }
  disconnect_ojo()
}
openjusticeok/ojo documentation built on Feb. 2, 2021, 5:47 a.m.