odcr_scrape <- function(courts, casetypes, years, case_seqs, updatedb = TRUE, update_freq = 20) {
for (court_tmp in courts) {
for (casetype_tmp in casetypes) {
for (caseyear_tmp in years) {
for (caseseq_tmp in case_seqs) {
start <- Sys.time()
casenum_tmp <- paste(casetype_tmp, caseyear_tmp,
str_pad(caseseq_tmp, side = "left", width = 5, pad = 0),
sep = "-")
u <- court_ref[court_ref$court == court_tmp, "url_pattern"] %>%
str_replace("XX", paste0(casetype_tmp, "+")) %>%
str_replace("YY", str_sub(caseyear_tmp, 3, 4)) %>%
str_replace("ZZZZ", str_pad(caseseq_tmp, side = "left", pad = 0, width = 4))
ht <- try(read_html(httr::GET(u,
config = httr::config(ssl_verifypeer = FALSE))))
if (class(ht) != "try-error") {
d <- ht %>%
html_nodes("table") %>%
html_table()
if (length(d) < 2) {
updates_tmp <- tibble(odcr_update_id = paste0(courtlist[[str_to_upper(court_tmp)]], casenum_tmp, "-", Sys.time()),
court = str_to_upper(court_tmp),
casenum = casenum_tmp,
casetype = casetype_tmp,
file_year = caseyear_tmp,
updated = Sys.time(),
available = FALSE)
lastupdate_tmp <- updates_tmp %>%
mutate(odcr_lastupdate_id = str_sub(odcr_update_id, 1, 17)) %>%
select(odcr_lastupdate_id, court, casenum, casetype, file_year, last_update = updated, available)
if (exists("odcr_updates")) {odcr_updates <<- bind_rows(odcr_updates, updates_tmp)
} else { odcr_updates <<- updates_tmp }
if (exists("odcr_lastupdate")) {odcr_lastupdate <<- bind_rows(odcr_lastupdate, lastupdate_tmp)
} else { odcr_lastupdate <<- lastupdate_tmp }
message(paste(court_tmp, casenum_tmp, "scraped in", Sys.time() - start, "seconds (data not available)."))
} else {
cid <- tibble(court = str_to_upper(court_tmp),
court_code = courtlist[[court_tmp]],
casenum = casenum_tmp,
casetype = casetype_tmp,
file_year = caseyear_tmp)
pl <- ht %>%
html_nodes("a") %>%
html_attr("href")
pl <- pl[str_detect(pl, "person")]
definfo <- tibble()
for (i in seq_along(pl)) {
pp <- try(read_html(httr::GET(paste0("https://www1.odcr.com/", pl[1]),
config = httr::config(ssl_verifypeer = FALSE))) %>%
html_nodes("span") %>%
html_text)
if (class(pp) != "try-error") {
definfo <- tibble(court = str_to_upper(court_tmp),
court_code = courtlist[[court_tmp]],
casenum = casenum_tmp,
casetype = casetype_tmp,
file_year = caseyear_tmp,
defname = str_squish(pp[2]),
def_mob = paste0("1/", str_extract(pp[4], "\\d{1,2}/\\d{2,4}")) %>%
dmy,
def_address = pp[6] %>% str_squish,
def_zip = def_address %>%
str_extract("\\d{5}")) %>%
mutate(odcr_pp_id = paste(court_code, casenum,
str_pad(i, 2, "left", 0), sep = "-")) %>%
select(odcr_pp_id, court, casenum, casetype, file_year, defname, def_mob, def_address, def_zip) %>%
bind_rows(definfo)
}
if (exists("odcr_party_profile")) {odcr_party_profile <<- bind_rows(odcr_party_profile, definfo)
} else {odcr_party_profile <<- definfo}
}
if (length(d) < 2) {} else {
for (i in 1:length(d)) {
t <- as.data.frame(d[[i]])
if (t[1,1] == "Case Identifier") {
odcr_caseinfo_tmp <- cid %>%
mutate(odcr_ci_id = paste(court_code, casenum),
file_date = t[3, 2] %>%
mdy,
judge = ifelse("Judge" %in% t$X1,
t[str_detect(t$X1, "Judge"), "X2"], NA),
odcr_ci_id = paste0(court_code, casenum)) %>%
select(odcr_ci_id, court, casenum, casetype, file_year, file_date, judge)
if (exists("odcr_caseinfo")) {odcr_caseinfo <<- bind_rows(odcr_caseinfo, odcr_caseinfo_tmp)
} else {odcr_caseinfo <<- odcr_caseinfo_tmp}
}
if (any(c("Judge", "Agency", "Attorney", "Defendant") %in% t[,1])) {
odcr_parties_tmp <- t %>%
mutate(casenum = casenum_tmp) %>%
right_join(cid, by = "casenum") %>%
mutate(party = X2 %>%
str_extract(".*?(?=\t|\n|$)") %>%
str_squish,
party_loc = X2 %>%
str_extract("(?<=\\sof ).*$") %>%
str_squish) %>%
mutate(odcr_pn_id = paste0(court_code, casenum, "-",
str_pad(row_number(), side = "left", width = 2, pad = "0"))) %>%
select(odcr_pn_id, court, casenum, casetype, file_year, party_type = X1, party, party_loc)
if (exists("odcr_parties")) {odcr_parties <<- bind_rows(odcr_parties, odcr_parties_tmp)
} else {odcr_parties <<- odcr_parties_tmp}
}
if ("Amount" %in% names(t) & i < 4) {
odcr_mins_tmp <- cid %>%
left_join(t %>%
mutate(casenum = casenum_tmp), by = "casenum") %>%
filter(Date != "Grand Total") %>%
mutate(min_date = mdy(Date),
fee_amt = Amount %>%
str_remove_all("\\$|,") %>%
as.numeric) %>%
fill(min_date) %>%
select(court, court_code, casenum, casetype, file_year, min_date, min_desc = Description, fee_amt) %>%
group_by(court, court_code, casenum, casetype, file_year, min_date, min_desc) %>%
summarize(fee_amt = sum(fee_amt, na.rm = TRUE)) %>%
ungroup %>%
mutate(odcr_min_id = paste0(court_code, casenum, "-",
str_pad(row_number(), width = 3, side = "left", pad = 0))) %>%
select(odcr_min_id, court, casenum, casetype, file_year, min_date, min_desc, fee_amt)
if (exists("odcr_mins")) {odcr_mins <<- bind_rows(odcr_mins, odcr_mins_tmp)
} else {odcr_mins <<- odcr_mins_tmp}
}
if ("Time" %in% names(t)) {
odcr_events_tmp <- cid %>%
left_join(t %>%
mutate(casenum = casenum_tmp), by = "casenum") %>%
mutate(event_date = mdy(Date),
event_desc = Description %>%
str_extract(".*?(?=\n|\t|Completed|$)"),
complete_date = str_extract(Description, "\\d{1,2}/\\d{1,2}/\\d{2,4}") %>%
mdy,
event_code = str_extract(Description, "(?<=Code:).*") %>%
str_squish) %>%
mutate(odcr_ev_id = paste0(court_code, casenum, "-",
str_pad(row_number(), width = 3, side = "left", pad = 0))) %>%
select(odcr_ev_id, court, casenum, casetype, file_year, event_date, event_time = Time, event_desc, complete_date, event_code)
if (exists("odcr_events")) {odcr_events <<- bind_rows(odcr_events, odcr_events_tmp)
} else {odcr_events <<- odcr_events_tmp}
}
if ("Amount" %in% names(t) & i > 3) {
odcr_pays_tmp <- cid %>%
left_join(t %>%
mutate(casenum = casenum_tmp), by = "casenum") %>%
filter(Date != "Grand Total") %>%
mutate(pay_date = mdy(Date),
pay_amt = Amount %>%
str_remove_all("\\$|,") %>%
as.numeric) %>%
mutate(odcr_pay_id = paste0(court_code, casenum, "-",
str_pad(row_number(), width = 3, side = "left", pad = 0))) %>%
select(odcr_pay_id, court, casenum, casetype, file_year, pay_date, pay_desc = Description, pay_amt)
if (exists("odcr_pays")) {odcr_pays <<- bind_rows(odcr_pays, odcr_pays_tmp)
} else {odcr_pays <<- odcr_pays_tmp}
}
#### Counts and dispositions ####
counts <- ht %>%
html_nodes("li") %>%
html_text()
counts <- counts[!str_detect(counts, "Pricing|Login|Sign Up|New search|Contact Us|Advertise|Facebook")] %>%
as.data.frame() %>%
mutate(casenum = casenum_tmp)
names(counts)[1] <- "ct_desc"
counts$ct_desc <- str_to_upper(counts$ct_desc)
odcr_disps_tmp <- cid %>%
left_join(counts, by = "casenum")
#
# disp_dates <- odcr_events_tmp[str_detect(odcr_events_tmp$event_desc, "DISPO|DEFER|CONVICT|DISMISS") & !str_detect(odcr_events_tmp$event_desc, "COSTS"), "event_date"] %>%
# as.list %>%
# unlist %>%
# as_date %>%
# unique
#
# disp_mins_tmp <<- odcr_mins %>%
# filter(min_date %in% disp_dates)
# cd <- case_when(any(str_detect(disp_mins_tmp$min_desc, "CONVICT")) ~ "CONVICTION",
# any(str_detect(disp_mins_tmp$min_desc, "DEFER")) ~ "DEFERRED",
# any(str_detect(disp_mins_tmp$min_desc, "DISMISS")) ~ "DISMISSED")
odcr_disps_tmp <- odcr_disps_tmp %>%
# mutate(case_disp = cd) %>%
mutate(ct_no = row_number(),
ct_desc = str_remove(ct_desc, "\\d{1,2}\\.") %>%
str_squish,
odcr_disp_id = paste0(court_code, casenum, "-",
str_pad(ct_no, width = 3, side = "left", pad = "0"), "-",
str_pad(row_number(), width = 3, side = "left", pad = "0"))) %>%
select(odcr_disp_id, court, casenum, casetype, file_year, ct_no, ct_desc)
if (exists("odcr_disps")) {odcr_disps <<- bind_rows(odcr_disps, odcr_disps_tmp)
} else {odcr_disps <<- odcr_disps_tmp}
}
}
message(paste(court_tmp, casenum_tmp, "scraped in", Sys.time() - start, "seconds."))
updates_tmp <- tibble(odcr_update_id = paste0(courtlist[[str_to_upper(court_tmp)]], casenum_tmp, "-", Sys.time()),
court = str_to_upper(court_tmp),
casenum = casenum_tmp,
casetype = casetype_tmp,
file_year = caseyear_tmp,
updated = Sys.time(),
available = ifelse(length(d) < 3, FALSE, TRUE))
lastupdate_tmp <- updates_tmp %>%
mutate(odcr_lastupdate_id = str_sub(odcr_update_id, 1, 17)) %>%
select(odcr_lastupdate_id, court, casenum, casetype, file_year, last_update = updated, available)
if (exists("odcr_updates")) {odcr_updates <<- bind_rows(odcr_updates, updates_tmp)
} else { odcr_updates <<- updates_tmp }
if (exists("odcr_lastupdate")) {odcr_lastupdate <<- bind_rows(odcr_lastupdate, lastupdate_tmp)
} else { odcr_lastupdate <<- lastupdate_tmp }
if (updatedb == TRUE & nrow(odcr_updates) >= update_freq) {
odcr_updatedb()
oscn_reset()
disconnect_ojo()
}
}
}
}
}
}
}
if (updatedb == TRUE) {
odcr_updatedb()
oscn_reset()
}
disconnect_ojo()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.