#' Parse lawsuits extracted from CJSG query
#'
#' @param file Character vector with the paths to one or more files
#' @param cores Number of cores to use when parsing
#'
#' @return A tibble with the columns
#' \itemize{
#' \item `file` Name of the file
#' \item `id_page` ID found in the page
#' \item `id_decision` Unique ID of the ruling
#' \item `id_lawsuit` Number of the lawsuit (doesn't have to be unique)
#' \item `class_subject` Class/subject, separated by slashes
#' \item `district` Name of the district
#' \item `court` Body responsible for the appeal
#' \item `date_decision` Date of the judgement (\%d/\%m/\%Y)
#' \item `date_publication` Date of the publication (\%d/\%m/\%Y)
#' \item `date_registration` Date of registration in the system (\%d/\%m/\%Y)
#' \item `rapporteur` Name of the rapporteur
#' \item `summary` Summary of the ruling
#' \item `txt_summary` Text of the summary with no formatting
#' }
#' @export
parse_cjsg <- function(file, cores = 1) {
# Set names for .id
names(file) <- file
file <- file[file.size(file) > 0]
# Run either with progress bar or on parallel
if (cores == 1) {
pb <- progress::progress_bar$new(total = length(file))
purrr::map_dfr(file, parse_cjsg_, pb, .id = "file")
} else {
file %>%
parallel::mclapply(parse_cjsg_, mc.cores = cores) %>%
dplyr::bind_rows(.id = "file")
}
}
#' Parse a page of CJSG results
#'
#' @param file The path to the file to be parsed
#' @param pb Progress bar created by [parse_cjsg()]
#' @return A tibble with the parsed information
parse_cjsg_ <- function(file, pb = NULL) {
# Safely parse everything
parse <- purrr::possibly(parse_cjsg_lawsuit, tibble::tibble(), quiet = FALSE)
# Iterate over xml nodes to parse every lawsuit
table <- file %>%
xml2::read_html("UTF-8") %>%
rvest::html_nodes(".fundocinza1") %>%
purrr::map_dfr(parse)
if (!is.null(pb)) { pb$tick() }
return(table)
}
#' Parse one lawsuit from a CJSG page
#'
#' @param node A `.fundocinza1` node extracted from the page
#' @return One row with the data concerning the lawsuit
parse_cjsg_lawsuit <- function(node) {
# Auxiliary function to fill in missing columns in table
fill_in_columns <- function(data) {
# Fill in ementa and publicacao
if (!tibble::has_name(data, "ementa"))
data <- dplyr::mutate(data, ementa = NA_character_)
if (!tibble::has_name(data, "data_publicacao"))
data <- dplyr::mutate(data, data_publicacao = NA_character_)
return(data)
}
# Auxiliary function to create a column that doesn't exist
fncols <- function(data, cname) {
add <-cname[!cname%in%names(data)]
if(length(add)!=0) data[add] <- NA_character_
data
}
# Get information from lawsuit
tmp <- rvest::html_node(node, ".downloadEmenta")
infos <- tibble::tibble(
id_lawsuit = stringr::str_trim(rvest::html_text(tmp)),
id_decision = rvest::html_attr(tmp, "cdacordao"))
# Get complicated variables
id <- node %>%
rvest::html_node(".ementaClass") %>%
rvest::html_text() %>%
stringr::str_trim() %>%
stringr::str_replace_all("[^0-9]", "")
cs <- node %>%
rvest::html_node(".assuntoClasse") %>%
rvest::html_text() %>%
stringr::str_trim()
ts <- node %>%
rvest::html_node("textarea") %>%
rvest::html_text()
# Create final table
node %>%
rvest::html_nodes(".ementaClass2") %>%
rvest::html_text() %>%
stringr::str_split_fixed(":", 2) %>%
tibble::as_tibble() %>%
purrr::set_names(c("key", "val")) %>%
dplyr::mutate_all(stringr::str_trim) %>%
dplyr::mutate(
key = key %>%
rm_accent() %>%
stringr::str_to_lower() %>%
stringr::str_replace_all(" +", "_") %>%
stringr::str_replace_all("[^a-z_]", "") %>%
stringr::str_replace_all("_d[eo]_", "_")) %>%
tidyr::spread(key, val) %>%
dplyr::bind_cols(infos) %>%
fill_in_columns() %>%
dplyr::mutate(id = id, cs = cs, ts = ts) %>%
fncols("data_julgamento") %>%
dplyr::select(
id_page = id, id_decision, id_lawsuit, class_subject = cs,
district = comarca, court = orgao_julgador, date_decision = data_julgamento,
date_publication = data_publicacao, date_registration = data_registro,
rapporteur = relatora, summary = ementa, txt_summary = ts)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.