#' @title Download results of a query on second degree lawsuits filed in
#' Brazilian Justice Courts
#'
#' @description Downloads an HTML with the results obtained from querying a
#' dataset of all second degree lawsuits and then one HTML for each page of
#' results (at most `max_page` pages). `query` should be the string to look
#' for in the lawsuits and `clases`, `courts`, etc. should be the filtering
#' parameters (make sure to use [cjsg_table()] to get lists of all valid codes
#' for these arguments).
#'
#' @param query Character vector with search query
#' @param path Path to directory where to save HTMLs
#' @param classes Character vector with class IDs (see [cjsg_table()])
#' @param subjects Character vector with subject IDs (see [cjsg_table()])
#' @param courts Character vector with court IDs (see [cjsg_table()])
#' @param trial_start Lower bound for trial date
#' @param trial_end Upper bound for trial date
#' @param registration_start Lower bound for registration date
#' @param registration_end Upper bound for registration date
#' @param min_page First page of results to download
#' @param max_page Last page of results to download. If is \code{NA} or
#' \code{Inf}, we use \code{\link{peek_cjsg}}.
#' @param cores The number of cores to be used when downloading. If you use more
#' than one core and is dowloading more than 15 pages, you will probably have
#' your IP blocked.
#' @param wait Seconds to wait between downloads. Does not work properly if
#' \code{cores} is greater than one, so you will probably have your IP blocked
#' anyway.
#' @param tj TJ from which to get data (only works with TJSP for now)
#' @param ... Param `rapporteurs` for [download_cjsg_tjmg()]
#' @return A character vector with the paths to the downloaded files
#'
#' @seealso [cjsg_table()], [browse_table()]
#' @export
download_cjsg <- function(query, path = ".", classes = "", subjects = "",
courts = "", trial_start = "", trial_end = "",
registration_start = "", registration_end = "",
min_page = 1, max_page = 1, cores = 1,
wait = .5, tj = "tjsp", ...) {
if (tj == "tjmg") { return(download_cjsg_tjmg(query, path, classes, subjects,
courts, trial_start, trial_end,
registration_start, registration_end,
min_page, max_page, ...)) }
# Convert parameters to expected format
strings <- list(classes, subjects, courts) %>%
purrr::modify(stringr::str_c, collapse = ",")
dates <- list(
trial_start, trial_end,
registration_start, registration_end) %>%
purrr::modify(date_pt)
# Those lines are no longer necessary, although the original requisition uses the '+'
# if (stringr::str_detect(query, "\"")) {
# query <- stringr::str_replace_all(query, " ", "+")
# }
# Query for POST request
query_post <- list(
"conversationId" = "",
"dados.buscaInteiroTeor" = query,
"dados.pesquisarComSinonimos" = "N",
"contadoragente" = 0,
"contadorMaioragente" = 0,
"contadorjuizProlator" = 0,
"contadorMaiorjuizProlator" = 0,
"classesTreeSelection.values" = strings[[1]],
"assuntosTreeSelection.values" = strings[[2]],
"contadorcomarca" = 0,
"contadorMaiorcomarca" = 0,
"secoesTreeSelection.values" = strings[[3]],
"dados.dtJulgamentoInicio" = dates[[1]],
"dados.dtJulgamentoFim" = dates[[2]],
"dados.dtRegistroInicio" = dates[[3]],
"dados.dtRegistroFim" = dates[[4]],
"dados.origensSelecionadas" = "T",
"tipoDecisaoSelecionados" = "A",
"dados.ordenarPor" = "dtPublicacao")
# Create directory if necessary
dir.create(path, FALSE, TRUE)
path <- normalizePath(path)
file <- stringr::str_c(path, "/search.html")
# Execute post request
httr::POST(
"https://esaj.tjsp.jus.br/cjsg/resultadoCompleta.do",
body = query_post, httr::config(ssl_verifypeer = FALSE),
httr::write_disk(file, TRUE))
if (is.na(max_page) || is.infinite(max_page)) {
# Get number of pages
max_page <- dirname(file) %>%
list.files("search", full.names = TRUE) %>%
xml2::read_html() %>%
xml2::xml_find_all("//*[@id='paginacaoSuperior-A']") %>%
rvest::html_text() %>%
stringr::str_extract_all(" [0-9]+") %>%
purrr::pluck(1) %>%
stringr::str_trim() %>%
as.numeric() %>%
magrittr::divide_by(.[1]) %>%
purrr::pluck(2) %>%
`%||%`(0) %>%
ceiling()
message("A total of ", max_page - min_page, " pages will be downloaded")
}
stopifnot(min_page <= max_page)
# Function do download a page into a directory
download_pages <- function(page, path, wait) {
Sys.sleep(wait)
# Query for GET request
query_get <- list(
"tipoDeDecisao" = "A",
"pagina" = page,
"conversationId" = "")
# Protect GET in case there are no pages
GET <- purrr::possibly(httr::GET, "")
# Download page
out <- NULL; file <- stringr::str_c(path, "/page_", stringr::str_pad(page, 4, "left", "0"), ".html")
if (!file.exists(file)) {
out <- GET(
"https://esaj.tjsp.jus.br/cjsg/trocaDePagina.do",
query = query_get, httr::config(ssl_verifypeer = FALSE),
httr::write_disk(file, TRUE))
}
# Normalize path if necessary
if (is.character(out)) { file <- out }
else { file <- normalizePath(file) }
return(file)
}
# Download all pages
files <- parallel::mcmapply(
download_pages, min_page:max_page,
path = path, wait = wait,
SIMPLIFY = FALSE, mc.cores = cores)
return(c(file, purrr::flatten_chr(files)))
}
#' Check how long a call to [download_cjsg()] will probably take
#' @param ... Arguments passed on to [download_cjsg()] (
#' `path` will be ignored)
#' @seealso [download_cjpg()], [cjpg_table()]
#' @export
peek_cjsg <- function(...) {
# Special treatment to some arguments
dots <- rlang::dots_list(...)
path <- tempdir()
dots$path <- path
min_p <- dots$min_page
max_p <- dots$max_page
dots$min_page <- 1
dots$max_page <- 1
dots$wait <- 0
# Call download_cjsg
do.call(download_cjsg, dots)
# Fix pages
dots$min_page <- min_p %||% 1
dots$max_page <- max_p %||% 1
# Get number of pages
pages <- path %>%
list.files("search", full.names = TRUE) %>%
xml2::read_html() %>%
xml2::xml_find_all("//*[@id='paginacaoSuperior-A']") %>%
rvest::html_text() %>%
stringr::str_extract_all(" [0-9]+") %>%
purrr::pluck(1) %>%
stringr::str_trim() %>%
as.numeric()
n_pages <- pages %>%
magrittr::divide_by(.[1]) %>%
purrr::pluck(2) %>%
`%||%`(0) %>%
ceiling()
# Print message
if (n_pages == 0) {
message("There are no pages to download")
invisible(pages)
}
else {
dots$max_page <- min(dots$max_page, n_pages)
n_pages <- dots$max_page - dots$min_page + 1
if (n_pages > 1000) {
message(
"There are ",
pages[1]*n_pages, " lawsuits to download ",
"(for a total of ", n_pages, " pages)\n",
"This should take around ",
how_long(n_pages*1.3988),
"\nNote that this estimate is only ok for less than 1000 pages")
}
else {
message(
"There are ",
pages[2], " lawsuits to download ",
"(for a total of ", n_pages, " pages)\n",
"This should take around ",
how_long(n_pages*1.3988))
}
invisible(pages)
}
}
#' Temporary function for downloading TJMG's CJSG queries
#'
#' @param query Character vector with search query
#' @param path Path to directory where to save HTMLs
#' @param classes Character vector with class IDs (e.g. `c(175, 43, 259, 263)`)
#' @param subjects Character vector with subject IDs (e.g. `c(10207, 10008, 10199)`)
#' @param courts Character vector with court IDs (e.g. `c("1-7", "1-9", "2-3", "1-1")`)
#' @param rapporteurs Character vector with rapporteur IDs (e.g. `c("2-1528561", "2-2345361")`)
#' @param trial_start Lower bound for trial date
#' @param trial_end Upper bound for trial date
#' @param registration_start Lower bound for registration date
#' @param registration_end Upper bound for registration date
#' @param min_page First page of results to download
#' @param max_page Last page of results to download. If is \code{NA} or
#' \code{Inf}, we use \code{\link{peek_cjsg}}.
#' @return A character vector with the paths to the downloaded files
#'
download_cjsg_tjmg <- function(query, path = ".", classes = "", subjects = "",
courts = "", trial_start = "", trial_end = "",
registration_start = "", registration_end = "",
min_page = 1, max_page = 1, rapporteurs = "") {
# Require V8 and decryptr
require_pkg("V8")
require_pkg("decryptr")
# Create directory if necessary
dir.create(path, FALSE, TRUE)
path <- normalizePath(path)
# Replicate name of item over vector
replicate_over <- function(vec, name) {
vec %>% as.character() %>% as.list() %>% purrr::set_names(rep(name, length(.))) }
names <- c("listaClasse", "listaAssunto", "listaOrgaoJulgador", "listaRelator")
# Convert dates to expected format
dates <- list(
trial_start, trial_end,
registration_start, registration_end) %>%
purrr::modify(date_pt)
# Create part of query with lists of filters
lists <- list(classes, subjects, courts, rapporteurs) %>%
purrr::map2(names, replicate_over) %>%
purrr::flatten() %>%
purrr::discard(~.x == "")
# Query for GET request
query_get <- c(list(
dataPublicacaoInicial = dates[[3]],
dataPublicacaoFinal = dates[[4]],
dataJulgamentoInicial = dates[[1]],
dataJulgamentoFinal = dates[[2]],
numeroRegistro = "1",
totalLinhas = "1",
palavras = query,
pesquisarPor = "ementa",
pesquisaTesauro = "true",
orderByData = "1",
linhasPorPagina = "10",
pesquisaPalavras = "Pesquisar",
classe = "",
codigoAssunto = "",
codigoOrgaoJulgador = "",
codigoCompostoRelator = ""
), lists)
# Base URL
base <- "http://www5.tjmg.jus.br/jurisprudencia/"
# Run search query on website's home
u_search <- stringr::str_c(base, "pesquisaPalavrasEspelhoAcordao.do")
r_search <- httr::GET(u_search, query = query_get)
# Collect captcha
v8 <- V8::v8(); captcha <- tempfile(fileext = ".jpeg")
u_captcha <- stringr::str_c(base, "captcha.svl?", v8$eval("Math.random()*5"))
r_captcha <- httr::GET(u_captcha, httr::write_disk(captcha, overwrite = TRUE))
# Query for POST request
query_post <- list(
"callCount" = "1",
"page" = "link_busca",
"httpSessionId" = r_search$cookies$value[1],
"scriptSessionId" = "",
"c0-scriptName" = "ValidacaoCaptchaAction",
"c0-methodName" = "isCaptchaValid",
"c0-id" = "0",
"c0-param0" = stringr::str_c("string:", decryptr::decrypt(captcha, "tjmg")),
"batchId" = "0")
file.remove(captcha)
# Validate captcha's answer
u_validate <- stringr::str_c(base, "dwr/call/plaincall/ValidacaoCaptchaAction.isCaptchaValid.dwr")
r_validate <- httr::POST(u_validate, body = query_post, encode = "form")
# Iterate over pages of results
files <- c()
for (i in min_page:max_page) {
# Update page number
query_get["paginaNumero"] = i
# Rerun search, now with captcha broken
file <- stringr::str_c(path, "/page_", stringr::str_pad(i, 4, "left", "0"), ".html")
httr::GET(u_search, query = query_get, httr::write_disk(file, overwrite = TRUE))
# Check whether should keep file
paginator <- file %>%
xml2::read_html() %>%
stringr::str_detect("p?gina [0-9]* de [0-9]*")
if (!paginator) { file.remove(file); break() }
files <- append(files, file)
}
return(files)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.