download_arq <- function(u, a, verbose = FALSE) {
if (file.exists(a)) {
if (verbose)
cat("\narquivo ", a, " ja existe!\n")
return(dplyr::data_frame(result = "exists"))
}
if (verbose)
cat("\nbaixando ", a, "...", sep = "")
res <- tryCatch({
r <- suppressWarnings({
httr::GET(u, httr::write_disk(a, overwrite = TRUE))
})
ct <- httr::headers(r)[["content-type"]]
ct <- ifelse(is.null(ct), "application", ct)
}, error = function(e) as.character(e))
if (stringr::str_detect(res, "Timeout")) {
if (verbose)
cat("ERRO!\n")
return(dplyr::data_frame(result = "timeout"))
}
if (httr::status_code(r) == 200 && stringr::str_detect(ct, "application")) {
if (verbose)
cat("OK!\n")
return(dplyr::data_frame(result = "ok"))
}
if (verbose)
cat("ERRO!\n")
return(dplyr::data_frame(result = "nao tem dje"))
}
dje_stj <- function(dates, path = "data-raw/dje_pdf", verbose = FALSE) {
u <- "http://dj.stj.jus.br/"
pastas <- sprintf("%s/stj_dje_%s", path, dates)
invisible(sapply(pastas, dir.create, showWarnings = FALSE, recursive = TRUE))
f <- dplyr::failwith(dplyr::data_frame(result = "erro"), download_arq)
d <- expand.grid(date = dates, caderno = as.character(1),
KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE) %>%
dplyr::tbl_df() %>%
dplyr::mutate(date_link = format(as.Date(date), "%Y%m%d"),
link = sprintf("%s%s.pdf", u, date_link),
arq = sprintf("%s/stj_dje_%s.pdf", pastas, date)) %>%
dplyr::arrange(desc(date)) %>%
dplyr::group_by(date, caderno, date_link, link, arq) %>%
dplyr::do(f(.$link, .$arq, verbose)) %>%
dplyr::ungroup() %>%
dplyr::select(date, caderno, link, arq, result)
return(d)
}
pdf2text <- function(a, first_pg = NA, last_pg = NA, r = F, keep_file = F, new_file = 'repo.txt'){
if(!file.exists(a)){return('')}
if(file.size(a) > 5000 & stringi::stri_detect(a, fixed = '.pdf')){
sprintf('pdftotext %s %s%s%s%s',
a,
ifelse(r,'-raw ',' '),
ifelse(!is.na(first_pg),paste('-f',first_pg,''),' '),
ifelse(!is.na(last_pg),paste('-l',last_pg,''),' '),
new_file) %>%
system()
texto = readr::read_file(new_file)
ifelse(keep_file,NA,file.remove(new_file))
} else {
texto = ''
}
return(texto)
}
parse_dje_stj <- function(arq) {
txt <- pdf2text(arq, r = TRUE)
re_cabecalho <- '(\npág\\. [0-9].*)|(\f|^)(Sup(.|\n)*?pub.*?[0-9]{4}\\.\n)'
# spl <- stringr::str_split(txt, '\f')
# stringr::str_replace_all(spl[[1]][], re_cabecalho, '')
txt <- stringr::str_replace_all(txt, re_cabecalho, '')
re_nproc <- '[\n\f](\\([0-9]+\\))[\n\f]+((.|\n)*?)\\(([0-9]{4}/[0-9]+-[0-9])\\)'
processos <- stringr::str_match_all(txt, re_nproc)[[1]]
id_processo <- tidyr::extract_numeric(processos[, 2])
txt_processo <- processos[, 3]
n_processo <- processos[, 5]
j <- 1
ok <- logical(length(id_processo))
for (i in seq_along(id_processo)) {
ok[i] <- (id_processo[i] == j)
if (ok[i]) j <- j + 1
}
re_processo <- '([A-Za-z ]+?) +[nN]\\? ?([0-9.]+)[- /]+?([A-Z]{2})'
dados <- txt_processo[ok] %>%
desacentuar() %>%
stringr::str_replace_all('\n', ' ') %>%
stringr::str_match(re_processo) %>%
as.data.frame(stringsAsFactors = FALSE) %>%
setNames(c('original', 'tipo', 'num', 'uf')) %>%
dplyr::mutate(original = txt_processo[ok],
id = id_processo[ok],
n_stj = n_processo) %>%
dplyr::tbl_df()
dados
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.