build_url_cpo_pg <- function(p) {
p <- gsub("[^0-9]", "", as.character(p))
dados_url <- list('conversationId' = '',
'dadosConsulta.localPesquisa.cdLocal' = '-1',
'cbPesquisa' = 'NUMPROC',
'dadosConsulta.tipoNuProcesso' = 'UNIFICADO',
'numeroDigitoAnoUnificado' = '',
'foroNumeroUnificado' = '',
'dadosConsulta.valorConsultaNuUnificado' = '',
'dadosConsulta.valorConsulta' = '')
dados_url[["numeriDigitoAnoUnificado"]] <- stringr::str_sub(p, end = 15)
dados_url[["foroNumeroUnificado"]] <- stringr::str_sub(p, start = 22)
dados_url[["dadosConsulta.valorConsultaNuUnificado"]] <- p
url1 <- "https://esaj.tjsp.jus.br/cpopg/search.do"
parametros <- paste(names(dados_url), unlist(dados_url), sep = "=")
url2 <- paste(url1, paste0(parametros, collapse = "&"), sep = "?")
url2
}
download_cpopg <- function(p) {
p <- gsub('[^0-9]', '', p)
plyr::l_ply(p, function(x) {
a <- sprintf('data-raw/cpo-pg/%s.html', x)
if (!file.exists(a)) httr::GET(build_url_cpo_pg(x), httr::write_disk(a),
httr::config(ssl_verifypeer = FALSE))
}, .progress = 'text')
}
#' @export
parse_cpopg_movs_ <- function(a) {
# a <- sample(arqs[arqs_tem], 1)
# visualize(a)
a %>%
xml2::read_html() %>%
rvest::html_node('#tabelaTodasMovimentacoes') %>%
rvest::html_table() %>%
dplyr::select(data_mov = X1, X3) %>%
tidyr::separate(X3, c('titulo', 'mov'),
sep = '\r\n\t', extra = 'merge', fill = 'right') %>%
dplyr::tbl_df()
}
#' @export
arrumar_key <- function(x) {
desacentuar(stringr::str_replace_all(tolower(x), ' +', '_'))
}
#' @export
arrumar_forma <- function(x) {
x <- desacentuar(stringr::str_replace_all(tolower(x), ' +', '_'))
x <- gsub('[^a-z]', '', x)
x
}
#' @export
desacentuar <- function(x) {
gsub("`|\\'", "", iconv(x, to = "ASCII//TRANSLIT"))
}
#' @export
parse_cpopg_info_ <- function(a) {
# a <- sample(arqs[arqs_tem], 1)
# visualize(a)
html <- xml2::read_html(a)
infos <- html %>%
rvest::html_nodes('.secaoFormBody') %>%
dplyr::last() %>%
rvest::html_nodes('tr') %>%
rvest::html_text() %>%
stringr::str_replace_all('[\n\r\t]+', ' ') %>%
stringr::str_replace_all(' +', ' ') %>%
stringr::str_trim() %>%
unique() %>%
{ dplyr::data_frame(info = .) } %>%
tidyr::separate(info, c('key', 'value'), sep = '\\:',
extra = 'merge', fill = 'left') %>%
dplyr::mutate(key = stringr::str_trim(key),
value = stringr::str_trim(value)) %>%
dplyr::distinct(value) %>%
dplyr::mutate(key = stringr::str_replace_na(key, 'Lugar')) %>%
dplyr::mutate(key = arrumar_key(key))
infos_cdp <- html %>%
rvest::html_text() %>%
stringr::str_match('processoPK\\.cdProcesso=([^&]+)&') %>%
as.character() %>%
dplyr::last() %>%
{ dplyr::data_frame(key = 'cdprocesso', value = .) }
infos_p <- infos %>%
dplyr::filter(key == 'processo') %>%
tidyr::separate(value, c('n_processo', 'status'), sep = ' ',
extra = 'merge', fill = 'right') %>%
dplyr::select(-key) %>%
tidyr::gather(convert = TRUE)
infos_digital <- html %>%
rvest::html_nodes('.linkPasta') %>%
{ if(length(.) == 0) '' else rvest::html_text(dplyr::first(.)) } %>% {
digital <- stringr::str_detect(., 'Este processo é digital')
dplyr::data_frame(key = 'digital', value = as.character(digital))
}
dplyr::bind_rows(infos, infos_p, infos_cdp, infos_digital) %>%
dplyr::tbl_df()
}
#' @export
parse_cpopg_partes_ <- function(a) {
# a <- sample(arqs[arqs_tem], 1)
# visualize(a)
# html <- xml2::read_html(a)
# html %>%
# rvest::html_nodes('#tableTodasPartes') %>% {
# if(length(.) == 0) rvest::html_nodes(html, '#tablePartesPrincipais')
# else .
# } %>%
# dplyr::first() %>%
# rvest::html_table() %>%
# tidyr::separate(X2, c('parte', 'adv'), sep = '\r\n\t',
# extra = 'merge', fill = 'right') %>%
# dplyr::mutate(adv = stringr::str_trim(adv)) %>%
# dplyr::rename(forma = X1) %>%
# dplyr::mutate(forma = arrumar_forma(forma)) %>%
# dplyr::mutate(adv = gsub(' *\r[ \r\t\n]+ *', '\n', adv),
# adv = gsub('\\ ', ' ', adv)) %>%
# dplyr::tbl_df()
html <- xml2::read_html(a)
html %>%
rvest::html_nodes("#tableTodasPartes") %>% {
if (length(.) == 0)
rvest::html_nodes(html, "#tablePartesPrincipais")
else .
} %>%
dplyr::first() %>% {
if (gsub('[\n\r\t]', '', rvest::html_text(.)) == '') {
dplyr::data_frame()
} else {
rvest::html_table(.) %>%
tidyr::separate(X2, c("parte", "adv"),
sep = "\r\n\t", extra = "merge", fill = "right") %>%
dplyr::mutate(adv = stringr::str_trim(adv)) %>%
dplyr::rename(forma = X1) %>%
dplyr::mutate(forma = arrumar_forma(forma)) %>%
dplyr::mutate(adv = gsub(" *\r[ \r\t\n]+ *", "\n", adv),
adv = gsub("\\ ", " ", adv)) %>%
dplyr::tbl_df()
}
}
}
#' @export
#'
#' @import magrittr
parse_cpopg <- function(arqs, .parallel = TRUE) {
if (.parallel) {
fun <- function(i, a, len) {
"%>%" <- dplyr::`%>%`
if (runif(1) < 0.01) cat(i, 'de', len, '\n')
x <- a[i]
dplyr::data_frame(
arq = x,
infos = list(brunoSalama::parse_cpopg_info_(x)),
partes = list(brunoSalama::parse_cpopg_partes_(x)),
movs = list(brunoSalama::parse_cpopg_movs_(x))
)
}
d_fail_tbl <- list(dplyr::data_frame('error'))
d_fail <- dplyr::data_frame(arq = NA, infos = d_fail_tbl,
partes = d_fail_tbl, movs = d_fail_tbl)
f <- dplyr::failwith(d_fail, fun)
cl <- parallel::makeCluster(parallel::detectCores(), outfile = '')
doParallel::registerDoParallel(cl)
n <- length(arqs)
d <- dplyr::tbl_df(plyr::ldply(seq_len(n), f, a = arqs, len = n,
.parallel = TRUE))
parallel::stopCluster(cl)
} else {
d <- dplyr::data_frame(arq = arqs) %>%
dplyr::distinct(arq) %>%
dplyr::group_by(arq) %>%
dplyr::do(infos = parse_cpopg_info_(.$arq),
partes = parse_cpopg_partes_(.$arq),
movs = parse_cpopg_movs_(.$arq)) %>%
dplyr::ungroup()
}
d
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.