R/parse-cpopg.R

Defines functions build_url_cpo_pg download_cpopg parse_cpopg_movs_ arrumar_key arrumar_forma desacentuar parse_cpopg_info_

build_url_cpo_pg <- function(p) {
  p <- gsub("[^0-9]", "", as.character(p))
  dados_url <- list('conversationId' = '',
                    'dadosConsulta.localPesquisa.cdLocal' = '-1',
                    'cbPesquisa' = 'NUMPROC',
                    'dadosConsulta.tipoNuProcesso' = 'UNIFICADO',
                    'numeroDigitoAnoUnificado' = '',
                    'foroNumeroUnificado' = '',
                    'dadosConsulta.valorConsultaNuUnificado' = '',
                    'dadosConsulta.valorConsulta' = '')
  dados_url[["numeriDigitoAnoUnificado"]] <- stringr::str_sub(p, end = 15)
  dados_url[["foroNumeroUnificado"]] <- stringr::str_sub(p, start = 22)
  dados_url[["dadosConsulta.valorConsultaNuUnificado"]] <- p
  url1 <- "https://esaj.tjsp.jus.br/cpopg/search.do"
  parametros <- paste(names(dados_url), unlist(dados_url), sep = "=")
  url2 <- paste(url1, paste0(parametros, collapse = "&"), sep = "?")
  url2
}

download_cpopg <- function(p) {
  p <- gsub('[^0-9]', '', p)
  plyr::l_ply(p, function(x) {
    a <- sprintf('data-raw/cpo-pg/%s.html', x)
    if (!file.exists(a)) httr::GET(build_url_cpo_pg(x), httr::write_disk(a),
                                   httr::config(ssl_verifypeer = FALSE))
  }, .progress = 'text')
}

#' @export
parse_cpopg_movs_ <- function(a) {
  # a <- sample(arqs[arqs_tem], 1)
  # visualize(a)
  a %>%
    xml2::read_html() %>%
    rvest::html_node('#tabelaTodasMovimentacoes') %>%
    rvest::html_table() %>%
    dplyr::select(data_mov = X1, X3) %>%
    tidyr::separate(X3, c('titulo', 'mov'),
                    sep = '\r\n\t', extra = 'merge', fill = 'right') %>%
    dplyr::tbl_df()
}

#' @export
arrumar_key <- function(x) {
  desacentuar(stringr::str_replace_all(tolower(x), ' +', '_'))
}

#' @export
arrumar_forma <- function(x) {
  x <- desacentuar(stringr::str_replace_all(tolower(x), ' +', '_'))
  x <- gsub('[^a-z]', '', x)
  x
}

#' @export
desacentuar <- function(x) {
  gsub("`|\\'", "", iconv(x, to = "ASCII//TRANSLIT"))
}

#' @export
parse_cpopg_info_ <- function(a) {
  # a <- sample(arqs[arqs_tem], 1)
  # visualize(a)
  html <- xml2::read_html(a)
  infos <- html %>%
    rvest::html_nodes('.secaoFormBody') %>%
    dplyr::last() %>%
    rvest::html_nodes('tr') %>%
    rvest::html_text() %>%
    stringr::str_replace_all('[\n\r\t]+', ' ') %>%
    stringr::str_replace_all(' +', ' ') %>%
    stringr::str_trim() %>%
    unique() %>%
    { dplyr::data_frame(info = .) } %>%
    tidyr::separate(info, c('key', 'value'), sep = '\\:',
                    extra = 'merge', fill = 'left') %>%
    dplyr::mutate(key = stringr::str_trim(key),
                  value = stringr::str_trim(value)) %>%
    dplyr::distinct(value) %>%
    dplyr::mutate(key = stringr::str_replace_na(key, 'Lugar')) %>%
    dplyr::mutate(key = arrumar_key(key))
  infos_cdp <- html %>%
    rvest::html_text() %>%
    stringr::str_match('processoPK\\.cdProcesso=([^&]+)&') %>%
    as.character() %>%
    dplyr::last() %>%
    { dplyr::data_frame(key = 'cdprocesso', value = .) }
  infos_p <- infos %>%
    dplyr::filter(key == 'processo') %>%
    tidyr::separate(value, c('n_processo', 'status'), sep = ' ',
                    extra = 'merge', fill = 'right') %>%
    dplyr::select(-key) %>%
    tidyr::gather(convert = TRUE)
  infos_digital <- html %>%
    rvest::html_nodes('.linkPasta') %>%
    { if(length(.) == 0) '' else rvest::html_text(dplyr::first(.)) } %>%  {
      digital <- stringr::str_detect(., 'Este processo é digital')
      dplyr::data_frame(key = 'digital', value = as.character(digital))
    }
  dplyr::bind_rows(infos, infos_p, infos_cdp, infos_digital) %>%
    dplyr::tbl_df()
}

#' @export
parse_cpopg_partes_ <- function(a) {
  # a <- sample(arqs[arqs_tem], 1)
  # visualize(a)

  #   html <- xml2::read_html(a)
  #   html %>%
  #     rvest::html_nodes('#tableTodasPartes') %>% {
  #       if(length(.) == 0) rvest::html_nodes(html, '#tablePartesPrincipais')
  #       else .
  #     } %>%
  #     dplyr::first() %>%
  #     rvest::html_table() %>%
  #     tidyr::separate(X2, c('parte', 'adv'), sep = '\r\n\t',
  #                     extra = 'merge', fill = 'right') %>%
  #     dplyr::mutate(adv = stringr::str_trim(adv)) %>%
  #     dplyr::rename(forma = X1) %>%
  #     dplyr::mutate(forma = arrumar_forma(forma)) %>%
  #     dplyr::mutate(adv = gsub(' *\r[ \r\t\n]+ *', '\n', adv),
  #                   adv = gsub('\\&nbsp', ' ', adv)) %>%
  #     dplyr::tbl_df()

  html <- xml2::read_html(a)
  html %>%
    rvest::html_nodes("#tableTodasPartes") %>% {
      if (length(.) == 0)
        rvest::html_nodes(html, "#tablePartesPrincipais")
      else .
    } %>%
    dplyr::first() %>% {
      if (gsub('[\n\r\t]', '', rvest::html_text(.)) == '') {
        dplyr::data_frame()
      } else {
        rvest::html_table(.) %>%
          tidyr::separate(X2, c("parte", "adv"),
                          sep = "\r\n\t", extra = "merge", fill = "right") %>%
          dplyr::mutate(adv = stringr::str_trim(adv)) %>%
          dplyr::rename(forma = X1) %>%
          dplyr::mutate(forma = arrumar_forma(forma)) %>%
          dplyr::mutate(adv = gsub(" *\r[ \r\t\n]+ *", "\n", adv),
                        adv = gsub("\\&nbsp", " ", adv)) %>%
          dplyr::tbl_df()
      }
    }
}

#' @export
#'
#' @import magrittr
parse_cpopg <- function(arqs, .parallel = TRUE) {
  if (.parallel) {
    fun <- function(i, a, len) {
      "%>%" <- dplyr::`%>%`
      if (runif(1) < 0.01) cat(i, 'de', len, '\n')
      x <- a[i]
      dplyr::data_frame(
        arq = x,
        infos = list(brunoSalama::parse_cpopg_info_(x)),
        partes = list(brunoSalama::parse_cpopg_partes_(x)),
        movs = list(brunoSalama::parse_cpopg_movs_(x))
      )
    }
    d_fail_tbl <- list(dplyr::data_frame('error'))
    d_fail <- dplyr::data_frame(arq = NA, infos = d_fail_tbl,
                                partes = d_fail_tbl, movs = d_fail_tbl)
    f <- dplyr::failwith(d_fail, fun)
    cl <- parallel::makeCluster(parallel::detectCores(), outfile = '')
    doParallel::registerDoParallel(cl)
    n <- length(arqs)
    d <- dplyr::tbl_df(plyr::ldply(seq_len(n), f, a = arqs, len = n,
                                   .parallel = TRUE))
    parallel::stopCluster(cl)
  } else {
    d <- dplyr::data_frame(arq = arqs) %>%
      dplyr::distinct(arq) %>%
      dplyr::group_by(arq) %>%
      dplyr::do(infos = parse_cpopg_info_(.$arq),
                partes = parse_cpopg_partes_(.$arq),
                movs = parse_cpopg_movs_(.$arq)) %>%
      dplyr::ungroup()
  }
  d
}
jtrecenti/brunoSalama documentation built on May 20, 2019, 3:16 a.m.