R/aux_captcha.R

# Remove captcha's background and image
rm_bg_and_lines <- function(img) {
  img %>%
    dplyr::filter(y > 15) %>%
    dplyr::group_by(color) %>%
    dplyr::mutate(n = n()) %>%
    dplyr::ungroup() %>%
    dplyr::filter(n < max(n)) %>%
    dplyr::filter(n > sort(unique(n), decreasing = TRUE)[3])
}

# Detect whether file has a captcha
has_captcha <- function(file) {
  (httr::content(file, 'text', encoding = "ISO-8859-1") %>%
    xml2::read_html() %>%
    rvest::html_nodes('#captchaCodigo') %>%
    length()) > 0
}
has_captcha <- purrr::possibly(has_captcha, TRUE)

# Get captcha's UUID
captcha_uuid <- function(file) {
  jsonlite::fromJSON(file)$uuidCaptcha
}
captcha_uuid <- purrr::possibly(captcha_uuid, "xxxx")

# Create query to download lawsuit
cpopg_query <- function(id) {
  list(
    "conversationId" = "",
     "dadosConsulta.localPesquisa.cdLocal" = "-1",
     "cbPesquisa" = "NUMPROC",
     "dadosConsulta.tipoNuProcesso" = "UNIFICADO",
     "numeroDigitoAnoUnificado" = stringr::str_sub(id, 1, 13),
     "foroNumeroUnificado" = stringr::str_sub(id, -4, -1),
     "dadosConsulta.valorConsultaNuUnificado" = id,
     "dadosConsulta.valorConsulta" = "",
     "uuidCaptcha" = "",
     "vlCaptcha" = "",
     "novoVlCaptcha" = "")
}

# Create query to download 2nd degree lawsuit
cposg_query <- function(id) {
  list(
    conversationId = "",
    paginaConsulta = 1,
    "localPesquisa.cdLocal" = -1,
    cbPesquisa = "NUMPROC",
    tipoNuProcesso = "UNIFICADO",
    numeroDigitoAnoUnificado = stringr::str_sub(id, 1, 11),
    foroNumeroUnificado = stringr::str_sub(id, -4, -1),
    dePesquisaNuUnificado = id,
    dePesquisaNuAntigo = "")
}

# Break RGB captcha
break_rgb_captcha <- function(file) {

  # Require magick and tesseract
  require_pkg("magick")
  require_pkg("tesseract")

  # Get file's JSON
  json <- jsonlite::fromJSON(file)

  # Collect file's image
  image <- json %>%
    with(imagem) %>%
    stringr::str_split_fixed(",", 2) %>%
    magrittr::extract(TRUE, 2) %>%
    base64enc::base64decode()

  # Collect file's colors
  color_json <- json %>%
    with(labelValorCaptcha) %>%
    stringr::str_match("<b>([A-Za-z]+)") %>%
    magrittr::extract(TRUE, 2)

  # Create image data frame
  img_png <- png::readPNG(image)
  img_dim <- dim(img_png)
  img_df <- tibble::tibble(
      x = rep(1:img_dim[2], each = img_dim[1]),
      y = rep(img_dim[1]:1, img_dim[2]),
      r = as.vector(img_png[,,1]),
      g = as.vector(img_png[,,2]),
      b = as.vector(img_png[,,3])) %>%
    dplyr::mutate(color = rgb(r, g, b), id = 1:n()) %>%
    rm_bg_and_lines()

  # Fill in data frame
  tmp <- tempfile(fileext = ".png")
  complete_df <- purrr::cross_df(list(
    x = min(img_df$x):max(img_df$x),
    y = min(img_df$y):max(img_df$y)))

  # Save image to temporary file
  img_df %>%
    filter_color(color_json) %>%
    dplyr::mutate(black = 0) %>%
    dplyr::arrange(x, y) %>%
    dplyr::right_join(complete_df, c("x", "y")) %>%
    tidyr::replace_na(list(black = 1)) %>%
    dplyr::select(x, y, black) %>%
    tidyr::spread(x, black, fill = 1) %>%
    dplyr::select(-y) %>%
    as.matrix() %>%
    magrittr::extract(nrow(.):1, TRUE) %>%
    png::writePNG(tmp)

  # Guess captcha's solution
  sol <- tmp %>%
    magick::image_read() %>%
    magick::image_trim() %>%
    magick::image_scale("x50") %>%
    tesseract::ocr() %>%
    stringr::str_trim() %>%
    stringr::str_to_lower() %>%
    stringr::str_replace_all("[^a-z]", "")

  file.remove(tmp)
  return(sol)
}
break_rgb_captcha <- purrr::possibly(break_rgb_captcha, "xxxx")

# Download an RGB captcha to a temporary file
download_rgb_captcha <- function(u_captcha, ts = "") {

  # Download captcha useing time stamp
  tmp <- tempfile()
  r <- httr::POST(
    u_captcha,
    body = list(timestamp = ts, uuidCaptcha = "", conversationId = ""),
    config = httr::config(ssl_verifypeer = FALSE),
    httr::write_disk(tmp, overwrite = TRUE)
  )

  return(tmp)
}
jtrecenti/esaj documentation built on June 20, 2019, 7:13 p.m.