R/transparencia.R

Defines functions getPublicServantInformation listPublicServants getIdServant listSubPublicAgencies listPublicAgencies getTableContent getNumberOfPages getHTML createPath

createPath <- function (strPath = "") {
  
  if (!file.exists('./raw')){
    dir.create(file.path('./raw'))
  }
  
  path <- sprintf('%s/%s' , './raw', strPath)
  
  if (!file.exists(path) && strPath != ""){
    dir.create(file.path(path))
  }
  
  path
}

# @param url - the location of the page on the internet
getHTML <- function (url, folder = "") {
  
  
  path <- createPath(folder)
  
  archive_name <- url %>%  stringr::str_extract('[^=]*$')
  
  arq <- sprintf('%s/%s.html' , path, archive_name)
  
  if(file.exists(arq)) {
    
    codHTML <- xml2::read_html(arq)
    
  } else {
    
    if(folder != "")
      httr::GET(url, httr::write_disk(arq, overwrite = TRUE))
    
    codHTML <-  url %>%
      httr::GET() %>%
      httr::content('text', encoding = 'latin1') %>%
      xml2::read_html()
    
  }
}

# @param url - the location of the page on the internet

getNumberOfPages <- function(url) {
  
  codHTML <-  getHTML(url) %>%
    rvest::html_node('p.paginaAtual') %>%
    xml2::xml_text()
  
}

# @param eq - Reduce the set of matched elements to the one at the specified index (eq)

getTableContent <- function(url, eq, folder){
  
  
  codHTML <-  getHTML(url, folder) %>%
    rvest::html_nodes('table') %>%
    extract2(eq) %>%
    html_table(header = TRUE)
  
}

# Function that access the website and abtain all available information about public agencies

listPublicAgencies <- function () {
  
  
  
  mainPageURL <- sprintf("http://www.portaltransparencia.gov.br/servidores/OrgaoExercicio-ListaOrgaosSuperiores.asp?Pagina=%d" , 1)
  
  numPages <- getNumberOfPages(mainPageURL) %>%  stringr::str_extract('[^/]*$')
  
  allPagesURL <- sprintf("http://www.portaltransparencia.gov.br/servidores/OrgaoExercicio-ListaOrgaosSuperiores.asp?Pagina=%d" , 1:numPages)
  
  
  folder_name <- 'public_agencies'
  
  
  codHTML <- map(allPagesURL, getTableContent, 1, folder_name)
  
  orgaosSuperiores <- do.call(rbind, codHTML)
  
  names(orgaosSuperiores)[1]<-"codigoRH"
  names(orgaosSuperiores)[2]<-"orgaoSuperior"
  names(orgaosSuperiores)[3]<-"servidoresEmExercicio"
  
  orgaosSuperiores
}

# @param codigoRH - Agency code obtained by the function listPublicAngencies.

listSubPublicAgencies <- function (codigoRH) {
  
  mainPageURL <- sprintf("http://www.portaltransparencia.gov.br/servidores/OrgaoExercicio-ListaOrgaos.asp?codOS=%d" , codigoRH)
  
  numPages <- getNumberOfPages(mainPageURL) %>%  stringr::str_extract('[^/]*$')
  
  allPagesURL <- sprintf("http://www.portaltransparencia.gov.br/servidores/OrgaoExercicio-ListaOrgaos.asp?codOS=%d&Pagina=%d", codigoRH , 1:numPages)
  
  folder_name <- 'public_sub_agencies'
  
  createPath(folder_name)
  
  folder_name <- sprintf("%s/%s" , folder_name, codigoRH)
  
  codHTML <- map(allPagesURL, getTableContent, 2, folder_name)
  
  orgaosSubordinados <- do.call(rbind, codHTML)
  
  names(orgaosSubordinados)[1]<-"codigoRH"
  names(orgaosSubordinados)[2]<-"orgao"
  names(orgaosSubordinados)[3]<-"servidoresEmExercicio"
  
  orgaosSubordinados
}


getIdServant <- function(url, eq,  folder, codigoRH) {
  
  codHTML <-  getHTML(url, folder) %>%
    rvest::html_nodes('table') %>%
    extract2(eq)
  
  refs <-  codHTML %>%
    html_nodes('a') %>%
    html_attr('href') %>%
    stringr::str_extract('(?<==)(.*\n?)(?=&)')
  
  numberOfRows <- nrow(as.data.frame((refs)))
  
  refs <- refs %>% extract(3: numberOfRows )
  
  table <- codHTML %>%
    html_table(header = TRUE)
  
  table[,'idServidor'] <- refs
  
  table
}

# @param codigoRH - Agency code obtained by the function listSubPublicAgencies.
listPublicServants <- function (codigoRH) {
  
  mainPageURL <- sprintf("http://www.portaltransparencia.gov.br/servidores/OrgaoExercicio-ListaServidores.asp?CodOrg=%d" , codigoRH)
  
  numPages <- getNumberOfPages(mainPageURL) %>%  stringr::str_extract('[^/]*$')
  
  allPagesURL <- sprintf("http://www.portaltransparencia.gov.br/servidores/OrgaoExercicio-ListaServidores.asp?CodOrg=%d&Pagina=%d", codigoRH , 1:numPages)
  
  folder_name <- '/public_servants'
  
  createPath(folder_name)
  
  folder_name <- sprintf("%s/%s", folder_name , codigoRH)
  
  codHTML <- map(allPagesURL, getIdServant, 2, folder_name, codigoRH)
  
  servidores <- do.call(rbind, codHTML)
  
  names(servidores)[1]<-"cpf"
  names(servidores)[2]<-"nomeServidor"
  names(servidores)[3]<-"orgaoLotacao"
  names(servidores)[4]<-"idServidor"
  
  servidores
  
}

getPublicServantInformation <- function (idServidor, codigoRH) {
  
  mainPageURL <- sprintf("http://www.portaltransparencia.gov.br/servidores/OrgaoExercicio-DetalhaServidor.asp?IdServidor=%d&CodOrgao=%d" , idServidor, codigoRH)
  
  folder_name <- 'public_servants'
  
  folder_name <- sprintf("%s/%s", folder_name , codigoRH)
  
  folder_name <- sprintf("%s/servidores", folder_name)
  
  createPath(folder_name)
  
  folder_name <- sprintf("%s/%s", folder_name, idServidor)
  
  servantInformation <- getTableContent(mainPageURL, 3, folder_name)
  
  servantInformation
}
ThiagoInocencio/TransparenciaFederal documentation built on June 10, 2017, 1:14 p.m.