R/utilGetLayout.R

Defines functions utilGetLayout

utilGetLayout <- function(layoutFile = NULL){

  library(magrittr)

  if (is.null(layoutFile)){
    layoutFile <- list.files(pattern = "*.pdf")[1]
  }

  pdf <- tm::readPDF(control = list(text = "-layout"))(elem = list(uri = layoutFile),
                                                   language = "en",
                                                   id = "id1")
  pdf <- paste(pdf[[1]], collapse = ' ')

  Encoding(pdf) <- "UTF-8"
  pdf <- iconv(enc2native(pdf), to = "ASCII//TRANSLIT")
  # pdf <- gsub(x = pdf, pattern = "([A-Z][a-z]+)", replacement = "\\L\\1", perl = T )
  pdf <- gsub(x = pdf,
              pattern = "(ALFA|NUMERICO|DATA)",
              replacement = "_\\1_", perl = T )
  pdf <- stringr::str_to_lower(pdf)
  pdf <- gsub(x = pdf,
              pattern = "(_\\w*_)",
              replacement = "\\U\\1", perl = T )
  pdf <- stringr::str_replace_all(pdf, "_", " ")


  # tokens <- c("Estrutura do arquivo",
  #             "Campo",
  #             "NUMERICO",
  #             "ALFA",
  #             "DATA")

  layoutFieldPattern <- "(\\d{1,2})((\\s+[a-z/()]+)+)(\\s+[A-Z][A-Z]*)((\\s+[a-z/()]+)+)"
  layoutNamePattern <- "(estrutura do arquivo de)(.*)(campo)"
  tokens <- c(layoutNamePattern, layoutFieldPattern)

  # Localiza a posição dos tokens em todo o texto
  # posTokens <- stringr::str_locate_all(pdf,
  #                                      "(\\d{1,2})(\\s+[a-z]+)+(\\s+[A-Z][A-Z]*)(\\s+[a-z]+)+")

  # Localiza a posição dos tokens em todo o texto
  posTokens <- stringr::str_locate_all(pdf, tokens)

  stringr::str_sub(pdf, posTokens[[1]][1,1], posTokens[[1]][1,2])
  stringr::str_sub(pdf, posTokens[[2]][1,1], posTokens[[2]][1,2])

  layoutsNames <- NULL
  for (i in seq_len(nrow(posTokens[[1]]))){
    line <- stringr::str_sub(pdf, posTokens[[1]][i,1], posTokens[[1]][i,2])

    layoutName <- stringr::str_trim(gsub(x = line,
                                        pattern = layoutNamePattern,
                                        replacement = "\\2"))
    layoutName <- stringr::str_replace_all(layoutName, patter = "\\s+", "_")

    layoutsNames <- c(layoutsNames, layoutName)
  }
  layoutsNames <- unique(layoutsNames)
  rm(layoutName)

  lsLayout <- NULL
  currLayout <- 1
  layouts <- new.env(parent = emptyenv())
  for (i in seq_len(nrow(posTokens[[2]]))){
    line <- stringr::str_sub(pdf, posTokens[[2]][i,1], posTokens[[2]][i,2])

    fieldOrder <- stringr::str_trim(gsub(x = line,
                       pattern = layoutFieldPattern,
                       replacement = "\\1"))
    fieldName <- stringr::str_trim(gsub(x = line,
                      pattern = layoutFieldPattern,
                      replacement = "\\2"))
    fieldType <- stringr::str_trim(gsub(x = line,
                      pattern = layoutFieldPattern,
                      replacement = "\\4"))
    fieldDesc <- stringr::str_trim(gsub(x = line,
                      pattern = layoutFieldPattern,
                      replacement = "\\5"))


    if (i > 1) {
      if ((fieldOrder == 1) | (i == nrow(posTokens[[2]]))) {

        dfLayout[dfLayout$Type == 'ALFA',]$Type <- "character"
        dfLayout[dfLayout$Type == 'NUMERICO',]$Type <- "numeric"
        dfLayout[dfLayout$Type == 'DATA',]$Type <- "br.date"

        lsLayout <- c(dfLayout, dfLayout)
        #eval(parse(text = paste("layouts$'", "' <- dfLayout", sep = layoutsNames[currLayout])))
        eval(parse(text = paste("layouts$", " <- dfLayout", sep = layoutsNames[currLayout])))
        currLayout <- currLayout + 1
      }
    }

    if (fieldOrder == 1) {
      dfLayout <- data.frame(stringsAsFactors = F)
    }

    dfLayout <- rbind(
      dfLayout,
      data.frame(
        Order = as.integer(fieldOrder),
        Name = fieldName,
        Type = fieldType,
        Description = fieldDesc,
        stringsAsFactors = F
      )
    )

  }

  layouts
}
brunomssmelo/TseWrangler documentation built on May 11, 2017, 5:43 p.m.