utilGetLayout <- function(layoutFile = NULL){
library(magrittr)
if (is.null(layoutFile)){
layoutFile <- list.files(pattern = "*.pdf")[1]
}
pdf <- tm::readPDF(control = list(text = "-layout"))(elem = list(uri = layoutFile),
language = "en",
id = "id1")
pdf <- paste(pdf[[1]], collapse = ' ')
Encoding(pdf) <- "UTF-8"
pdf <- iconv(enc2native(pdf), to = "ASCII//TRANSLIT")
# pdf <- gsub(x = pdf, pattern = "([A-Z][a-z]+)", replacement = "\\L\\1", perl = T )
pdf <- gsub(x = pdf,
pattern = "(ALFA|NUMERICO|DATA)",
replacement = "_\\1_", perl = T )
pdf <- stringr::str_to_lower(pdf)
pdf <- gsub(x = pdf,
pattern = "(_\\w*_)",
replacement = "\\U\\1", perl = T )
pdf <- stringr::str_replace_all(pdf, "_", " ")
# tokens <- c("Estrutura do arquivo",
# "Campo",
# "NUMERICO",
# "ALFA",
# "DATA")
layoutFieldPattern <- "(\\d{1,2})((\\s+[a-z/()]+)+)(\\s+[A-Z][A-Z]*)((\\s+[a-z/()]+)+)"
layoutNamePattern <- "(estrutura do arquivo de)(.*)(campo)"
tokens <- c(layoutNamePattern, layoutFieldPattern)
# Localiza a posição dos tokens em todo o texto
# posTokens <- stringr::str_locate_all(pdf,
# "(\\d{1,2})(\\s+[a-z]+)+(\\s+[A-Z][A-Z]*)(\\s+[a-z]+)+")
# Localiza a posição dos tokens em todo o texto
posTokens <- stringr::str_locate_all(pdf, tokens)
stringr::str_sub(pdf, posTokens[[1]][1,1], posTokens[[1]][1,2])
stringr::str_sub(pdf, posTokens[[2]][1,1], posTokens[[2]][1,2])
layoutsNames <- NULL
for (i in seq_len(nrow(posTokens[[1]]))){
line <- stringr::str_sub(pdf, posTokens[[1]][i,1], posTokens[[1]][i,2])
layoutName <- stringr::str_trim(gsub(x = line,
pattern = layoutNamePattern,
replacement = "\\2"))
layoutName <- stringr::str_replace_all(layoutName, patter = "\\s+", "_")
layoutsNames <- c(layoutsNames, layoutName)
}
layoutsNames <- unique(layoutsNames)
rm(layoutName)
lsLayout <- NULL
currLayout <- 1
layouts <- new.env(parent = emptyenv())
for (i in seq_len(nrow(posTokens[[2]]))){
line <- stringr::str_sub(pdf, posTokens[[2]][i,1], posTokens[[2]][i,2])
fieldOrder <- stringr::str_trim(gsub(x = line,
pattern = layoutFieldPattern,
replacement = "\\1"))
fieldName <- stringr::str_trim(gsub(x = line,
pattern = layoutFieldPattern,
replacement = "\\2"))
fieldType <- stringr::str_trim(gsub(x = line,
pattern = layoutFieldPattern,
replacement = "\\4"))
fieldDesc <- stringr::str_trim(gsub(x = line,
pattern = layoutFieldPattern,
replacement = "\\5"))
if (i > 1) {
if ((fieldOrder == 1) | (i == nrow(posTokens[[2]]))) {
dfLayout[dfLayout$Type == 'ALFA',]$Type <- "character"
dfLayout[dfLayout$Type == 'NUMERICO',]$Type <- "numeric"
dfLayout[dfLayout$Type == 'DATA',]$Type <- "br.date"
lsLayout <- c(dfLayout, dfLayout)
#eval(parse(text = paste("layouts$'", "' <- dfLayout", sep = layoutsNames[currLayout])))
eval(parse(text = paste("layouts$", " <- dfLayout", sep = layoutsNames[currLayout])))
currLayout <- currLayout + 1
}
}
if (fieldOrder == 1) {
dfLayout <- data.frame(stringsAsFactors = F)
}
dfLayout <- rbind(
dfLayout,
data.frame(
Order = as.integer(fieldOrder),
Name = fieldName,
Type = fieldType,
Description = fieldDesc,
stringsAsFactors = F
)
)
}
layouts
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.