WEBDATA: WEBDATA

`%>%` <- dplyr::`%>%`

# CVM --------------------------------------------------------------------------
url.cvm <- paste0("http://bvmf.bmfbovespa.com.br/cias-Listadas/Empresas-Listadas/",
                  "BuscaEmpresaListada.aspx?Letra=%s&idioma=pt-br")

cvm <- pbmcapply::pbmclapply(c(LETTERS, 0:9), function(x) {
    u <- sprintf(url.cvm, x)
    suppressMessages({
        h <- XML::htmlParse(httr::GET(u, httr::config(ssl_verifypeer = FALSE)))
    })
    emp <- XML::xpathSApply(h,"//tbody/tr/td[1]/a", XML::xmlGetAttr, "href")
    emp <- stringr::str_extract(emp, "(?<=Cvm=).+")
    return(emp)
})
cvm <- as.integer(do.call(c, cvm))

# Dados da companhia & Demonstrativos ------------------------------------------
# Código, CNPJ, Atividade Principal, Classificação Setorial.

## Dados da companhia (DDC)
url.ddc <- paste0("http://bvmf.bmfbovespa.com.br/pt-br/mercados/acoes/empresas",
                  "/ExecutaAcaoConsultaInfoEmp.asp?CodCVM=%s&ViewDoc=1&AnoDoc=",
                  "2019&VersaoDoc=1&NumSeqDoc=80849#a")

## Demonstrativos Financeiros
url.df <- paste0("http://bvmf.bmfbovespa.com.br/cias-listadas/empresas-",
              "listadas/HistoricoFormularioReferencia.aspx?codigoCVM=%s&tipo=",
              "dfp&ano=0&idioma=pt-br")

# Dados da companhia -----------------------------
## TODO
## Paralelizar
## Melhorar scrap.. tirar dependencia de selenium.
# WEBDATA::selenium()

remDr <- RSelenium::remoteDriver(
                        remoteServerAddr = "localhost",
                        port = 4445L,
                        browserName = "firefox"
                    )
remDr$open()

out <- lapply(cvm, function(x) {
    print(x)

    u1 <- sprintf(url.ddc, x)

    suppressMessages({
        h <- XML::htmlParse(httr::GET(u1, httr::config(ssl_verifypeer = FALSE)))
    })

    xpath <- c("//tr/td[text()='Códigos de Negociação:']/following-sibling::td/a",
               "//tr/td[text()='CNPJ:']/following-sibling::td",
               "//tr/td[text()='Atividade Principal:']/following-sibling::td",
               "//tr/td[text()='Classificação Setorial:']/following-sibling::td")

    desc <- lapply(xpath, function(x) {
        dplyr::last(XML::xpathSApply(h, x, XML::xmlValue))
    })
    names(desc) <- c("cod", "cnpj", "atv.pri", "cla.set")
    desc <- dplyr::as_tibble(desc)

    # Número Sequencial do Documento - Demonstrativos --------------------------
    u2 <- sprintf(url.df, x)

    suppressMessages({
        h2 <- XML::htmlParse(httr::GET(u2, httr::config(ssl_verifypeer = FALSE)))
    })

    links <- XML::xpathSApply(h2, "//div[@class='list-avatar-row']/div[@class='content']//a",
                              XML::xmlGetAttr, "href")
    links <- stringr::str_extract(links, "(?<=\\(').+(?='\\))")
    txt <- XML::xpathSApply(h2, "//div[@class='list-avatar-row']/div[@class='content']//a",
                            XML::xmlValue)

    ## Pegar última versão
    dt <- lubridate::dmy(stringr::str_extract(txt, "[0-9]{2}/[0-9]{2}/[0-9]{4}"))
    v <- as.integer(trimws(stringr::str_extract(txt, "(?<=Versão).+$")))
    index <- dplyr::distinct(dplyr::tibble(dt = dt, v = v, i = 1:length(dt)), dt,
                             .keep_all = TRUE)

    links <- links[index$i]
    txt <- txt[index$i]

    ## Empresa
    emp <- dplyr::tibble()

    for (i in 1:length(links)) {
        
        remDr$navigate(links[i])
        Sys.sleep(1)
        tt <- suppressMessages({
            try(tableElem <- remDr$findElement(using = "id", "iFrameFormulariosFilho"),
                silent = TRUE)
        })

        if (class(tt) == "try-error") {
            URL <- "http://www2.bmfbovespa.com.br/dxw/FormDetalheDXWDRE.asp?TipoInfo=C"
            remDr$navigate(URL)

            tb <- xml2::read_html(remDr$getPageSource()[[1]]) %>% 
                rvest::html_nodes("table") %>%
                .[5] %>%
                rvest::html_table(fill = TRUE, header = TRUE) %>%
                .[[1]]

            colnames(tb)[1:2] <- c("conta", "descricao")
            
            tb <- dplyr::as_tibble(tb) %>%
                tidyr::gather(dt, value, -conta, -descricao) %>%
                tidyr::separate(dt, c("dtI", "dtF"), sep = " a ") %>%
                dplyr::mutate(dtI = lubridate::dmy(trimws(dtI)),
                              dtF = lubridate::dmy(trimws(dtF)))

            tb$value <- stringr::str_replace_all(tb$value, "[[:punct:]]", "")
            tb$value <- as.integer(tb$value)

            tb$dem.dt <- lubridate::dmy(stringr::str_extract(txt[i], ".+(?= - D)"))
            tb$versao <- as.numeric(stringr::str_extract(txt[i], "(?<=Versão ).+$"))
            
        } else {
            remDr$switchToFrame(tableElem)

            Sys.sleep(1)

            h <- XML::htmlParse(remDr$getPageSource()[[1]], encoding = "utf-8")
            
            tb <- XML::readHTMLTable(h, header = TRUE, as.data.frame = FALSE)[[1]]

            names(tb) <- tolower(iconv(trimws(gsub(" ", " ", names(tb))),
                                       to = "ASCII//TRANSLIT"))
            tb[[1]] <- trimws(gsub(" ", " ", tb[[1]]))
            tb[[2]] <- trimws(gsub(" ", " ", tb[[2]]))

            for (j in 3:length(tb)) {
                tb[[j]] <- gsub(" |\\.", "", tb[[j]])
                tb[[j]] <- as.numeric(gsub(",", ".", tb[[j]]))
            }

            tb <- dplyr::as_tibble(tb) %>%
                tidyr::gather(dt, value, -conta, -descricao) %>%
                tidyr::separate(dt, c("dtI", "dtF"), sep = " a ") %>%
                dplyr::mutate(dtI = lubridate::dmy(trimws(dtI)),
                              dtF = lubridate::dmy(trimws(dtF)))

            tb$dem.dt <- lubridate::dmy(stringr::str_extract(txt[i], ".+(?= - D)"))
            tb$versao <- as.numeric(stringr::str_extract(txt[i], "(?<=Versão ).+$"))
            # tb <- tidyr::nest(tb, -dem.dt, -versao)
            # tb <- dplyr::bind_cols(desc, tb)
        }
        
        emp <- dplyr::bind_rows(emp, tb)
    }
    dplyr::bind_cols(desc, tidyr::nest(emp))
})