`%>%` <- dplyr::`%>%`
# CVM --------------------------------------------------------------------------
url.cvm <- paste0("http://bvmf.bmfbovespa.com.br/cias-Listadas/Empresas-Listadas/",
"BuscaEmpresaListada.aspx?Letra=%s&idioma=pt-br")
cvm <- pbmcapply::pbmclapply(c(LETTERS, 0:9), function(x) {
u <- sprintf(url.cvm, x)
suppressMessages({
h <- XML::htmlParse(httr::GET(u, httr::config(ssl_verifypeer = FALSE)))
})
emp <- XML::xpathSApply(h,"//tbody/tr/td[1]/a", XML::xmlGetAttr, "href")
emp <- stringr::str_extract(emp, "(?<=Cvm=).+")
return(emp)
})
cvm <- as.integer(do.call(c, cvm))
# Dados da companhia & Demonstrativos ------------------------------------------
# Código, CNPJ, Atividade Principal, Classificação Setorial.
## Dados da companhia (DDC)
url.ddc <- paste0("http://bvmf.bmfbovespa.com.br/pt-br/mercados/acoes/empresas",
"/ExecutaAcaoConsultaInfoEmp.asp?CodCVM=%s&ViewDoc=1&AnoDoc=",
"2019&VersaoDoc=1&NumSeqDoc=80849#a")
## Demonstrativos Financeiros
url.df <- paste0("http://bvmf.bmfbovespa.com.br/cias-listadas/empresas-",
"listadas/HistoricoFormularioReferencia.aspx?codigoCVM=%s&tipo=",
"dfp&ano=0&idioma=pt-br")
# Dados da companhia -----------------------------
## TODO
## Paralelizar
## Melhorar scrap.. tirar dependencia de selenium.
# WEBDATA::selenium()
remDr <- RSelenium::remoteDriver(
remoteServerAddr = "localhost",
port = 4445L,
browserName = "firefox"
)
remDr$open()
out <- lapply(cvm, function(x) {
print(x)
u1 <- sprintf(url.ddc, x)
suppressMessages({
h <- XML::htmlParse(httr::GET(u1, httr::config(ssl_verifypeer = FALSE)))
})
xpath <- c("//tr/td[text()='Códigos de Negociação:']/following-sibling::td/a",
"//tr/td[text()='CNPJ:']/following-sibling::td",
"//tr/td[text()='Atividade Principal:']/following-sibling::td",
"//tr/td[text()='Classificação Setorial:']/following-sibling::td")
desc <- lapply(xpath, function(x) {
dplyr::last(XML::xpathSApply(h, x, XML::xmlValue))
})
names(desc) <- c("cod", "cnpj", "atv.pri", "cla.set")
desc <- dplyr::as_tibble(desc)
# Número Sequencial do Documento - Demonstrativos --------------------------
u2 <- sprintf(url.df, x)
suppressMessages({
h2 <- XML::htmlParse(httr::GET(u2, httr::config(ssl_verifypeer = FALSE)))
})
links <- XML::xpathSApply(h2, "//div[@class='list-avatar-row']/div[@class='content']//a",
XML::xmlGetAttr, "href")
links <- stringr::str_extract(links, "(?<=\\(').+(?='\\))")
txt <- XML::xpathSApply(h2, "//div[@class='list-avatar-row']/div[@class='content']//a",
XML::xmlValue)
## Pegar última versão
dt <- lubridate::dmy(stringr::str_extract(txt, "[0-9]{2}/[0-9]{2}/[0-9]{4}"))
v <- as.integer(trimws(stringr::str_extract(txt, "(?<=Versão).+$")))
index <- dplyr::distinct(dplyr::tibble(dt = dt, v = v, i = 1:length(dt)), dt,
.keep_all = TRUE)
links <- links[index$i]
txt <- txt[index$i]
## Empresa
emp <- dplyr::tibble()
for (i in 1:length(links)) {
remDr$navigate(links[i])
Sys.sleep(1)
tt <- suppressMessages({
try(tableElem <- remDr$findElement(using = "id", "iFrameFormulariosFilho"),
silent = TRUE)
})
if (class(tt) == "try-error") {
URL <- "http://www2.bmfbovespa.com.br/dxw/FormDetalheDXWDRE.asp?TipoInfo=C"
remDr$navigate(URL)
tb <- xml2::read_html(remDr$getPageSource()[[1]]) %>%
rvest::html_nodes("table") %>%
.[5] %>%
rvest::html_table(fill = TRUE, header = TRUE) %>%
.[[1]]
colnames(tb)[1:2] <- c("conta", "descricao")
tb <- dplyr::as_tibble(tb) %>%
tidyr::gather(dt, value, -conta, -descricao) %>%
tidyr::separate(dt, c("dtI", "dtF"), sep = " a ") %>%
dplyr::mutate(dtI = lubridate::dmy(trimws(dtI)),
dtF = lubridate::dmy(trimws(dtF)))
tb$value <- stringr::str_replace_all(tb$value, "[[:punct:]]", "")
tb$value <- as.integer(tb$value)
tb$dem.dt <- lubridate::dmy(stringr::str_extract(txt[i], ".+(?= - D)"))
tb$versao <- as.numeric(stringr::str_extract(txt[i], "(?<=Versão ).+$"))
} else {
remDr$switchToFrame(tableElem)
Sys.sleep(1)
h <- XML::htmlParse(remDr$getPageSource()[[1]], encoding = "utf-8")
tb <- XML::readHTMLTable(h, header = TRUE, as.data.frame = FALSE)[[1]]
names(tb) <- tolower(iconv(trimws(gsub(" ", " ", names(tb))),
to = "ASCII//TRANSLIT"))
tb[[1]] <- trimws(gsub(" ", " ", tb[[1]]))
tb[[2]] <- trimws(gsub(" ", " ", tb[[2]]))
for (j in 3:length(tb)) {
tb[[j]] <- gsub(" |\\.", "", tb[[j]])
tb[[j]] <- as.numeric(gsub(",", ".", tb[[j]]))
}
tb <- dplyr::as_tibble(tb) %>%
tidyr::gather(dt, value, -conta, -descricao) %>%
tidyr::separate(dt, c("dtI", "dtF"), sep = " a ") %>%
dplyr::mutate(dtI = lubridate::dmy(trimws(dtI)),
dtF = lubridate::dmy(trimws(dtF)))
tb$dem.dt <- lubridate::dmy(stringr::str_extract(txt[i], ".+(?= - D)"))
tb$versao <- as.numeric(stringr::str_extract(txt[i], "(?<=Versão ).+$"))
# tb <- tidyr::nest(tb, -dem.dt, -versao)
# tb <- dplyr::bind_cols(desc, tb)
}
emp <- dplyr::bind_rows(emp, tb)
}
dplyr::bind_cols(desc, tidyr::nest(emp))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.