# @title scrapmenu internal function
# @description This function will extract links from the menu of links
# @param url url link
#' @import rvest
#' @import dplyr
#' @import stringi
#' @import xml2
# @return Returns the dataframe object with urls and links.
filecounter=1
scraperf <- function(url){
webpage <- tryCatch(xml2::read_html(url, encoding="windows-1251"), error = function(e) e)
if(inherits(webpage, "error")){
Sys.sleep(sample(seq(1, 5, by=0.001), 1))
webpage <- tryCatch(xml2::read_html(url, encoding="windows-1251"), error = function(e) e)
if(inherits(webpage, "error")) return ("error")
}
return(webpage)}
scrapmenu <- function(url){
webpage <- scraperf(url)
url_ <- webpage %>%
rvest::html_nodes("option") %>%
rvest::html_attr("value")
link_ <- webpage %>%
rvest::html_nodes("option") %>%
rvest::html_text()%>%
{ gsub('^\\s+|\\s+$','', .) }
return(tibble(link = link_, url = url_))
}
scrapregion <- function(url){
webpage <- scraperf(url)
urlR <- webpage %>%html_nodes("a") %>% html_text( "href")%>%
stri_trans_general("russian-latin/bgn")%>%
{ gsub('^\\s+|\\s+$','', .) }
gr_expr <- "Respublik|respublik|oblast|Oblast|Kray|kray|Sankt|sankt|Mosk|mosk|Sevas|sevas|okrug|Okrug"
wd_count <- lengths(gregexpr("\\W+", gsub("\u02B9|\u02BA","", urlR)))+1<4
#cap_count <- sapply(regmatches(urlR, gregexpr("[A-Z]", urlR, perl=TRUE)), length)
reg<-tibble(level1=urlR[grepl(gr_expr, urlR) & wd_count])
if(is.data.frame(reg) && nrow(reg)==0){reg="None"}
return(reg)}
scrapmenupage <- function(url){
webpage <- scraperf(url)
url_ <- webpage %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href")
link_ <- webpage %>%
rvest::html_nodes("a") %>%
rvest::html_text()%>%
{ gsub('^\\s+|\\s+$','', .) }
return(tibble(link = link_, url = url_))
}
scrapwebpage <- function(webp){
url_ <- webp %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href")
date_ <- webp %>%
rvest::html_nodes("td") %>%
rvest::html_text()%>%
{ gsub('^\\s+|\\s+$','', .) }%>%
transliterate()
regNA<-webp %>% html_nodes("td>:not(#a)")%>% html_attr('href')
regA_ <- webp %>%
rvest::html_nodes("b") %>%
rvest::html_text()%>%
{ gsub('^\\s+|\\s+$','', .) }%>%
transliterate()
#level2_ <- webp %>%
# rvest::html_nodes("b") %>%
# rvest::html_text()%>%
# { gsub('^\\s+|\\s+$','', .) }%>%
# transliterate()
if(length(url_)!=length(regA_)){
level2_ <- rep(regA_, c(diff(which(is.na(regNA)))-1,
length(regNA)-which(is.na(regNA))[length(which(is.na(regNA)))]))
}else{
level2_ <- regA_
}
datebool <- grepl("^[[:digit:]]+", date_)
datesV <- date_[datebool]
dates_tms <- c((diff(which(datebool))-1),
(length(datebool)-which(datebool)[length(which(datebool))]))/2
dates_ <- rep(datesV, dates_tms)
webscrape_ <- TRUE
link_ <- webp %>%
rvest::html_nodes("a") %>%
rvest::html_text()%>%
{ gsub('^\\s+|\\s+$','', .) }%>%
transliterate()
return(tibble(link = link_, level1 = dates_,
level2 = level2_, webscrape = webscrape_,
url = url_))
}
scrappage <- function(x, ttime, dnames, savetodir, tabextract){
webpage <- tryCatch(scraperf(x$url), error = function(e) e)
if(inherits(webpage, "error")){return("error")}
if (savetodir!=""){
file_name=paste0(savetodir, "/file", "_", filecounter, ".html", sep="")
write_xml(webpage, file = file_name, encoding = "UTF-8" )
assign("filecounter", filecounter+1 , envir = .GlobalEnv)
}
#Data Info
data_info <- webpage %>% html_nodes("[class='w2']")%>%html_text(trim = TRUE)%>%transliterate()
data_info <- data_info[!data_info==""]
if(length(data_info)>2){data_info <- data_info[1:2]}
#Voting Info
tab_key <- "Data golosovaniya"
tbln <- webpage %>% html_nodes(xpath="//table")
tbln_tk <- which(grepl(tab_key, transliterate(as.character(tbln))))
if(length(tbln_tk)!=0){
num_tbln_tk <- tbln_tk[length(tbln_tk)]
tbls<- tryCatch(tbln[c(num_tbln_tk+1):length(tbln)]%>%html_table(fill = TRUE), error = function(e) e)
if(inherits(tbls, "error")){return("error")}
}else{
tbls<- tryCatch(tbln[1:length(tbln)]%>%html_table(fill = TRUE), error = function(e) e)
if(inherits(tbls, "error")){return("error")}
}
if(ttime==FALSE){
tab_key <- "Chislo izbiratel|Chislo byulleteney"
t1<-unlist(lapply(tbls, function(x) sum(!is.na(x))/(dim(x)[1]*dim(x)[2])))
t2<-unlist(lapply(tbls, function(x) dim(x)[1]))
t3<-unlist(lapply(tbls, function(x) any(grepl(tab_key, transliterate(as.character(x))))))
if(!is.null(tabextract)){tabnum=tabextract}else{tabnum=which(t1>.2 & t2>10 & t3)}
tbl <- tryCatch(tbls[[tabnum]], error = function(e) e)
if(inherits(tbl, "error")){
tbl <- tryCatch(tbls[[t3]], error = function(e) e)
if(inherits(tbl, "error")){
tbl <- tryCatch(tbls[[which(t1>.2 & t2>10)]], error = function(e) e)
if(inherits(tbl, "error")){return("error")}
}
}
res<-suppressWarnings(as.numeric(gsub('\r.*','\\1',tbl[,3])))
}else{
ind = suppressWarnings(which(grepl("^[[:digit:]]+$", as.numeric(t(tbls[[2]])[,2]))))
if(length(ind)==0){ind = suppressWarnings(which(grepl("^[[:digit:]]+$", as.numeric(t(tbls[[3]])[,2]))))}
res <- tryCatch(c(tbls[[length(tbls)]][ind][3,]), error = function(e) e)
if(!inherits(res, "error")) {res <- res[unlist(lapply(res, function(x) grepl("\\%", x)))]}
if(inherits(res, "error")) {res <- tryCatch(c(tbls[[tabextract]][,3]), error = function(e) e)
res <- suppressWarnings(as.numeric(gsub('\r.*','\\1',res)))}
res <- suppressWarnings(as.numeric(gsub("%", "", res)))
res[is.na(res)]<-"None"
invisible(gc())
}
#working with names
if(ttime==FALSE){
if(dnames){
res_names<-transliterate(tbl[,2])
res_names<-res_names[res_names!="Chislo golosov izbirateley, podannykh za kazhdyy spisok" & res_names!="Chislo golosov izbirateley, podannykh za"]
res_names<-res_names[!res_names==""]
}else{
if(any(is.na(tbl[,1]))&"Chislo golosov izbirateley, podannykh za kazhdyy spisok"%in%transliterate(tbl[,2])){
w <- which(transliterate(tbl[,2])=="Chislo golosov izbirateley, podannykh za kazhdyy spisok")
tbl[w[1],3]<-NA
if(length(w)>=2) tbl<-tbl[-w[-1],]
}
ind <- which(tbl[,2]=="")
if(length(ind)!=0){
namesC<-paste0("C", 1:(ind-1), sep="")
namesP<-paste0("P", 1:(length(tbl[,2])-(ind)), sep="")
res_names=c(namesC, namesP)
}
if(length(ind)==0){
ind <- which(tbl[,1]==""|is.na(tbl[,1]))[1]
namesC<-paste0("C", 1:(ind-1), sep="")
namesP<-paste0("P", 1:(length(tbl[,2])-(ind)), sep="")
res_names=c(namesC, namesP)
}
}}else{
res_names<-paste0("T",1:4)
}
#vector names
if(any(is.na(res))) res=res[!is.na(res)]
if(length(data_info)>0){data_info_names=paste0("info", seq(1, length(data_info)))}else{data_info_names<-NULL}
names_x<-names(x)
#reformatting
x<- data.frame(lapply(x, as.character), stringsAsFactors=FALSE)
names_vector<-c(data_info_names, names_x, res_names)
res_return<-data.frame(t(c(data_info, unname(unlist(x)), res)))
names_vector <- gsub("^\\s+|\\s+$","", names_vector)
colnames(res_return)<-names_vector
re<-tryCatch(colnames(res_return)<-names_vector, error = function(e) e)
return(res_return)}
scrappage_fast <- function(x, ttime, dnames, savetodir, tabextract){
if(is.character(x) & length(x)==1){
webpage <- scraperf(x)
}else{
webpage <- scraperf(x$url)
}
if (savetodir!=""){
file_name=paste0(savetodir, "/file", "_", filecounter, ".html", sep="")
write_xml(webpage, file = file_name, encoding = "UTF-8" )
assign("filecounter", filecounter+1 , envir = .GlobalEnv)
}
#Data Info
data_info <- webpage %>% html_nodes(xpath="//table")%>%html_nodes("[class='w2']")%>%html_text(trim = TRUE)%>%transliterate()
data_info <- data_info[!data_info==""]
if(length(data_info)>2){data_info <- data_info[1:2]}
if(length(data_info)>0){data_info_names=paste0("info", seq(1, length(data_info)))}else{data_info_names<-NULL}
#Voting Info
tbln <- webpage %>% html_nodes(xpath="//table")
if(ttime==FALSE){
if(!is.null(tabextract)){tabnum=tabextract
}else{
tabnum <- which(unlist(lapply(tbln, function(x) grepl("overflow:scroll",x))))
tabnum <- tabnum[length(tabnum)]
}
list_results<-sapply(sapply(tbln[tabnum],
function(x) {x %>% html_nodes("tr")}),
function(x) {x%>%html_nodes("nobr")%>% html_text()})
col_names<-unlist(lapply(sapply(sapply(tbln[tabnum-1],
function(x) {x %>% html_nodes("tr")}),
function(x) {x%>%html_nodes("nobr")%>% html_text()}),
function(x) {x[2]}))
na_pos <- which(lapply(list_results, function(x) length(x))==0)
list_results[[na_pos]]<-rep(NA,length(list_results[[1]]))
tbl <- t(do.call(rbind,list_results))
if(is.vector(tbl)){tbl<-as.data.frame(tbl, row.names = NULL); colnames<-"C"}
tbl <- tbl[,-na_pos]
if(is.vector(tbl)){tbl[1]<-transliterate(as.character(tbl[1]))}else{
tbl[,1] <- transliterate(as.character(tbl[,1]))
}
if(dnames){
res_names<-transliterate(col_names)
res_names<-res_names[!(res_names==""|is.na(res_names))]
}else{
namesC<-paste0("C", 1:(na_pos-2), sep="")
namesP<-paste0("P", 1:(length(list_results)-na_pos), sep="")
res_names=c(namesC, namesP)
}
x <- data.frame(lapply(x, as.character), stringsAsFactors=FALSE)
if(is.vector(tbl)){
res_return<-data.frame(cbind(data.frame(rbind(data_info, row.names = NULL)), x, t(tbl), row.names = NULL))}else{
res_return<-data.frame(cbind(data.frame(rbind(data_info, row.names = NULL)), x, tbl, row.names = NULL))}
level <- grepl("level|link|url",names(x))
names_x <- names(x)[level]
max_level <- suppressWarnings(max(as.numeric(unlist(regmatches(names_x, gregexpr("[[:digit:]]+", names_x))))))
names_x[names_x=="link"]<-c(paste0("level",max_level+1,""))
res_names=c("link", res_names)
names_vector<-c(data_info_names, names_x, res_names)
colnames(res_return)<-names_vector
}else{
if(!is.null(tabextract)){tabnum=tabextract}else{tabnum=length(tbln)}
tbls<-tbln[tabnum]
tbl<-tbls%>%html_table(fill = TRUE)
tbl<-apply(tbl[[1]],2, function(x) gsub("\\%", "", x))
tbl <- tbl[,-1]
tbl[,1]=transliterate(tbl[,1])
res_names<-paste(transliterate(unname(unlist(tbl[1,]))), transliterate(unname(unlist(tbl[2,]))), sep="")
tbl=tbl[-c(1,2),]
if(dnames){
res_names<-res_names[!(res_names==""|is.na(res_names))]
}else{
namesC<-paste0("C", 1:(length(tbl[1,])-4), sep="")
namesP<-paste0("T", 1:4, sep="")
res_names=c(namesC, namesP)
}
res_return<-data.frame(cbind(data.frame(rbind(data_info, row.names = NULL)), x, tbl, row.names = NULL))
level <- grepl("level|link|url",names(x))
names_x <- names(x)[level]
max_level <- suppressWarnings(max(as.numeric(unlist(regmatches(names_x, gregexpr("[[:digit:]]+", names_x))))))
names_x[names_x=="link"]<-c(paste0("level",max_level+1,""))
res_names[1]<-"link"
}
names_vector <- c(data_info_names, names_x, res_names)
names_vector <- gsub("^\\s+|\\s+$","", names_vector)
colnames(res_return)<-names_vector
invisible(gc())
return(res_return)}
contentextractor<-function(x, uplevel, ttime, typedata, dnames, savetodir, tabextract){
errorf<-function(y){
max_el<-max(unlist(lapply(y, function(x){length(x[!is.na(x)])})))
errorM<-lapply(y, function(x){
x<-x[!is.na(x)];
r<-if(any(x == "error")) rep("error", max_el) else unname(unlist(x))
return(r)})
names(errorM) <- colnames(y)
return(errorM)}
if (is.character(x) & typedata=="slow"){
list.vote_content <- scrappage(x, ttime, savetodir, tabextract)}
if (is.character(x) & typedata=="fast"){
list.vote_content <- scrappage_fast(x, ttime, savetodir, tabextract)}
#regular data frame
if (is.data.frame(x) & typedata=="slow"){
list.vote_content<-lapply(1:dim(x)[1], function(iter) {
k=as.data.frame(scrappage(x[iter,], ttime, dnames, savetodir, tabextract));
cat(paste("scraping page N", iter, "of", uplevel, sep=" "), "\n")
return(k)})%>%errorf()
el <- lapply(list.vote_content, length)
if (length(unique(sapply(el, length)))>1) {warning('Rows sizes across units vary.')
}
return.content <- data.frame(do.call(rbind,list.vote_content))
colnames(return.content)<-names(scrappage(x[1,], ttime, dnames, savetodir, tabextract))
}
#fast download
if(is.data.frame(x) & typedata=="fast"){
list.vote_content<-lapply(1:dim(x)[1], function(iter) {
k=as.data.frame(scrappage_fast(x[iter,], ttime, dnames, savetodir, tabextract));
cat(paste("scraping page N", iter, "of", uplevel, sep=" "), "\n")
return(k)})
el <- lapply(list.vote_content, length)
if (length(unique(sapply(el, length)))>1) {warning('Rows sizes across units vary.')}
return.content <- data.frame(do.call(rbind,list.vote_content))
colnames(return.content)<-names(scrappage_fast(x[1,], ttime, dnames, savetodir, tabextract))}
selcols<-colnames(return.content)[!grepl("info|level|link|url", colnames(return.content))]
return.content[selcols] = apply(return.content[selcols], 2, function(x) suppressWarnings(as.numeric(as.character(x))))
return(return.content)}
pipefinder<-function(base.url, blocks, search.term, hits, extracturls, breakloop, messages,...){
msg1 <- grepl("(?<=\\(\\))$", blocks, perl=TRUE) & !grepl("messages\\s*=\\s*[FALSETRUE]", blocks, perl=TRUE)
msg2 <- grepl("\\(.+\\)", blocks, perl=TRUE) & !grepl("messages\\s*=\\s*[FALSETRUE]", blocks, perl=TRUE)
if (isFALSE(messages) & any(msg1)){
blocks[msg1] <- gsub("\\)", "messages=FALSE)", blocks[msg1])
}
if (isFALSE(messages) & any(msg2)){
blocks[msg2] <- gsub("\\)", ", messages=FALSE)", blocks[msg2])
}
if(is.null(search.term)){
search.term<-"UIK|uik|uchastok"
}
whilefunction<-function(...){
new_blocks <- NULL
desired_pipe <- NULL
pipe.formula <- NULL
extracted.urls <- NULL
result <- NULL
ls_res <- list()
breakcounter <- 0
hit <- 0
while(is.null(pipe.formula)){
if(breakcounter>=breakloop){
pipe.formula <- "unidentified";
print("Reached maximum number of iterations. Optimal pipeline search failed :(...");
break}
breakcounter <- breakcounter+1
if(length(ls_res)>0){
testuik <- unlist(
lapply(1:length(ls_res),
function(x){
if (!is.null(dim(ls_res[[x]]))){any(apply(ls_res[[x]], 2, function(y) {
grepl(search.term, y)}))}else{FALSE}
}))
#testuik2<-unlist(lapply(names(ls_res),
# function(x) (length(unique(unlist(strsplit(x, "%>%"))))-1)==length(blocks)))
if(any(testuik)){
ls_resC <- names(ls_res)[testuik][1]
desired_pipe<-gsub("\\[1,\\]","", ls_resC)
hit=hit+1
if(hit>=hits){cat("Found desirable pipeline:", pipe.formula<-desired_pipe, "\n"); break}
}
selectuik<-unlist(lapply(1:length(ls_res),
function(x){
if (!is.null(dim(ls_res[[x]]))){TRUE}else{FALSE}}))
if(all(selectuik==FALSE)) {pipe.formula<-"unidentified";
print("Optimal pipeline search failed :(...");
break}
old_blocks <- names(ls_res)[selectuik]
old_blocks_data <- ls_res[selectuik]
expand_blocks <- data.frame(expand.grid(old_blocks, blocks, stringsAsFactors = FALSE))
test_expand_blocks <- apply(expand_blocks, 1,
function(x) if(!grepl("listURLextractor\\(.*\\)",
x[2]) & grepl(x[2],x[1], fixed=TRUE)){TRUE}else{FALSE})
expand_blocks <- expand_blocks[!test_expand_blocks,]
new_blocks <- data.frame(expand_blocks,
apply(expand_blocks,1,
function(x){paste(x, collapse ="%>%")}), stringsAsFactors = FALSE)
}else{
old_blocks_data <- NULL
expand_blocks <- expand.grid("base.url", blocks, stringsAsFactors = FALSE)
new_blocks <- data.frame(expand_blocks,
apply(expand_blocks,1,
function(x){paste(x, collapse ="%>%")}), stringsAsFactors = FALSE)
}
ls_res<-list()
for(block in 1:dim(new_blocks)[1]){
if(!is.null(old_blocks_data)){
try_url <- tryCatch(old_blocks_data[[which(names(old_blocks_data)%in%new_blocks$Var1[block])]]$url[1], error = function(e) e)
co.exp <- paste("try_url", new_blocks$Var2[block], sep="%>%")
ev.exp <- tryCatch(eval(parse(text=co.exp)), error = function(e) e)
}else{
co.exp <- as.character(new_blocks[block,3])
ev.exp <- tryCatch(eval(parse(text=co.exp)), error = function(e) e)
}
ls_res[[new_blocks[block,3]]] <- ev.exp
}
}
return(pipe.formula)}
extracted.urls <- NULL
while(hits>0){
pipe.formulaCompute <- whilefunction(base.url, hits)
if(pipe.formulaCompute!="unidentified") break
hits <- hits - 1
print("Decreasing <hit> by 1")
}
if(!is.null(pipe.formulaCompute) & extracturls==TRUE){
print("Extracting all urls...")
extracted.urls <- eval(parse(text=pipe.formulaCompute))
}
result <- list(pipe.formula=pipe.formulaCompute, extracted.res=extracted.urls, retreivaldate=Sys.time())
on.exit(closeAllConnections())
invisible(gc())
return(result)}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.