https://stackoverflow.com/a/36723520/5270873
library(xml2) library(rvest) URL <- "http://stackoverflow.com/questions/3746256/extract-links-from-webpage-using-r" pg <- read_html(URL) head(html_attr(html_nodes(pg, "a"), "href"))
https://stackoverflow.com/a/35247304/5270873
library(rvest) page=read_html("http://www.yelp.com/search?find_loc=New+York,+NY,+USA") page %>% html_nodes(".biz-name") %>% html_attr('href')
https://stackoverflow.com/questions/31517121/using-r-to-scrape-the-link-address-of-a-downloadable-file-from-a-web-page
library(rvest) realtime.page <- "http://www.acleddata.com/data/realtime-data/" realtime.html <- read_html(realtime.page) realtime.html %>% html_node(xpath = "/html/body/div/div/div/div[1]/div/article/div/ul[1]/li[2]/a") %>% html_attr("href")
https://stackoverflow.com/a/31525104/5270873
library(rvest) library(stringr) page <- read_html("http://www.acleddata.com/data/realtime-data/") page %>% html_nodes("a") %>% # find all links html_attr("href") %>% # get the url str_subset("\\.xlsx") %>% # find those that end in xlsx .[[1]] # look at the first one
library(rvest) x <- "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture" %>% read_html() %>% html_nodes("#mw-content-text table:nth-child(55)") html_table(x)
trace(rvest:::html_table.xml_node, quote({ values <- lapply(lapply(cells, html_node, "a"), html_attr, name = "href") values[[1]] <- html_text(cells[[1]]) }), at = 14)
html_table essentially extracts the cells of the html table and runs html_text on them. All we need to do is replace that by extracting the tag from each cell and running html_attr(., "href") instead.
html_table(x)
library(rvest) url <- "http://www.ajnr.org/content/30/7/1402.full" page <- read_html(url) # First find all the urls table_urls <- page %>% html_nodes(".table-inline li:nth-child(1) a") %>% html_attr("href") %>% xml2::url_absolute(url) table_urls # Then loop over the urls, downloading & extracting the table # lapply(table_urls, . %>% # read_html() %>% # html_table())
main_url <- "http://www.ajnr.org/content/30/7/1402/" urls <- paste(main_url, c("T1.expansion", "T2.expansion", "T3.expansion", "T4.expansion"), ".html", sep = "") urls tables <- list() for(i in seq_along(urls)) { total <- readHTMLTable(urls[i]) n.rows <- unlist(lapply(total, function(t) dim(t)[1])) tables[[i]] <- as.data.frame(total[[which.max(n.rows)]]) } tables
readHTMLTable(urls[1])
library(rvest) library(XML) library(httr) url <- "http://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes" doc <- content(GET(url)) getHrefs <- function(node, encoding) { x <- xmlChildren(node)$a if (!is.null(x)) paste0("http://", parseURI(url)$server, xmlGetAttr(x, "href"), " | ", xmlValue(x) ) else xmlValue(xmlChildren(node)$text) } tab <- readHTMLTable(doc, which = 3, elFun = getHrefs) head(tab[, 1:4])
library(rvest) library(XML) pg <- htmlParse("http://www.bvl.com.pe/includes/empresas_todas.dat") pg %>% html_nodes("a") %>% html_attr("href")
links <- function(URL) { getLinks <- function() { links <- character() list(a = function(node, ...) { links <<- c(links, xmlGetAttr(node, "href")) node }, links = function() links) } h1 <- getLinks() htmlTreeParse(URL, handlers = h1) h1$links() } links("https://www.bvl.com.pe/listado-de-empresas")
# Option 1 library(RCurl) getHTMLLinks('https://www.bvl.com.pe/listado-de-empresas/')
# Option 2 library(rvest) library(pipeR) # %>>% will be faster than %>% html("http://www.bvl.com.pe/includes/empresas_todas.dat")%>>% html_nodes("a") %>>% html_attr("href")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.