extract_searchpage_pubmed = function(path, no_of_results) {

  xpath1 = '//*[@id="search-results"]/section/div['
  xpath2 = ']/div/article['
  xpath3 = ']/div[2]/div[1]/a'
  per_page = 200
  no_page = (no_of_results %/% per_page) + 1

  titles = c()
  hrefs = c()

  for (i in 1:no_page) {
    pathi = read_html(paste(path, i, sep = ""))
    for (j  in 1:per_page) {

      if (((i-1)*200 + j) > no_of_results) {next}

      xp = paste(xpath1, i, xpath2, j, xpath3, sep = "")
      title = pathi %>%
        html_node(xpath = xp) %>% 
        html_text()
      title = gsub("\n                ", "", title)
      title = gsub("\n              ", "", title)
      titles = c(titles, title)

      href = pathi %>%
        html_node(xpath = xp) %>%
        html_attr("href")
      href = paste("https://pubmed.ncbi.nlm.nih.gov", href, sep = "")
      hrefs = c(hrefs, href)

    }
  }

  return(list(titles, hrefs))
}
link = words2link_pubmed("ptsd ketamine")
library(zeallot)
c(title, link) %<-% extract_searchpage_pubmed(link, 211)
title[1:10]
link[1:10]


cheungngo/webscr8per documentation built on March 16, 2021, 12:06 a.m.