R/search_functions.R

Defines functions getPDBResults getLinkFromPage getTextValuesFromPage getTextValueFromPage

Documented in getLinkFromPage getPDBResults getTextValueFromPage getTextValuesFromPage

library(XML)
library(readr)
library(httr)
library(dplyr)
library(purrr)

#' RCSB Search Result Scraper Function
#'
#' This function allows you to parse (to df) the basic results of a search on
#' RCSB, based on attribute XPaths derived by Elliot Williams in February 2018.
#'
#' You want to save the HTML associated with a given search (with all search
#' results displayed) to disk before running this command.
#'
#' @param path The path to the HTML search file
#' @keywords PDB RSCB
#' @export
#' @examples
#' getPDBResults(path="data/searchResults.html")
getPDBResults <- function(path="data/searchResults.html") {
  # Opens pre-saved version of the search results
  searchResults <- read_file(path)
  d <- htmlParse(searchResults)

  # Gets the number of structures returned
  node_num_struct <- getNodeSet(doc=d, path="//ul/li[@class='active']//a")[[1]]
  num_structure <- trimws(xmlToList(node_num_struct)$text)

  # Scrapes all search result HTML nodes (and their PDB accession numbers)
  node_set_path <- "//ul[@id='SearchResultsDetails-MainContent']/li"
  result_nodes <- getNodeSet(doc=d, path=node_set_path)
  # Returns the class element of each search result item corresponding to PDB code
  cand_structs  <- data.frame(sapply(result_nodes,
                                     function(node) {
                                       substr(xmlAttrs(node)[[1]], 11, 1000)
                                     }))
  colnames(cand_structs) <- c("PDB_Number")

  # Scrapes brief description of structure
  desc_path <- "//ul[@id='SearchResultsDetails-MainContent']/li/div[2]/h4/a"
  desc_nodes <- getNodeSet(doc=d, path=desc_path)
  cand_structs$Description  <- sapply(desc_nodes,
                                      xmlValue)

  # Scrapes all search result links (to the PDB structures themselves)
  file_link_path <- "//ul[@id='SearchResultsDetails-MainContent']/li/div[2]/div[1]/div/a[1]"
  cand_structs$PDBFileLink   <- xpathSApply(doc=d, path=file_link_path, fun=xmlGetAttr, name="href")


  # Scrapes Citation of Paper
  cit_path <- "//ul[@id='SearchResultsDetails-MainContent']/li/div[2]/p[2]"
  cit_nodes <- getNodeSet(doc=d, path=cit_path)
  cand_structs$PaperDate  <- sapply(cit_nodes, xmlValue)

  # Scrapes Link to Individual Structure Page
  link_path <- "//ul[@id='SearchResultsDetails-MainContent']/li/div[2]/h3/a"
  cand_structs$PageLink <- xpathSApply(doc=d, path=link_path, fun=xmlGetAttr, name="href")

  return(cand_structs)
}


#' RCSB Link Scraper Function
#'
#' This function allows you to scrape a unique link specified by an XPath,
#' applied on an individual RCSB structure page. Concretely, this lets you
#' obtain links to assets (like the FASTA file, PDB file, mmCIF file, etc.)
#' if you have the corresponding XPath for that <a> element.
#'
#' To apply this to a vector, see `getLinksFromPages``
#'
#' @param url The url corresponding to the RCSB Structure Page in question
#' @param path The path of the <a> element you're wanting to scrape a link from
#' @param prepend Any string you want to prepend to the results
#'
#' @keywords PDB RSCB structure
#' @export
#' @examples
#' # This gets the FASTA file link associated with the 6B4V structure on RCSB
#' url <- "http://www.rcsb.org/structure/6B4V"
#' fasta_path <- '//*[@id="DownloadFilesButton"]/ul/li[1]/a'
#' getLinkFromPage(url, fasta_path)
getLinkFromPage <- function(url, path, prepend="http://www.rcsb.org") {
  # Gets HTML associated with particular site
  print(url)
  s <- GET(url)
  w <- content(s, as='text') # converts s to plaintext of HTML
  d <- htmlParse(file=content(s, as="text", asText=T))

  # Gets link from page
  link <- xpathSApply(doc=d, path=path, fun=xmlGetAttr, name="href")[1]
  # Sees that it's valid (ie is not empty)
  if (nchar(link) == 0) {
    # Tries again (one more time), if necessary
    s <- GET(url)
    w <- content(s, as='text') # converts s to plaintext of HTML
    d <- htmlParse(file=content(s, as="text", asText=T))
    link <- xpathSApply(doc=d, path=path, fun=xmlGetAttr, name="href")[1]
  }
  link <- paste(prepend, link, sep="")
  return(link)
}

#' Web Scraper Function
#'
#' This function allows you to scrape the text values within multiple elements,
#' specified by the xpath `path`, on a particular website referenced by `url`.
#' This serves as a helper function to scrape sites relevant to
#' Bioinformatics-based purposes, and as this package is developed I will add
#' more higher-level functions that scrape from commonly used sites. But in the
#' meantime (and I'm sure for a long time to come), this will be handy. If you
#' are trying to scrape one particular element, see `getTextValueFromPage`.
#'
#' @param url The url corresponding to the website in question
#' @param path The xpath associated with the elements we want to grab the text inside
#'
#' @keywords web scraping xpath text value
#' @export
#' @examples
#' # This gets the paper authors associated with a structure on RCSB
#' url <- "http://www.rcsb.org/structure/6B4V"
#' author_path <- '//*[@id="header_deposition-authors"]'
#' getAuthorNames <- partial(getTextValueFromPage, path=author_path)
#' cand_structs$Authors <- substr(getAuthorNames(url), 27, 200)
getTextValuesFromPage <- function(url, path) {
  print(url)
  s <- GET(url)
  w <- content(s, as='text') # converts s to plaintext of HTML
  d <- htmlParse(file=content(s, as="text", asText=T))
  nodes <- getNodeSet(doc=d, path=path)
  values <- sapply(nodes, xmlValue)
  return(values)
}

#' Web Scraper Function
#'
#' This function allows you to scrape the text value within a specific element,
#' specified by the xpath `path`, on a particular website referenced by `url`.
#' This serves as a helper function to scrape sites relevant to
#' Bioinformatics-based purposes, and as this package is developed I will add
#' more higher-level functions that scrape from commonly used sites. But in the
#' meantime (and I'm sure for a long time to come), this will be handy. If you
#' are trying to scrape multiple elements per page, see `getTextValuesFromPage`.
#'
#' @param url The url corresponding to the website in question
#' @param path The xpath associated with the elements we want to grab the text inside
#'
#' @keywords web scraping xpath text value
#' @export
#' @examples
#' # This gets the paper authors associated with a structure on RCSB
#' url <- "http://www.rcsb.org/structure/6B4V"
#' author_path <- '//*[@id="header_deposition-authors"]'
#' getAuthorNames <- partial(getTextValueFromPage, path=author_path)
#' cand_structs$Authors <- substr(getAuthorNames(url), 27, 200)
getTextValueFromPage <- function(url, path) {
  values <- getTextValuesFromPage(url, path)
  if (is.null(values[1])) {
    values <- getTextValuesFromPage(url, path)
  }
  return(values[1])
}
lacoperon/MDScraperTools documentation built on May 28, 2019, 12:59 p.m.