R/page_toc.R

#' Download the table of contents of a page.
#'
#' @param revid A numeric vector giving the id of the revision wich to extract the table of contents.
#' @param domain The domain where the wiki is located.
#' @param pander Wheter to return a data-frame
#'
#' @return See \code{pander} argument.
#' @export
#'
#' @importFrom  magrittr %>%
#' @import rvest
#' @importFrom xml2 read_html
#' @importFrom stringr str_extract
#' @importFrom pander pandoc.list.return
#' @importFrom plyr alply

page_toc <- function(revid, domain= "fr", pander = TRUE) {
  
  nodes <- paste0("https://", domain, ".wikipedia.org/w/index.php?&oldid=", revid) %>%
    read_html() %>%
    html_nodes(xpath = "//div[@id='mw-content-text']/h2 | //div[@id='mw-content-text']/h3 | //div[@id='mw-content-text']/h4 | //div[@id='mw-content-text']/h5 | //div[@id='mw-content-text']/h5 | //div[@id='mw-content-text']/h6")
  
  level <- html_name(nodes) %>%
    str_extract("[[:digit:]]") %>%
    as.numeric() - 1
  
  text <- html_text(nodes)
  
  toc <- data.frame(level, text, stringsAsFactors = FALSE)
  
  if(pander) {
    
    alply(toc, 1, function(x) {
      res <- pandoc.list.return(x["text"], indent.level = x["level"], add.end.of.list = FALSE)
    }) %>% 
      unlist %>% 
      paste(collapse = "") %>%
      return
    
  } else {
    
    return(toc)
    
  }

    
}
cafeine05/WikiSocio documentation built on May 13, 2019, 10:39 a.m.