R/page_content.R

#' Downloading the content of a page
#'
#' @param revisions A revisions table built with \code{page_revisions}
#' @param clean Logical. Indicate if the content need to be cleaned with \code{clean_wikitext} function.
#' @param parallel Logical. Indicate whether have to use a parallel backend to run or not.
#' @param domain The domain where the wiki is located
#'
#' @return A character vector containing all the revisions of corresponding to metadata \code{revisions}.
#' @export
#' @import foreach magrittr
#'
#' @examples
#' \dontrun{
#' revisions <- page_revisions("Sociologie")
#' content <- page_content(revisions, clean = TRUE)
#' }

page_content <- function(revisions, clean = FALSE, domain = "fr", parallel = FALSE) {


  `%op%` <- if(parallel) `%dopar%` else `%do%`

  ids <- divide_list(revisions$revid)

  query <- list(
    action = "query",
    prop = "revisions",
    rvprop = "ids|content")

  content <- foreach(i = seq_along(ids),
                     .combine = "c",
                     .export = c("exec_query")) %op% {

    query["revids"] <- ids[[i]]

    exec <- exec_query(query, domain = domain)

    revid <- exec$query$pages %>%
      lapply("[[", "revisions") %>%
      lapply(lapply, "[[", "revid") %>%
      unlist(use.names = FALSE)
    
    res <- exec$query$pages %>%
      lapply("[[", "revisions") %>%
      lapply(lapply, function(x) {
        if("texthidden" %in% names(x)) {
          NA
        } else {
          x$`*`
        }
      }) %>%
      unlist(use.names = FALSE)
    
    names(res) <- revid

    res    

  }

  if(clean) {

    content %<>% clean_wikitext

  }

  content

}
leojoubert/WikiSocio documentation built on May 21, 2019, 5:08 a.m.