R/page_revisions.R

#' Downloading the list of contributions for one page
#'
#' @param x Either id or title of a page
#' @param domain The domain where the wiki is located
#'
#' @return A data-frame containing the username of the user (or the IP if anonymous contribution), the timestamp, the size of the revision, a boolean indicating weither the contribution is anonymous or not, and the difference beetween the contribution and the previous
#' @export
#'
#' @family page functions
#' 
#' @import magrittr
#' @importFrom dplyr filter arrange
#'
#' @examples
#' # Downloading the list of contribution for the 'action' page in the french wiki
#' page_revisions('Action') 

page_revisions <- function(x, domain = "fr") {

  # Création de la requête
  query = list(action = "query", 
               prop = "revisions",
               rvlimit = "max", 
               rvprop = "sha1|timestamp|size|userid|user|ids",
               redirects = "",
               rvcontinue = NULL)
  
  if(is.numeric(x)) {
    query["pageids"] <- x
  } else {
    query["titles"] <- x
  }
  
  # Pré-allocation de l'objet résultats
  result <- data.frame(matrix(ncol = 9, nrow = 0))
  names(result) <- c("revid","parentid","userid","user", "timestamp", "size", "anon", "sha1")
  
  repeat {
    
    exec <- exec_query(query, domain = domain)
    
    cond <- if(is.null(exec)) {
      
      FALSE
      warning("Network problem - results may be incomplete")
      
    } else {
      
      names(exec$query$pages[1]) %>% as.numeric() > 0
      
    }
    
    if (cond) {
      
      content <- exec[["query"]][["pages"]][[1]][["revisions"]]
      suppressWarnings({
        userid <- sapply(content, "[[", "userid") %>% as.character %>% as.numeric %>% unlist
      })
      user <- sapply(content, "[[", "user") %>% as.character
      revid <- sapply(content, "[[", "revid") %>% as.character %>% as.numeric %>% unlist
      parentid <- sapply(content, "[[", "parentid") %>% as.character %>% as.numeric %>% unlist
      timestamp <- sapply(content, "[[", "timestamp") %>% unlist %>% as.character
      size <- sapply(content, "[[", "size") %>% as.character %>% as.numeric %>% unlist
      sha1 <- sapply(content, "[[", "sha1") %>% as.character
      anon <- sapply(content, function(x) {
        !is.null(x$anon)
      }) %>% as.logical

      result <- data.frame(revid,parentid,userid,user, timestamp, size, anon,sha1, stringsAsFactors = FALSE) %>%
        rbind(result, .)

      query$rvcontinue <- exec$continue$rvcontinue %>% unname %>% unlist
      
      if (is.null(query$rvcontinue)) {
        break
      }
      
    } else {
      
      break
      
    }
    
  }
  
  # Mise en forme du temps
  result$timestamp %<>% strptime("%Y-%m-%dT%H:%M:%SZ") %>%
    as.POSIXct()
  
  # Calcul des poids
  if(nrow(result) > 1) {
    
    result$weight <- c(diff(result$size) * -1, result$size[nrow(result)])
    
  } else {
    
    result$weight <- result$size
    
  }
  
  # Nettoyage des révisions consécutives par le même utilisateur
  result %<>% arrange(timestamp)
  
  clean <- sapply(2:nrow(result), function(i) {
    result$user[i-1] == result$user[i]
  }) %>%
    c(result$user[1] == result$user[2], .)
  
  result[clean == FALSE, ]
  
}
cafeine05/WikiSocio documentation built on May 13, 2019, 10:39 a.m.