R/page_version.R

#' Versionning an article
#'
#' @param corpus A character vector built with \code{page_content}
#' @param meta A list of revisions metadata which each item had been built with \code{page_revisions}
#' @param parallel A logical indicating whether the foreach loop should go parallel or not. If \code{TRUE}, 
#' you need to declare a parallel backend, see \code{foreach} documentation.
#'
#' @return A four-columns data-frame giving for each word an unique id, the word itself, 
#' the revision number in which it appear, and the revision number in which it disappear. 
#' For the last one, a missing value indicate that the word is still present into the page at the moment of the extraction.
#' 
#' @export
#'
#' @import foreach magrittr
#'
#' @importFrom textreuse align_local tokenize_words
#' @importFrom stringr str_split
#' @importFrom plyr ddply
#' @importFrom dplyr filter
#' @importFrom tidyr spread
#'
page_version <-function (corpus, meta, parallel = TRUE) 
{

  `%op%` <- ifelse(parallel, `%dopar%`, `%do%`) 

  tbl_version <- foreach(i = seq_along(corpus)[-1], .combine = rbind, 
                         .packages = c("magrittr", "textreuse", "stringr", "dplyr", 
                                       "foreach"), .export = c("diff", "diff_lcs", "match_seq")) %op% 
                                       {
                                         res <- diff_text(corpus[i - 1], corpus[i]) %>% 
                                           cbind(rep(i, nrow(.)), .)
                                         
                                         names(res)[1] <- "step"
                                         
                                         res[, c("step", "a", "b", "mot", "status")]
                                         
                                       }
  mot <- tokenize_words(corpus[1])
  tbl_version <- rbind(data.frame(step = 1, a = NA, b = 1:length(mot), 
                                  mot = mot, status = "+", stringsAsFactors = FALSE), tbl_version)
  tbl_version %<>% set_id()
  tbl_version %<>% filter(status != "=") %>% select(id, mot, 
                                                    step, status) %>% spread(status, step)
  if(!"-" %in% names(tbl_version)) {
    tbl_version[, "-"] <- NA
  }
  names(tbl_version) <- c("id", "mot", "stop", "begin") # Reprendre page_version
  meta %<>% arrange(timestamp)
  tbl_version$user.add <- meta$user[tbl_version$begin]
  tbl_version$user.delete <- meta$user[tbl_version$stop]
  tbl_version$date.add <- meta$timestamp[tbl_version$begin]
  tbl_version$date.delete <- meta$timestamp[tbl_version$stop]
  tbl_version

}
cafeine05/WikiSocio documentation built on May 13, 2019, 10:39 a.m.