R/clean_wikitext.R

#' Cleaning wikitext function
#'
#' @param revs A character vector containing the revisions to be cleaned
#' @param locale For \code{stringr} and \code{tm} functions, locale parameter - very important to specify correctly.
#'
#' @return The \code{revs} character vector cleaned.
#' @export
#' 
#' @importFrom stringr str_replace_all str_to_lower
#' @importFrom tm removeWords stopwords
#'
#' @examples
#' \dontrun{
#' library(magrittr)
#' 
#' page_revisions("Sociologie") %>%
#'    page_content(clean = FALSE) %>%
#'    clean_wikitext()
#'    
#' # This example is equivalent to :
#' page_revisions("Sociologie") %>%
#'    page_content(clean = TRUE)
#'    
#' }
clean_wikitext <- function(revs, locale = "fr") {
  
  revs %<>% gsub("(?>\\{(?:[^{}]*|(?R))*\\})", " ", ., perl = TRUE)
  revs %<>% gsub("<references*[^<]*(?:<(?!/>)[^<]*)*(/>|\r\n)", " ", ., perl = TRUE)
  revs %<>% gsub("<(ref|references)\\b[^<]*>[^<]*(?:<(?!/ref>)[^<]*)*</ref>", " ", ., perl = TRUE)
  revs %<>% gsub("\\[\\[Fichier:[^*][^\r\n]+", " ", ., perl = TRUE)
  revs %<>% gsub('\\[\\[Catégorie:[^*][^\r\n]+', " ", ., perl = TRUE)
  revs %<>% gsub("=[^*][^\r\n]+", " ", ., perl = TRUE)
  revs %<>% gsub("\\;[^*][^\r\n]+", " ", ., perl = TRUE)
  revs %<>% gsub("\\[\\[(?:[^|\\]]*\\|)?([^\\]]+)\\]\\]", "\\1", ., perl = TRUE)
  revs %<>% sapply(str_to_lower, locale)
  revs %<>% removeWords(stopwords(locale))
  revs %<>% gsub("[\r\n]+", " ", ., perl = TRUE)
  revs %<>% gsub("[[:punct:]]", " ", .) 
  revs %<>% gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", ., perl=TRUE)
  revs %<>% unname
  
  revs
  
}
cafeine05/WikiSocio documentation built on May 13, 2019, 10:39 a.m.