#' Cleaning wikitext function
#'
#' @param revs A character vector containing the revisions to be cleaned
#' @param locale For \code{stringr} and \code{tm} functions, locale parameter - very important to specify correctly.
#'
#' @return The \code{revs} character vector cleaned.
#' @export
#'
#' @importFrom stringr str_replace_all str_to_lower
#' @importFrom tm removeWords stopwords
#'
#' @examples
#' \dontrun{
#' library(magrittr)
#'
#' page_revisions("Sociologie") %>%
#' page_content(clean = FALSE) %>%
#' clean_wikitext()
#'
#' # This example is equivalent to :
#' page_revisions("Sociologie") %>%
#' page_content(clean = TRUE)
#'
#' }
clean_wikitext <- function(revs, locale = "fr") {
revs %<>% gsub("(?>\\{(?:[^{}]*|(?R))*\\})", " ", ., perl = TRUE)
revs %<>% gsub("<references*[^<]*(?:<(?!/>)[^<]*)*(/>|\r\n)", " ", ., perl = TRUE)
revs %<>% gsub("<(ref|references)\\b[^<]*>[^<]*(?:<(?!/ref>)[^<]*)*</ref>", " ", ., perl = TRUE)
revs %<>% gsub("\\[\\[Fichier:[^*][^\r\n]+", " ", ., perl = TRUE)
revs %<>% gsub('\\[\\[Catégorie:[^*][^\r\n]+', " ", ., perl = TRUE)
revs %<>% gsub("=[^*][^\r\n]+", " ", ., perl = TRUE)
revs %<>% gsub("\\;[^*][^\r\n]+", " ", ., perl = TRUE)
revs %<>% gsub("\\[\\[(?:[^|\\]]*\\|)?([^\\]]+)\\]\\]", "\\1", ., perl = TRUE)
revs %<>% sapply(str_to_lower, locale)
revs %<>% removeWords(stopwords(locale))
revs %<>% gsub("[\r\n]+", " ", ., perl = TRUE)
revs %<>% gsub("[[:punct:]]", " ", .)
revs %<>% gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", ., perl=TRUE)
revs %<>% unname
revs
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.