#------------------------------------------------------------------------------#
# TextStudio #
#------------------------------------------------------------------------------#
#' TextStudio
#'
#' \code{TextStudio} Class responsible for manipulating Corpus texts.
#'
#' Class contains methods for normalizing and cleaning Corpus object texts.
#'
#' @docType class
#' @author John James, \email{jjames@@dataScienceSalon.org}
#' @export
TextStudio <- R6::R6Class(
classname = "TextStudio",
lock_objects = FALSE,
lock_class = FALSE,
inherit = Super,
private = list(
..corpus = character(),
..config = list(),
..regex = list(
punct = "(?![\'-])[[:punct:]]",
hyphen = '[-]',
apostrophe = '[\']',
trailingApostrophe = '[\']\\B',
numbers = "\\d+\\S*",
symbols = "(?![.?!'-])[[:punct:]]",
twitter = '\\B[@#]\\w*[a-zA-Z]+\\w*',
url = "(?:(?:https?:\\/\\/)|(?:www\\.))[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,4}\\b(?:[-a-zA-Z0-9@:%_\\+.~#?&/=]*)",
email = "[a-zA-Z0-9\\-_~]+(\\.[a-zA-Z0-9\\-_~]+)*@[a-zA-Z0-9\\-_~]+(\\.[a-zA-Z0-9\\-_~]+)*\\.[a-zA-Z]{2,}",
strayApostrophe = "\\s*'\\B|\\B'\\s*",
strayHyphen = "\\s*-\\B|\\B-\\s*",
strayComma = "\\s,\\B|\\B,\\s*",
singles = '\\b[b-hj-z]{1}\\b',
backtick = list(
pattern = "\\`",
replacement = "\\'"
),
commaSpace = list(
pattern = "(,)([^ ])",
replacement = "\\1 \\2"
)
),
processDocument = function(document) {
content <- document$content
content <- iconv(content, "UTF-8", "ASCII", sub = "")
if (private$..config$..lowercase) {
content <- tolower(content)
}
# Remove apostrophe, trailingApostrophe, numbers, twitter, url and email
regex <- c()
if (private$..config$..remove$apostrophe)
regex <- c(regex, private$..regex$apostrophe)
if (private$..config$..remove$trailingApostrophe)
regex <- c(regex, private$..regex$trailingApostrophe)
if (private$..config$..remove$twitter)
regex <- c(regex, private$..regex$twitter)
if (private$..config$..remove$url)
regex <- c(regex, private$..regex$url)
if (private$..config$..remove$email)
regex <- c(regex, private$..regex$email)
content <- gsub(paste0(regex, collapse = '|'), "", content, perl = TRUE, ignore.case = TRUE)
regex <- NULL
# Remove sentences/vectors with profanity
if (private$..config$..remove$profanitySentences) {
if (length(private$..config$profanity) == 0) {
profanity <- NLPLists::profanity[,1]
} else {
profanity <- private$..config$profanity
}
pattern <- paste0("\\b", paste0(profanity, collapse = "|"), "(?!\\w)", sep = "")
content <- content[!grepl(pattern, content, perl = TRUE)]
}
# Remove words/vectors with profanity
if (private$..config$..remove$profanityWords) {
if (length(private$..config$profanity) == 0) {
profanity <- NLPLists::profanity[,1]
} else {
profanity <- private$..config$profanity
}
content <- textclean::mgsub_fixed(content, pattern = profanity,
replacement = "",
leadspace = FALSE,
trailspace = FALSE,
fixed = TRUE,
trim = FALSE,
order.pattern = TRUE)
}
# Process Abbreviations
if (private$..config$..replace$abbreviations) {
if (length(private$..config$..abbreviations) == 0) {
abbreviations <- NLPLists::internetAbbreviations
} else {
abbreviations <- private$..config$..abbreviations
}
pattern <- paste0("\\b",unlist(abbreviations[,1]),"\\b")
replacement <- unlist(abbreviations[,2])
content <- stringi::stri_replace_all_regex(content,
replacement = replacement,
pattern = pattern, mode = all,
vectorize_all = FALSE)
}
# Process Hyphens
if (private$..config$..replace$hyphen) {
content <- gsub(pattern = private$..regex$hyphen, replacement = " ",
content, perl = TRUE)
}
# Process Internet Slang
if (private$..config$..replace$slang) {
if (length(private$..config$slang) == 0) {
slang <- NLPLists::internetAbbreviations
} else {
slang <- private$..config$slang
}
pattern <- paste0("\\b",unlist(slang[,1]),"\\b")
replacement <- unlist(slang[,2])
content <- stringi::stri_replace_all_regex(content,
replacement = replacement,
pattern = pattern, mode = all,
vectorize_all = FALSE)
}
# Process Backtick
if (private$..config$..replace$backtick) {
pattern <- private$..regex$backtick$pattern
replacement <- private$..regex$backtick$replacement
content <- gsub(pattern = pattern, replacement = replacement,
content, perl = TRUE, ignore.case = TRUE)
}
# Process Contractions
if (private$..config$..replace$contractions) {
if (length(private$..config$..contractions) == 0) {
contractions <- NLPLists::contractions
} else {
contractions <- private$..config$..contractions
}
pattern <- paste0("\\b",unlist(contractions[,1]),"\\b")
replacement <- unlist(contractions[,2])
content <- stringi::stri_replace_all_regex(content,
replacement = replacement,
pattern = pattern, mode = all,
vectorize_all = FALSE)
}
# Process CurlyQuotes
if (private$..config$..replace$curlyQuotes) {
content <- textclean::replace_curly_quote(x = content)
}
# Process Emoji
if (private$..config$..replace$emoji) {
content <- textclean::replace_emoji(x = content)
}
# Process Emoticon
if (private$..config$..replace$emoticon) {
content <- textclean::replace_emoticon(x = content)
}
# Process Kern
if (private$..config$..replace$kern) {
content <- textclean::replace_kern(x = content)
}
# Process Numbers
if (private$..config$..replace$numbers) {
content <- textclean::replace_number(x = content, num.paste = TRUE)
}
# Process Ordinal
if (private$..config$..replace$ordinal) {
content <- textclean::replace_ordinal(x = content)
}
# Process Symbol
if (private$..config$..replace$symbols) {
content <- textclean::replace_symbol(x = content)
}
# Process Word Elongation
if (private$..config$..replace$wordElongation) {
content <- textclean::replace_word_elongation(x = content)
}
# Process Comma Space
if (private$..config$..add$commaSpace) {
pattern <- private$..regex$commaSpace$pattern
replacement <- private$..regex$commaSpace$replacement
content <- gsub(pattern = pattern, replacement = replacement,
content, perl = TRUE, ignore.case = TRUE)
}
# Clean symbols, punctuation, numbers, stray apostrophe, commas, and extra whitespace.
regex <- NULL
if (private$..config$..remove$symbols) {
regex <- c(regex, private$..regex$symbols)
regex <- c(regex, private$..regex$trailingApostrophe)
}
if (private$..config$..remove$punct) {
regex <- c(regex, private$..regex$punct)
regex <- c(regex, private$..regex$trailingApostrophe)
}
if (private$..config$..remove$numbers)
regex <- c(regex, private$..regex$numbers)
regex <- c(regex, private$..regex$strayApostrophe,
private$..regex$strayComma,
private$..regex$strayHyphen)
content <- gsub(paste0(regex, collapse = '|'), "", content, perl = TRUE, ignore.case = TRUE)
content <- textclean::replace_white(x = content)
document$content <- content
return(document)
},
processCorpus = function() {
documents <- private$..corpus$getDocuments()
for (i in 1:length(documents)) {
document <- private$processDocument(documents[[i]])
private$..corpus$addDocument(document)
}
return(TRUE)
}
),
public = list(
#-------------------------------------------------------------------------#
# Constructor #
#-------------------------------------------------------------------------#
initialize = function(x) {
private$loadServices(name = 'TextStudio')
invisible(self)
},
#-------------------------------------------------------------------------#
# Configure #
#-------------------------------------------------------------------------#
configure = function(x) {
# Validate class of object.
private$..params <- list()
private$..params$classes$name <- list('x')
private$..params$classes$objects <- list(x)
private$..params$classes$valid <- list(c('TextConfig'))
v <- private$validator$validate(self)
if (v$code == FALSE) {
private$logR$log(method = 'configure', event = v$msg, level = "Error")
stop()
}
private$..config <- x$getConfig()
invisible(self)
},
#-------------------------------------------------------------------------#
# Execute Method #
#-------------------------------------------------------------------------#
execute = function(corpus, name = NULL) {
# Validate class of object.
private$..params <- list()
private$..params$classes$name <- list('corpus')
private$..params$classes$objects <- list(corpus)
private$..params$classes$valid <- list(c('Corpus'))
v <- private$validator$validate(self)
if (v$code == FALSE) {
private$logR$log(method = 'execute', event = v$msg, level = "Error")
stop()
}
if (length(private$..config) == 0) {
event <- paste0("TextConfig object is missing. ")
private$logR$log(method = 'execute', event = event, level = "Error")
}
private$..corpus <- Clone$new()$this(corpus, reference = TRUE, content = TRUE)
if (!is.null(name)) private$..corpus$setName(name)
private$processCorpus()
return(private$..corpus)
},
#-------------------------------------------------------------------------#
# Visitor Method #
#-------------------------------------------------------------------------#
accept = function(visitor) {
visitor$textStudio(self)
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.