#' @title Clean text across several dimensions
#'
#' @description A function to perform basic text cleaning. It is primarily a convenience function, wrapping up some very commonly used cleaning tasks. It automatically lowers the text.
#' @param text.clean character vector
#' @param numbs logical, indicating whether to remove numbers
#' @param stopz logical, indicating whether to remove stop words (from the tm package)
#' @param URL logical, indicating whether to remove URLS
#' @param RT logical, indicating whether to remove RT sign from the start of each entry
#' @param punct logical, indicating whether to remove punctuation
#' @return vector with cleaned text
#' @export
clean.text = function(text.clean,
punct = T,
numbs = T,
stopz = T,
URL = T,
RT = T){
print(paste0('Starting to clean text! The time is: ', Sys.time()))
text.clean = tolower(text.clean)
text.clean = gsub('\\n', ' ', text.clean, fixed = T) # remove annoying \n values from @RichScriven at https://stackoverflow.com/questions/11936339/in-r-replace-text-within-a-string
text.clean = gsub('\\', '', text.clean, fixed = T)
print('\n values and weird spaces removed, and text lowered')
if(URL == T){
text.clean = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)", "", text.clean) # remove URLs
text.clean = gsub("htt\\w+ *", " ", text.clean) # remove anything beginning "htt" - nearly always a broken off URL string - from https://stackoverflow.com/questions/22615188/remove-all-words-that-start-with-from-a-string
text.clean = tm::removeWords(text.clean, "htt") #remove just this specific word, which for some reason the previous line does not remove
print('URLs removed. This is the default setting')
} else {
print('not removing URLs as you set URL == F')
}
if(numbs == T){
text.clean = gsub('[[:digit:]]+', ' ', text.clean) # https://stackoverflow.com/questions/13590139/remove-numbers-from-alphanumeric-characters
print('numbers removed. This is the default setting')
} else {
print('not removing numbers as you set numbs == F')
}
if(punct == T){
text.clean = gsub("'", "", text.clean) # contractions should be removed
text.clean = gsub('[[:punct:]]+', ' ', text.clean)
print('text lowered, contractions removed, punctuation removed. This is the default setting')
}
if(RT == T){
text.clean = gsub('^rt', '', text.clean) # remove 'rt' at the start of words
print('RTs removed. This is the default setting')
} else {
print('not removing RTs as you set RT == F')
}
if(stopz == T){
text.clean = tm::removeWords(text.clean, tm::stopwords(kind = 'english')) # remove stop words
# YES! We remove stopwords but don't stem
# discussion on SO: https://stackoverflow.com/questions/34721984/stopword-removing-when-using-the-word2vec
print('stopwords removed. This is the default setting')
} else {
print('not removing stopwords as you set stopz == F')
}
text.clean = gsub(x = text.clean,
pattern = "\\s+",
replacement = " ")
text.clean = base::trimws(text.clean)
print(paste0('Done with cleaning! The time is: ', Sys.time()))
return(text.clean)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.