R/clean.text.R

#' @title Clean text across several dimensions
#'
#' @description A function to perform basic text cleaning. It is primarily a convenience function, wrapping up some very commonly used cleaning tasks. It automatically lowers the text.

#' @param text.clean character vector
#' @param numbs logical, indicating whether to remove numbers
#' @param stopz logical, indicating whether to remove stop words (from the tm package)
#' @param URL logical, indicating whether to remove URLS
#' @param RT logical, indicating whether to remove RT sign from the start of each entry
#' @param punct logical, indicating whether to remove punctuation
#' @return vector with cleaned text
#' @export



clean.text = function(text.clean,
                        punct = T,
                        numbs = T,
                        stopz = T,
                        URL = T,
                        RT = T){

  print(paste0('Starting to clean text! The time is: ', Sys.time()))

  text.clean = tolower(text.clean)
  text.clean = gsub('\\n', ' ', text.clean, fixed = T) # remove annoying \n values from @RichScriven at https://stackoverflow.com/questions/11936339/in-r-replace-text-within-a-string
  text.clean = gsub('\\', '', text.clean, fixed = T)
  print('\n values and weird spaces removed, and text lowered')


  if(URL == T){
    text.clean = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)", "", text.clean) # remove URLs
    text.clean = gsub("htt\\w+ *", " ", text.clean) # remove anything beginning "htt" - nearly always a broken off URL string - from https://stackoverflow.com/questions/22615188/remove-all-words-that-start-with-from-a-string
    text.clean = tm::removeWords(text.clean, "htt") #remove just this specific word, which for some reason the previous line does not remove
    print('URLs removed. This is the default setting')
  } else {
    print('not removing URLs as you set URL == F')
  }


  if(numbs == T){
    text.clean = gsub('[[:digit:]]+', ' ', text.clean) # https://stackoverflow.com/questions/13590139/remove-numbers-from-alphanumeric-characters
    print('numbers removed. This is the default setting')
  } else {
    print('not removing numbers as you set numbs == F')
  }


  if(punct == T){
    text.clean = gsub("'", "", text.clean) # contractions should be removed
    text.clean = gsub('[[:punct:]]+', ' ', text.clean)
    print('text lowered, contractions removed, punctuation removed. This is the default setting')
  }


  if(RT == T){
  text.clean = gsub('^rt', '', text.clean) # remove 'rt' at the start of words
  print('RTs removed. This is the default setting')
  } else {
  print('not removing RTs as you set RT == F')
  }


  if(stopz == T){
    text.clean = tm::removeWords(text.clean, tm::stopwords(kind = 'english')) # remove stop words
    # YES! We remove stopwords but don't stem
    # discussion on SO: https://stackoverflow.com/questions/34721984/stopword-removing-when-using-the-word2vec
    print('stopwords removed. This is the default setting')
  } else {
    print('not removing stopwords as you set stopz == F')
  }

  text.clean = gsub(x = text.clean,
                    pattern = "\\s+",
                    replacement = " ")
  text.clean = base::trimws(text.clean)

  print(paste0('Done with cleaning! The time is: ', Sys.time()))
  return(text.clean)
}
bvidgen/tc documentation built on May 9, 2019, 2:21 a.m.