R/textfunc_textcleaner.R

#' Clean SoMe text
#'
#' @param textdata character vector; The text you want cleaned
#' @param removePunctuation logical; If TRUE (default) it wil remove all punctuation and symbols from text
#' @param removeURLs logical; If TRUE (default) it wil remove all URLs from text
#' @param removeRT logical; If TRUE it wil remove 'RT' and 'via' from text
#' @param removeHashtag logical; If TRUE it wil remove all words begining with '#' from text
#' @param removeHandles logical; If TRUE it wil remove all words begining with '@' from text
#' @return Character string
#' @export

textfunc.textcleaner <- function(textdata,
                                 removePunctuation = TRUE,
                                 removeURLs = TRUE,
                                 removeRT = FALSE,
                                 removeHashtag = FALSE,
                                 removeHandles = FALSE){

  library(stringi)

  if(removeRT) textdata <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", textdata)
  if(removeHashtag) textdata <- gsub("[#]\\S+", "", textdata)
  if(removeHandles) textdata <- gsub("[@]\\S+", "", textdata)
  if(removeURLs) textdata <- gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)", "", textdata)
  # if(removeURLs) textdata <- gsub("http://t.co/[a-z,A-Z,0-9]*{8}", "", textdata)
  # if(removeURLs) textdata <- gsub("https?:\\/\\/(.*?|\\/)(?=\\s|$)\\s?", "", "")
  if(removePunctuation) textdata <- gsub("[[:punct:]]+", "", textdata)
  textdata <- stri_trans_tolower(textdata)
  textdata <- trimws(textdata)

  return(textdata)

}
emillykkejensen/textfunc documentation built on May 16, 2019, 5:08 a.m.