R/textTools.R
In politeness: Detecting Politeness Features in Text

Documented in cleanpunct ctxpand noLeadDash usWords

#' Text Counter
#'
#' @description Counts total prevalence of a set of items in each of a set of texts.
#' @param counted character vector of items to search for in the texts.
#' @param texts character vector of to-be-searched text.
#' @param words logical. Default FALSE. Does \code{counted} contain words, or sequences of chracters?
#' @param fixed logical. Default TRUE. Use literal characters instead of regular expressions?
#' @param start logical. Default FALSE. Does \code{counted} only look at the start of a sentence?
#' @param num_mc_cores integer Number of cores for parallelization. Default is 1.
#' @return numeric vector as long as \code{texts} indicating total frequencies of \code{counted} items.
#' @keywords internal
#'
textcounter<-function (counted, texts, words=FALSE, fixed = TRUE, start=FALSE,
                       num_mc_cores = 1) {


  if(words){
    if(start){
      texts=unlist(parallel::mclapply(texts,function(x) x[1], mc.cores=num_mc_cores))
    }
    counts<-unlist(parallel::mclapply(texts,function(x) sum(unlist(x)%in%counted), mc.cores=num_mc_cores))
  }else {
    counts <- rep(0, length(texts))
    for (x in counted) {
      counts <- counts + unlist(sapply(gregexpr(x, texts, fixed = fixed),
                                       function(z) ifelse(z[1] == (-1), 0, length(z))))
    }
  }
  counts[is.na(counts)]<-0
  return(counts)
}

#' Clean Text
#' @description Basic text cleaning
#' @param text character text to be cleaned
#' @param language string. Default "english".
#' @param stop.words logical. Default TRUE
#' @return a character vector
#' @keywords internal
cleantext<-function (text, language = "english", stop.words = TRUE) {
  text <- tolower(text)
  text <- gsub("[[:punct:]]", " ", text)
  text <- gsub("[[:cntrl:]]", " ", text)
  if (length(stop.words) > 1) {
    text <- tm::removeWords(text, stop.words)
  }
  else if (stop.words) {
    text <- tm::removeWords(text, tm::stopwords(language))
  }
  text <- tm::removeNumbers(text)
  text <- tm::stripWhitespace(text)
  text <- paste0(" ",text," ")
  return(as.character(text))
}

#' Contraction Expander
#' @description Expands Contractions
#' @param text a character vector of texts.
#' @return a character vector
#' @keywords internal
ctxpand<-function(text){
  text<-sapply(text, function(x) gsub("let's", "let us", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("won't", "will not", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("can't", "cannot", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("shan't", "shall not", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("'d", " would", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("'ve", " have", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("'s", " is", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("'ll", " will", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("'re", " are", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("n't", " not", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("u.s.a.", "usa", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("u.s.", "usa", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("e.g.", "eg", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("i.e.", "ie", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("i'm", "i am", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("i i ", "i ", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("i, i ", "i ", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("I'm", "I am", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("I I ", "I ", x, fixed=TRUE))
  text<-sapply(text, function(x) gsub("I, I ", "I ", x, fixed=TRUE))
  names(text)<-NULL
  return(text)
}


#' UK to US conversion
#' @description background function to load.
#' @param text character Vector of strings to convert to US spelling.
#' @return character Vector of Americanized strings.
#' @keywords internal
usWords<-function(text){
  toks <- quanteda::tokens(text)
  tokUS<-quanteda::tokens_lookup(toks, politeness::uk2us,
                                 exclusive = FALSE,capkeys = FALSE)
  sentUS<-unlist(lapply(tokUS,paste, collapse=" "),use.names = FALSE)
  return(sentUS)
}

#' Cleaning weird encodings
#' @description Handles curly quotes, umlauts, etc.
#' @param text character Vector of strings to clean.
#' @return character Vector of clean strings.
#' @keywords internal
cleanpunct<-function(text){
  # text<-gsub("‘", "'",text)
  # text<-gsub("’", "'", text)
  # text<-gsub("“", '"', text)
  # text<-gsub("”", '"', text)
  #text<-gsub(",Äô", '\'', text,fixed=T)
  text<-gsub(",", ", ", text,fixed=TRUE, useBytes = TRUE)
  text<-gsub(".", ". ", text,fixed=TRUE)
  text<-gsub(". .", "..", text,fixed=TRUE)
  text<-gsub("[\u201C\u201D\u201E\u201F\u2033\u2036]", '"', text)
  text<-gsub("[\u2018\u2019\u201A\u201B\u2032\u2035]", "'", text)
  text<-stringi::stri_trans_general(text, "latin-ascii")
  text=unlist(lapply(text, noLeadDash))
  return(text)
}

#' Cleaning leading punctuation
#' @description Handles interruption dashes.
#' @param text character Vector of strings to clean.
#' @return character Vector of clean strings.
#' @keywords internal
noLeadDash<-function(text){
  tLoc<-gregexpr("[[:alpha:]]",text)[[1]]
  text=substr(text,tLoc,nchar(text))
  return(text)
}