R/cleanme_23.R

Defines functions cleanme

Documented in cleanme

#' Clean Raw Text
#'
#' Clean and format raw text
#'
#'@name cleanme
#' @param  dataframe with raw text in a column titled "doc_text". Usually first generated by readme()
#' @return  a dataframe of cleaned text documents (doc_clean) their document ID and raw text preserved.
#' @importFrom tibble as_tibble
#' @importFrom here here
#' @importFrom dplyr group_by
#' @importFrom textclean replace_date
#' @importFrom textclean replace_contraction
#' @importFrom tm removeWords
#' @importFrom tm stripWhitespace
#' @keywords internal
#' @export cleanme

cleanme<- function(x) {
  #load(here::here("data", "omissions_2023.rda"))  only use if in markdown
  omissions <- omissions_2023
  message("Performing text cleaning.... sit tight!")
  y <- x
  x <- x %>% group_by(doc_id)
  x <- x$doc_text
  x <- tolower(x) #to lower
  x <- gsub("\"", " ", x)
  x <- gsub("\n", " ", x)
  x <- textclean::replace_date(x)
  x <- gsub("`", "'", x)  # replaces tick marks with apostrophe for contractions
  x <- gsub("(\\d)(\\.)", "", x)   #look for period adjacent to any digit, replace with nothing
  x <- textclean::replace_contraction(x) #replace contractions
  x <- gsub("([[:alpha:]])([[:punct:]])", "\\1 \\2", x) #add a space between any alphabetic character and punctuation
  x <- gsub("-", " ", x) #replace all hyphens with spaces
  x <- tm::removeWords(x, omissions$target)
  x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
  x <- gsub("[^a-zA-Z;.,]", " ", x) #omit numbers and most punctuation, retain alphabetic chars, comma, semicolon, periods
  x <- gsub("\\b[a-z]\\b{1}", " ", x) #omits any singleton alphabetic character
  x <- gsub("\\;", "\\.", x) #replace semicolons with periods
  x <- gsub("\\s{2,}", " ", x) #replace two or more spaces with a single space
  x <- unlist(strsplit(x, " "))
  x <- paste(x,collapse=" ")
  x <- gsub("(.*)(, and* )(\\w{1,}\\s\\w{1,})", "\\1. \\3", x)
  x <- gsub("(.*)(, but* )(\\w{1,}\\s\\w{1,})", "\\1. \\3", x)
  x <- gsub("(.*)(,* because* )(\\w{1,}\\s\\w{1,})", "\\1. \\3", x)
  x <- gsub("(.*)(,* then* )(\\w{1,}\\s\\w{1,})", "\\1. \\3", x)
  x <- gsub("([[:punct:]])", "", x) # remove periods
  doc_clean <- tm::stripWhitespace(x)
  cleandoc <- cbind(y, doc_clean)
  return(cleandoc)
}
bzuck-temple/TextDistanceBeta documentation built on Jan. 29, 2023, 6:37 p.m.