#' Clean Raw Text
#'
#' Clean and format raw text
#'
#'@name cleanme
#' @param dataframe with raw text in a column titled "doc_text". Usually first generated by readme()
#' @return a dataframe of cleaned text documents (doc_clean) their document ID and raw text preserved.
#' @importFrom tibble as_tibble
#' @importFrom here here
#' @importFrom dplyr group_by
#' @importFrom textclean replace_date
#' @importFrom textclean replace_contraction
#' @importFrom tm removeWords
#' @importFrom tm stripWhitespace
#' @keywords internal
#' @export cleanme
cleanme<- function(x) {
#load(here::here("data", "omissions_2023.rda")) only use if in markdown
omissions <- omissions_2023
message("Performing text cleaning.... sit tight!")
y <- x
x <- x %>% group_by(doc_id)
x <- x$doc_text
x <- tolower(x) #to lower
x <- gsub("\"", " ", x)
x <- gsub("\n", " ", x)
x <- textclean::replace_date(x)
x <- gsub("`", "'", x) # replaces tick marks with apostrophe for contractions
x <- gsub("(\\d)(\\.)", "", x) #look for period adjacent to any digit, replace with nothing
x <- textclean::replace_contraction(x) #replace contractions
x <- gsub("([[:alpha:]])([[:punct:]])", "\\1 \\2", x) #add a space between any alphabetic character and punctuation
x <- gsub("-", " ", x) #replace all hyphens with spaces
x <- tm::removeWords(x, omissions$target)
x <- gsub("\\d+(st|nd|rd|th)", " ", x) #omits 6th, 23rd, ordinal numbers
x <- gsub("[^a-zA-Z;.,]", " ", x) #omit numbers and most punctuation, retain alphabetic chars, comma, semicolon, periods
x <- gsub("\\b[a-z]\\b{1}", " ", x) #omits any singleton alphabetic character
x <- gsub("\\;", "\\.", x) #replace semicolons with periods
x <- gsub("\\s{2,}", " ", x) #replace two or more spaces with a single space
x <- unlist(strsplit(x, " "))
x <- paste(x,collapse=" ")
x <- gsub("(.*)(, and* )(\\w{1,}\\s\\w{1,})", "\\1. \\3", x)
x <- gsub("(.*)(, but* )(\\w{1,}\\s\\w{1,})", "\\1. \\3", x)
x <- gsub("(.*)(,* because* )(\\w{1,}\\s\\w{1,})", "\\1. \\3", x)
x <- gsub("(.*)(,* then* )(\\w{1,}\\s\\w{1,})", "\\1. \\3", x)
x <- gsub("([[:punct:]])", "", x) # remove periods
doc_clean <- tm::stripWhitespace(x)
cleandoc <- cbind(y, doc_clean)
return(cleandoc)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.