
#' Check quality and toxicity of an input string
#' Check quality of the string in terms of spelling errors and toxicity content.
#' The function performs necessary cleaning on the input string.
#' Created on 09 February, 2019
#' Authors: Harjyot Kaur
#' Comparison is done with pre-existing list of
#' exhaustive english words to calculate the spelling errors in the string.
#' Comparison is done with pre-existing list of
#' exhaustive toxic-english words to calculate the toxicity in the string.
#' Takes in a string and returns a data.frame with one row and two columns
#' First column contains proportion of spelling errors in the input
#' contains and the second column storestoxicity in the the input string.
#' @param txt string
#' @return data.frame
#' @import stringr
#' @export
#' @examples
#' txt <- "This str has words spelllll wrong. This string has a slag word shitty."
#' quality <- text_quality(txt)

text_quality <- function(txt) {

  if (!is.character(txt)){stop("Input text must be a string.")}

  # Go through text cleaning before running quality check
  cleaned_text <- clean_text_quality(txt)
  spelling_mistakes <- spell_check(en_dictionary[[1]],cleaned_text)
  toxic_content <- toxicity_check(en_dictionary[[2]],cleaned_text)

  # Run quality check
  quality <- cbind(spelling_mistakes,toxic_content)

  return (quality)

# Helper function that cleans the text
clean_text_quality <-  function(txt){
  text="RT $USD @Amila #Test\nTom\'s newly listed Co. &amp; Mary\'s unlisted Group to supply tech for
            nlTK.\nh.. $TSLA $AAPL https://t.co/x34afsfQsh"
  # Remove tickers
  remove_tickers=gsub("\\$", "", txt)
  # Remove new line symbol
  remove_newline = gsub('\n','',remove_tickers)
  # Remove links
  # Remove special characters
  remove_punctuation=gsub("[[:punct:]]", ' ', remove_links)
  # Remove numerical strings
  remove_numeric_words= gsub("\\b\\d+\\b", '',remove_punctuation)
  clean_text <- str_squish(remove_numeric_words)

  return (clean_text)

# Checks the spelling of the input words
spell_check <- function(eng_words,txt){
  spell_error_df <- data.frame(spell_error=character(),
  spell_error_df[nrow(spell_error_df) + 1,] = list("",0,0.0)

  if (length(unlist(strsplit(txt, split=" ")))!=0){
    eng_words_regex = paste(eng_words, collapse = '\\b|\\b')
    eng_words_regex = paste0('\\b', eng_words_regex, '\\b')
    # get mispelt words
    non_eng_words=str_remove_all(txt, eng_words_regex)

    if (length(unlist(strsplit(non_eng_words, split=" ")))!=0){

        remove_nouns=gsub("[A-Z]([a-z]+)", '',non_eng_words)

        if (length(unlist(strsplit(non_noun_words, split=" ")))!=0){

          spell_error=str_remove_all(tolower(non_noun_words), eng_words_regex)
          count=length(unlist(strsplit(spell_error, split=" ")))
          prop=count/(length(unlist(strsplit(txt, split=" "))))
          errors=c(unique(unlist(strsplit(spell_error, split=" "))))
          spell_error_df$spell_error <- list(errors)
          spell_error_df$count_spell_error <- count
          spell_error_df$proportion_spell_error <- prop
  return (spell_error_df)

toxicity_check <- function(profane_words,txt){
  toxic_words_df <- data.frame(toxic_words=character(),
  toxic_words_df[nrow(toxic_words_df) + 1,] = list("",0,0.0)
  toxic_words_regex = paste(profane_words, collapse = '\\b|\\b')
  toxic_words_regex = paste0('\\b', toxic_words_regex, '\\b')
  # get mispelt words
  txt=unlist(strsplit(txt, split=" "))
  if (length(unlist(strsplit(toxic_words, split=" ")))!=0){
    count=length(unlist(strsplit(toxic_words, split=" ")))
    prop=count/(length(unlist(strsplit(txt, split=" "))))
    toxic_words_df$toxic_words<- list(errors)
    toxic_words_df$count_toxic_words <- count
    toxic_words_df$proportion_toxic_words <- prop
  return (toxic_words_df)
UBC-MDS/RSyntext documentation built on May 7, 2019, 7:14 p.m.