R/cleanTextFns.R

Defines functions interactiveSpellCheck.df InteractiveFindReplace.df interactiveSpellCheck.vector InteractiveFindReplace.vector findReplace checkByContext checkSpelling vec2Words

Documented in checkByContext checkSpelling findReplace InteractiveFindReplace.df InteractiveFindReplace.vector interactiveSpellCheck.df interactiveSpellCheck.vector

###########################################
###          HELPER FUNCTIONS           ###
###########################################

# turn a character vector into a named list of words with the
# original being the name/key and the value being the lowered / non-punctuation version
vec2Words <- function(charVec) {
  charVec <- unique(charVec) # remove dupe elements of vector
  charVec <- charVec[!is.na(charVec)] # remove NA elements of vector
  charVec <- tolower(charVec) # lowercase everything
  charVec <- gsub("[[:punct:]]", " ", charVec) # rm punctuation except hyphens and digits
  charVec <- gsub("\\d+", " ", charVec) # rm digits
  charVec <- gsub("\\s{2,}", " ", charVec) # rm extra spaces generated by previous line
  words <- unique(unlist(strsplit(charVec, " "))) # make list of unique words
  words <- words[nchar(words) > 2] # remove words with only 1 character, possibly due to subs
}

###########################################
###            MAIN FUNCTIONS           ###
###########################################

#' @title check for spelling errors in a character vector
#'
#' @description takes character and returns list of problematic
#'     spellings and suggestions for replacement
#'
#' @param input character vector
#' @import hunspell
#' @export
checkSpelling <- function(input) {
  res <- unique(unlist(hunspell(unique(input)))) # unique wrapper in case single long char vec
  probs <- res[lengths(res) > 0]
  if (!is.null(probs)) {
    suggestions <- hunspell_suggest(probs)
    names(suggestions) <- probs
    return(suggestions)
  } else {
    message("No misspellings found!")
    return(NULL)
  }
}

#' @title Check a character vector against words in ImmuneSpace
#'
#' @description checkByContext compares non-stopwords in a vector to those
#'     already in ImmuneSpace. Any words not already in ImmuneSpace are analyzed
#'     via stringdist to find closest matches.  These matches are returned in a
#'     named list.
#'
#' @param input character vector
#' @importFrom stopwords stopwords
#' @importFrom stringdist stringdist
#' @importFrom SnowballC wordStem
#' @export
checkByContext <- function(input) {
  # assume that user has run checkSpelling and no mis-spellings in input
  # so this looks for issues like "does" in place of "doses"
  # TODO: take closer look at stopwords!

  if (length(input) == 1) {
    words <- unique(vec2Words(input))
  } else {
    words <- input
  }

  # for ease of use
  freqDF <- data.frame(R2i::ISFreqsAll, stringsAsFactors = F)
  colnames(freqDF) <- c("fullWord", "frequency")

  # remove non-analytical words
  names(words) <- words <- words[!(words %in% stopwords::stopwords(source = "smart"))]
  words <- tolower(words)
  words <- words[!(words %in% names(R2i::ISFreqsAll))] # rm words that are in IS dbase

  # Make DF for relating stems and most likely full words
  freqDF$stems <- SnowballC::wordStem(names(R2i::ISFreqsAll))
  freqDF <- freqDF[order(freqDF$stems, -freqDF$frequency), ]

  # find closest stems and return possible fullwords in frequency order
  res <- sapply(words, USE.NAMES = T, function(x) {
    # get string distances using OSA method
    dists <- stringdist::stringdist(SnowballC::wordStem(x), freqDF$stems)
    poss <- as.vector(freqDF$fullWord[dists == min(dists)]) # get rid of levels
  })
}

#' @title find and replace characters in a vector
#'
#' @description find all instances of a word in a vector
#'     and replace it with another word
#'
#' @param input character vector
#' @param find word to find
#' @param replace replacement word
#' @export
findReplace <- function(input, find, replace) {
  tmp <- sapply(input, function(x) {
    gsub(find, replace, x)
  })
  return(unname(tmp))
}

#' @title Interactively find and replace problematic words in a vector
#'
#' @description For each word in the output list from checkSpelling()
#'     find the word in the inputVector and allow the user to enter a
#'     replacement word
#'
#' @param misspelledWords named list of problematic words and suggested replacements
#' @param inputVector character vector
#' @param outFile filepath for where to append lines of code
#' @export
InteractiveFindReplace.vector <- function(misspelledWords, inputVector, outFile = NULL) {
  ret <- inputVector
  message("NOTE: leaving the replacement field blank means do not replace.")

  for (nm in names(misspelledWords)) {
    message(paste0("word not found: ", nm))
    message("Possible suggestions: ")
    print(misspelledWords[[nm]])
    rep <- readline(prompt = paste0("enter replacement for ", nm, ": "))
    if (rep == "") {
      rep <- nm
    }
    message("")

    ret <- gsub(pattern = nm, replacement = rep, x = ret)

    if (!is.null(outFile)) {
      codeLn <- paste0("gsub('", nm, "', '", rep, "', x) }))")
      cat(codeLn, file = outFile, append = TRUE)
    }
  }

  return(ret)
}


#' @title Interactively check spelling against dictionary and ImmuneSpace-specific words
#'
#' @description Given an input vector and output directory, the user may
#'     correct words that are not found in a standard dictionary or a
#'     current list of ImmuneSpace specific terms.
#'
#' @param inputVector vector, only character type will be worked on
#' @param vectorName name of vector for use in output R doc
#' @param outputDir filepath for where to append lines of code
#' @export
interactiveSpellCheck.vector <- function(inputVector, vectorName, outputDir) {

  # skip if not a character vector
  if (typeof(inputVector) != "character") {
    message("skipping non-character vector")
    return(inputVector)
  }

  # Want file to be executable
  outFile <- paste0(outputDir, "/", vectorName, ".R")

  # write first lines of file
  header <- paste0(
    "# Changes made to ", vectorName, " using interactiveSpellCheck() \n",
    "# at ", Sys.time(), "\n"
  )
  cat(header, file = outFile, append = TRUE)

  # run regular spell-check first
  message("---- Running Spell Check ---- \n")
  misspelledWords <- checkSpelling(inputVector)

  # do findReplace
  tmpVec <- InteractiveFindReplace.vector(
    misspelledWords,
    inputVector,
    outFile
  )

  # run checkByContext
  message("---- Running Context Check ---- \n")
  contextWords <- checkByContext(tmpVec) # fix to not flag regular words like mosquito

  # do findReplace
  resVec <- InteractiveFindReplace.vector(
    contextWords,
    tmpVec,
    outFile
  )

  # Add newlines in case wrapped in sapply statement
  cat("\n\n", file = outFile, append = TRUE)

  names(resVec) <- vectorName

  return(resVec)
}

#' @title Interactively find and replace problematic words in a data frame
#'
#' @description For each word in the output list from checkSpelling()
#'     find the word in the inputVector and allow the user to enter a
#'     replacement word
#'
#' @param misspelledWords named list of problematic words and suggested replacements
#' @param inputDF character vector
#' @param outFile filepath for where to append lines of code
#' @export
InteractiveFindReplace.df <- function(misspelledWords, inputDF, outFile = NULL) {
  message("NOTE: leaving the replacement field blank means do not replace.")

  ret <- inputDF
  for (nm in names(misspelledWords)) {
    message(paste0("word not found: ", nm))
    message("Possible suggestions: ")
    print(misspelledWords[[nm]])
    rep <- readline(prompt = paste0("enter replacement for ", nm, ": "))
    if (rep == "") {
      rep <- nm
    }
    message("")

    ret <- data.frame(lapply(ret, function(x) { # need to deal with case issues
      gsub(
        pattern = nm,
        replacement = rep,
        x
      )
    }))
    colnames(ret) <- colnames(inputDF)

    if (!is.null(outFile)) {
      codeLn <- paste0(
        "\ndata.frame(lapply(inputDF, function(x){ gsub(pattern = '",
        nm, "', replacement = '", rep, "', x) }))"
      )
      cat(codeLn, file = outFile, append = TRUE)
    }
  }

  return(ret)
}

#' @title Interactively check spelling against dictionary and ImmuneSpace-specific words
#'
#' @description Given an input dataframe and output directory, the user may
#'     correct words that are not found in a standard dictionary or a
#'     current list of ImmuneSpace specific terms.
#'
#' @param inputDF dataframe, only character type will be worked on
#' @param outputDir filepath for where to append lines of code
#' @param dfName name of df for use in output R doc, default NULL uses "templateName" attribute
#' @export
interactiveSpellCheck.df <- function(inputDF, outputDir, dfName = NULL) {
  if (is.null(dfName)) {
    dfName <- attr(inputDF, "templateName")
  }

  # Want file to be executable
  outFile <- paste0(outputDir, "/", dfName, ".R")

  # write first lines of file
  header <- paste0(
    "# Changes made to ", dfName, " using interactiveSpellCheck() \n",
    "# at ", Sys.time(), "\n"
  )
  cat(header, file = outFile, append = TRUE)

  # get all unique words from entire DF into one vector
  words <- unique(unlist(apply(inputDF, 2, vec2Words)))

  # run regular spell-check first
  message("---- Running Spell Check ---- \n")
  misspelledWords <- checkSpelling(words)

  # do InteractivefindReplace.DF ... creates named list of find:replace pairs and then iterates
  tmpDF <- InteractiveFindReplace.df(misspelledWords, inputDF, outFile)

  # run checkByContext to look at words that are in dictionary, but not accurate (e.g. mistyped)
  message("---- Running Context Check ---- \n")
  chkdWords <- words[!(words %in% names(misspelledWords))]
  contextWords <- checkByContext(chkdWords)

  # do findReplace
  resDF <- InteractiveFindReplace.df(contextWords, tmpDF, outFile)

  # Add newlines in case wrapped in sapply statement
  cat("\n\n", file = outFile, append = TRUE)

  names(resDF) <- dfName

  return(resDF)
}
RGLab/R2i documentation built on May 20, 2021, 10:14 a.m.