R/string_to_word_vector.R

Defines functions string_to_word_vector

Documented in string_to_word_vector

#' string_to_word_vector
#'
#' Convert a string into a dictionary TF-IDF vector.
#'
#' @param string A character string to be converted into a TF-IDF vector
#'
#' @return A matrix representing the TF-IDF vector for the input string
#'
#' @details
#' The function takes in a character string, removes punctuation and special characters,
#' converts the string to lower case, splits the string into words, removes stop words,
#' performs stemming using the Snowball Stemmer, creates a corpus of text, and calculates the
#' TF-IDF (term frequency-inverse document frequency) values. The result is a matrix
#' representing the TF-IDF vector for the input string.
#'
#' @importFrom
#' SnowballC wordStem
#'
#' @importFrom
#' tm DocumentTermMatrix Corpus
#'
#' @examples
#' string_to_word_vector("The quick brown fox jumps over the lazy dog.")
#'
#' @rdname string_to_word_vector
#' @export string_to_word_vector
string_to_word_vector <- function(string) {
  # Remove punctuation and special characters
  string <- tolower(string)
  string <- gsub("[^[:alnum:]]", " ", string)

  # Split string into a vector of words
  words <- strsplit(string, " ")[[1]]

  # Eliminate stop words
  stop_words <- c("a", "about", "above", "after", "again", "against",
                  "all", "am", "an", "and", "any", "are", "as", "at", "be",
                  "because", "been", "before", "being", "below", "between",
                  "both", "but", "by", "could", "did", "do", "does", "doing",
                  "down", "during", "each", "few", "for", "from", "further",
                  "had", "has", "have", "having", "he", "he'd", "he'll",
                  "he's", "her", "here", "here's", "hers", "herself", "him",
                  "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm",
                  "i've", "if", "in", "into", "is", "it", "it's", "its",
                  "itself", "let's", "me", "more", "most", "my", "myself",
                  "nor", "of", "on", "once", "only", "or", "other", "ought",
                  "our", "ours", "ourselves", "out", "over", "own", "same", "she",
                  "she'd", "she'll", "she's", "should", "so", "some", "such",
                  "than", "that", "that's", "the", "their", "theirs", "them",
                  "themselves", "then", "there", "there's", "these", "they", "they'd",
                  "they'll", "they're", "they've", "this", "those", "through", "to",
                  "too", "under", "until", "up", "very", "was", "we", "we'd",
                  "we'll", "we're", "we've", "were", "what", "what's", "when",
                  "when's", "where", "where's", "which", "while", "who", "who's",
                  "whom", "why", "why's", "with", "would", "you", "you'd", "you'll",
                  "you're", "you've", "your", "yours", "yourself", "yourselves")
  words <- words[!(words %in% stop_words)]

  # Perform stemming with Snowball Stemmer
  words <- wordStem(words, language="english")

  # Create a text corpus
  corpus <- Corpus(VectorSource(words))

  # Calculate TF-IDF
  dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))

  return(as.matrix(dtm))
}
Erickcufe/textCells documentation built on May 20, 2023, 11:45 p.m.