#' string_to_word_vector
#'
#' Convert a string into a dictionary TF-IDF vector.
#'
#' @param string A character string to be converted into a TF-IDF vector
#'
#' @return A matrix representing the TF-IDF vector for the input string
#'
#' @details
#' The function takes in a character string, removes punctuation and special characters,
#' converts the string to lower case, splits the string into words, removes stop words,
#' performs stemming using the Snowball Stemmer, creates a corpus of text, and calculates the
#' TF-IDF (term frequency-inverse document frequency) values. The result is a matrix
#' representing the TF-IDF vector for the input string.
#'
#' @importFrom
#' SnowballC wordStem
#'
#' @importFrom
#' tm DocumentTermMatrix Corpus
#'
#' @examples
#' string_to_word_vector("The quick brown fox jumps over the lazy dog.")
#'
#' @rdname string_to_word_vector
#' @export string_to_word_vector
string_to_word_vector <- function(string) {
# Remove punctuation and special characters
string <- tolower(string)
string <- gsub("[^[:alnum:]]", " ", string)
# Split string into a vector of words
words <- strsplit(string, " ")[[1]]
# Eliminate stop words
stop_words <- c("a", "about", "above", "after", "again", "against",
"all", "am", "an", "and", "any", "are", "as", "at", "be",
"because", "been", "before", "being", "below", "between",
"both", "but", "by", "could", "did", "do", "does", "doing",
"down", "during", "each", "few", "for", "from", "further",
"had", "has", "have", "having", "he", "he'd", "he'll",
"he's", "her", "here", "here's", "hers", "herself", "him",
"himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm",
"i've", "if", "in", "into", "is", "it", "it's", "its",
"itself", "let's", "me", "more", "most", "my", "myself",
"nor", "of", "on", "once", "only", "or", "other", "ought",
"our", "ours", "ourselves", "out", "over", "own", "same", "she",
"she'd", "she'll", "she's", "should", "so", "some", "such",
"than", "that", "that's", "the", "their", "theirs", "them",
"themselves", "then", "there", "there's", "these", "they", "they'd",
"they'll", "they're", "they've", "this", "those", "through", "to",
"too", "under", "until", "up", "very", "was", "we", "we'd",
"we'll", "we're", "we've", "were", "what", "what's", "when",
"when's", "where", "where's", "which", "while", "who", "who's",
"whom", "why", "why's", "with", "would", "you", "you'd", "you'll",
"you're", "you've", "your", "yours", "yourself", "yourselves")
words <- words[!(words %in% stop_words)]
# Perform stemming with Snowball Stemmer
words <- wordStem(words, language="english")
# Create a text corpus
corpus <- Corpus(VectorSource(words))
# Calculate TF-IDF
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
return(as.matrix(dtm))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.