#' string_to_fixed_dictionary_tf_idf_vector
#'
#' Calculates the TF-IDF vector for a string using a fixed dictionary
#'
#' @param string The string to process
#' @param dictionary A vector of terms that will be used to calculate the TF-IDF
#' @param num_docs The total number of documents in the text corpus
#' @return A vector that contains the TF-IDF of each term in the dictionary for the input string
#'
#' @author
#' Erick Cuevas-Fernandez
#'
#' @importFrom
#' SnowballC wordStem
#'
#' @importFrom
#' stringr str_split
#'
#' @export
#' @examples
#' string <- "This is a sentence"
#' dictionary <- c("sentence", "this", "is")
#' num_docs <- 10
#' string_to_fixed_dictionary_tf_idf_vector(string, dictionary, num_docs, stop_words)
#'
#' @rdname string_to_fixed_dictionary_tf_idf_vector
#' @export string_to_fixed_dictionary_tf_idf_vector
string_to_fixed_dictionary_tf_idf_vector <- function(string, dictionary, num_docs) {
# Convert the string to lowercase
string <- tolower(string)
# Remove stop words from the string
string_tokens <- str_split(string, "\\s+")[[1]]
stop_words <- c("a", "about", "above", "after", "again", "against",
"all", "am", "an", "and", "any", "are", "as", "at", "be",
"because", "been", "before", "being", "below", "between",
"both", "but", "by", "could", "did", "do", "does", "doing",
"down", "during", "each", "few", "for", "from", "further",
"had", "has", "have", "having", "he", "he'd", "he'll",
"he's", "her", "here", "here's", "hers", "herself", "him",
"himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm",
"i've", "if", "in", "into", "is", "it", "it's", "its",
"itself", "let's", "me", "more", "most", "my", "myself",
"nor", "of", "on", "once", "only", "or", "other", "ought",
"our", "ours", "ourselves", "out", "over", "own", "same", "she",
"she'd", "she'll", "she's", "should", "so", "some", "such",
"than", "that", "that's", "the", "their", "theirs", "them",
"themselves", "then", "there", "there's", "these", "they", "they'd",
"they'll", "they're", "they've", "this", "those", "through", "to",
"too", "under", "until", "up", "very", "was", "we", "we'd",
"we'll", "we're", "we've", "were", "what", "what's", "when",
"when's", "where", "where's", "which", "while", "who", "who's",
"whom", "why", "why's", "with", "would", "you", "you'd", "you'll",
"you're", "you've", "your", "yours", "yourself", "yourselves")
string_tokens <- string_tokens[!string_tokens %in% stop_words]
# Stem the string tokens using Snowball stemming
stemmed_string_tokens <- wordStem(string_tokens, language = "english")
# Calculate the term frequency (TF) for each term in the dictionary
tf_vector <- numeric(length(dictionary))
for (i in 1:length(dictionary)) {
term <- dictionary[i]
stemmed_term <- wordStem(term, language = "english")
tf_vector[i] <- sum(stemmed_string_tokens == stemmed_term) / length(stemmed_string_tokens)
}
# Calculate the inverse document frequency (IDF) for each term in the dictionary
idf_vector <- log(num_docs / (1 + sum(dictionary %in% stemmed_string_tokens)))
# Calculate the TF-IDF vector
tf_idf_vector <- tf_vector * idf_vector
return(tf_idf_vector)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.