R/string_to_fixed_dictionary_tf_idf_vector.R

Defines functions string_to_fixed_dictionary_tf_idf_vector

Documented in string_to_fixed_dictionary_tf_idf_vector

#' string_to_fixed_dictionary_tf_idf_vector
#'
#' Calculates the TF-IDF vector for a string using a fixed dictionary
#'
#' @param string The string to process
#' @param dictionary A vector of terms that will be used to calculate the TF-IDF
#' @param num_docs The total number of documents in the text corpus
#' @return A vector that contains the TF-IDF of each term in the dictionary for the input string
#'
#' @author
#' Erick Cuevas-Fernandez
#'
#' @importFrom
#' SnowballC wordStem
#'
#' @importFrom
#' stringr str_split
#'
#' @export
#' @examples
#' string <- "This is a sentence"
#' dictionary <- c("sentence", "this", "is")
#' num_docs <- 10
#' string_to_fixed_dictionary_tf_idf_vector(string, dictionary, num_docs, stop_words)
#'
#' @rdname string_to_fixed_dictionary_tf_idf_vector
#' @export string_to_fixed_dictionary_tf_idf_vector

string_to_fixed_dictionary_tf_idf_vector <- function(string, dictionary, num_docs) {
  # Convert the string to lowercase
  string <- tolower(string)

  # Remove stop words from the string
  string_tokens <- str_split(string, "\\s+")[[1]]

  stop_words <- c("a", "about", "above", "after", "again", "against",
                  "all", "am", "an", "and", "any", "are", "as", "at", "be",
                  "because", "been", "before", "being", "below", "between",
                  "both", "but", "by", "could", "did", "do", "does", "doing",
                  "down", "during", "each", "few", "for", "from", "further",
                  "had", "has", "have", "having", "he", "he'd", "he'll",
                  "he's", "her", "here", "here's", "hers", "herself", "him",
                  "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm",
                  "i've", "if", "in", "into", "is", "it", "it's", "its",
                  "itself", "let's", "me", "more", "most", "my", "myself",
                  "nor", "of", "on", "once", "only", "or", "other", "ought",
                  "our", "ours", "ourselves", "out", "over", "own", "same", "she",
                  "she'd", "she'll", "she's", "should", "so", "some", "such",
                  "than", "that", "that's", "the", "their", "theirs", "them",
                  "themselves", "then", "there", "there's", "these", "they", "they'd",
                  "they'll", "they're", "they've", "this", "those", "through", "to",
                  "too", "under", "until", "up", "very", "was", "we", "we'd",
                  "we'll", "we're", "we've", "were", "what", "what's", "when",
                  "when's", "where", "where's", "which", "while", "who", "who's",
                  "whom", "why", "why's", "with", "would", "you", "you'd", "you'll",
                  "you're", "you've", "your", "yours", "yourself", "yourselves")
  string_tokens <- string_tokens[!string_tokens %in% stop_words]

  # Stem the string tokens using Snowball stemming
  stemmed_string_tokens <- wordStem(string_tokens, language = "english")

  # Calculate the term frequency (TF) for each term in the dictionary
  tf_vector <- numeric(length(dictionary))
  for (i in 1:length(dictionary)) {
    term <- dictionary[i]
    stemmed_term <- wordStem(term, language = "english")
    tf_vector[i] <- sum(stemmed_string_tokens == stemmed_term) / length(stemmed_string_tokens)
  }

  # Calculate the inverse document frequency (IDF) for each term in the dictionary
  idf_vector <- log(num_docs / (1 + sum(dictionary %in% stemmed_string_tokens)))

  # Calculate the TF-IDF vector
  tf_idf_vector <- tf_vector * idf_vector

  return(tf_idf_vector)
}
Erickcufe/textCells documentation built on May 20, 2023, 11:45 p.m.