R/df_to_fixed_dictionary_tf_idf_matrix.R

Defines functions df_to_fixed_dictionary_tf_idf_matrix

Documented in df_to_fixed_dictionary_tf_idf_matrix

#' df_to_fixed_dictionary_tf_idf_matrix
#'
#' Calculates the TF-IDF matrix for a data.frame using a fixed dictionary
#'
#' @param df A data.frame that contains the text to process
#' @param text_col The name of the column in `df` that contains the text
#' @param dictionary A vector of terms that will be used to calculate the TF-IDF
#' @param num_docs The total number of documents in the text corpus
#' @return A matrix that contains the TF-IDF of each term in the dictionary for each document in the data.frame
#' @export
#' @examples
#' df <- data.frame(text = c("This is a sentence", "This is another sentence"))
#' dictionary <- c("sentence", "this", "is")
#' num_docs <- 2
#' df_to_fixed_dictionary_tf_idf_matrix(df, "text", dictionary, num_docs)
#'
#' @rdname df_to_fixed_dictionary_tf_idf_matrix
#' @export df_to_fixed_dictionary_tf_idf_matrix
df_to_fixed_dictionary_tf_idf_matrix <- function(df, text_col, dictionary, num_docs) {
  # Initialize a matrix to store the TF-IDF vectors for each document
  tf_idf_matrix <- matrix(ncol = length(dictionary), nrow = nrow(df))
  colnames(tf_idf_matrix) <- dictionary

  # Calculate the TF-IDF for each document
  for (i in 1:nrow(df)) {
    string <- as.character(df[i, text_col])
    tf_idf_vector <- string_to_fixed_dictionary_tf_idf_vector(string, dictionary, num_docs)
    tf_idf_matrix[i, ] <- tf_idf_vector
  }

  return(tf_idf_matrix)
}
Erickcufe/textCells documentation built on May 20, 2023, 11:45 p.m.