R/tf_idf_dataframe.R

Defines functions tf_idf_dataframe

tf_idf_dataframe <- function(df, dictionary) {
  library(dplyr)
  # define the stop words
  stop_words <- tidytext::stop_words$word

  colnames(df) <- c("document_id", "text")

  # tokenize and stem the texts
  df_tokenized <- df |>
    dplyr::mutate(text = tolower(text)) |>
    dplyr::mutate(text_tokens = stringr::str_split(text, "\\s+")) |>
    dplyr::mutate(text_tokens = purrr::map(text_tokens, ~ .x[! .x %in% stop_words])) |>
    dplyr::mutate(text_stemmed = purrr::map(text_tokens, ~ SnowballC::wordStem(.x, language = "english"))) |>
    dplyr::select(document_id, text_stemmed)

  # get the number of documents
  num_docs <- nrow(df)

  # calculate term frequency for each document and word in the dictionary
  tf_df <- df_tokenized |>
    tidyr::unnest(text_stemmed) |>
    dplyr::filter(text_stemmed %in% dictionary$words) |>
    dplyr::count(document_id, text_stemmed) |>
    tidyr::pivot_wider(names_from = text_stemmed, values_from = n, values_fill = 0) |>
    dplyr::mutate(dplyr::across(-document_id, ~ ./sum(.)))

  sel_col <- dictionary$words %in% colnames(tf_df)[-1]
  new_colnames <- dictionary$words[!sel_col]
  df_tmp <- matrix(data = 0 ,ncol = length(new_colnames), nrow = nrow(tf_df)) |>
    as.data.frame()
  colnames(df_tmp) <- new_colnames
  tf_df <- cbind(tf_df, df_tmp) |> tibble::tibble()

  # calculate inverse document frequency for each word in the dictionary
  idf_df <- df_tokenized |>
    tidyr::unnest(text_stemmed) |>
    dplyr::filter(text_stemmed %in% dictionary$words) |>
    dplyr::distinct(document_id, text_stemmed) |>
    dplyr::count(text_stemmed) |>
    dplyr::mutate(idf = log(num_docs / n)) |>
    dplyr::select(text_stemmed, idf)

  sel_t <- dictionary$words %in% idf_df$text_stemmed
  aa <- dictionary$words[!sel_t]
  bb <- data.frame(text_stemmed = aa, idf = 1)
  idf_df <- rbind(idf_df, bb)

  # merge the tf and idf data frames and calculate the tf-idf scores
  tf_idf_df <- tf_df[,-1] * idf_df$idf[match(names(tf_df), idf_df$text_stemmed)][col(tf_df)]
  tf_idf_df <- cbind(tf_df$document_id, tf_idf_df) |> tibble::tibble()
  tf_idf_df[is.na(tf_idf_df)] <- 0
  colnames(tf_idf_df)[1] <- "document_id"

  # re-order the columns based on the dictionary
  tf_idf_df <- tf_idf_df[, c("document_id", dictionary$words)]


  # normalize the scores
  tf_idf_df <- tf_idf_df |>
    dplyr::mutate(dplyr::across(-document_id, ~ . / sqrt(sum(.^2))))

  # fill in missing words with 0s
  tf_idf_df[is.na(tf_idf_df)] <- 0

  return(tf_idf_df)
}
Erickcufe/textCells documentation built on May 20, 2023, 11:45 p.m.