tf_idf_dataframe <- function(df, dictionary) {
library(dplyr)
# define the stop words
stop_words <- tidytext::stop_words$word
colnames(df) <- c("document_id", "text")
# tokenize and stem the texts
df_tokenized <- df |>
dplyr::mutate(text = tolower(text)) |>
dplyr::mutate(text_tokens = stringr::str_split(text, "\\s+")) |>
dplyr::mutate(text_tokens = purrr::map(text_tokens, ~ .x[! .x %in% stop_words])) |>
dplyr::mutate(text_stemmed = purrr::map(text_tokens, ~ SnowballC::wordStem(.x, language = "english"))) |>
dplyr::select(document_id, text_stemmed)
# get the number of documents
num_docs <- nrow(df)
# calculate term frequency for each document and word in the dictionary
tf_df <- df_tokenized |>
tidyr::unnest(text_stemmed) |>
dplyr::filter(text_stemmed %in% dictionary$words) |>
dplyr::count(document_id, text_stemmed) |>
tidyr::pivot_wider(names_from = text_stemmed, values_from = n, values_fill = 0) |>
dplyr::mutate(dplyr::across(-document_id, ~ ./sum(.)))
sel_col <- dictionary$words %in% colnames(tf_df)[-1]
new_colnames <- dictionary$words[!sel_col]
df_tmp <- matrix(data = 0 ,ncol = length(new_colnames), nrow = nrow(tf_df)) |>
as.data.frame()
colnames(df_tmp) <- new_colnames
tf_df <- cbind(tf_df, df_tmp) |> tibble::tibble()
# calculate inverse document frequency for each word in the dictionary
idf_df <- df_tokenized |>
tidyr::unnest(text_stemmed) |>
dplyr::filter(text_stemmed %in% dictionary$words) |>
dplyr::distinct(document_id, text_stemmed) |>
dplyr::count(text_stemmed) |>
dplyr::mutate(idf = log(num_docs / n)) |>
dplyr::select(text_stemmed, idf)
sel_t <- dictionary$words %in% idf_df$text_stemmed
aa <- dictionary$words[!sel_t]
bb <- data.frame(text_stemmed = aa, idf = 1)
idf_df <- rbind(idf_df, bb)
# merge the tf and idf data frames and calculate the tf-idf scores
tf_idf_df <- tf_df[,-1] * idf_df$idf[match(names(tf_df), idf_df$text_stemmed)][col(tf_df)]
tf_idf_df <- cbind(tf_df$document_id, tf_idf_df) |> tibble::tibble()
tf_idf_df[is.na(tf_idf_df)] <- 0
colnames(tf_idf_df)[1] <- "document_id"
# re-order the columns based on the dictionary
tf_idf_df <- tf_idf_df[, c("document_id", dictionary$words)]
# normalize the scores
tf_idf_df <- tf_idf_df |>
dplyr::mutate(dplyr::across(-document_id, ~ . / sqrt(sum(.^2))))
# fill in missing words with 0s
tf_idf_df[is.na(tf_idf_df)] <- 0
return(tf_idf_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.