R/getCommonKeywords.R

Defines functions getCommonKeywords

# getCommonKeywords.R
#
# Purpose: Takes the results from getCleanedAndTokenizedData() and returns the
#  dataframe of similar articles
#
# Author: Roberto Lentini (rlentini@research.baycrest.org)
#
# Date: 2021-05-27
#
# ========================================================================================

getCommonKeywords <- function(visualize, similar_articles) {
  #  temporary fix to the naming issues
  colnames(visualize)[1] <- "proposal_title"
  colnames(similar_articles)[1] <- "proposal_title"

  visualize_long <- visualize %>%
    tidyr::pivot_longer(!proposal_title, names_to = "keywords", values_to = "count") %>%
    tidyr::drop_na() %>%
    dplyr::select(proposal_title, keywords)


  visualize_long <- visualize_long %>%
    dplyr::group_by(proposal_title) %>%
    dplyr::summarise(
      alltypes = paste(keywords, collapse=", "))


  results <- merge(similar_articles, visualize_long, by='proposal_title' )
  results <- merge(results, visualize_long, by.x = "most_similar_proposal", by.y = "proposal_title" )
  s <- strsplit(results$alltypes.x , split = ", ")
  a <- strsplit(results$alltypes.y , split = ", ")

  common_words_list = vector('list', length(s))
  for (i in 1:length(s)) {
    common_words <- pmap(list(s[i], a[i]), intersect)
    common_words_list[[i]] <- common_words
  }

  results <- results %>%
    tibble::add_column(common_words_list)

  results <- results %>%
    dplyr::select(proposal_title, most_similar_proposal, common_words_weighted, common_words_list)

  return(results)
}
roblen001/document_similarity_checker documentation built on Aug. 14, 2022, 9:39 a.m.