scripts/ecb_text_analytics/text_analytics.R

library(dplyr)
library(tidytext)
library(igraph)
library(ggraph)
library(stringr)
#' INCOMPLETE AND UNUSED FUNCTION: Add term frequency - inverse document frequency as the weight, see bind_tf_idf
#'
#' @param ngrams	A tidy text dataset with one-row-per-term-per-document
#' @param term	Column containing terms as string or symbol
#' @param document Column containing document IDs as string or symbol
#' @param n1 Column containing document-term counts as string or symbol
#'
#' @return
#' @export
#'
#' @examples
add_tf_idf_wt <- function(ngrams, ...) {
  # WARNING: FUNCTION IS INCOMPLETE AND CURRENTLY NOT IN USED
  ngrams %>% 
    bind_tf_idf(...) %>%
    dplyr::select(-tf, -idf) %>%
    rename(wt = tf_idf) %>%
    arrange(desc(n))
}
#' Calculate the sentiment score
#'
#' @param ngrams 
#'
#' @return
#' @export
#'
#' @examples
calc_sentiment_score <- function(ngrams, wt) {
  #column_names <- names(ngrams)
  #column_names <- column_names[!column_names %in% c("n", "idf", "tf", "tf_idf", wt)]
  
  temp_grams <- NULL
  
  if (sum(ngrams$method != "afinn") > 1) {
    temp_grams <- ngrams %>%
      filter(method != "afinn") %>%
      filter(sentiment %in% c("positive", "negative")) %>%
      #group_by_(.dots = column_names, add = TRUE) %>%
      group_by(method, sentiment, add = TRUE) %>%
      summarise_(.dots = list("n" = sprintf("sum(%s)", wt))) %>%
      ungroup(method, sentiment) %>%
      spread(sentiment, n, fill = 0) %>%
      mutate(score = positive - negative,
             negative = -negative)
  }
    
  if ("afinn" %in% ngrams$method) {
    afinn <- ngrams %>%
      filter(method == "afinn") %>%
      #group_by_(.dots = column_names[column_names != "sentiment"]) %>%
      mutate_(.dots = list("score" = sprintf("%s * as.numeric(as.character(sentiment))", wt))) %>%
      group_by(method, add = TRUE) %>%
      summarise(score = sum(score, na.rm = TRUE)) %>%
      ungroup(method, sentiment)
    
    temp_grams <- bind_rows(temp_grams, afinn)
  }
  
  temp_grams
}
#' Compare words sentiment in wide form
#'
#' @param ngrams long form dataframe with columns "method" and "sentiment"
#'
#' @return
#' @export
#'
#' @examples
compare_words_sentiment <- function(ngrams) {
  ngrams %>%
    group_by(word, n, method) %>%
    summarise(sentiment = paste0(sentiment, collapse="|")) %>%
    distinct(word, sentiment, method) %>% 
    spread(method, sentiment, convert = TRUE) %>%
    arrange(desc(n))
  
  #left_join(ngrams, temp_grams, by = "word") 
}

#' To add sentiment/s to dataframe on matching with "word" column
#'
#' @param ngram dataframe of ngram with column "word"
#' @param sentiment_libs library of sentiments "afinn", "bing", "nrc" or "loughran"
#' @param name_using_method use the input methods as name of columns, always truE if multiple sentiments are used
#'
#' @return dataframe with added column for sentiment
#' @export
#'
#' @examples
add_sentiments <- function(ngrams, sentiment_libs = c("loughran", "nrc", "bing"), all = FALSE) {
  if (all) {
    sentiment_libs = c("loughran", "nrc", "bing", "afinn")
  }
  bind_rows(lapply(sentiment_libs, function(x) {single_add_sentiment(ngrams, x)}))
}

#' Helper method to add sentiment to dataframe on matching with "word" column, avoid using directly, use add_sentiments() instead
#'
#' @param ngram dataframe of ngram with column "word"
#' @param sentiment_lib library of sentiments "afinn", "bing", "nrc" or "loughran"
#' @param name_using_method use the input methods as name of columns

#'
#' @return dataframe with added columns
#' @export
#'
#' @examples
single_add_sentiment <- function(ngrams, sentiment_lib) {
  temp_grams <- ngrams %>%
    inner_join(get_sentiments(sentiment_lib), by = "word") %>%
    mutate(method = sentiment_lib)
  
  if (sentiment_lib == "afinn") {
    temp_grams <- temp_grams %>% 
      mutate(sentiment = as.character(score)) %>%
      dplyr::select(-score)
  }

  if ("method" %in% names(ngrams) && ("sentiment" %in% names(ngrams) || "score" %in% names(ngrams))) {
    bind_rows(ngrams, temp_grams)
  } else {
    temp_grams
  }
}


#' Remove standard stopwords or custom stopwords from a unigram or bigram
#'
#' @param ngram a dataframe which is an ngram
#' @param custom_stop_words 
#'
#' @return unibigram a dataframe with stop words removed
#' @export
#'
#' @examples
#' #'#' library(gutenbergr)
#'#' kjv <- gutenberg_download(10)
#'#' kjv_bigrams <- kjv %>% make_bigrams(remove_stop_words = FALSE) %>% remove_stopwords
remove_stopwords <- function(ngram, custom_stop_words = NULL) {
  if (is.null(custom_stop_words)) {
    data(stop_words)
    custom_stop_words <- stop_words
  }
  
  word_cols <- paste("word", c("", as.character(1:5)), sep="")
  if (sum(word_cols %in% names(ngram)) > 0) {
    for (i in word_cols[word_cols %in% names(ngram)]) {
      ngram <- ngram %>% filter_(paste("!", i, "%in% custom_stop_words$word"))
    }
    
    ngram
  } else {
    stop("Error, please ensure you input an ngram with col names 'word', 'word1', 'word2' up to 'word5' etc")
  }
}

#' Generates unigram from text input
#'
#' @param dataset dataframe with column name "text" 
#' @param custom_stop_words 
#' @param remove_stop_words 
#'
#' @return unigram dataframe
#' @export
#'
#' @examples
#' #'#' library(gutenbergr)
#'#' kjv <- gutenberg_download(10)
#'#' kjv_bigrams <- kjv %>% make_unigrams
make_unigrams <- function(dataset, custom_stop_words = NULL, remove_stop_words = TRUE) {
  temp_gram <- dataset %>%
    unnest_tokens(word, text)
  
  if (remove_stop_words) {
    temp_gram <- remove_stopwords(temp_gram, custom_stop_words)
  }
  
  temp_gram
}

#' Generate bigrams from text input
#'
#' @param dataset dataframe with column name "text" 
#' @param custom_stop_words 
#' @param remove_stop_words
#'
#' @return bigram dataframe
#' @export
#'
#' @examples
#'#' library(gutenbergr)
#'#' kjv <- gutenberg_download(10)
#'#' kjv_bigrams <- kjv %>% make_bigrams
make_ngrams <- function(dataset, n = 1, custom_stop_words = NULL, remove_stop_words = TRUE, summarise_count = TRUE) {
  temp_gram <- dataset %>%
    unnest_tokens(word, text, token = "ngrams", n = n)
  
  # Split into individual columns if ngrams
  if (n > 1) {
    word_names = paste("word", 1:n, sep="") 
    temp_gram <- separate(temp_gram, word, word_names, sep = " ")
  }
  
  # Remove stop words if necessary
  if (remove_stop_words) 
    temp_gram <- remove_stopwords(temp_gram, custom_stop_words)
  
  if (summarise_count) 
    temp_gram <- temp_gram %>%
      group_by_(.dots = names(.)) %>%
      summarise(n = n()) %>%
      ungroup %>%
      arrange(desc(n))
  
  temp_gram
}

#' Calculate count of words from ngrams
#'
#' @param ngrams dataframe with columns "wprd", "word1" and "word2" etc
#'
#' @return tidytext dataframe of ngram counts
#' @export
#'
#' @examples
#'#' library(gutenbergr)
#'#' kjv <- gutenberg_download(10)
#'#' kjv_bigrams <- kjv %>% make_ngrams(2) %>% count_ngrams
count_ngrams <- function(ngrams, reorder = TRUE) {
  word_found <- names(ngrams) %>%
    .[grepl("^word[0-9]{0,1}", .)]
  
  if ("n" %in% names(ngrams)) {
    ngrams %>% 
      group_by(.dots = word_found, add = TRUE) %>%
      summarise(n = sum(n)) %>%
      ungroup() %>%
      arrange(desc(n))
  }
  else {
    ngrams %>%
      count_(paste(word_found, sep=", ")) %>%
      arrange(desc(n))
  }
}

#' Visualize bigrams as a directed graph
#'
#' @param bigrams dataframe with columns "word1" and "word2"
#' @param min.n minimum count of word to be displayed
#' @param ignore.digits to ignore occurrence of digits in the text
#'
#' @return NULL
#' @export
#'
#' @examples
#'#' library(gutenbergr)
#'#' kjv <- gutenberg_download(10)
#'#' kjv_bigrams <- count_bigrams(kjv)
#'#' visualize_bigrams(kjv_bigrams, n = 50, ignore.digits = TRUE)
visualize_bigrams <- function(bigrams_count, min.n = 40, ignore.digits = TRUE, seed = 1) {
  set.seed(seed)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  if (ignore.digits) {
    bigrams_count <- bigrams_count %>%
      filter(!str_detect(word1, "\\d"),
             !str_detect(word2, "\\d"))
  }
  
  bigrams_count %>%
    filter(n > min.n) %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}
yunching/tidymas documentation built on Feb. 5, 2023, 1:42 p.m.