vocabular2: Document Vocabulary Comparison

Documented in compare_vocabs

# TODO Rename Condition to Document

# tc_dfs: List of Term Counts data frame
#' @title Compare vocabularies
#' @param tc_dfs Named list of Term Counts data frames.
#'
#'  Each data frame must contain a column with the words and a column with their counts.
#'
#'  Each document must be uniquely named.
#' @param weighting_fn Function for weighting the count column before normalizing to frequency.
#'  The metrics using the weighting function will be named with a "_weighted" suffix.
#'
#'  NOTE: This is experimental. The metrics have not been thought through and some may not be meaningful.
#' @author Ludvig Renbo Olsen, \email{r-pkgs@@ludvigolsen.dk}
#' @export
#' @importFrom dplyr %>%
#' @import data.table
compare_vocabs <- function(tc_dfs,
                           word_col = "Word",
                           counts_col = "Count",
                           weighting_fn = function(x){log(x+1)},
                           calc_weighting_metrics = FALSE,
                           rel_tf_nrtf_beta = 1,
                           zero_negatives = TRUE){

  #### Check and prepare inputs ####

  # Sanity check
  # In a package, I would create a unique name for condition instead
  if ("Condition" %in% c(word_col, counts_col))
    stop("Neither 'word_col' or 'counts_col' can be named 'Condition'.")

  # Extract conditions
  conditions <- names(tc_dfs)
  # Test list was named correctly
  if (is.null(conditions) ||
      length(conditions) != length(tc_dfs) ||
      length(unique(conditions)) != length(tc_dfs)
  ){
    stop("'tc_dfs' must be a named list with a unique name for each element.")
  }

  # Combine the term-counts dataframes
  term_counts <- tc_dfs %>%
    dplyr::bind_rows(.id = "Condition") %>%
    tidyr::spread(key = "Condition", value = counts_col)

  # Words that are not in a condition's vocabulary
  # will have an NA in Count when we use spread
  # Set NAs to zero
  term_counts[is.na(term_counts)] <- 0

  # Separate the counts and word columns
  counts <- base_select(term_counts, conditions)
  words <- base_select(term_counts, word_col)

  # Document counts
  # list with two elements
  #   'contains' is a one hot - word in doc?
  #   'counts' are rowsums of 'contains'
  doc_counts <- document_count(counts)

  # Normalize counts column-wise
  freqs <- counts %>%
    dplyr::mutate_all(.funs = list(normalize))

  # Calculate epsilons (1/sum(counts_rest))
  # These are used to add +1 smoothing in some metrics
  epsilons <- sum_rest_populations(counts) %>%
    dplyr::summarise_all(.f = list(function(x) {
      1 / sum(x)
    }))

  if (isTRUE(calc_weighting_metrics)){

    # In case we don't wan't to weight the counts
    if (is.null(weighting_fn))
      weighting_fn <- identity

    # Weight counts column-wise
    weighted_counts <- counts %>%
      dplyr::mutate_all(.funs = list(weighting_fn))

    # Normalize weighted counts column-wise
    weighted_freqs <- weighted_counts %>%
      dplyr::mutate_all(.funs = list(normalize))

    # Weight the epsilons
    weighted_epsilons <- epsilons %>%
      dplyr::mutate_all(.funs = list(weighting_fn))
  }

  #### Calculate metrics ####

  metrics <- calculate_metrics(
    counts = counts,
    freqs = freqs,
    doc_counts = doc_counts,
    epsilons = epsilons,
    rel_tf_nrtf_beta = rel_tf_nrtf_beta,
    zero_negatives = zero_negatives
  )
  idf <- metrics[["idf"]]
  metrics[["idf"]] <- NULL

  if (isTRUE(calc_weighting_metrics)){

    weighted_metrics <- calculate_metrics(
      counts = weighted_counts,
      freqs = weighted_freqs,
      doc_counts = doc_counts,
      epsilons = weighted_epsilons,
      rel_tf_nrtf_beta = rel_tf_nrtf_beta,
      zero_negatives = zero_negatives,
      metric_suffix = "_weighted"
    )
    weighted_metrics[["idf"]] <- NULL
    weighted_metrics[["irf"]] <- NULL

    metrics <- dplyr::bind_cols(
      c(
      metrics,
      weighted_metrics
      )
    )

  } else {

    metrics <- dplyr::bind_cols(metrics)

  }

  #### Prepare output ####

  # Rename columns
  counts <- add_colnames_suffix(counts, "_Count")
  freqs <- add_colnames_suffix(freqs, "_TF")

  if (isTRUE(calc_weighting_metrics)){
    weighted_freqs <- add_colnames_suffix(weighted_freqs, "_WeightedFreq")

    # Add counts and frequencies to metrics
    metrics <- metrics %>%
      dplyr::bind_cols(counts, freqs, weighted_freqs)

  } else {
    weighted_freqs <- NULL

    # Add counts and frequencies to metrics
    metrics <- metrics %>%
      dplyr::bind_cols(counts, freqs)

  }

  metric_names <- plyr::llply(conditions, function(cond){
    paste0(cond, "_", c(
    "Count", "TF", "IRF", "RTF", "NRTF", "MRTF", "TF_IDF", "TF_IRF",
    "TF_RTF", "TF_NRTF", "TF_MRTF", "REL_TF_NRTF", "REL_TF_MRTF", "RANK_ENS"
  ))}) %>% unlist()

  # Reorder
  metrics <- base_select(metrics, cols = metric_names)

  # Nest metrics by condition
  nested_metrics <- plyr::llply(conditions, function(cond){
    metrics %>%
      base_select(cols = grepl(cond, metric_names)) %>%
      nest_rowwise() %>%
      tibble::enframe(name=NULL, value=cond)
  }) %>% dplyr::bind_cols()

  # Combine the computed columns
  dplyr::bind_cols(
    words,
    doc_counts[["counts"]],
    idf,
    nested_metrics
  )

}