iNZightTA: iNZight Text Analytics

#########################################################
##################### Server Functions ##################
#########################################################

# This file contains the miscellaneous functions called in the server. 

#' Used for generating the colors used in plots
#' @param n_cols number of colors to generate (number of levels in faceting)
#' 
#' @return character vector of hex colors with length n_levels 

gg_cols <- function(n_cols) {
  h = seq(15, 375, length = n_cols + 1)
  hcl(h = h, l = 65, c = 100)[1:n_cols]
}

#########################################################
##################### For calculating td-idf
##################### 
#########################################################

#' @param .data dataframe of terms as per output of format_data
#' 
#' @param grouping character string indicating how the text is grouped into documents
#' 
#' @return original data frame additional tf-idf column                

get_tf_idf <- function(.data, grouping){
  
  # count term frequency by group 
  df <- .data %>%
    dplyr::count(!! dplyr::sym(grouping), text) 
  
  # total number of words in each group (including the stopwords)
  tot_words <- .data %>%
    dplyr::count(!! dplyr::sym(grouping), word) %>%
    dplyr::group_by(!! dplyr::sym(grouping)) %>%
    dplyr::summarise(total_words = sum(n))
  
  # calculate term frequency tf 
  df <- dplyr::left_join(df, tot_words) %>% 
    dplyr::mutate(tf = n/total_words)
  
  # calculate inverse doc freq idf
  t <- df$text
  idf <- log(nrow(tot_words) / table(t))
  
  df$idf <- as.numeric(idf[t])
  df$`Term Frequency-Inverse Document Frequency` <- df$tf * df$idf
  df <- df %>% dplyr::select(!! dplyr::sym(grouping), text, `Term Frequency-Inverse Document Frequency`)
  
  dplyr::left_join(.data, df)
}

#########################################################
##################### For readability, word tree, and lexical 
##################### dispersion plot
#########################################################

#' Adds section column to dataframe
#'
#' @param .data data frame as per output of pre-processing (still with stopwords)
#'
#' @param section_by character name of what to section over
#'
#' @return input dataframe with additional section column
#'

section_for_merge_id  <- function(.data, section_by){
  sec_table <- list("chapter" = get_chapters,
                    "part" = get_parts,
                    "section" = get_sections,
                    "canto" = get_cantos,
                    "book" = get_books)
  .data %>%
    dplyr::group_by(id) %>%
    dplyr::mutate(!! section_by := sec_table[[section_by]](text))
}


#' Groups the text by id and collapses them together into one long string
#'
#' @param x data frame as per output of pre-processing 
#'
#' @param source character name of text source
#'
#' @return data frame with new groupings and text within each group merged together
#' 

merge_id <- function(x, source, groups, filter = NULL){
  if (isTruthy(filter)){
    x <- x %>%
      dplyr::filter(!! dplyr::sym(groups) == filter)
  }
  
  if (isTruthy(groups)){
    if (input$section_by == groups)
    {
      # if user sections the text by chapter/book/canto in the beginning, 
      # add in the column for the sectioning
      x <- x %>%
        section_for_merge_id(groups)
      
      # and merge the text by these columns
      by_section <- x %>%
        dplyr::group_by(id, !! dplyr::sym(groups)) %>%
        dplyr::mutate(text = paste(text, collapse = " ")) %>%
        dplyr::distinct(text) %>% dplyr::ungroup() %>%
        dplyr::mutate(id = paste(id, groups))
      by_section
    }
    
    else {
      by_chosen <- x %>% 
        dplyr::group_by(!! dplyr::sym(groups)) %>%
        dplyr::mutate(text = paste(text, collapse = " ")) %>%
        dplyr::distinct(text) %>% dplyr::ungroup() %>%
        dplyr::mutate(id = !! dplyr::sym(groups))
      by_chosen
    }
  }
  
  else{
    all_merged <- x %>% 
      dplyr::mutate(text = paste(text, collapse = ". ")) %>%
      dplyr::distinct(text) %>% 
      dplyr::mutate(id = "Text")
    all_merged
  }
  
}

# merge_id <- function(x, source){
#   if (isTruthy(input$merge_id_grps)){
#     if (input$section_by == input$merge_id_grps)
#     {
#       # if user sections the text by chapter/book/canto in the beginning, 
#       # add in the column for the sectioning
#       x <- x %>%
#         section_for_merge_id(input$section_by)
#         
#       # and merge the text by these columns
#       by_section <- x %>%
#         dplyr::group_by(id, !! dplyr::sym(input$section_by)) %>%
#         dplyr::mutate(text = paste(text, collapse = " ")) %>%
#         dplyr::distinct(text) %>% dplyr::ungroup() %>%
#         dplyr::mutate(id = paste(id, input$section_by))
#       by_section
#     }
#     
#     else {
#       by_chosen <- x %>% 
#         dplyr::group_by(!! dplyr::sym(input$merge_id_grps)) %>%
#         dplyr::mutate(text = paste(text, collapse = " ")) %>%
#         dplyr::distinct(text) %>% dplyr::ungroup() %>%
#         dplyr::mutate(id = !! dplyr::sym(input$merge_id_grps))
#       by_chosen
#     }
#   }
#  
#   else{
#     all_merged <- x %>% 
#       dplyr::mutate(text = paste(text, collapse = ". ")) %>%
#       dplyr::distinct(text) %>% 
#       dplyr::mutate(id = "Text")
#     all_merged
#   }
#   
# }


#' Creates kwic object to pass into textplot_xray() and for concordance table
#'
#' @param merged data frame as per output ofmerge_id()
#'
#' @param patt pattern to find in text
#' 
#' @param value type of pattern matching "glob", "regex", or "fixed"
#' 
#' @param window how many words displayed around keyword
#' 
#' @param case_ins case insensitive pattern matching?
#'
#' @return input dataframe with additional section column
#'
#' @export data frame with new groupings and text within each group merged together
#'

get_kwic <- function(merged, patt, value, window, case_ins){
  words <- quanteda::phrase(unlist(strsplit(patt, split = ",")))
  corp <- quanteda::corpus(merged, text_field = "text",
                 docid_field = "id")
  quanteda::kwic(x = corp, pattern = words, window = window, valuetype = value, case_insensitive = case_ins)
  
}

# #########################################################
# ##################### For readability
# #########################################################
# 
# # After merge_id() merges all the text for each id into one long string, 
# # books_with_samples() takes in this data frame containing the id and the merged text, 
# # then calculates the Flesch Kincaid score for each. 
# # Afterwards binding these together with the sample texts in samples.R and 
# # arranging the texts in ascending Flesch Kincaid score. 
# 
# books_with_samples <- function(books){
#   # calculate the FK score of the text for each id
#   books$scores <- lapply(books$text, quanteda::textstat_readability, measure = "Flesch.Kincaid")
#   
#   # extract the scores and place them in their own column in the data frame
#   books$FK <- unlist(lapply(books[["scores"]], function(x) x[[2]])) 
#   
#   # no sample excerpts for the texts provided
#   books$excerpt <- ""
#   books <- books %>% dplyr::select(id, FK, excerpt)
#   
#   # bind the provided texts with the sample (reference) texts
#   samps <- rbind.data.frame(samples, books)
#   samps %>% dplyr::arrange(FK)
# }

#########################################################
##################### cleaning text
#########################################################

#' does basic cleaning before processing (converts encoding, removes HTML entities, 
#' expanding contractions, ...)
#'
#' @param df data frame with the a column `text` containing the text to be cleaned
#' 
#' @param exp_cont logical - whether to expand contractions or not
#' 
#' @param lyrics logical - is the text from genius? (different encoding)
#'
#' @return data frame with `text` column cleaned

clean_for_app <- function(df, exp_cont = TRUE, lyrics = FALSE){
  
  if (lyrics == FALSE){
    Encoding(df$text) <- "UTF-8"
    df$text <- iconv(df$text, "UTF-8", "UTF-8", "")
  }
  
  else{Encoding(df$text) <- "UNICODE"}
  
  # replaces the fancy apostrophes (for replacing contractions later on)
  if (lyrics == FALSE) {
    df$text <- gsub(intToUtf8(8217), "'", df$text, perl = TRUE)
  }
  
  # For the guardian
  df$text <- gsub("<figcaption.+?</figcaption>|Related.+?</aside>", "", df$text)
  
  df$text <- gsub("<.+?>", "", df$text)

  # Decodes common HTML entities 
  df$text <- gsub("&amp;", "&", df$text)
  df$text <- gsub("&quot;", '"', df$text)
  df$text <- gsub("&#039;|&#39;", "'", df$text)
  
  df$text <- gsub("&.*\\w+?;", " ", df$text)
  
  if (exp_cont == TRUE){df$text <- textclean::replace_contraction(df$text)}
  
  # Replace Mr. with Mister ... for sentence tokenization 
  if (lyrics == FALSE) {
    df$text <- qdap::replace_abbreviation(df$text)
  }
  df$text <- gsub("^\"|\"$", "", df$text)
  
  return(df)
}

#########################################################
##################### get song lyrics. 
##################### 
#########################################################

get_lyrics <- function(artist,song){
  df <- genius::genius_lyrics(artist = artist, song = song)
  names(df)[names(df) == "lyric"] <- "text"
  names(df)[names(df) == "track_title"] <- "title"
  df$artist <- artist
  df
}


#########################################################
##################### From quanteda - for lexical dispersion plot
##################### changed appearance 
#########################################################

textplot_xray <- function(..., scale = c("absolute", "relative"),
                          sort = FALSE) {
  UseMethod("textplot_xray")
}

#' @export
textplot_xray.default <- function(..., scale = c("absolute", "relative"),
                                  sort = FALSE) {
  stop(friendly_class_undefined_message(class(x), "textplot_xray"))
}

#' @export
textplot_xray.kwic <- function(..., scale = c("absolute", "relative"),
                               sort = FALSE) {
  
  if (!requireNamespace("ggplot2", quietly = TRUE))
    stop("You must have ggplot2 installed to make a dispersion plot.")
  if (!requireNamespace("grid", quietly = TRUE))
    stop("You must have grid installed to make a dispersion plot.")
  
  position <- from <- keyword <- docname <- ntokens <- NULL
  
  kwics <- list(...)
  if (!all(vapply(kwics, is.kwic, logical(1))))
    stop("objects to plot must be kwic objects")
  
  # create a data.table from the kwic arguments
  x <- data.table(do.call(rbind, kwics))
  # use old variable name
  x[, position := from]
  # get the vector of ntokens
  ntokensbydoc <- unlist(lapply(kwics, attr, "ntoken"))
  # add ntokens to data.table as an indexed "merge"
  x[, ntokens := ntokensbydoc[as.character(x[, docname])]]
  
  # replace "found" keyword with patterned keyword
  x[, keyword := unlist(lapply(kwics, function(l) l[["pattern"]]))]

  # pre-emptively convert keyword to factor before ggplot does it, so that we
  # can keep the order of the factor the same as the order of the kwic objects
  # x[, keyword := factor(keyword, levels = unique(keyword))]
  
  multiple_documents <- length(unique(x$docname)) > 1
  
  # Deal with the scale argument:
  # if there is a user-supplied value, use that after passing through
  # match.argj; if not, use relative for multiple documents and absolute
  # for single documents
  if (!missing(scale)) {
    scale <- match.arg(scale)
  }
  else {
    if (multiple_documents) {
      scale <- "relative"
    } else {
      scale <- "absolute"
    }
  }
  
  # Deal with the sort argument:
  if (sort) {
    x[, docname := factor(docname)] # levels are sorted by default
  } else {
    x[, docname := factor(docname, levels = unique(docname))]
  }
  
  if (scale == "relative")
    x[, position := position / ntokens]
  
  x[,"yvar"] = rnorm(nrow(x), mean = 0.2, sd = 0)
  
  plot <- ggplot2::ggplot(x, ggplot2::aes(x = position, y = yvar)) +
    ggplot2::geom_point(ggplot2::aes(size = 2), alpha = 0.13) +
    ggplot2::theme(axis.line = ggplot2::element_blank(),
                   panel.background = ggplot2::element_blank(),
                   panel.grid.major.y = ggplot2::element_blank(),
                   panel.grid.minor.y = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   axis.ticks.y = ggplot2::element_blank(),
                   axis.text.y = ggplot2::element_blank(),
                   panel.spacing = grid::unit(0.1, "lines"),
                   panel.border = ggplot2::element_rect(colour = "gray", fill = NA),
                   strip.text.y = ggplot2::element_text(angle = 0)
    ) + ggplot2::ylim(0, 0.4)
  
  if (scale == "absolute")
    plot <- plot +
    ggplot2::geom_rect(ggplot2::aes(xmin = ntokens, xmax = max(x$ntokens),
                                    ymin = 0, ymax = 0.4), fill = "gray90")
  
  if (multiple_documents) {
    # If there is more than one document, put documents on the panel y-axis
    # and keyword(s) on the panel x-axis
    plot <- plot + ggplot2::facet_grid(docname ~ keyword) +
      ggplot2::labs(y = "Document", title = paste("Lexical dispersion plot"))
  }
  else {
    # If not, put keywords on the panel y-axis and the doc name in the title
    plot <- plot + ggplot2::facet_grid(keyword~.) +
      ggplot2::labs(y = "", title = paste("Lexical dispersion plot, document:",
                                          x$docname[[1]]))
  }
  
  if (scale == "relative") {
    plot <- plot + ggplot2::labs(x = "Relative token index")
  }
  else {
    plot <- plot + ggplot2::labs(x = "Token index")
  }
  
  l = vector("list")
  l[[1]] = x
  l[[2]] = plot
  return(l)
}


#########################################################
##################### From pushshift API 
##################### importing data from reddit
#########################################################

#' gets the pushshift data
#'
#' @param postType "submission" or "comment"
#' 
#' @param title Character string to search for in post titles
#' 
#' @param size Number of results to return, maximum is 1000
#' 
#' @param q Character string with search query
#' 
#' @param after Only search for posts made after this data, specified as a UNIX epoch time
#' 
#' @param before Only search for posts made before this data, specified as a UNIX epoch time
#' 
#' @param subreddit Only return posts made in this subreddit
#' 
#' @param nest_level How deep to search? nest_level = 1 returns only top-level comments
#' 
#' @return data frame with desired comments and submissions

getPushshiftData <- function(postType,
                             title = NULL,
                             size = NULL,
                             q = NULL,
                             after = NULL,
                             before = NULL,
                             subreddit = NULL,
                             nest_level = NULL) {
  if(postType == "submission") {
    base_url <- "https://api.pushshift.io/reddit/search/submission/"
    httr::GET(url = base_url, 
              query = list(title = title,
                           size = size,
                           q = q,
                           after = after,
                           before = before,
                           subreddit = subreddit,
                           nest_level = nest_level, 
                           sort = "asc")) %>%
      .$url %>%
      jsonlite::fromJSON() %>%
      .$data %>%
      jsonlite::flatten() %>%
      dplyr::select(author, title, selftext, created_utc, permalink, num_comments, score, subreddit) %>%
      dplyr::filter(!stringr::str_detect(author, "Moderator")) %>%
      tibble::as_tibble() %>%
      dplyr::rename(id = title) %>%
      dplyr::rename(text = selftext) %>%
      dplyr::arrange(created_utc)
    
  } 
  
  else {
    base_url <- "https://api.pushshift.io/reddit/search/comment/"
    httr::GET(url = base_url, 
              query = list(title = title,
                           size = size,
                           q = q,
                           after = after,
                           before = before,
                           subreddit = subreddit,
                           nest_level = nest_level, 
                           sort = "asc")) %>%
      .$url %>%
      jsonlite::fromJSON() %>%
      .$data %>%
      jsonlite::flatten() %>%
      dplyr::select(author, body, permalink, score, created_utc, subreddit) %>%
      dplyr::filter(!stringr::str_detect(author, "Moderator")) %>%
      tibble::as_tibble() %>%
      dplyr::rename(id = permalink) %>%
      dplyr::rename(text = body) %>%
      dplyr::arrange(created_utc)
  }
}

getPushshiftDataRecursive <- function(postType = "submission",
                                      title = NULL,
                                      size = NULL,
                                      q = NULL,
                                      after = NULL,
                                      before = NULL,
                                      subreddit = NULL,
                                      nest_level = NULL,
                                      delay = 0) {

  tmp <- getPushshiftData(postType,
                          title,
                          size,
                          q,
                          after,
                          before,
                          subreddit,
                          nest_level)
  
  out <- tmp %>% dplyr::filter(FALSE)
  on.exit(return(out), add = TRUE)
  after <- data.table::last(tmp$created_utc)
  
  while(nrow(tmp) > 0) {
    message(
      sprintf("%d %ss fetched, last date fetched: %s \n",
              nrow(tmp),
              postType,
              as.Date(as.POSIXct(as.numeric(after), origin = "1970-01-01"))))
    out <- rbind(out, tmp)
    after <- data.table::last(tmp$created_utc)
    tmp <- getPushshiftData(postType,
                            title,
                            size,
                            q,
                            after,
                            before,
                            subreddit,
                            nest_level)
  }
  Sys.sleep(delay)
}


#########################################################
##################### Error handling for plots in shiny
##################### (used in lexical div plot)
#########################################################

#' To present error message in plot outputs in shiny 
#'
#' @param ... Text strings to be printed in the plot window 
#' 
#' @param sep Character string to separate the strings provided in ...
#' 
#' @return Plot with text strings printed on it


plot_exception <-function(
  ...,
  sep=" "){      
  txt = paste(...,collapse = sep)
  print(ggplot2::ggplot() +
          ggplot2::geom_text(ggplot2::aes(x = 0, y = 0, label = txt), color = "red", size = 6) + 
          ggplot2::theme_void())
  invisible(NULL)
}

#########################################################
##################### preserve the emojis in tweets
##################### package lexicon also has sentiment corresponding to 
##################### emojis (for use in sentimentr)
#########################################################

#' Convert the emojis in a text vector to ::their description::
#'
#' @param x A character vector
#' 
#' @param emoji_dt data frame where a column called "x" contains the emoji in 
#' bytes and another column called "y" with its description
#' 
#' @return Character vector with emojis replaced with ::description of emoji::

emoji_to_words <- function(x, emoji_dt = lexicon::hash_emojis){
  emoji_dt[["y"]] <- gsub("-", "_", emoji_dt[["y"]])
  x <- iconv(x, "UTF-8", "ASCII", "byte")
  x <- textclean::mgsub(x, emoji_dt[["x"]], paste0("_", gsub("\\s+", "_", emoji_dt[["y"]]), "_"))
}

#' Same as above, but for emoticons 

emoticon_to_words <- function (x, emoticon_dt = lexicon::hash_emoticons) 
{
  textclean::mgsub(x, emoticon_dt[["x"]], paste0("_", gsub("\\s+", "_", emoticon_dt[["y"]]), "_"))
}