R/module_nlp.R

library(modules)

nlp <- module({
  
  import("dplyr")
  import("ggraph")
  import("igraph")
  import("koRpus")
  import("ldatuning")
  import("mallet")
  import("naturalsort")
  import("pdftools")
  import("qdapDictionaries")
  import("textmineR")
  import("tidyr")
  import("tidytext")
  import("tm")
  import("tools")
  import("topicmodels")
  
  read_txt_files <- function(fnames, as_string = TRUE) {
    # Apply 'readlines()' function to each element of a character vector of .txt filenames.
    #
    # Arguments:
    #   fnames {char} -- vector of filenames to read and apply cleaning to
    # 
    # Keyword Arguments:
    #   as_string {logical} -- if TRUE, read each text file as a string rather than a vector
    #                          by collapsing lines separated by '\n'
    #
    # Returns:
    #   {named char} -- if (as_string = TRUE); charracter vector of textfile contents as strings
    #   {list} -- if (as_string = FALSE); list of textfile contents as character vectors
    
    stopifnot(is.character(fnames))
    stopifnot(is.logical(as_string))
    
    # Read files and collapse
    text = lapply(fnames, readLines)
    if (as_string) {
      text = unlist(lapply(text, function(x) paste0(x, collapse = "\n")))  
    }
    
    # Create named chr from filenames
    names(text) = fnames
    
    return(text)
  }
  
  rm_letters <- function(str, ltrs = LETTERS, upper_and_lower = TRUE) {
    # Remove strings like "b" or "A", or "W", or any other standalone
    # letter. Standalone letters are identified using word boundaries (\b).
    #
    # Arguments:
    #   str {char} -- character string to remove letters from
    #
    # Keyword Arguments:
    #   ltrs {char} -- letter or letters to remove from `str` (default: LETTERS)
    #   upper_and_lower {logical} -- if True, remove both upper and lower case versions of `ltrs`
    #
    # Returns:
    #   {char} -- string with letters removed

    stopifnot(is.character(str))
    stopifnot(is.logical(upper_and_lower))

    # Get upper and lower case versions of `ltrs` if specified. If not, just use `ltrs` as it is
    if (upper_and_lower) {
      ltrs = c(toupper(ltrs), tolower(ltrs))
    }
    letters_regex = paste0("\\b", paste0(ltrs, collapse = "\\b|\\b"), "\\b")

    return( gsub(letters_regex, " ", str) )
  }

  rm_stopwords <- function(str, stopwords = tm::stopwords("en")){
    # Remove any word in a vector, stopwords, from another character vector.
    # You can pass in any list of "stopwords" to be removed from x
    # If unspecified, stopwords list defaults to list from 'tm' package
    #
    # Arguments:
    #   str {char} -- character string to remove stopwords from
    #
    # Keyword Arguments:
    #   stopwords {char} -- character vector of words to remove (default: tm::stopwords("en"))
    #
    # Function Dependencies:
    #   - tm
    #
    # Returns:
    #   {char} -- string with stopwords removed
    
    stopifnot(is.character(str))
    stopifnot(is.character(stopwords))

    # Collapse into regex string
    stopwords_regex = paste0("\\b", paste0(stopwords, collapse = "\\b|\\b"), "\\b")
    
    # Remove all stopwords by replacing with an empty string
    out = gsub(stopwords_regex, "", str)
    
    # Remove extra spaces
    out = rm_excess_spaces(out)
    
    return(out)
  }

  rm_punct <- function(str, except = NULL){
    # Remove punctuation from a character vector.
    #
    # Arguments:
    #   str {char} -- character string to remove punctuation from
    #
    # Keyword Arguments:
    #   except {char} or NULL -- character string specifying punctuation characters NOT to
    #                            remove (default: NULL). Ex: except = ".,|" -> will not remove
    #                            periods, commas or pipes.
    #
    # Returns:
    #   {char} -- character string with punctuation removed

    stopifnot(is.character(str))
    stopifnot(is.character(except) || is.null(except))
    stopifnot(length(except) == 1)

    # Original pattern specifying all punctuation (all characters that are
    # NOT alphanumeric)
    punct_pattern = "[^[:alnum:] "

    # If single quote ' is in "except", this needs to be treated differently.
    # It must be replaced with nothing (instead of a space) due to cases like
    # "don't". If it were treated the same, don't --> don t, which is not what
    # we want. We will simply remove the single quote before the rest.
    if(!is.null(except)){
      if(grepl("'", except)){
        str = gsub("'", "", str)
        except = gsub("'", "", except)
      }
    }

    # Add except if specified
    if(!is.null(except)) punct_pattern = paste0(punct_pattern, except)
    punct_pattern = paste0(punct_pattern, "]")  # Close unclosed square bracket

    # Apply removal of punctuation
    str = gsub(punct_pattern, " ", str)
    str = rm_excess_spaces(str)

    return(str)
  }

  rm_nonwords <- function(str, keep_numerics = TRUE, keep_acronyms = TRUE, keep_names = TRUE){
    # Remove non-English words from character string or vector using the Grady Augmented
    # dictionary. This will also remove all punctuation and automatically encode to UTF-8.
    #
    # Arguments:
    #   str {char} -- character string to remove nonwords from
    #
    # Keyword Arguments:
    #   keep_numerics {logical} -- if TRUE, any token that can be coerced to object of type
    #                              numeric using 'as.numeric()' is kept
    #   keep_acronyms {logical} -- if TRUE, any token that can be coerced to object of type
    #                              numeric using 'as.numeric()' is kept
    #   keep_names {logical} -- if TRUE, all tokens that are in qdapDictionaries::NAMES$name
    #                           are kept.
    #
    # Function Dependencies:
    #   - qDapDictionaries
    #
    # Returns:
    #   {char} -- character string with nonwords removed
    
    stopifnot(is.character(str))
    stopifnot(is.logical(keep_numerics))
    stopifnot(is.logical(keep_acronyms))
    stopifnot(is.logical(keep_names))
    
    # Prepare str vector
    str = enc2utf8(str)  # Encode to UTF-8
    str = rm_punct(str, except = "'-")  # Remove punct except ' and -
    str = rm_excess_spaces(str)
    
    # Separate words by splitting on spaces
    if(length(str) > 1) {
      # 'str' is a vector, not a string, so split each element of vector
      # on a space (result is a list)
      str = lapply(str, function(string) strsplit(string, split = " ")[[1]])
    } else {
      str = strsplit(str, split = " ")[[1]]  # Create vector from string    
    }
    
    # Apply removal of nonwords taking into account user-specified parameters
    filter_nonwords <- function(x) {
      # Find and remove invalid words
      # For each element of 'str', apply the following
      keep = tolower(x) %in% tolower(qdapDictionaries::GradyAugmented)
      
      if(keep_numerics == TRUE) {
        idx_numeric = which(!is.na(suppressWarnings(as.numeric(x))))
        keep[idx_numeric] = TRUE
      }
      if(keep_acronyms == TRUE) {
        # Separate numerics from acronyms
        idx_numeric = which(!is.na(suppressWarnings(as.numeric(x))))
        idx_acronym = which(x == toupper(x))
        idx_acronym = idx_acronym[!(idx_acronym %in% idx_numeric)]
        keep[idx_acronym] = TRUE
      }
      if(keep_names == TRUE) {
        idx_name = which(tolower(x) %in% tolower(qdapDictionaries::NAMES$name))    
        keep[idx_name] = TRUE
      }
      
      return(x[keep])
    }
    if(class(str) == "list") {
      str_clean = lapply(str, filter_nonwords)
      str_clean = unlist(lapply(str, function(x) paste0(x, collapse = " ")))
    } else {
      str_clean = filter_nonwords(str)
    }
    
    # Apply original document names if 'str' argument was a named chr
    if(!is.null(names(str))) names(str_clean) =  names(str)
    
    return(str_clean)
  }

  rm_names <- function(str) {
    # Remove names from a character string or vector using regular expressions names are
    # given by those in `qdapDictionaries::NAMES$name`.
    #
    # Arguments:
    #   str {char} -- character string to remove names from
    #
    # Function Dependencies:
    #   - qDapDictionaries
    #
    # Returns:
    #   {char} -- character string with names removed
    
    stopifnot(is.character(str))

    # Get vector of names
    names = sort(tolower(qdapDictionaries::NAMES$name))

    # Must break into batches otherwise regular expression fails (too long)
    batch_size = 100
    names_list = split(names, ceiling(seq_along(names)/batch_size))
    names_vec = unname(unlist( lapply(names_list, function(x) {
      paste0(paste0("\\b", paste0(str, collapse = "\\b|\\b"), "\\b"))
    }) ))

    # Sub out names by batch
    for(i in 1:length(names_vec)) {
      str = gsub(names_vec[i], " ", tolower(str))
    }

    return(x)
  }

  rm_excess_spaces <- function(str) {
    # Trim whitespace on the ends of a string, and replace instances of two or more spaces
    # with a single space.
    #
    # Arguments:
    #   str {char} -- character string to remove excess whitespace from
    #
    # Returns:
    #   {char} -- character string with excess whitespace removed
    
    stopifnot(is.character(str))
    
    return( trimws(gsub("\\s+", " ", str)) )
  }
  
  replace_non_ascii <- function(text){
    # Replace any non-English characters with Latin counterparts.
    #
    # Arugments:
    #   text {char} -- character string
    #
    # Returns:
    #   {char}
    
    # Define characters
    unwanted_array = list(
      "Š"="S", "š"="s", "Ž"="Z", "ž"="z", "À"="A", "Á"="A", "Â"="A","Ã"="A",
      "Ä"="A", "Å"="A", "Æ"="A", "Ç"="C", "È"="E", "É"="E", "Ê"="E", "Ë"="E",
      "Ì"="I", "Í"="I", "Î"="I", "Ï"="I", "Ñ"="N", "Ò"="O", "Ó"="O", "Ô"="O",
      "Õ"="O", "Ö"="O", "Ø"="O", "Ù"="U", "Ú"="U", "Û"="U", "Ü"="U", "Ý"="Y",
      "Þ"="B", "ß"="Ss", "à"="a", "á"="a", "â"="a", "ã"="a", "ä"="a",
      "å"="a", "æ"="a", "ç"="c", "è"="e", "é"="e", "ê"="e", "ë"="e", "ì"="i",
      "í"="i", "î"="i", "ï"="i", "ð"="o", "ñ"="n", "ò"="o", "ó"="o", "ô"="o",
      "õ"="o", "ö"="o", "ø"="o", "ù"="u", "ú"="u", "û"="u", "ý"="y", "ý"="y",
      "þ"="b", "ÿ"="y"
    )
    
    # Execute replacement
    out = chartr(
      old = paste(names(unwanted_array), collapse = ""),
      new = paste(unwanted_array, collapse = ""),
      x   = text
    )
    
    return(out)
  }
  
  dtm <- function(doc_vec, tfidf = FALSE){
    # Create a document-term matrix from a character vector using the 'tm' package.
    #
    # Arguments:
    #   doc_vec {char} -- character string or vector containing text to run DTM on, often a corpus
    #
    # Keyword Arguments:
    #   tfidf {logical} -- if TRUE, return TF-IDF scores instead of counts (default: FALSE)
    #
    # Function Dependencies:
    #   - tm
    #
    # Returns:
    #   {matrix} -- document-term matrix cast as matrix
    
    library(tm)

    stopifnot(is.character(doc_vec))
    stopifnot(is.logical(tfidf))
    
    # Prepare corpus
    doc_vec = tolower(doc_vec)
    corpus = tm::Corpus(tm::VectorSource(doc_vec))
    
    # Create DTM to user specification
    if(tfidf==TRUE) {
      dtm = tm::DocumentTermMatrix(
        corpus,
        control = list(weighting = weightTfIdf))
    } else {
      dtm = tm::DocumentTermMatrix(corpus)
    }
    
    # Return as matrix object, set names if doc_vec is a named chr
    dtm = as.matrix(dtm)
    if(!is.null(names(doc_vec))) rownames(dtm) = names(doc_vec)
    
    return(dtm)
  }
  
  clean_corpus <- function(doc_vec) {
    # Clean a character vector or string by:
    #    - Removing excess whitespace
    #    - Removing punctuation
    #    - Converting to lower case
    #    - Removing stopwords
    #
    # Arguments:
    #   doc_vec {char} -- character string or vector representing corpus to clean
    #
    # Function Dependencies:
    #   - tm
    #
    # Returns:
    #   {char} -- corpus cleaned

    library(tm)

    crp_clean = tm::tm_map(doc_vec, stripWhitespace)
    crp_clean = tm::tm_map(crp_clean, removePunctuation)
    crp_clean = tm::tm_map(crp_clean, tm::content_transformer(tolower))
    crp_clean = tm::tm_map(crp_clean, removeWords, tm::stopwords("en"))

    return(crp_clean)
  }

  freqtable <- function(doc_vec){
    # Make word-frequency table from a character string or vector
    #
    # Arguments:
    #   doc_vec {char} -- character string or vector with text to create a frequency table from
    #
    # Returns:
    #   {dataframe} -- frequency table cast as dataframe
    
    # Make document-term matrix
    document_term_matrix = dtm(doc_vec, tfidf = FALSE)
    
    # Get word frequencies from document-term matrix  
    freq = sort(colSums(document_term_matrix), decreasing = TRUE)
    words = names(freq)
    freq = unname(freq)
    
    # Bind into data frame and return
    df = data.frame(word = words, frequency = freq, stringsAsFactors = FALSE)
    df = df[order(-df$frequency),]  # Sort by decreasing frequency
    
    return(df)
  }

  ngram <- function(text, n = 2){
    # Create table of ngrams from a character string or vector
    #
    # Arguments:
    #   doc_vec {char} -- character string or vector containing text to create ngrams from
    #
    # Keyword Arguments:
    #   n {numeric} -- ngram window, defaults to bigrams (default: 2)
    #     Ex. n = 1 will create a word-frequency table
    #     Ex. n = 2 will create a table of bigrams
    #     Ex. n = 3 will create a table of trigrams
    #
    # Function Dependencies:
    #   - dplyr
    #
    # Returns:
    #   {dataframe} -- dataframe of bigrams with columns "word1", "word2", ... , "frequency"

    library(dplyr)

    # Clean up character vector by removing NAs and punctuation excluding
    # apostrophe (') and convert to lower case
    text = text[!is.na(text)]
    text = gsub("[^[:alnum:][:space:]']", " ", text)
    text = gsub("\\s+", " ", text)
    text = as.character(tolower(text))
    
    # Create bigrams data frame
    columns = paste0("word", 1:n)
    bigrams = data.frame(text = text) %>%
        tidytext::unnest_tokens(bigram, text, token = "ngrams", n = n) %>%
        dplyr::count(bigram, sort = TRUE) %>%
        tidyr::separate(bigram, columns, sep = " ") %>%
        as.data.frame()
        
    # Rename columns and rows
    colnames(bigrams) = c(paste0("word", 1:n), "frequency")
    rownames(bigrams) = NULL
    
    return(bigrams)
  }

  ldatuning <- function(text, cores, start, end, by, verbose = FALSE) {
    # Run ldatuning on a collection of texts.
    #
    # Set this function equal to a varaible to save results.
    #   Example:
    #     ldatuning_result = ldatuning(text, 4, 2, 20, 2)
    #     ldatuning::FindTopicsNumber_plot(ldatuning_result)
    #
    # This program will create a plot detailing the optimal number of topics to
    # use when running a topic model on a dataset. It uses four metrics:
    #   * Griffiths 2004
    #   * Cao-Juan 2009
    #   * Arun 2010
    #   * Deveaud 2014
    # Using these metrics and the output plot, not only will the optimal number
    # of topics be able to be deduced, but also a range of numbers of topics
    # that are acceptable.
    #    
    # Look for the point at which Cao-Juan and Arun are at a MINIMUM, and
    # Griffiths and Deveaud are at a MAXIMUM.
    #
    # Arguments:
    #   text {char} -- character vector of texts
    #   cores {numeric} -- number of cores to run LDATuning with
    #   start {numeric} -- starting number of sequence of topics
    #   end {numeric} -- ending number of sequence of topics
    #   by {numeric} -- increment number of topics
    #
    # Keyword Arguments
    #   verbose {logical} -- if TRUE, print updates to STDOUT (default: FALSE)
    #
    # Function Dependencies:
    #   - ldatuning
    #
    # Returns:
    #   {ldatuning::FindTopicsNumber}
    #
    # Example:
    #   If START = 2, END = 20, BY = 1, then this program will test all topics:
    #   2, 3, 4, 5, ..., 20
    # Example:
    #   If START = 5, END = 50, BY = 5, then this program will test all topics:
    #   5, 10, 15, ..., 50
    
    library(ldatuning)
    
    stopifnot(is.character(texdt))
    stopifnot(is.numeric(cores))
    stopifnot(is.numeric(start))
    stopifnot(is.numeric(end))
    stopifnot(is.numeric(by))
    stopifnot(is.logical(verbose))
    
    est_time_ldatuning = function(data_size_in_mb, start, end, by) {
      # Estimate time to run LDATuning based on size of data and topics.
      #
      # Declared average time based on trial 2-50 topics took 3.278738 days
      # at sum(1:49) = 1225 total topics tested
      # This averages out to 3.85419 minutes per topic/192.4 megabytes -->
      #   0.02003217 minutes per topic per megabyte,
      #   or 1.20193 seconds
      #
      # Arguments:
      #   data_size_in_mb {int} -- data size of text in megabytes
      #   start {numeric} -- starting number of sequence of topics
      #   end {numeric} -- ending number of sequence of topics
      #   by {numeric} -- increment number of topics
      #
      # Returns:
      #   {char}
      
      # Get the total number of topics using start, end and by
      number_of_topics = sum(seq(start, end, by))
      
      # Calcualted estimated time
      est_time_sec = 1.20193 * data_size_in_mb * number_of_topics
      est_time = format_time_diff(est_time_sec)
      
      # Return character string indicating estimated length
      est_time_string = paste(est_time$value, est_time$unit)
      
      return(est_time_string)
    }
    
    # Load libraries
    if(verbose == TRUE) echo("Loading libraries 'ldatuning' 'topicmodels' 'tm'")
    packages = c("ldatuning", "topicmodels", "tm")
    invisible(lapply(packages, FUN = function(x) {
      if (!require(x, character.only = TRUE)) {
        install.packages(x, dependencies = TRUE)
        library(x, character.only = TRUE, quietly = TRUE)
      }
    }))
    if(verbose == TRUE) echo("DONE", indent = 1)
    
    # Make DTM
    if(verbose == TRUE) echo("Creating document-term matrix")
    corpus = tm::Corpus(tm::VectorSource(text))
    dtm = tm::DocumentTermMatrix(corpus)
    dtm = as.matrix(dtm)
    if(verbose == TRUE) echo("DONE", indent = 1)
    
    # Use four methods to find optimal number of topics
    if(verbose == TRUE) echo("Fitting models (this may take a while)")
    result = ldatuning::FindTopicsNumber(
      dtm,
      topics   = seq(from = start, to = end, by = by),
      metrics  = c('Griffiths2004', 'CaoJuan2009', 'Arun2010', 'Deveaud2014'),
      method   = 'Gibbs',
      control  = list(seed = 77),
      mc.cores = 4L,
      verbose  = FALSE)
    if(verbose == TRUE) echo("DONE", indent = 1)
    
    # Quickly visualize results
    if(verbose == TRUE) echo("Plotting results")
    ldatuning::FindTopicsNumber_plot(result)
    if(verbose == TRUE) echo("DONE", indent = 1)
    
    return(result)
  }
  
  pdf_to_text <- function(pdf_file) {
    # Converts a PDF document to TXT using library "pdftools"
    # 
    # Arguments:
    #   pdf_file {char} -- pdf file to read and convert to text
    #
    # Function Dependencies:
    #   - pdftools
    #
    # Returns:
    #   {char}
    
    library(pdftools)
    
    stopifnot(is.character(pdf_file))
    
    # Get text
    txt = pdftools::pdf_text(pdf_file)
    
    # Clean text
    txt = tolower(paste(unlist(txt), collapse = " "))
    txt = nlp$rm_excess_spaces(txt)
    
    return(txt)
  }
  
  nodegraph <- function(text) {
    # Given a character vector or string, create a nodegraph using the 'igraph'
    # package, by first extracting a table of bigrams from the text.
    #
    # Arguments:
    #   text {char} -- text to make nodegraph from
    #
    # Function Dependencies:
    #   - igraph
    #   - ggraph
    # 
    # Returns:
    #   {ggraph}
    
    library(igraph)
    library(ggraph)
    
    stopifnot(is.character(text))
    
    # Make bigrams table
    bigrams = ngram(text, n = 2)
    
    # Make into graph object
    bigrams_graph = bigrams %>% graph_from_data_frame()
    
    # Make graph
    ggraph(bigrams_graph, layout = "fr") +
      geom_edge_link(
        aes(edge_alpha = nrow(bigrams)),
        show.legend = FALSE,
        arrow = grid::arrow(
          type = "open",
          length = unit(.10, "inches")
        ),
        end_cap = circle(.07, "inches")
      ) +
      geom_node_point(
        color = "lightblue",
        size = 3
      ) +
      geom_node_text(
        aes(label = name),
        repel = TRUE
      ) +
      theme_void()
  }
  
  lemmatize <- function(
    doc_vec,
    doc_names,
    outdir,
    lem_src_dir,
    rm_punct = FALSE,
    to_lower = TRUE,
    file_suffix = "_lemma",
    file_prefix = NULL,
    verbose = FALSE) {

    # Lemmatize a collection of text files.
    # 
    # HOW TO GET TREETAGGER SOURCE
    #  (1) Go to http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/
    #  (2) Follow the instructions under the heading "Download".
    #  (3) Point this program to the lemmatizer source code via the 
    #      'lem_src_dir' parameter.
    #
    # Arguments:
    #   doc_vec {char} -- character vector of your text files to lemmatize
    #   doc_names {char} -- is a character vector of the names of those documents
    #   outdir {char} -- is the output directory where you would like each lemmatized
    #                    text file to be written to. The output of this program is
    #                    thus a lemmatized texzt file for each input file.
    #   lem_src_dir {char} -- is the treetagger source. Instructions on how to get
    #                         these source files is below.
    #
    # Keyword Arguments:
    #   rm_punct {logical} --  (default: FALSE)
    #   to_lower {logical} --  (default: TRUE)
    #   file_prefix {char} --  (default: NULL)
    #   file_suffix {char} --  (default: "_lemma")
    #   verbose {logical} --  (default: FALSE)
    #
    # Function Dependencies:
    #   - koRpus
    #
    # Returns:
    #   nothing -- files written to output directory
    
    library(koRpus)
    
    stopifnot(is.character(doc_vec))
    stopifnot(is.character(doc_names))
    stopifnot(length(doc_vec) == length(doc_names))
    stopifnot(is.character(outdir))
    stopifnot(dir.exists(outdir))
    stopifnot(is.character(lem_src_dir))
    stopifnot(dir.exists(lem_src_dir))
    
    # Set the koRpus environment
    set.kRp.env(
      TT.cmd     = paste0(lem_src_dir, "/cmd/tree-tagger-english"),
      lang       = "en",
      preset     = "en",
      treetagger = "manual",
      format     = "file",
      TT.tknz    = TRUE,
      encoding   = "UTF-8",
      TT.options = list(
        path   = lem_src_dir,
        preset = "en"
      )
    )
    if(verbose == TRUE) echo("koRpus environment set")
    
    # If outdir doesn't end in a slash, append it
    if(!grepl(".*\\/$", outdir)) outdir = paste0(outdir, "/")
    
    # New filenames
    if(is.null(file_suffix)) file_suffix = ""
    if(is.null(file_prefix)) file_prefix = ""
    new_files = paste0(outdir, file_prefix, basename(doc_names), file_suffix, ".txt")
    
    # Execute lemmatization. This treetagger only works with file connections,
    # not R objects. So each text file must be temporarily written to the
    # file system.
    if(verbose == TRUE) echo("Initiating lemmatization...")
    if(dir.exists("~/.lemma_r_tmp")) unlink("~/.lemma_r_tmp", recursive = TRUE)
    dir.create("~/.lemma_r_tmp")
    
    for(i in 1:length(doc_vec)) {
      
      # Write temporary file
      tmp_file = file.path("~/.lemma_r_tmp", basename(doc_names[i]))
      writeLines(text = doc_vec[i], con = tmp_file)
      
      # Treetag
      tryCatch({
        
        tagged_words = treetag(
          file       = tmp_file,
          treetagger ="manual",
          lang       = "en",
          TT.options = list(
            path   = lem_src_dir,
            preset = "en"
          )
        )
        results = tagged_words@TT.res
        lem = results$lemma
        
      }, error = function(e) {
        
        # Write error file and proceed
        writeLines(text = "",
          con = paste0(dirname(new_files),
            "/ERROR_",
            basename(new_files)))
        echo("ERROR in file %s", doc_names[i])
      })
      
      # Replace "@card@" with the actual number (cardinal number)
      lem[lem == "@card@"] = results$token[lem == "@card@"]
      
      # Delete "<unknown>"
      if(length(which(lem == "<unknown>"))) lem = lem[-which(lem == "<unknown>")]
      
      # Collapse into character string
      lem = paste0(lem, collapse = " ")
      
      # Write output file
      writeLines(text = lem, con = new_files[i])
      
      # Remove temporary file
      file.remove(tmp_file)
      
      # Notify
      if(verbose == TRUE) {
        echo(sprintf("%s Lemmatized %s of %s (%s%%)",
          Sys.time(), i, length(doc_names),
          round(i/length(doc_names)*100, 2)) )
      }
      
    }
    unlink("~/.lemma_r_tmp", recursive = TRUE)
    
    # End
    if(verbose == TRUE) echo("Lemmatization complete")
  }

  topic_model <- function(
    doc_vec,
    doc_names     = NULL,
    num_topics    = 10,
    top_terms     = NumTopTerms,
    alpha         = 50 / num_topics,
    estimate_beta = TRUE,
    delta         = 0.1,
    burnin        = 4000,
    iter          = 2000,
    thin          = 500,
    nstart        = 5,
    best          = TRUE,
    stem          = FALSE,
    seed          = list(2003, 5, 63, 100001, 765),
    verbose       = TRUE) {

    # Run topic model using 'topicmodels' package.
    #
    # Arguments:
    #   doc_vec {char} -- texts to run topic model on, may be named chr with names as doc_names
    #
    # Keyword Arguments:
    #   doc_names {char} -- vector of text doc_names
    #   num_topics {numeric} -- number of topics to run
    #   top_terms {numeric} -- number of terms to retrieve for each topic
    #   alpha {numeric} -- hyperparameter: alpha
    #   estimate_beta {numeric} -- hyperparameter: beta
    #   delta {numeric} -- hyperparameter: delta
    #   burnin {numeric} -- burn in iterations
    #   iter {numeric} -- iterations to run
    #   thin {numeric} -- hyperparameter: thin
    #   nstart {numeric} -- Gibbs sampling parameter
    #   best {logical} -- if TRUE, optimize
    #   stem {logical} -- if TRUE, stem corpus before running Gibbs sampling
    #   seed {list} -- random seed list
    #   verbose {logical} -- if TRUE, print updates to STDOUT
    #
    # Function Dependencies:
    #   - tm
    #   - topicmodels
    #
    # Returns:
    #   model = list(
    #     dtm            = dtm,
    #     freq           = freq,
    #     ldaOut         = ldaOut,
    #     topic_words    = ldaOut.terms,
    #     doc_top_topics = ldaOut.topics,
    #     doc_topics     = topicProbabilities
    #   )

    library(tm)
    library(topicmodels)

    # Assign filenames if not specified
    if (is.null(doc_names)) {
      doc_names = paste0("textfile", 1:length(doc_vec))
    }

    stopifnot(is.character(doc_vec))
    stopifnot(is.character(doc_names))
    stopifnot(length(doc_vec) == length(doc_names))
    stopifnot(length(doc_vec) > 0)
    stopifnot(is.numeric(num_topics))
    stopifnot(is.numeric(top_terms))
    stopifnot(is.numeric(alpha))
    stopifnot(is.numeric(estimate_beta))
    stopifnot(is.numeric(delta))
    stopifnot(is.numeric(burnin))
    stopifnot(is.numeric(iter))
    stopifnot(is.numeric(thin))
    stopifnot(is.numeric(nstart))
    stopifnot(is.logical(best))
    stopifnot(is.logical(stem))
    stopifnot(is.logical(verbose))
    stopifnot(is.list(seed))


    # Transform pattern to space
    to_space = tm::content_transformer(function(x, pattern) {
      return (gsub(pattern, " ", x))
    })


    # -------------------- #
    # -- PREPARE CORPUS -- #
    # -------------------- #

    # Begin program
    if (verbose) echo("BEGINNING TOPIC MODEL", indent = 0)
    if (verbose) echo("-------------------------------", indent = 0)
    start_time = Sys.time()                # Begin program timing
    options(scipen = 999)                  # Turn off scientific notation

    # Create document-term matrix
    if (verbose) echo("Creating Corpus object...")
    docs = Corpus(VectorSource(doc_vec))
    docs = tm_map(docs, content_transformer(tolower))
    if (verbose) echo("DONE", indent = 1)

    # Remove problematic symbols
    if (verbose) {
      echo("Cleaning corpus (removing symbols, numbers, punctuation, whitespace)...")
    }
    docs = tm_map(docs, to_space, "-")
    docs = tm_map(docs, to_space, "’")
    docs = tm_map(docs, to_space, "‘")
    docs = tm_map(docs, to_space, "•")
    docs = tm_map(docs, to_space, '"')
    docs = tm_map(docs, to_space, "'")

    # Remove punctuation, digits, stopwords, whitespace
    docs = tm_map(docs, removeWords, stopwords("english"))
    docs = tm_map(docs, removePunctuation)
    docs = tm_map(docs, removeNumbers)
    docs = tm_map(docs, stripWhitespace)
    docs = tm_map(docs, content_transformer(trimws))
    if (verbose) echo("DONE", indent = 1)

    # Stem
    if (stem == TRUE) {
      if (verbose) echo("Stemming corpus...")
      docs = tm_map(docs, stemDocument)
      if (verbose) echo("DONE", indent = 1)
    }


    # ---------------------- #
    # -- DTM & Freq Table -- #
    # ---------------------- #

    # Create document-term matrix
    if (verbose) echo("Creating document-term matrix...")
    dtm = DocumentTermMatrix(docs)
    rownames(dtm) = doc_names
    if (verbose) echo("DONE", indent = 1)


    # Collapse matrix by summing over columns
    if (verbose) echo("Creating frequency table...")
    freq = colSums(as.matrix(dtm))

    # List all terms in decreasing order of freq and write to disk
    freq = freq[order(freq, decreasing = TRUE)]
    if (verbose) echo("DONE", indent = 1)


    # ----------------- #
    # -- TOPIC MODEL -- #
    # ----------------- #

    # Run LDA using Gibbs sampling
    if (verbose) echo("Running topic model. This could take a while...")
    ldaOut = LDA(
      x      = dtm,
      k      = num_topics,
      method = "Gibbs",
      control = list(
        alpha         = alpha,
        estimate.beta = estimate_beta,
        delta         = delta,
        nstart        = nstart,
        best          = best,
        burnin        = burnin,
        iter          = iter,
        thin          = thin,
        seed          = seed
      )
    )
    if (verbose) echo("DONE", indent = 1)


    # ------------------------- #
    # -- TOPIC MODEL OBJECTS -- #
    # ------------------------- #

    # Docs top topics
    if (verbose) echo("Creating doc top topics...")
    ldaOut.topics = data.frame(
      File             = names(topics(ldaOut)),
      Top_Topic        = unname(topics(ldaOut)),
      stringsAsFactors = FALSE
    )
    if (verbose) echo("DONE", indent = 1)

    # Top terms in each topic
    if (verbose) echo("Creating topic words...")
    ldaOut.terms = as.data.frame(terms(ldaOut, top_terms))
    ldaOut.terms = data.frame(lapply(ldaOut.terms, as.character), stringsAsFactors = FALSE)
    ldaOut.terms = cbind(RowID = 1:nrow(ldaOut.terms), ldaOut.terms)
    colnames(ldaOut.terms) = gsub(" |\\.", "", colnames(ldaOut.terms))
    rownames(ldaOut.terms) = NULL
    if (verbose) echo("DONE", indent = 1)

    # Probabilities associated with each topic assignment (doc topics)
    if (verbose) echo("Creating doc topics...")
    topicProbabilities = as.data.frame(ldaOut@gamma)
    colnames(topicProbabilities) = paste0("Topic", 1:num_topics)
    topicProbabilities =
      cbind(RowID = 1:nrow(topicProbabilities),
          Document = as.character(doc_names), topicProbabilities)
    rownames(topicProbabilities) = NULL
    if (verbose) echo("DONE", indent = 1)

    # Collect all results in a single list
    model = list(
      dtm            = dtm,
      freq           = freq,
      ldaOut         = ldaOut,
      topic_words    = ldaOut.terms,
      doc_top_topics = ldaOut.topics,
      doc_topics     = topicProbabilities
    )

    # End program
    if (verbose) {
      end_time = Sys.time()    # End program timing
      echo("-------------------------------", indent = 0)
      echo(sprintf("TOPIC MODEL COMPLETED IN %s %s",
        round(as.numeric(difftime(end_time, start_time)), 6),
        gsub("MINS", "MINUTES",
          gsub("SECS", "SECONDS",
            toupper(units(difftime(end_time, start_time))))) ))
    }

    return(model)
  }
  
  topic_model_textmineR <- function(
    doc_vec,
    doc_names  = NULL,
    num_topics = seq(2, 40, 2),
    iterations = 2000,
    top_terms  = 10) {

    # Run a topic model using textmineR package.
    #
    # Arguments:
    #   doc_vec {char} -- texts to run topic model on, may be named chr with names as doc_names
    #
    # Keyword Arguments:
    #   doc_names {char} -- vector of text doc_names (default: NULL)
    #   num_topics {numeric} -- number of topics to run (default: `seq(2, 40, 2)`)
    #   iterations {numeric} -- number of iterations to run (default: 2000)
    #   top_terms {numeric} -- number of terms to return in each topic (default: 10)
    # 
    # Function Dependencies:
    #   - textmineR
    #
    # Returns:
    #   {list} -- model = list(
    #               summary          # Summary table
    #               top_terms        # Top terms
    #               top_terms_prime  # Top terms prime for classifying new documents
    #               assignments      # DocTopics matrix
    #               num_docs         # Number of documents in each topic
    #               coherence        # Probabilistic coherence: measures statistical support for a topic
    #               labels           # Topic labels
    #               best_k           # Optimized number of topics
    #               r2               # R- squared of this model
    #               dtm              # Document-Term Matrix
    #             )
    #
    # plot(model$hclust)  # Topic cluster dendogram
    # model$tf[order(-model$tf$term_freq),]  # Inspection of Document-Term Matrix

    library(textmineR)

    if (is.null(doc_names)) {
      doc_names = paste0("textfile", 1:length(doc_vec))
    }

    stopifnot(is.character(doc_vec))
    stopifnot(is.character(doc_names))
    stopifnot(length(doc_vec) == length(doc_names))
    stopifnot(is.numeric(num_topics))
    stopifnot(is.numeric(iterations))
    stopifnot(is.numeric(top_terms))

    # Make Document-Term matrix
    dtm = CreateDtm(
      doc_vec      = doc_vec,
      doc_names    = doc_names,
      ngram_window = c(1,1)
    )

    # Inspect
    tf = TermDocFreq(dtm = dtm)
    rownames(tf) = NULL

    # Eliminate words appearing less than 2 times or in more than half of the documents
    vocabulary = tf$term[ tf$term_freq > 1 & tf$doc_freq < nrow(dtm) / 2 ]
    dtm = dtm[, vocabulary]

    # Fit LDA models and select best number of topics
    k_list = num_topics  

    # Fit models based on multiple possible numbers of topics
    model_list = TmParallelApply(k_list, function(k) {
      m = FitLdaModel(
        dtm        = dtm,
        k          = k,
        iterations = iterations
      )
      m$k = k
      m$coherence = CalcProbCoherence(
        phi = m$phi,
        dtm = dtm,
        M   = top_terms
      )
      m
    }) 

    # Get coherence matrix used to evaluate best model
    coherence_mat = data.frame(
      k                = sapply(model_list, function(x) nrow(x$phi)),
      coherence        = sapply(model_list, function(x) mean(x$coherence)),
      stringsAsFactors = FALSE
    )

    # Select k based on maximum average coherence
    # phi is P(words|topics)
    # theta is P(topics|documents)
    model = model_list[which.max(coherence_mat$coherence)][[1]]
    model$best_k = coherence_mat$k[which.max(coherence_mat$coherence)]

    # Get the R-squared of this model
    model$r2 = CalcTopicModelR2(
      dtm = dtm,
      phi = model$phi,
      theta = model$theta
    )

    # Top terms of the model according to phi & phi-prime
    model$top_terms = GetTopTerms(
      phi = model$phi,
      M   = top_terms
    )

    # Phi-prime, P(topic | words) for classifying new documents
    model$phi_prime = CalcPhiPrime(
      phi    = model$phi,
      theta  = model$theta,
      p_docs = rowSums(dtm)
    )
    model$top_terms_prime = GetTopTerms(
      phi = model$phi_prime,
      M   = top_terms
    )

    # Give a hard in/out assignment of topics in documents
    model$assignments = model$theta
    model$assignments[model$assignments < 0.05] = 0
    model$assignments = model$assignments / rowSums(model$assignments)
    model$assignments[is.na(model$assignments)] = 0

    # Get topic labels using n-grams from the DTM
    model$labels = LabelTopics(
      assignments = model$assignments, 
      dtm         = dtm,
      M           = 2
    )

    # Probabilistic coherence: measures statistical support for a topic
    model$coherence = CalcProbCoherence(
      phi = model$phi,
      dtm = dtm,
      M = top_terms
    )

    # Number of documents ineach topic
    model$num_docs = colSums(model$assignments > 0)

    # Cluster topics together in a dendrogram
    model$topic_linguistic_dist = CalcHellingerDist(model$phi)
    model$hclust = hclust(as.dist(model$topic_linguistic_dist), "ward.D")
    model$hclust$clustering = cutree(model$hclust, k = model$best_k)
    model$hclust$labels = paste(model$hclust$labels, model$labels[,1])

    # rect.hclust(model$hclust, k = length(unique(model$hclust$clustering)))
    # rect.hclust(model$hclust, k = 10)

    # Make a summary table
    model$summary = data.frame(
      topic     = rownames(model$phi),
      cluster   = model$hclust$clustering,
      labels    = model$labels,
      coherence = model$coherence,
      num_docs  = model$num_docs,
      top_terms = apply(model$top_terms, 2, function(x) {
        paste(x, collapse = ", ")
      }),
      top_terms_prime = apply(model$top_terms_prime, 2, function(x) {
        paste(x, collapse = ", ")
      }),
      stringsAsFactors = FALSE
    )
    model$summary = model$summary[order(model$hclust$clustering),]

    # Retrieve `tf` inspection of DTM from earlier, and DTM itself
    model$tf = tf
    model$dtm = dtm

    return(model)
  }

  docClusterCsim <- function(doc_vec, doc_names = NULL, num_clusters) {
    # Cluster documents using TFIDF and Cosine Similarity.
    #
    # Arguments:
    #   doc_vec {char} -- character vector of text files
    #   num_clusters {numeric} -- the desired number of clusters
    #
    # Keyword Arguments:
    #   doc_names {char} -- filenames of text files to appear in clustering (default: NULL)
    #
    # Function Dependencies:
    #   - textmineR
    #
    # Returns:
    #   {list} -- model = list(
    #               summary,        # {dataframe}
    #               cluster_words,  # {list}
    #               hclust,         # {hclust} (ready for plotting)
    #               dtm,            # {}
    #               tfidf,          # {}
    #               csim            # {}
    #             )
    #
    # Original Source: 
    #   https://cran.r-project.org/web/packages/textmineR/vignettes/b_document_clustering.html
    
    
    # Create DTM
    library(textmineR)

    dtm = textmineR::CreateDtm(
      doc_vec   = doc_vec,
      doc_names = doc_names,
      verbose   = FALSE
    )
    rownames(dtm) = basename(txt_files)
    tf_mat = textmineR::TermDocFreq(dtm)
    
    
    # TF-IDF and cosine similarity
    tfidf = t(dtm[ , tf_mat$term ]) * tf_mat$idf
    tfidf = t(tfidf)
    csim = tfidf / sqrt(rowSums(tfidf * tfidf))
    csim = csim %*% t(csim)
    
    
    # Clustering
    cdist = as.dist(1 - csim)
    hc = hclust(cdist, "ward.D")
    clustering = cutree(hc, num_clusters)
    # plot(hc, main = "Document Clustering", ylab = "", xlab = "", yaxt = "n")
    # rect.hclust(hc, num_clusters, border = "red")
    
    
    # Cluster words
    p_words = colSums(dtm) / sum(dtm)
    cluster_words = lapply(unique(clustering), function(x){
      rows = dtm[clustering == x, ]
      rows = rows[, colSums(rows) > 0]    # Drop words not in cluster
      colSums(rows) / sum(rows) - p_words[colnames(rows)]
    })
    
    
    # Summary table of top 5 words in each cluster
    cluster_summary = data.frame(
      cluster   = unique(clustering),
      num_docs  = as.numeric(table(clustering)),
      top_words = sapply(cluster_words, function(d){
        paste(
          names(d)[order(d, decreasing = TRUE)][1:5], 
          collapse = ", "
        )
      }),
      stringsAsFactors = FALSE
    )
    
    
    # Final list
    model = list(
      summary       = cluster_summary,
      cluster_words = cluster_words,
      hclust        = hc,
      dtm           = dtm,
      tfidf         = tfidf,
      csim          = csim
    )

    return(model)
  }

  topic_model <- function(
    fnames,
    outdir,
    alpha,
    hyper,
    iterations,
    num_topics,
    corrplot = FALSE,
    dendrogram = FALSE,
    dendro_bal = 0.5){
    
    # Run topic model using MALLET library from Stanford NLP.
    #
    # Arguments:
    #   fnames -- {char} Textfiles to run topic model on.
    #   outdir -- {char} Directory path to write outputs to.
    #   alpha -- {numeric} Set the Dirichlet alpha parameter. The lower the value
    #                      (can be negative) the more words in distribution that will
    #                      have probabilities approximating 0. It primarily affects
    #                      the distribution of words across the topic (which has a 
    #                      subsequent effect on topics across the corpus).
    #   hyper -- {numeric} Determines the strongly the alpha parameter affects the
    #                      overall model. Each time you run an optimization, the
    #                      topic probability distribution departs farther from the
    #                      initial homogeneous distribution. It primarily affects the
    #                      distribution of topics across the corpus (which has a
    #                      subsequent effect on words in each topic). If your goal
    #                      is to identify small numbers of texts about specific
    #                      themes in a large collection, then a lot of optimization
    #                      may be good. However, if your goal is to identify topics
    #                      typical of certain authors, periods, genres or some other
    #                      reasonably large subset of your collection, then optimize
    #                      a bit less.
    #   iterations -- {numeric} Set the number of times the Latent Dirichlet
    #                           Allocation (LDA) algorithm will be executed.
    #                           Trade-off between speed and quality. Higher
    #                           iterations generally leads to lower computation
    #                           speed but higher accuracy. 500 iterations is the
    #                           suggested baseline for up to 10,000 documents.
    #   num_topics -- {numeric} The desired number of topics.
    #
    # Keyword Arguments:
    #   corrplot -- {logical} If TRUE, a correlation matrix plot is outputted (default: FALSE)
    #   dendrogram -- {logical} If TRUE, 3 dendrograms are outputted. Topic, Documents,
    #                           Custom split. The 'Topic' dendrogram is created with
    #                           the balance set to 1. This means that only word-level
    #                           similarity is used. The 'Document' dendrogram is made
    #                           with balance set to 0, indicating only document-level
    #                           similarity is used. Any dendrogram balance inputted
    #                           here creates a third dendrogram using this parameter as
    #                           the balance. Set to NULL to not create the third
    #                           dendrogram (default: FALSE)
    #   dendro_bal -- {numeric} Between 0 and 1 (default: 0.5)
    #
    # Function Dependencies:
    #   - tm
    #   - mallet
    #   - naturalsort
    #
    # Returns:
    #   {list} -- list(

    # )
    
    
    # May need this depending on Java issues
    # dyn.load("/Library/Java/JavaVirtualMachines/jdk-9.0.1.jdk/Contents/Home/lib/server/libjvm.dylib")
    
    library(tm)
    library(tools)
    library(mallet)
    library(naturalsort)

    stopifnot(is.character(fnames))
    stopifnot(length(fnames) > 0)
    stopifnot(is.character(outdir))
    stopifnot(dir.exists(outdir))
    stopifnot(is.numeric(alpha))
    stopifnot(is.numeric(hyper))
    stopifnot(is.numeric(iterations))
    stopifnot(is.numeric(num_topics))
    stopifnot(is.logical(corrplot))
    stopifnot(is.logical(dendrogram))
    stopifnot(is.numeric(dendro_bal))

    options(scipen = 999)

    read_file = function(fname){
      # Read and format a text file using proper encoding.
      #
      # Arguments:
      #   fname {char} -- textfile to read, must have .txt extension
      #
      # Returns:
      #   {char} -- contents of textfile
      
      library(tools)

      stopifnot(file.exists(fname))
      stopifnot(tolower(tools::file_ext(fname)) == "txt")

      text = scan(fname, what = "character", sep = "\n")
      text = iconv(text, "WINDOWS-1252", "UTF-8")
      text = tolower(text)
      text = paste(text, collapse = " ")
      return(text)
    }
    write_topic_df <- function(df, fname) {
      # Write topic model dataframe. Shorthand for `write.csv()` with arguments.
      #
      # Arguments:
      #   df {dataframe} -- dataframe to write
      #   fname {char} -- filename to write to
      #
      # Returns:
      #   nothing

      stopifnot(is.data.frame(df))
      stopifnot(is.character(fname))

      suppressWarnings(
        write.csv(
          TopicWords_DF,
          file      = sprintf("%s/TopicWords.csv", outdir),
          file      = fname,
          row.names = TRUE,
          col.names = TRUE,
          quote     = FALSE
        )
      )
    }
    write_corrplot <- function(df_corr, outdir) {
      # Write corrplot as a PNG image on file system.
      #
      # Arguments:
      #   df_corr {dataframe} -- correlation dataframe
      #   outdir {char} -- directory to save output PNG image to
      #
      # Returns:
      #   nothing

      library(corrplot)

      stopifnot(is.data.frame(df))
      stopifnot(is.character(outdir))
      stopifnot(dir.exists(outdir))

      # Correlation - Topic Words
      png(
        filename  = file.path(outdir, "Correlation_TopicWords.png"),
        width     = 5000,
        height    = 5000,
        pointsize = 12,
        res       = 500
      )
      corrplot.mixed(
        corr       = df_corr,
        lower      = "circle",
        upper      = "number",
        tl.col     = "black",
        tl.cex     = .45,
        number.cex = .6,
        title      = "Topic Words Correlation"
      )
      dev.off()
    }
    write_dendrogram <- function(dt, tw, fname, num_topics, bal) {
      # Create a dendrogram and write to a PNG on the filesystem.
      #
      # Arguments:
      #   dt {matrix} -- doc topics matrix
      #   tw {matrix} -- topic words matrix
      #   fname {char} -- png filename to write to
      #   num_topics {numeric} -- the number of topics run
      #   bal {numeric} -- the dendrogram balance (0 to 1), 1 being word-weighted and 0
      #                    being doc-weighted
      #
      # Function Dependencies:
      #   - mallet
      #
      # Returns:
      #   nothing

      library(mallet)

      stopifnot(is.matrix(dt))
      stopifnot(is.matrix(tw))
      stopifnot(is.character(fname))
      stopifnot(is.numeric(num_topics))

      png(
        filename  = fname,
        width     = 3000,
        height    = 3000,
        pointsize = 12,
        res       = 450
      )

      plot(
        mallet::mallet.topic.hclust(
          doc.topics  = dt,
          topic.words = tw,
          balance     = bal
        ),
        labels = paste0("Topic", 1:num_topics)
      )
      
      dev.off()
    }
    
    # Write stopwords to a temporary file
    stopwords_fname = "~/tmp.stopwords.list.csv"
    write.table(tm::stopwords("en"), stopwords_fname,
      quote = FALSE, row.names = FALSE, col.names = FALSE)
    
    # Sort files naturally, not based on text string comparison
    # If documents have numbers, this will sort them on the numbers rather than lexographically
    fnames = naturalsort::naturalsort(fnames)
    
    # Read all textfiles
    texts = sapply(fnames, read_file)

    # Convert array of document IDs and text files to Mallet instance list
    MalletInstances = mallet::mallet.import(
      id.array      = names(texts),
      text.array    = unname(texts),
      stoplist.file = stopwords_fname,
      preserve.case = FALSE,
      token.regexp  = "[\\p{L}]+"
    )
    
    # Wraps a Mallet topic model trainer Java object
    TopicModel = mallet::MalletLDA(num.topics = num_topics)
    
    # Load the documents into the topic model object
    TopicModel$loadDocuments(MalletInstances)
    
    # Extract words and frequencies
    Vocabulary = TopicModel$getVocabulary()
    WordFreqs = mallet.word.freqs(TopicModel)
    WordFreqs = WordFreqs[order(-WordFreqs$term.freq),]
    
    # Apply parameters and train
    TopicModel$setAlphaOptimization(alpha, hyper)
    TopicModel$train(iterations)
    
    
    # ----------------- #
    # -- DATA FRAMES -- #
    # ----------------- #
    
    # TopicWords
    TopicWords_Matrix = mallet.topic.words(
      topic.model = TopicModel,
      smoothed = TRUE,
      normalized = TRUE
    )
    TopicWords_DF = as.data.frame(t(TopicWords_Matrix))
    TopicWords_DF = as.data.frame(
      cbind(
        1:nrow(TopicWords_DF),
        Vocabulary,
        TopicWords_DF
      )
    )
    colnames(TopicWords_DF) = c("RowID", "Word", paste0("Topic", 1:num_topics))
    
    # Ordered Version
    # Must be specific format with no non-numeric values
    TopicWords_DF_tmp = as.data.frame(t(TopicWords_Matrix)) 
    rownames(TopicWords_DF_tmp) = Vocabulary
    colnames(TopicWords_DF_tmp) = paste0("Topic", 1:num_topics)
    TopicWords_DF_Ordered = as.data.frame(
      matrix(
        row.names(TopicWords_DF_tmp)[apply(-TopicWords_DF_tmp, 2, order)],
        nrow(TopicWords_DF_tmp)
      )
    )
    colnames(TopicWords_DF_Ordered) = paste0("Topic", 1:num_topics)
    
    
    # DocTopics
    DocTopics_Matrix = mallet.doc.topics(
      topic.model = TopicModel,
      smoothed    = TRUE,
      normalized  = TRUE
    )
    DocTopics_DF = as.data.frame(DocTopics_Matrix)
    #rownames(DocTopics_DF) = names(texts)
    DocTopics_DF = cbind(
      1:nrow(DocTopics_DF),
      names(texts),
      DocTopics_DF
    )
    colnames(DocTopics_DF) = c("RowID", "Document", paste0("Topic", 1:num_topics))
    
    # Ordered Version
    DocTopics_DF_tmp = as.data.frame(DocTopics_Matrix)
    rownames(DocTopics_DF_tmp) = names(texts)
    colnames(DocTopics_DF_tmp) = paste0("Topic", 1:num_topics)
    DocTopics_DF_Ordered = as.data.frame(
      matrix(
        row.names(DocTopics_DF_tmp)[apply(-DocTopics_DF_tmp, 2, order)],
        nrow(DocTopics_DF_tmp)
      )
    )
    colnames(DocTopics_DF_Ordered) = paste0("Topic", 1:num_topics)
    
    
    # ----------------- #
    # -- WRITE FILES -- #
    # ----------------- #

    write_topic_df(TopicWords_DF,         file.path(outdir, "TopicWords.csv"))
    write_topic_df(TopicWords_DF_Ordered, file.path(outdir, "TopicWords_DF_Ordered.csv"))
    write_topic_df(DocTopics_DF,          file.path(outdir, "DocTopics_DF.csv"))
    write_topic_df(DocTopics_DF_Ordered,  file.path(outdir, "DocTopics_DF_Ordered.csv"))

    if (corrplot) {
      # Write Topic Words correlation PNG
      write_corrplot(
        df_corr = as.matrix(Hmisc::rcorr(as.matrix(TopicWords_DF_tmp))[[1]]),
        outdir  = outdir
      )

      # Write Doc Topics correlation PNG
      write_corrplot(
        df_corr = as.matrix(Hmisc::rcorr(as.matrix(DocTopics_DF_tmp))[[1]]),
        outdir  = outdir
      )
    }

    if (dendrogram) {
      # Word-weighted
      write_dendrogram(
        dt    = DocTopics_Matrix,
        tw    = TopicWords_Matrix,
        fname = file.path(outdir, "Dendogram_WordSimilarity_balance=1.png"),
        bal   = 1
      )

      # Doc-weighted
      write_dendrogram(
        dt    = DocTopics_Matrix,
        tw    = TopicWords_Matrix,
        fname = file.path(outdir, "Dendogram_DocSimilarity_balance=1.png"),
        bal   = 0
      )

      # Custom balance dendrogram if balance is specified and is not 0 or 1, for whic
      # the dendrograms are already written
      if (!is.null(dendro_bal)) {
        if (!(dendro_bal %in% c(0, 1))) {
          write_dendrogram(
            dt    = DocTopics_Matrix,
            tw    = TopicWords_Matrix,
            fname = file.path(outdir, "Dendogram_WordSimilarity_balance=1.png"),
            bal   = dendro_bal
          )
        }
      }
    }

    # Remove temporary stopwords file from above
    if(file.exists(stopwords_fname)) file.remove(stopwords_fname)
    
    # Return list of all dataframes
    out = list(
      TopicWords         = TopicWords_DF,
      TopicWords_Ordered = TopicWords_DF_Ordered,
      DocTopics          = DocTopics_DF,
      DocTopics_Ordered  = DocTopics_DF_Ordered
    )

    return(out)
  }

  annotate_doc_by_sentence <- function(fname, outdir, verbose = FALSE) {
    # Annotate text by sentence. Parse a large textfile using the coreNLP
    # function, `annotateString()`. However, this function can run into an
    # 'out of memory' error if the file is too large.
    #
    # coreNLP must be initialized with `initCoreNLP()`
    #
    # Normally done as follows (your version of Java might be different):
    #
    # # Set up Java 8
    # # (give path to your libjvm.dylib)
    # dyn.load("/Library/Java/JavaVirtualMachines/jdk1.8.0_162.jdk/Contents/Home/jre/lib/server/libjvm.dylib")
    #
    # # Load libraries
    # library(rJava)
    # library(NLP)
    # library(coreNLP)
    #
    # # Initialize coreNLP
    # initCoreNLP()
    #
    # # Run split_parse
    # split_parse(...)
    #
    # Arguments:
    #   fname {char} -- file to annotate
    #   outdir {char} -- directory to write files to
    #
    # Keyword Arguments:
    #   verbose {logical} -- if TRUE, print updates to STDOUT (default: FALSE)
    # 
    # Function Dependencies:
    #   - rJava
    #   - NLP
    #   - coreNLP
    #
    # Returns:
    #   nothing
    
    stopifnot(is.character(fname))
    stopifnot(file.exists(fname))
    stopifnot(is.character(outdir))
    stopifnot(dir.exists(outdir))

    chunk_into_sentences = function(text) {
      # Break text string into sentences delimiting by . or ! or ?
      #
      # Arguments:
      #   text {char} -- text to break into sentences
      #
      # Returns:
      #   {char}
      break_points = c(1, as.numeric(gregexpr('[[:alnum:] ][.!?]', text)[[1]]) + 1)
      sentences = NULL
      
      for(i in 1:length(break_points)) {

        res = substr(text, break_points[i], break_points[i+1]) 
        if(i > 1) {
          sentences[i] = sub('. ', '', res)
        } else {
          sentences[i] = res
        }

      }
      sentences = sentences[sentences=!is.na(sentences)]

      return(sentences)
    }
    
    # Read in file
    text = readLines(fname)
    text = trimws(paste0(text, collapse = " "))
    text = nlp$rm_excess_spaces(text)
    if (verbose) echo(sprintf("File: %s", basename(fname)))
    
    # Break document into sentences
    sentences = chunk_into_sentences(text)
    if (verbose) echo("Document split into %s sentences", length(sentences), indent = 1)
    
    # Create teporary directory
    tmpdir = "tmp.coreNLP.annotateString"
    if(dir.exists(tmpdir)) unlink(tmpdir, recursive = TRUE)
    dir.create(tmpdir)
    if (verbose) echo("Created temp dir '%s/%s'", getwd(), tmpdir, indent = 1)
    
    # Execute string annotation and save each sentence as RDS object
    if (verbose) echo("Annotating by sentence...", indent = 1)
    for(i in seq_along(sentences)) {
      anno = coreNLP::annotateString(sentences[i])
      saveRDS(anno, file.path(tmpdir, paste0("anno", i, ".rds")))
    }
    if (verbose) echo("DONE", indent = 2)
    
    # Set up dataframe object to load RDS data into
    parsetree = c()
    table = setNames(
      data.frame(matrix(ncol = 9, nrow = 0)),
      c("sentence", "id", "token", "lemma", "CharacterOffsetBegin",
        "CharacterOffsetEnd", "POS", "NER", "Speaker"
      )
    )
    
    # Extract parsetree from each RDS object
    echo("Extracting annotable and parsetree from each RDS object...", indent = 1)
    for(i in seq_along(sentences)) {
      anno = readRDS(paste0(tmpdir, "/anno", i, ".rds"))
      anno.table = anno[[1]]
      anno.table$sentence = rep(i, nrow(anno.table))
      table = rbind(table, anno.table)
      parsetree = append(parsetree, anno[[2]])
    }
    echo("DONE", indent = 2)
    
    # Save final objects (annotable and parsetree) for entire doc
    parsetree_fname = file.path(outdir, paste0(tools::file_path_sans_ext(basename(fname)), "-parsetree.rds"))
    saveRDS(parsetree, parsetree_fname)
    
    annotable_fname = file.path(outdir, paste0(tools::file_path_sans_ext(basename(fname)), "-annotable"))
    saveRDS(table, annotable_fname)

    if (verbose) echo("Saved final objects as RDS files:", indent = 1)
    if (verbose) echo(parsetree_fname, indent = 2)
    if (verbose) echo(annotable_fname, indent = 2)

    # Remove temporary directory  
    unlink(tmpdir, recursive = TRUE)
  }


})
tsouchlarakis/rdoni documentation built on Sept. 16, 2019, 8:53 p.m.