topicApp: A Shiny App to run Topic Models

#packages
packages <- c("shiny","quanteda","shinydashboard","RColorBrewer","DT","visNetwork","ggwordcloud",
              "igraph","reshape","grid","tidyverse","shinyjs","shinyBS","stm")

lapply(packages,library,character.only = TRUE)
source('directoryInput.R')
source('functions.R')

#source("./inst/app/functions.R")

# put stop words to start with here
exp.stop <- c()

###################################################
##############       UI       #####################
###################################################

############### Header content ####################

header <- dashboardHeader(title = "topicApp")

############### Sidebar content ###################

sidebar <- dashboardSidebar(
  sidebarMenu(
    menuItem("Model Parameters", tabName = "model", icon = icon("tasks")),
    menuItem("Topics", tabName = "topics", icon = icon("th")),
    #menuItem("Document-Level", tabName = "companies", icon = icon("users")),
    menuItem("Validation", tabName = "validation", icon = icon("check"))
  )
)

############### Body content ######################

body <-   dashboardBody(
  tabItems(
    # Topic Modeling Tab
    
    tabItem(tabName = "model",
            fluidRow(
              box(title = "Step 1: Load Dataset",
                  column(9,
                         fileInput("dataFileToUpload", "Choose Data File To Upload")
                  ),
                  hr(),
                  # Code below was from stmGUI: https://github.com/dzangri/stmGUI
                  actionButton("submitDataForUpload", "Submit"),
                  hr(),
                  #a(id = "toggleAdvDataUpload", "Show/hide advanced options"),
                    div(id = "advUploadOptions",
                        checkboxInput("headerPresent", "Header Row Present", TRUE),
                        radioButtons("columnSeparator",
                                     "Separator",
                                     c(Comma = ",",
                                       Semicolon = ";",
                                       Tab = "\t"),
                                     inline = TRUE,
                                     ","),
                        radioButtons("quoteAroundData", "Quotes Around Text",
                                     c(None = "",
                                       "Double Quote" = "\"",
                                       "Single Quote" = "'"),
                                     inline = TRUE,
                                     "\"")
                    ),
                  hr(),
                      directoryInput('load.directory', label = 'Or load a previous model (then move to Step 4)', value = '~'),
                      bsTooltip("load.directory", "Select the directory to load a model.",
                                "left", options = list(container = "body")),
                      actionButton("load.model","Load Model")
                  
                  ),
              box(title = "Step 2: Pre-processing",
                  selectInput("tpDocs",
                              "Select Text Column",
                              c()),
                  bsTooltip("tpDocs", "Select which column contains the column of text.",
                            "left", options = list(container = "body")),
                  textInput("stopwords", label = "Stop Words", 
                            value = paste(exp.stop, collapse = ", "),
                            placeholder = "also, such, really..."),
                  bsTooltip("stopwords", "Include additional stop words to remove:",
                            "left", options = list(container = "body")),
                  sliderInput("minDoc",
                              "Minimum # of Documents (for Terms):",
                              min = 0,  max = 100,  value = 10, step = 1),
                  bsTooltip("minDoc", "Remove sparse terms:",
                            "left", options = list(container = "body")),
                  box(checkboxInput("stemming", label = "Stemming", value = FALSE),
                  radioButtons("ngrams", label = "n-grams",
                               choices = list("Unigrams" = 1, "Bigrams" = 2), selected = 1)),
                  box(actionButton("dfm.update", "Create DFM"))
                  )
              ),
              fluidRow(
              box(title = "Step 3: Topic Model",
                  column(1.5,
                    sliderInput("num.topics",
                                "Number of Topics:",
                                min = 0,  max = 100,  value = 10, step = 1),
                    bsTooltip("num.topics", "Set to zero to auto-detect topics.",
                              "left", options = list(container = "body"))),
                  column(1.5,
                    sliderInput("iter",
                                "Maximum Number of Iterations:",
                                min = 20,  max = 200,  value = 100, step = 20),
                    bsTooltip("iter", "Adjust higher if the algorithm is not converging.",
                              "left", options = list(container = "body"))),
                    hr(),
                    actionButton("topic.update", "Run Model")
              ),
              box(title = "Step 4: Topic Network Settings",
                  sliderInput("parm",
                              "Minimum Correlation",
                              min = 0,  max = 0.2,  value = 0.1, step = 0.01),
                  bsTooltip("parm", "Higher threshold means less edges, Lower means more edges.",
                            "left", options = list(container = "body")),
                  hr(),
                  actionButton("network.update", "Create Network")
                ),
              box(title = "Save results",
                  
                  directoryInput('directory', label = 'Selected Directory', value = '~'),
                  bsTooltip("directory", "Select the directory  to save the results.",
                            "left", options = list(container = "body")),
                  
                  actionButton("save.results","Save Model")
                  )
              )
            )
    ,
    # Topics Tab
    tabItem(tabName = "topics",
            fluidRow(
              #actionButton("tabBut", "View Topic"),
              box(title = "Topic Network", 
                  visNetworkOutput("topic.network", height = "400px"), width = 12, collapsible = F)
              ),
             fluidRow(
               box(title = "Topic Word Cloud: Size Proportional to Word Probability", 
                   plotOutput("topic.wordcloud"),
                   width = 12
                   )
               ),
            fluidRow(
              box(title = "Representative Documents",
                  dataTableOutput("doc.table"), width = 12
                  )
                )
            )
    ,
    # # Document Tab 
    # tabItem(tabName = "document",
    #         fluidRow(
    #           box(
    #             title = "Document Attributes",
    #             #selectInput("document", "Choose a document:", choices = cmpyData$Company),
    #             dataTableOutput("doc.attribute"),
    #             height = 400),
    #           box(
    #             title = "Document Topics", 
    #             plotOutput("doc.treemap")),
    #           height = 400,
    #           collapsible = T
    #         )
    #         ,
    #         fluidRow(    
    #           box(title = "Document's Webpages", dataTableOutput("company.webpage"), width = 12, collapsible = T)
    #         )
    # ),
    # Validation tab
    tabItem(tabName = "validation",
            fluidRow(
              box(
                title = "Topic Validation",
                checkboxGroupInput("k.validation", "K Topics to Run:",
                                   c("5" = 1,
                                     "10" = 2,
                                     "20" = 3,
                                     "30" = 4,
                                     "50" = 5,
                                     "75" = 6,
                                     "100" = 7), selected = c(1,2,3,4,5,6,7), inline = TRUE),
                numericInput("search.seed", label = "Seed", value = 1),
                bsTooltip("search.seed", "Random seed used to partition dataset for Cross-Validation",
                          "left", options = list(container = "body")),
                hr(),
                actionButton("run.validation", "Run Validation")
              )
            ),
            fluidRow(
              box(
                plotOutput('valid.plot'), width = 12
                )
              )
            )
    )
)

############### Dashboard page ####################

ui <- dashboardPage(header, sidebar, body)

##############    SERVER       #####################

server <- function(input, output, session) {

  # reactive object that stores intermediate results
  storedData <- reactiveValues()
  
  storedData$data <- NULL

  # load previous model
  
  observeEvent(
    ignoreNULL = TRUE,
    eventExpr = {
      input$load.directory
    },
    handlerExpr = {
      if (input$load.directory > 0) {
        # condition prevents handler execution on initial app launch
        
        path = choose.dir(default = readDirectoryInput(session, 'load.directory'))
        updateDirectoryInput(session, 'load.directory', value = path)
      }
    }
  )
  
  output$directory = renderText({
    readDirectoryInput(session, 'load.directory')
  })
  
  # save model
  
  observeEvent(
    ignoreNULL = TRUE,
    eventExpr = {
      input$directory
    },
    handlerExpr = {
      if (input$directory > 0) {
        # condition prevents handler execution on initial app launch
        
        path = choose.dir(default = readDirectoryInput(session, 'directory'))
        updateDirectoryInput(session, 'directory', value = path)
      }
    }
  )
  
  output$directory = renderText({
    readDirectoryInput(session, 'directory')
  })
  
  # shinyjs below was from stmGUI: https://github.com/dzangri/stmGUI
  
  shinyjs::onclick("toggleAdvDataUpload",
                   shinyjs::toggle(id = "advUploadOptions",
                                   anim = TRUE))
  observe({
    shinyjs::toggleState("submitDataForUpload",
                         !is.null(input$dataFileToUpload))
  })
  observe({
    shinyjs::toggleState("dataInputTitle-nextStep",
                         !is.null(storedData$data))
  })
  
  observeEvent(input$submitDataForUpload, ({
    shinyjs::html("dataInputTextResult", "")
    
    userData <- input$dataFileToUpload
    
    withProgress(message = "Loading data, please wait...", {
      setProgress(0.5)
      
      readDataArgs <- list(userData$datapath, header = input$headerPresent, sep = input$columnSeparator,
                           quote = input$quoteAroundData)
      
      shinyjs::toggleState("moveFromStep1To2")
      
      tryCatch({
        storedData$data <- do.call(read.csv, readDataArgs)
        storedData$data$rowNum <- 1:nrow(storedData$data)
      }, error = function(e) {
        funName <- deparse(substitute(read.csv))
        shinyjs::html("dataInputTextResult",
                      paste("ERROR: Error while running '",
                            funName, "':\n",
                            e,
                            sep = ""))
        storedData$data <- NULL
        return(NULL)
      }, warning = function(w) {
        shinyjs::html("dataInputTextResult",
                      paste("WARNING: Warning while reading data:\n",
                            w,
                            sep = "\n"))
        storedData$data <- NULL
        return(NULL)
      }, finally = {
      })
      
      setProgress(1)
    })

  }))
  
  observe({
    userData <- storedData$data
    if (!is.null(userData)) {
      shinyjs::enable("tpDocs")
      dataColumnNames <- colnames(userData)
      updateSelectInput(session, "tpDocs", choices = dataColumnNames)
    } else {
      shinyjs::disable("tpDocs")
    }
  })
  
    # Topic
  z <- reactiveValues(Corpus = NULL, dtm = NULL, dfm = NULL)

  observeEvent(input$dfm.update, {
    MyCorpus <- corpus(as.character(storedData$data[,input$tpDocs]))
    
    # sets input data row number as primary key -- ensures matchback for datasets without a primary key
    docvars(MyCorpus, "rowNum") <- storedData$data$rowNum
    stp <- unlist(strsplit(input$stopwords,","))
    stp <- trimws(stp)
    ngram <- ifelse(input$ngrams==1,1L, 1L:2L)
    
    Dfm <- dfm(MyCorpus, remove = c(stopwords("english"), stp), remove_numbers = TRUE, remove_punct = TRUE,
                  stem = input$stemming, ngrams = ngram
               )
  
    tdfm <- dfm_trim(Dfm, min_docfreq = input$minDoc)

    # we now export to a format that we can run the topic model with
    z$Corpus <- MyCorpus
    z$dtm <- convert(tdfm, to= "topicmodels")
    z$dfm <- convert(tdfm, to = "stm", docvars = docvars(MyCorpus))
    
    print("DFM created")
  })
  
  v <- reactiveValues(probtopics = NULL, probterms = NULL, topicnames = NULL, stmFit = NULL, out = NULL)
  
  # topic models
  
  observeEvent(input$topic.update, {
    k <- input$num.topics
    dfm <- z$dfm
    
    # use quanteda converter to convert our Dfm
    out <- prepDocuments(dfm$documents, dfm$vocab, dfm$meta, lower.thresh = 1, subsample = NULL)
    
    stmFit <- stm(out$documents, out$vocab, K = k, #prevalence =~ Party + s(Time),
                  max.em.its = input$iter, data = out$meta, init.type = "Spectral", seed = 300)
    
    probterms <- data.frame(t(data.frame(probs = stmFit$beta[[1]])))  # words (rows) x topics (columns)
    row.names(probterms) <- stmFit$vocab
    probdocs <- data.frame(stmFit$theta)
    
    topic.names <- character(length = ncol(stmFit$theta))
    
    for (i in 1:ncol(stmFit$theta)){
      temp <- order(-probterms[,i])
      temp2 <- rownames(probterms[temp,])
      topic.names[i] <- paste(temp2[1:5], collapse = " \n ")
    }
    v$out <- out
    v$stmFit <- stmFit
    v$probdocs <- probdocs
    v$probterms <- probterms
    v$topicnames <- topic.names
  })

  # Network
  x <- reactiveValues(nodes = NULL, edges = NULL)

  observeEvent(input$network.update, {
   results <- new.topic.network(v$stmFit, input$parm, v$topicnames)
   x$nodes <- results[[1]]
   x$edges <- results[[2]]
   print("Network created")
  })
  
  # load and save
  
  observeEvent(input$load.model, {
    dir <- readDirectoryInput(session, 'load.directory')
    v$probterms <- read.csv(file = paste0(dir,"/prob-terms.csv"), stringsAsFactors = F, row.names = 1)
    v$probdocs <- read.csv(file = paste0(dir,"/prob-docs.csv"), stringsAsFactors = F, row.names = 1)
    load(paste0(dir,"/stmFit.RData"))
    v$stmFit <- stmFit
    load(paste0(dir,"/out.RData"))
    v$out <- out

    topic.names <- character(length = ncol(v$probterms))
    for (i in 1:ncol(v$probterms)){
      temp <- order(-v$probterms[,i])
      temp2 <- rownames(v$probterms[temp,])
      topic.names[i] <- paste(temp2[1:5], collapse = " \n ")
    }
    v$topicnames <- topic.names
    
    print("Model Uploaded!")
  })

  observeEvent(input$save.results, {

    dir <- readDirectoryInput(session, 'directory')
    
    dir.terms <- paste0(dir,"/prob-terms.csv")
    dir.docs <- paste0(dir,"/prob-docs.csv")
    dir.topics <- paste0(dir,"/topic-names.csv")
    dir.parms <- paste0(dir,"/sparameters.csv")
    
    write.csv(v$probterms, dir.terms, row.names = T)
    write.csv(v$probdocs, dir.docs, row.names = T)
    write.csv(v$topicnames, dir.topics, row.names = F)
    parameters <- data.frame(Stopwords = input$stopwords,
                             minDoc = input$minDoc,
                             stem = input$stemming,
                             unigrams = input$ngrams,
                             NumTopics = input$num.topics,
                             Iterations = input$iter)
    
    write.csv(parameters, dir.parms, row.names = F)
     
    stmFit <- v$stmFit
    out <- v$out
    save(stmFit, file = paste0(dir,"/stmFit.RData"))
    save(out, file = paste0(dir,"/out.RData"))
    
    print("Topic model saved")
  })
  
  ### Network
  
  output$topic.network <- renderVisNetwork({

    visNetwork(x$nodes, x$edges, submain = "A topic is a word list of word co-occurrence clusters. Each node is a topic and each edge represents shared words between clusters.", height = "600px") %>%
      #visExport() %>%
      visNodes(labelHighlightBold = T) %>%
      visOptions(highlightNearest = T, selectedBy = "community", nodesIdSelection = T) %>%
      visInteraction(navigationButtons = T)
  })
  
  # terms <- reactive({
  #   freq <- data.frame(v$probterms)
  #   temp <- as.integer(input$topic.network_selected)
  #   data.frame(word = rownames(v$probterms), freq = freq[,temp])
  # })
  
  terms <- reactive({
    validate(
      need(input$topic.network_selected != "", "Please select a topic")
    )
    freq <- data.frame(v$probterms)
    temp <- as.integer(input$topic.network_selected)
    data.frame(word = rownames(v$probterms), freq = freq[,temp])
  })
  
  docs <- reactive({
    validate(
      need(input$topic.network_selected != "", "Please select a topic")
    )
    freq <- data.frame(v$probdocs)
    temp <- as.integer(input$topic.network_selected)
    data.frame(docname = rownames(v$probdocs), freq = freq[,temp], rowNum = v$out$meta$rowNum)
  })

  output$topic.wordcloud <- renderPlot({
    w <- terms()
    
    w %>%
      mutate(word = as.character(word)) %>%
      mutate(freq = round(exp(freq)*100)) %>%
      ggplot(aes(label = word, size = freq)) +
      geom_text_wordcloud() +
      scale_size_area(max_size = 24) +
      theme_minimal()
    })
  
  
  # expert table
  Docs <- reactive({
    d <- docs()
    ldaProbs <- data.frame(rowNum = d$rowNum, Prob = exp(d$freq), stringsAsFactors = F)
    ldaProbs <- merge(ldaProbs, storedData$data, by = "rowNum")
    ldaProbs[order(ldaProbs$Prob, decreasing = T), c("rowNum","Prob",input$tpDocs)]
  })
  
  #Representative Document
  
  output$doc.table <- renderDataTable({
    temp <- Docs()
    colnames(temp) <- c("Row Num","Topic Prob","Text")
    temp[,2] <- round(log(temp[,2]),3)
    temp$Text <- as.character(temp$Text)
    temp
  }, options = list(pageLength = 5, dom = 'tip') , rownames= FALSE)
  
  valid <- reactiveValues(results = NULL, K = NULL)
  
  # Validation
  
  observeEvent(input$run.validation, {
  
    K <- c(5,10,20,30,50,75,100)
    K <- K[as.numeric(input$k.validation)]
    
    dfm <- z$dfm
    
    # use quanteda converter to convert our Dfm
    out <- prepDocuments(dfm$documents, dfm$vocab, dfm$meta, lower.thresh = 1, subsample = NULL)
    
    valid$results <- searchK(out$documents, 
                             out$vocab, 
                             K, 
                             init.type = "Spectral", 
                             proportion = 0.5, 
                             heldout.seed = input$search.seed, 
                             max.em.its = 200)
    valid$K <- K
    })
  
  output$valid.plot <- renderPlot({
    try <- try(plot(valid$results))
    if("try-error" %in% class(try)){print("Select the number of topics to test and run topic validation.")
    }else{plot(valid$results)}
  })
}

# Run the application 
shinyApp(ui = ui, server = server)