knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(topicmodels)
library(USNVC)
library(tm)

The first step of the analysis is to perform topic modeling on the National Vegetation Classification (NVC) using the inherited text descriptions. In brief, by incorporating the inherited information from higher levels in the hierarchy, I expect to be able to recover the hierarchical structure exactly.

conv_txt <- function(x) {
  return(iconv(x = x, from = "iso-8859-1", to = "utf-8"))
}

cats <- unique(unit_data$CLASSIFICATION_LEVEL_NAME)
for(i in 2:length(cats)) {
  print(c(i, cats[i]))
  cur_top <- filter(unit_data, CLASSIFICATION_LEVEL_NAME == cats[i-1])
  cur_sub <- filter(unit_data, CLASSIFICATION_LEVEL_NAME == cats[i])
  cur_k <- length(cur_top[[1]])
  cur_txt <- unlist(lapply(as.character(cur_sub$ELEMENT_GLOBAL_ID),
                           FUN = USNVC::get_ancestor_typeConcept,
                           g = NVC_graph))
  # print(head(cur_txt))
  cur_txt <- conv_txt(cur_txt)
  txt_nchar <- nchar(cur_txt)
  txt_df <- data.frame(elem = cur_sub$ELEMENT_GLOBAL_ID,
                       text = cur_txt,
                       txt_nchar = txt_nchar,
                       stringsAsFactors = FALSE)
  txt_df <- filter(txt_df, txt_nchar > 150)
  if(dim(txt_df)[1] == 0) next
  cur_corp <- tm::VCorpus(tm::VectorSource(txt_df$text))
  cur_cln <- USNVC::prep_text(cur_corp)
  cur_dtm <- tm::DocumentTermMatrix(cur_cln)
  cur_dtm$dimnames$Docs <- as.character(txt_df$elem)
  word_freq <- table(cur_dtm$j)
  slim <- cur_dtm[, which(word_freq <= 20 & word_freq >= 2)]
  slim <- slim[which(rowSums(as.matrix(slim)) > 0), ]
  cur_lda <- LDA(x = slim,
                 k = cur_k,
                 method = "Gibbs",
                 control = list(seed = 742,
                                burnin = 1000,
                                thin = 100,
                                iter = 4000))
  cur_ctm <- CTM(x = slim, k = cur_k)
  cur_name <- paste0(cats[i], "_to_", cats[i-1], "_lda_control")
  cur_ctm_name <- paste0(cats[i], "_to_", cats[i-1], "_ctm_control")
  assign(x = cur_name, value = cur_lda)
  assign(x = cur_ctm_name, value = cur_ctm)
  save(cur_lda, file = paste0(cur_name, ".rda"))
  save(cur_ctm, file = paste0(cur_ctm_name, ".rda"))
}

x <- lapply(ls(pattern = "_lda_control"), 
            FUN = function(x) {
              save(list = x, file = paste0(x, ".rda")) 
            } )

x <- lapply(ls(pattern = "_ctm_control"), 
            FUN = function(x) {
              save(list = x, file = paste0(x, ".rda"))
            } )


jacob-ogre/USNVC documentation built on May 18, 2019, 8 a.m.