knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(topicmodels)
library(USNVC)
# library(textualnvc)
library(tm)

The Challenge

We would like to be able to project the 'new' U.S. National Vegetation Classification (NVC) information onto maps. USGS has already done high-resolution vegetation mapping using a categorization system that is related to NVC, but version 1. If we could crosswalk the NVC v2 and GAP vegetation categories, then we could do the desire projections. Both classifications provide descriptions of each category--NVC for >8,000 entries, GAP for 772--and, I think, the NVC v2 typeConcepts are based in part on the GAP descriptions (which, in turn, are based on NVC v1 [again, I think]).

Correlations of DTMs

The first attempt is based on looking for correlations between the terms used for each category in NVC and the terms used in GAP to describe each category in their respective classification schemes. If an NVC category is synonymous (or closely related) to a GAP category, then the words used in both descriptions should be similar. I hope. First, we need to create the Document-Term Matrix for the combined NVC and GAP text:

# group <- dplyr::filter(unit_data, CLASSIFICATION_LEVEL_NAME == "Group") 
# NVC_text <- paste(group$translatedName,
#                   group$typeConcept, 
#                   group$Physiognomy,
#                   group$Environment,
#                   group$Dynamics,
#                   group$Floristics,
#                   group$Range)
# GAP_text <- paste(GAP_df$Class_Name, GAP_df$Description)
# texts <- c(NVC_text, GAP_text)
# corp <- tm::VCorpus(tm::VectorSource(texts))
# cln <- USNVC::prep_text(corp)
# dtm <- tm::DocumentTermMatrix(cln)
# dtm$dimnames$Docs <- c(group$ELEMENT_GLOBAL_ID, GAP_df$Class_Name)
# word_freq <- table(dtm$j)
# # NVC_dtm <- NVC_dtm[!is.na(row.names(NVC_dtm)), ]
# slim <- dtm[, which(word_freq <= 10 & word_freq >= 2)]
# slim <- slim[which(rowSums(as.matrix(slim)) > 0), ]
# summary(rowSums(as.matrix(slim)))
# dim(slim)

Then do the document (row) by document correlations:

# res <- matrix(NA, nrow = 401, ncol = 750)
# rownames(res) <- slim$dimnames$Docs[1:401]
# colnames(res) <- slim$dimnames$Docs[402:length(slim$dimnames$Docs)]
# dtm_mat <- as.matrix(slim)
# for(i in 1:401) {
#   for(j in 402:1151) {
#     res[i, j - 401] <- cor(dtm_mat[i, ], dtm_mat[j, ])
#   }
# }
# DTM_doc_cor <- res
# save(DTM_doc_cor, file = "DTM_doc_cor.rda")
# 
# GAP_nm <- colnames(DTM_doc_cor)
# NVC_nm <- row.names(DTM_doc_cor)
# GAP <- rep(NA, length(GAP_nm))
# NVC <- rep(NA, length(GAP_nm))
# cor <- rep(NA, length(GAP_nm))
# for(i in 1:length(GAP_nm)) {
#   cur_max <- max(DTM_doc_cor[, GAP_nm[i]])
#   cur_mat <- NVC_nm[which(DTM_doc_cor[, GAP_nm[i]] == cur_max)]
#   GAP[i] <- GAP_nm[i]
#   NVC[i] <- paste(cur_mat, collapse = ", ")
#   cor[i] <- cur_max
# }
# best_GAP_NVC_cors <- tibble::data_frame(GAP = GAP, NVC = NVC, cor = cor)
# save(best_GAP_NVC_cors, file = "best_GAP_NVC_cors.rda")
# load("best_GAP_NVC_cors.rda")
# knitr::kable(head(best_GAP_NVC_cors, 10))

Now consider the example of the first match:

# dplyr::filter(GAP_df, Class_Name == best_GAP_NVC_cors$GAP[1])$Description
# dplyr::filter(unit_data, ELEMENT_GLOBAL_ID == best_GAP_NVC_cors$NVC[1])$typeConcept

Unfortunately, that doesn't work for all examples, e.g.:

# comp <- best_GAP_NVC_cors[722, ]
# data.frame(dplyr::filter(GAP_df, Class_Name == comp$GAP))
# hits <- unlist(strsplit(comp$NVC, split = ", "))
# dplyr::filter(unit_data, ELEMENT_GLOBAL_ID %in% hits)

Latent Dirichlet Allocation

Now, for the LDA analysis.

# NVC_grp <- dplyr::filter(unit_data, 
#                          CLASSIFICATION_LEVEL_NAME == "Alliance" |
#                          CLASSIFICATION_LEVEL_NAME == "Association")
# NVC_text <- paste(NVC_grp$translatedName,
#                   NVC_grp$typeConcept, 
#                   NVC_grp$Physiognomy,
#                   NVC_grp$Environment,
#                   NVC_grp$Dynamics,
#                   NVC_grp$Floristics,
#                   NVC_grp$Range)
# NVC_corp <- tm::VCorpus(tm::VectorSource(NVC_text))
# NVC_cln <- USNVC::prep_text(NVC_corp)
# NVC_dtm <- tm::DocumentTermMatrix(NVC_cln)
# NVC_dtm$dimnames$Docs <- as.character(NVC_grp$ELEMENT_GLOBAL_ID)
# word_freq <- table(NVC_dtm$j)
# # NVC_dtm <- NVC_dtm[!is.na(row.names(NVC_dtm)), ]
# slim <- NVC_dtm[, which(word_freq <= 10 & word_freq >= 2)]
# slim <- slim[which(rowSums(as.matrix(slim)) > 0), ]
# summary(rowSums(as.matrix(slim)))
# dim(slim)
# NVC_lda <- LDA(x = slim,
#                k = 453,
#                method = "Gibbs",
#                control = list(seed = 742,
#                               burnin = 1000,
#                               thin = 100,
#                               iter = 1000))
# save(NVC_lda, file = "NVC_lda.rda")
# 
# # Now to fit the GAP descriptions to the NVC topics
# GAP_corp <- tm::VCorpus(tm::VectorSource(GAP_text))
# GAP_cln <- USNVC::prep_text(GAP_corp)
# GAP_dtm <- tm::DocumentTermMatrix(GAP_cln)
# GAP_dtm$dimnames$Docs <- as.character(GAP_df$Class_Name)
# GAP_2 <- GAP_dtm[which(rowSums(as.matrix(GAP_dtm)) > 0), ]
# save(GAP_2, file = "GAP_2.rda")
# 
# GAP_post <- posterior(NVC_lda, GAP_2)
# save(GAP_post, file = "GAP_post.rda")

It looks like the process worked; now I want to see what the results look like.

# load("vignettes/NVC_lda.rda")
# load("vignettes/GAP_post.rda")
# top <- GAP_post$topics
# top[1:5, 1:5]
# 
# par(mfrow=c(5,5), mar = c(1,1,1,1))
# for(i in 1:25) {
#   hist(top[i, ], main = NULL)
# }
# 
# maxes <- apply(top, MARGIN = 1, FUN = max, na.rm = TRUE)
# medians <- apply(top, MARGIN = 1, FUN = median, na.rm = TRUE)
# sds <- apply(top, MARGIN = 1, FUN = sd, na.rm = TRUE)
# ranges <- apply(top, 
#                 MARGIN = 1, 
#                 FUN = function(x) max(x) - min(x))
# head(ranges)
# 
# par(mfrow = c(1,1), mar = c(5,5,3,3))
# hist(ranges)
# plot(maxes ~ ranges)
# plot(maxes ~ medians)
# plot(sds ~ ranges)

OK, I don't know what thresholds I should use for accepting an assignment, but I think the fact that the above example worked is promising. Probably the best thing to do is build LDAs for each level, with K = n_cat for the level above each focal level.

# nat <- filter(unit_data, 
#               !grepl(unit_data$CLASSIFICATION_LEVEL_NAME, 
#                     pattern = "Cultural"))
# cats <- unique(nat$CLASSIFICATION_LEVEL_NAME)
# for(i in 2:length(cats)) {
#   cur_top <- filter(nat, CLASSIFICATION_LEVEL_NAME == cats[i-1])
#   cur_sub <- filter(nat, CLASSIFICATION_LEVEL_NAME == cats[i])
#   cur_k <- length(cur_top[[1]])
#   cur_txt <- paste(cur_sub$translatedName,
#                    cur_sub$typeConcept, 
#                    cur_sub$Physiognomy,
#                    cur_sub$Environment,
#                    cur_sub$Dynamics,
#                    cur_sub$Floristics,
#                    cur_sub$Range)
#   cur_corp <- tm::VCorpus(tm::VectorSource(cur_txt))
#   cur_cln <- USNVC::prep_text(cur_corp)
#   cur_dtm <- tm::DocumentTermMatrix(cur_cln)
#   cur_dtm$dimnames$Docs <- as.character(cur_sub$ELEMENT_GLOBAL_ID)
#   word_freq <- table(cur_dtm$j)
#   slim <- cur_dtm[, which(word_freq <= 10 & word_freq >= 2)]
#   slim <- slim[which(rowSums(as.matrix(slim)) > 0), ]
#   cur_lda <- LDA(x = slim,
#                  k = cur_k,
#                  method = "Gibbs",
#                  control = list(seed = 742,
#                                 burnin = 1000,
#                                 thin = 100,
#                                 iter = 1000))
#   cur_name <- paste0(cats[i], "_to", cats[i-1], "_lda_model")
#   assign(x = cur_name, value = cur_lda)
# }
# 
# lapply(ls(pattern = "_lda_model"), FUN = function(x) {
#   save(list = x, file = paste0(x, ".rda")) }
# )

Now that we have some LDAs for each NVC transition, we can try to fit the GAP data to each set of models to try to find optimum matches.

# load("Subclass_toClass_lda_model.rda")
# load("Formation_toSubclass_lda_model.rda")
# load("Division_toFormation_lda_model.rda")
# load("Group_toMacrogroup_lda_model.rda")
# load("Macrogroup_toDivision_lda_model.rda")
# load("Alliance_toGroup_lda_model.rda")
# load("Association_toAlliance_lda_model.rda")
# 
# GAP_Class <- posterior(Subclass_toClass_lda_model, GAP_2)
# save(GAP_Class, file = "GAP_Class.rda")
# GAP_Subclass <- posterior(Formation_toSubclass_lda_model, GAP_2)
# save(GAP_Subclass, file = "GAP_Subclass.rda")
# GAP_Formation <- posterior(Division_toFormation_lda_model, GAP_2)
# save(GAP_Formation, file = "GAP_Formation.rda")
# GAP_Division <- posterior(Macrogroup_toDivision_lda_model, GAP_2)
# save(GAP_Division, file = "GAP_Division.rda")
# GAP_Macrogroup <- posterior(Group_toMacrogroup_lda_model, GAP_2)
# save(GAP_Macrogroup, file = "GAP_Macrogroup.rda")
# GAP_Group <- posterior(Alliance_toGroup_lda_model, GAP_2)
# save(GAP_Group, file = "GAP_Group.rda")
# GAP_Alliance <- posterior(Association_toAlliance_lda_model, GAP_2)
# save(GAP_Alliance, file = "GAP_Alliance.rda")

After glancing through a few of the examples, I think I have a way forward.

load("GAP_Class.rda")
load("GAP_Subclass.rda")
load("GAP_Formation.rda")
load("GAP_Division.rda")
load("GAP_Macrogroup.rda")
load("GAP_Group.rda")
load("GAP_Alliance.rda")

hist_counts <- function(x) {
  return(hist(x, plot = FALSE)$counts)
}

max_sec <- function(x) {
  cur_max <- max(x)
  sorted <- unique(sort(x, decreasing = TRUE))
  second <- nth(sorted, 2)
  return(cur_max / second)
}

Class_hist <- apply(GAP_Class$topics, MARGIN = 1, FUN = hist_counts)
Class_divs <- unlist(lapply(Class_hist, vegan::diversity))
Class_best <- apply(GAP_Class$topics, MARGIN = 1, FUN = max_sec)
# hist(Class_divs)

Subclass_hist <- apply(GAP_Subclass$topics, MARGIN = 1, FUN = hist_counts)
Subclass_divs <- unlist(lapply(Subclass_hist, vegan::diversity))
Subclass_best <- apply(GAP_Subclass$topics, MARGIN = 1, FUN = max_sec)
# hist(Subclass_divs)

Formation_hist <- apply(GAP_Formation$topics, MARGIN = 1, FUN = hist_counts)
Formation_divs <- unlist(lapply(Formation_hist, vegan::diversity))
# hist(Formation_divs)

Division_hist <- apply(GAP_Division$topics, MARGIN = 1, FUN = hist_counts)
Division_divs <- unlist(lapply(Division_hist, vegan::diversity))
# hist(Division_divs)

Macrogroup_hist <- apply(GAP_Macrogroup$topics, MARGIN = 1, FUN = hist_counts)
Macrogroup_divs <- unlist(lapply(Macrogroup_hist, vegan::diversity))
# hist(Macrogroup_divs)

Group_hist <- apply(GAP_Group$topics, MARGIN = 1, FUN = hist_counts)
Group_divs <- unlist(lapply(Group_hist, vegan::diversity))
# hist(Group_divs)

Alliance_hist <- apply(GAP_Alliance$topics, MARGIN = 1, FUN = hist_counts)
Alliance_divs <- unlist(lapply(Alliance_hist, vegan::diversity))
Alliance_best <- apply(GAP_Alliance$topics, MARGIN = 1, FUN = max_sec)
# hist(Alliance_divs)

Wow, that's a steady march left of the distribution of Shannon's diversity values...as we might expect given the change in the number of categories:

# par(mfrow = c(7,1), mar = c(1,1,1,1))
hist(Class_divs, xlim = c(0,2))
hist(Subclass_divs, xlim = c(0,2))
hist(Formation_divs, xlim = c(0,2))
hist(Division_divs, xlim = c(0,2))
hist(Macrogroup_divs, xlim = c(0,2))
hist(Group_divs, xlim = c(0,2))
hist(Alliance_divs, xlim = c(0,2))

# par(mfrow = c(1,1), mar = c(5,5,3,3))

I think this means there is no single diversity cutoff that applies to all levels of the NVC hierarchy. But...

Two steps forward, one back

As I explored the data to find some of the best matches, I see that there are some problems with the LDAs. There are a significant number of matches where the only text for the NVC model is the translatedName of the category. How often is this a problem? About 1/8 of all NVC categories have extremely limited text, i.e., < 100 characters.

conv_txt <- function(x) {
  return(iconv(x = x, from = "iso-8859-1", to = "utf-8"))
}

NVC_text <- paste(unit_data$translatedName,
                  unit_data$typeConcept,
                  unit_data$Physiognomy,
                  unit_data$Environment,
                  unit_data$Dynamics,
                  unit_data$Floristics,
                  unit_data$Range)

txt_nchar <- unlist(lapply(NVC_text, FUN = function(x) nchar(conv_txt(x))))
hist(txt_nchar, breaks = 100)
tmp <- txt_nchar[txt_nchar < 1000]
hist(tmp, breaks = 40)
length(txt_nchar[txt_nchar < 100])
length(txt_nchar)

Restart

While I could go back to the preceding code and do a bunch of edits, I think I will keep it all in place as a reminder... So without further ado, here's a re-do:

conv_txt <- function(x) {
  return(iconv(x = x, from = "iso-8859-1", to = "utf-8"))
}

# cats <- unique(unit_data$CLASSIFICATION_LEVEL_NAME)
# for(i in 2:length(cats)) {
#   print(c(i, cats[i]))
#   cur_top <- filter(unit_data, CLASSIFICATION_LEVEL_NAME == cats[i-1])
#   cur_sub <- filter(unit_data, CLASSIFICATION_LEVEL_NAME == cats[i])
#   cur_k <- length(cur_top[[1]])
#   cur_txt <- paste(cur_sub$translatedName,
#                    cur_sub$typeConcept,
#                    cur_sub$Physiognomy,
#                    cur_sub$Environment,
#                    cur_sub$Dynamics,
#                    cur_sub$Floristics,
#                    cur_sub$Range)
#   cur_txt <- conv_txt(cur_txt)
#   txt_nchar <- nchar(cur_txt)
#   txt_df <- data.frame(elem = cur_sub$ELEMENT_GLOBAL_ID,
#                        text = cur_txt,
#                        txt_nchar = txt_nchar,
#                        stringsAsFactors = FALSE)
#   txt_df <- filter(txt_df, txt_nchar > 150)
#   if(dim(txt_df)[1] == 0) next
#   cur_corp <- tm::VCorpus(tm::VectorSource(txt_df$text))
#   cur_cln <- USNVC::prep_text(cur_corp)
#   cur_dtm <- tm::DocumentTermMatrix(cur_cln)
#   cur_dtm$dimnames$Docs <- as.character(txt_df$elem)
#   word_freq <- table(cur_dtm$j)
#   slim <- cur_dtm[, which(word_freq <= 20 & word_freq >= 2)]
#   slim <- slim[which(rowSums(as.matrix(slim)) > 0), ]
#   cur_lda <- LDA(x = slim,
#                  k = cur_k,
#                  method = "Gibbs",
#                  control = list(seed = 742,
#                                 burnin = 1000,
#                                 thin = 100,
#                                 iter = 2000))
#   cur_name <- paste0(cats[i], "_to_", cats[i-1], "_lda_model")
#   assign(x = cur_name, value = cur_lda)
# }
# 
# lapply(ls(pattern = "_lda_model"), FUN = function(x) {
#   save(list = x, file = paste0(x, ".rda")) }
# )

# cur_top <- filter(unit_data, CLASSIFICATION_LEVEL_NAME == "Formation")
# cur_sub <- filter(unit_data, CLASSIFICATION_LEVEL_NAME == "Division")
# cur_k <- length(cur_top[[1]])
# cur_txt <- paste(cur_sub$translatedName,
#                  cur_sub$typeConcept,
#                  cur_sub$Physiognomy,
#                  cur_sub$Environment,
#                  cur_sub$Dynamics,
#                  cur_sub$Floristics,
#                  cur_sub$Range)
# cur_txt <- conv_txt(cur_txt)
# txt_nchar <- nchar(cur_txt)
# txt_df <- data.frame(elem = cur_sub$ELEMENT_GLOBAL_ID,
#                      text = cur_txt,
#                      txt_nchar = txt_nchar,
#                      stringsAsFactors = FALSE)
# txt_df <- filter(txt_df, txt_nchar > 150)
# if(dim(txt_df)[1] == 0) stop("Dim1 of txt_df == 0")
# cur_corp <- tm::VCorpus(tm::VectorSource(txt_df$text))
# cur_cln <- USNVC::prep_text(cur_corp)
# cur_dtm <- tm::DocumentTermMatrix(cur_cln)
# cur_dtm$dimnames$Docs <- as.character(txt_df$elem)
# word_freq <- table(cur_dtm$j)
# slim <- cur_dtm[, which(word_freq <= 20 & word_freq >= 2)]
# slim <- slim[which(rowSums(as.matrix(slim)) > 0), ]
# Division_to_Formation_lda_model <- LDA(x = slim,
#                                        k = cur_k,
#                                        method = "Gibbs",
#                                        control = list(seed = 742,
#                                                       burnin = 1000,
#                                                       thin = 100,
#                                                       iter = 2000))
# save(Division_to_Formation_lda_model, file = "Division_to_Formation_lda_model.rda")
# 
# 
# GAP_text <- paste(GAP_df$Class_Name, GAP_df$Description)
# GAP_corp <- tm::VCorpus(tm::VectorSource(GAP_text))
# GAP_cln <- USNVC::prep_text(GAP_corp)
# GAP_dtm <- tm::DocumentTermMatrix(GAP_cln)
# GAP_dtm$dimnames$Docs <- as.character(GAP_df$Class_Name)
# GAP_2 <- GAP_dtm[which(rowSums(as.matrix(GAP_dtm)) > 0), ]

# GAP_Class <- posterior(Subclass_to_Class_lda_model, GAP_2)
# save(GAP_Class, file = "GAP_Class.rda")
# GAP_Subclass <- posterior(Formation_to_Subclass_lda_model, GAP_2)
# save(GAP_Subclass, file = "GAP_Subclass.rda")
# GAP_Formation <- posterior(Division_to_Formation_lda_model, GAP_2)
# save(GAP_Formation, file = "GAP_Formation.rda")
# GAP_Division <- posterior(Macrogroup_to_Division_lda_model, GAP_2)
# save(GAP_Division, file = "GAP_Division.rda")
# GAP_Macrogroup <- posterior(Group_to_Macrogroup_lda_model, GAP_2)
# save(GAP_Macrogroup, file = "GAP_Macrogroup.rda")
# GAP_Group <- posterior(Alliance_to_Group_lda_model, GAP_2)
# save(GAP_Group, file = "GAP_Group.rda")
# GAP_Alliance <- posterior(Association_toAlliance_lda_model, GAP_2)
# save(GAP_Alliance, file = "GAP_Alliance.rda")

Well crud. All of the levels appear to work well except for the Division. Upon further inspection, I see that's because the Division level has no descriptive text...no typeConcept, no Physiognomy, nothing beyond a name. This may be a case where the ancestor / children text concatenation is necessary.

NVC correlated topic model

cur_sub <- filter(unit_data, CLASSIFICATION_LEVEL_NAME == "Alliance")
cur_txt <- paste(cur_sub$translatedName,
                 cur_sub$typeConcept,
                 cur_sub$Physiognomy,
                 cur_sub$Environment,
                 cur_sub$Dynamics,
                 cur_sub$Floristics,
                 cur_sub$Range)
# cur_txt <- paste(cur_sub$Physiognomy,
#                  cur_sub$Environment,
#                  cur_sub$Dynamics)
cur_txt <- unlist(lapply(as.character(cur_sub$ELEMENT_GLOBAL_ID),
                         FUN = USNVC::get_ancestor_typeConcept,
                         g = NVC_graph))
# cur_txt <- unlist(lapply(as.character(cur_sub$ELEMENT_GLOBAL_ID),
#                          FUN = USNVC::get_ancestor_text,
#                          g = NVC_graph,
#                          "typeConcept",
#                          "Physiognomy",
#                          "Dynamics"))
cur_txt <- conv_txt(cur_txt)
# out31 <- cur_txt[31]
# cur_txt <- cur_txt[-31]
txt_nchar <- nchar(cur_txt)
txt_df <- data.frame(elem = cur_sub$ELEMENT_GLOBAL_ID,
                     text = cur_txt,
                     txt_nchar = txt_nchar,
                     stringsAsFactors = FALSE)
txt_df <- filter(txt_df, txt_nchar > 150)
if(dim(txt_df)[1] == 0) stop("Dim1 of txt_df == 0")
cur_corp <- tm::VCorpus(tm::VectorSource(txt_df$text))
cur_cln <- USNVC::prep_text(cur_corp)
cur_dtm <- tm::DocumentTermMatrix(cur_cln)
cur_dtm$dimnames$Docs <- as.character(txt_df$elem)
word_freq <- table(cur_dtm$j)
slim <- cur_dtm[, which(word_freq <= 10 & word_freq >= 2)]
slim <- slim[which(rowSums(as.matrix(slim)) > 0), ]

form_ctm <- CTM(cur_dtm, k = 13)
topics(form_ctm)

form_2_lda <- LDA(cur_dtm, 
                  k = 74,
                  method = "gibbs",
                  control = list(seed = 742,
                                 burnin = 1000,
                                 thin = 100,
                                 iter = 2000))
topics(form_2_lda)
table(topics(form_2_lda))
table(cur_sub$PARENT_ID)

form_lda <- LDA(slim, 
                k = 13,
                method = "gibbs",
                control = list(seed = 742,
                               burnin = 1000,
                               thin = 100,
                               iter = 2000))

out31_corp <- tm::VCorpus(tm::VectorSource(out31))
out31_cln <- USNVC::prep_text(out31_corp)
out31_dtm <- tm::DocumentTermMatrix(out31_cln)
pred31 <- posterior(form_ctm, out31_dtm)
topics(form_ctm)
cov2cor(form_ctm@Sigma)

par(mfrow = c(6,6), mar = c(1,1,1,1))
for(i in 1:length(form_ctm@gamma[, 1])) {
  hist(form_lda@gamma[i, ], xlim = c(0,1), main = form_lda@documents[i])
}

par(mfrow = c(1,1), mar = c(5,5,4,3))


jacob-ogre/USNVC documentation built on May 18, 2019, 8 a.m.