knitr::opts_chunk$set( collapse = TRUE, warning = FALSE, message = FALSE, echo = FALSE, comment = "#>", fig.path = "figures" ) library(saaabstracts) library(tm) library(topicmodels) library(readxl) library(Rmpfr) library(tidyverse)
# read in the abstracts general_2018 <- read_excel("../data/raw_data/General Abstracts_2018.xlsx") general_2018_posters <- general_2018 %>% filter(grepl("Poster", `Presentation Format`)) general_2018_papers <- general_2018 %>% filter(grepl("Paper", `Presentation Format`)) # clean geo clean_geo <- function(x){ tmp <- stringr::str_trim(tolower(gsub("[[:punct:]]|\r\n| and ", " ", x))) tmp <- gsub("other", "", tmp) tmp <- gsub(" ", "_", tmp) tmp <- gsub("_{2}", "_", tmp) tmp } general_2018_posters$geo <- unlist(purrr::map(general_2018_posters$`Geographic Focus`, ~clean_geo(.x))) general_2018_posters$kw1 <- unlist(purrr::map(general_2018_posters$Keyword1, ~clean_geo(.x))) general_2018_posters$kw2 <- unlist(purrr::map(general_2018_posters$Keyword2, ~clean_geo(.x))) general_2018_posters$kw3 <- unlist(purrr::map(general_2018_posters$Keyword3, ~clean_geo(.x))) general_2018_posters$text_to_model <- with(general_2018_posters, paste(geo, kw1, kw2, kw3))
# clean text to model general_2018_posters$text_to_model <- stringr::str_trim(tolower(gsub("\\/|:|&", " ", general_2018_posters$text_to_model))) general_2018_posters$text_to_model <- gsub(" and | archaeology | analysis |\r\n", " ", general_2018_posters$text_to_model)
# Create a corpus object from the text to mode general_2018_posters_corpus <- tm::Corpus(tm::VectorSource(general_2018_posters$text_to_model )) ndocs <- length(general_2018_posters_corpus) # ignore extremely rare words i.e. terms that appear in less then 1% of the documents minTermFreq <- ndocs * 0.01 # ignore overly common words i.e. terms that appear in more than 50% of the documents maxTermFreq <- ndocs * 0.5 # create a document term matrix general_2018_posters_tm <- tm::DocumentTermMatrix(general_2018_posters_corpus, control = list(stemming = TRUE, stopwords = TRUE, wordLengths=c(4,30), removeNumbers = TRUE, bounds = list(global = c(minTermFreq, maxTermFreq)), removePunctuation = TRUE))
Distance matrix
m <- as.matrix(general_2018_posters_tm) distMatrix <- dist(m, method="euclidean")
How many clusters?
library(apcluster) d.apclus <- apcluster(negDistMat(r=2), m) k <- length(d.apclus@clusters) windows() heatmap(d.apclus)
Cluster analysis and dendrogram
groups <- hclust(distMatrix,method="ward.D") plot(groups, cex=0.9, hang=-1) rect.hclust(groups, k) # see what cluster each abstract is in: cuts <- cutree(groups, k)
library(dynamicTreeCut) maxCoreScatter <- 0.99 minGap <- (1 - maxCoreScatter) * 0.01 dynamicCut <- cutreeDynamic(groups, minClusterSize=4, method="hybrid", distM=as.matrix(dist(m, method="euclidean")), deepSplit=4, maxCoreScatter=maxCoreScatter, maxAbsCoreScatter=NULL, minAbsGap=NULL) hist(table(dynamicCut)) xfreqs_per_cluster <- dynamicCut %>% data_frame(dynamicCut = ., freq = names(.)) %>% group_by(dynamicCut) %>% tally(sort=TRUE) hist(xfreqs_per_cluster$n) library(dendextend) hc <- groups dend <- as.dendrogram(hc) heights_per_k.dendrogram(dend) unnest <- function(x) { # from Vlo's answer if(is.null(names(x))) x else c(list(all=unname(unlist(x))), do.call(c, lapply(x, unnest))) } hc <- groups cuts2 <- hc$height + 1e-9 # my experiment for(i in seq_along(cuts2)){ n <- attr(cut(as.dendrogram(groups), h=3)$lower[[i]], "members") print(n) } my_list <- cut(as.dendrogram(groups), h=3) map(my_list, unnest(.x)$lower) map(cut(as.dendrogram(groups), h=3), ~attr(.x$lower, "members")) xcuts <- cutree(groups, k) xfreqs_per_cluster <- xcuts %>% data_frame(cuts = ., freq = names(.)) %>% group_by(cuts) %>% tally(sort=TRUE) xmax <- max(xfreqs_per_cluster$n) xmin <- min(xfreqs_per_cluster$n) max_size <- 16 min_size <- 4 n <- 0 while(xmax <= max_size & xmin >= min_size & n <= length(xcuts)){ n <- n + 1 ycuts <- cutree(groups, n) xfreqs_per_cluster <- xcuts %>% data_frame(cuts = ., freq = names(.)) %>% group_by(cuts) %>% tally(sort=TRUE) xmax <- max(xfreqs_per_cluster$n) xmin <- min(xfreqs_per_cluster$n) xcuts <- cutree(groups, n) }
hc <- groups hc$size unnest <- function(x) { # from Vlo's answer if(is.null(names(x))) x else c(list(all=unname(unlist(x))), do.call(c, lapply(x, unnest))) } cuts2 <- hc$height + 1e-9 biggest <- 50 smallest <- 0 max_size <- 16 min_size <- 4 i <- 0 while(smallest < min_size & biggest > max_size & i <= length(cuts2)){ h_i <- cuts2[i <- i+1] if(i > length(cuts2)){ warning("Couldn't find a cluster big enough.") } else smallest <- Reduce(min, lapply(X = unnest(cut(as.dendrogram(hc), h=h_i)$lower), FUN = attr, which = "members") ) # from lukeA's comment biggest <- Reduce(max, lapply(X = unnest(cut(as.dendrogram(hc), h=h_i)$upper), FUN = attr, which = "members") ) # from lukeA's comment print(paste0("biggest: ", biggest, ", smallest: ", smallest, " i: ", i, " h_i: ", h_i)) } h_i cut(as.dendrogram(hc), h=h_i)
How many items in each cluster
# histogram freqs_per_cluster <- cuts %>% data_frame(cuts = ., freq = names(.)) %>% group_by(cuts) %>% tally(sort=TRUE) hist(freqs_per_cluster$n) median(freqs_per_cluster$n) # 10
# put topics back onto abstract spreadsheet # abstract_ids <- dimnames(general_2018_reduced_dtm)$Docs # general_2018$`Abstract Id` general_2018_posters_with_topics <- bind_cols(general_2018_posters, data_frame(cuts)) write.csv(general_2018_posters_with_topics, "../data/derived_data/general_2018_posters_with_clusters.csv")
This report was generated on r Sys.time()
using the following computational environment and dependencies:
# which R packages and versions? devtools::session_info()
The current Git commit details are:
# what commit is this file at? You may need to change the path value # if your Rmd is not in analysis/paper/ git2r::repository("../..")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.