knitr::opts_chunk$set( collapse = TRUE, warning = FALSE, message = FALSE, echo = FALSE, comment = "#>", fig.path = "figures" ) library(saaabstracts) library(tm) library(topicmodels) library(readxl) library(Rmpfr) library(tidyverse)
# read in the abstracts general_2018 <- ab# read_excel("../data/raw_data/General Abstracts_2018.xlsx") general_2018_posters <- general_2018 %>% filter(grepl("Poster", `Presentation Format`)) general_2018_papers <- general_2018 %>% filter(grepl("Paper", `Presentation Format`)) # clean geo clean_geo <- function(x){ tmp <- stringr::str_trim(tolower(gsub("[[:punct:]]|\r\n| and ", " ", x))) tmp <- gsub("other", "", tmp) tmp <- gsub(" ", "_", tmp) tmp <- gsub("_{2}", "_", tmp) tmp } general_2018_papers$geo <- unlist(purrr::map(general_2018_papers$`Geographic Focus`, ~clean_geo(.x))) general_2018_papers$kw1 <- unlist(purrr::map(general_2018_papers$Keyword1, ~clean_geo(.x))) general_2018_papers$kw2 <- unlist(purrr::map(general_2018_papers$Keyword2, ~clean_geo(.x))) general_2018_papers$kw3 <- unlist(purrr::map(general_2018_papers$Keyword3, ~clean_geo(.x))) library(stringi) general_2018_papers$text_to_model <- paste(stri_dup(paste(general_2018_papers$geo, ""), 4), stri_dup(paste(general_2018_papers$kw1, ""), 3), general_2018_papers$kw2, general_2018_papers$kw3)
# clean text to model general_2018_papers$text_to_model <- stringr::str_trim(tolower(gsub("\\/|:|&", " ", general_2018_papers$text_to_model))) general_2018_papers$text_to_model <- gsub(" and | archaeology | analysis |\r\n", " ", general_2018_papers$text_to_model)
# Create a corpus object from the text to mode general_2018_papers_corpus <- tm::Corpus(tm::VectorSource(general_2018_papers$text_to_model )) ndocs <- length(general_2018_papers_corpus) # ignore extremely rare words i.e. terms that appear in less then 1% of the documents minTermFreq <- ndocs * 0.01 # ignore overly common words i.e. terms that appear in more than 50% of the documents maxTermFreq <- ndocs * 0.5 # create a document term matrix general_2018_papers_tm <- tm::DocumentTermMatrix(general_2018_papers_corpus, control = list(stemming = TRUE, stopwords = TRUE, wordLengths=c(4,30), removeNumbers = TRUE, bounds = list(global = c(minTermFreq, maxTermFreq)), removePunctuation = TRUE))
Distance matrix
m <- as.matrix(general_2018_papers_tm) distMatrix <- dist(m, method="euclidean")
How many clusters?
library(apcluster) d.apclus <- apcluster(negDistMat(r=2), m) k <- length(d.apclus@clusters) windows() heatmap(d.apclus)
Cluster analysis and dendrogram
groups <- hclust(distMatrix,method="ward.D") plot(groups, cex=0.9, hang=-1) rect.hclust(groups, k) # see what cluster each abstract is in: cuts <- cutree(groups, k)
How many items in each cluster
# histogram freqs_per_cluster <- cuts %>% data_frame(cuts = ., freq = names(.)) %>% group_by(cuts) %>% tally(sort = TRUE) hist(freqs_per_cluster$n) median(freqs_per_cluster$n) # 10
# put topics back onto abstract spreadsheet # abstract_ids <- dimnames(general_2018_reduced_dtm)$Docs # general_2018$`Abstract Id` general_2018_papers_with_topics <- bind_cols(general_2018_papers, data_frame(cuts)) write.csv(general_2018_papers_with_topics, "../data/derived_data/general_2018_papers_with_clusters.csv")
Compare
ab <- read_excel("../data/derived_data/Copy of general_2018_papers_with_clusters_AB.xlsx") compare <- left_join(general_2018_papers_with_topics, ab, by = "Abstract Id") compare_groups <- compare %>% select(cuts.x, cuts.y)
reclassify using weighted keywords
This report was generated on r Sys.time()
using the following computational environment and dependencies:
# which R packages and versions? devtools::session_info()
The current Git commit details are:
# what commit is this file at? You may need to change the path value # if your Rmd is not in analysis/paper/ git2r::repository("../..")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.