#' Create objects for input to STM functions
#'
#' Create and save objects to be used as input to STM functions.
#' @param input_dir Directory containing text files from which to create STM input.
#' @param output_file Full filepath, including filename, at which to save .Rdata file containing STM input objects.
#' @param bounds An integer vector of length 2 indicating the lower and upper bounds (inclusive) for the number of documents in which a token must appear to be included in the analysis.
#' @param word_lengths An integer vector of length 2 indicating lower and upper bounds (inclusive) for the number of characters a token must have to be included in the analysis.
#' @param split_docs Logical indicating whether documents should be split into 3,000 word sections prior to STM analysis.
#' @export
create_stm_input <- function(input_dir, output_file, bounds=c(2, Inf),
word_lengths=c(3, 24), split_docs=FALSE) {
all_reports <- tm::VCorpus(tm::DirSource(input_dir))
all_reports <- tm::as.VCorpus(lapply(all_reports, function(x) {
x$meta$id <- stringr::str_replace(x$meta$id, "_no_ref.txt$", ".txt")
return(x)
}))
titles <- sapply(all_reports, tm:::meta.VCorpus, "id", USE.NAMES=FALSE)
# Rename gensci_meta as d
d <- gensci_meta
# setdiff(str_replace(titles, "\\.txt$", ""), d$New.Names.for.Pdfs)
# Remove EC_1999b, EC_2016, and all but the first of the series
# for She Figures reports and NSF "Women, Minorities, and Persons with Disabilities..." reports
# and EC_2016 (study period ends at 2015)
she_figures <- paste0(c("EC_2003", "EC_2006", "EC_2009d", "EC_2013c", "EC_2015f"), ".txt")
nsf <- paste0(c("NSF_1994", "NSF_2000", "NSF_2002", "NSF_2013", "NSF_2015"), ".txt")
to_drop <- c("EC_1999b.txt", "EC_2016.txt", tail(she_figures, -1), tail(nsf, -1))
all_reports <- all_reports[!(titles %in% to_drop)]
titles <- titles[!(titles %in% to_drop)]
title_order <- order(titles)
titles <- titles[title_order]
all_reports <- all_reports[title_order]
setorder(d, New.Names.for.Pdfs)
d <- d[New.Names.for.Pdfs %in% stringr::str_replace(titles, "\\.txt$", ""), ]
all <- all_reports
rm(all_reports)
# Remove the space between dash and next word in hyphenated word
all <- tm::tm_map(all, tm::content_transformer(function(x) stringr::str_replace_all(x, "(?<=\\w)- (?=\\w)", "-")))
## Remove web-addresses
url_pattern <- "(http)?www[^ ]+"
all <- tm::tm_map(all, tm::content_transformer(function(x) stringr::str_replace_all(x, url_pattern, "")))
## Change "per cent" to "percent"
all <- tm::tm_map(all, tm::content_transformer(function(x) stringr::str_replace_all(x, "per cent", "percent")))
# Make raw_pages for most representative document output
# all_raw <- all
# which_non_empty <- lapply(all, function(x) {
# pages <- stringr::str_split(x$content, "\\f")[[1]]
# return(str_detect(pages, "\\w"))
# })
if(split_docs) {
all_pages <- list()
all_pages_years <- integer()
all_pages_geog <- character()
all_pages_titles <- character()
all_pages_report <- character()
for(i in seq_along(all)) {
doc <- all[[i]]
pages <- stringr::str_split(doc$content, "\\f")[[1]]
doc_title <- stringr::str_replace(doc$meta$id, "\\.txt$", "")
metadata <- d[New.Names.for.Pdfs==doc_title, ]
doc_year <- metadata$Year
doc_geog <- metadata$US.EU.UN
token_pattern <- "[\\S]+"
word_count <- 0L
page_grp <- integer()
# Aggregate until you have 3000 words
for(j in seq_along(pages)) {
word_count <- word_count + stringr::str_count(pages[j], token_pattern)
page_grp <- c(page_grp, j)
if(word_count >= 3000 | (word_count > 0 & j==length(pages))) {
page_grp_title <- sprintf("%s_%d_to_%d", doc_title, head(page_grp, 1), tail(page_grp, 1))
to_add <- tm::PlainTextDocument(x=stringr::str_c(pages[page_grp], collapse=" "), id=page_grp_title, language="en")
all_pages <- c(all_pages, list(to_add))
all_pages_years <- c(all_pages_years, doc_year)
all_pages_geog <- c(all_pages_geog, doc_geog)
all_pages_titles <- c(all_pages_titles, page_grp_title)
all_pages_report <- c(all_pages_report, doc_title)
word_count <- 0L
page_grp <- integer()
} else {
next
}
}
}
all <- tm::as.VCorpus(all_pages)
raw_reports <- sapply(all, tm:::content.VCorpus)
d <- data.table(title=all_pages_titles, year=all_pages_years, geog=factor(all_pages_geog),
report=factor(all_pages_report))
rm(all_pages, all_pages_titles, all_pages_years, all_pages_geog, all_pages_report)
} else {
raw_reports <- sapply(all, tm:::content.VCorpus)
d <- d[ , .(title=Title, year=Year, geog=US.EU.UN)]
}
custom_stopwords <- c("appendix", "annex", "andor", "dqg", "embo", "dfg", "jhqghu", "wklv", "copyright", "pdf",
"typesetting", "typographical", "europe", "european", "america", "american", "january",
"february", "march", "april", "may", "june", "july", "august", "september", "septembre",
"october", "octobre", "november", "novembre", "december", "decembre")
# Convert to lowercase
all <- tm::tm_map(all, tm::content_transformer(function(x) tolower(x)))
## Remove Numbers
all <- tm::tm_map(all, tm::removeNumbers)
# Remove punctuation
all <- tm::tm_map(all, tm::content_transformer(function(x) texanaaid::remove_punctuation(x)))
# Change "title ix" to "titleix"
all <- tm::tm_map(all, tm::content_transformer(function(x) stringr::str_replace_all(x, "\\btitle ix\\b", "titleix")))
# Transliterate non-ASCII characters
all <- tm::tm_map(all, tm::content_transformer(function(x) iconv(x, to="ASCII//TRANSLIT")))
# Convert British to American spellings
british_to_american <- british_to_american_spellings()
plurals <- paste0(british_to_american, "s")
names(plurals) <- paste0(names(british_to_american), "s")
british_to_american <- c(british_to_american, plurals)
for(i in seq_along(all)) {
for(w in names(british_to_american)) {
all[[i]]$content <-
stringr::str_replace_all(all[[i]]$content,
sprintf("\\b%s\\b", w),
british_to_american[w])
}
}
## Remove Stopwords
all <- tm::tm_map(all, tm::removeWords, tm::stopwords("english"))
all <- tm::tm_map(all, tm::removeWords, custom_stopwords)
# Pre-stem "hand-stemming"; modifications to ensure words are combined or
# distinguished the way we want
## Make a post-stem conversion dictionary
post_stem_conversion <- character(0)
## Distinguish "equality" and "equalities" from "equal"
### By default these are all stemmed to "equal"
### Replace "equalities" and "equality" with "equalit"
all <- tm::tm_map(all, tm::content_transformer(
function(x) stringr::str_replace_all(x, "\\bequalities\\b|\\bequality\\b", "equalit")
))
### After stemming, replace "equalit" with "equaliti"
post_stem_conversion["\\bequalit\\b"] <- "equaliti"
##
## Combine all versions of root "scien"
### This includes "science", "sciences", "scientific", "scientist", and
### "scientists"
### Replace all these with "scien"
all <- tm::tm_map(all, tm::content_transformer(
function(x) stringr::str_replace_all(x, paste0("\\b",
c("sciences*", "scientific",
"scientists*"),
"\\b", collapse="|"),
"scien")))
##
## Stem all forms of technologi* at techn*
all <- tm::tm_map(all, tm::content_transformer(
function(x) stringr::str_replace_all(x, "\\btechnolog[a-z]+\\b",
"techn")))
##
## Stem all forms of entrepreneur at entrepr*
all <- tm::tm_map(all, tm::content_transformer(
function(x) stringr::str_replace_all(x, "\\bentrepreneur[a-z]*\\b",
"entrepr")))
##
## Distinguish all forms of "engineer" from "engine"
all <- tm::tm_map(all, tm::content_transformer(
function(x) stringr::str_replace_all(x, "engineer",
"engineert")))
post_stem_conversion["engineert"] <- "engineer"
## Stem
all <- tm::tm_map(all, tm::stemDocument)
## Perform post-stem conversion
post_stem_conversion["\\bgrante\\b"] <- "grant"
for(w in names(post_stem_conversion)) {
all <- tm::tm_map(all,
tm::content_transformer(function(x) stringr::str_replace_all(
x, w, post_stem_conversion[w])))
}
## Convert stemmed British spellings to catch variations on words
## not on the original list
for(i in seq_along(all)) {
for(w in names(british_to_american)) {
all[[i]]$content <-
stringr::str_replace_all(all[[i]]$content,
sprintf("\\b%s\\b", SnowballC::wordStem(w)),
SnowballC::wordStem(british_to_american[w]))
}
}
## Remove post-stem stopwords
all <- tm::tm_map(all, tm::removeWords, post_stem_custom_stopwords)
custom_tokenizer <- NLP::Token_Tokenizer(texanaaid::tokenize)
dtm <- tm::DocumentTermMatrix(all, control=
list(
tokenize=custom_tokenizer,
#stemming=TRUE,
bounds=list(global=bounds),
wordLengths=word_lengths#,
#stopwords=custom_stopwords
#weighting=weightTfIdf
)
)
# dim(dtm)
### With stemming and wordLengths=c(2, 24): 21,583 terms
### With stemming and wordLengths=c(2, 24) and bounds c(3, Inf): 14,651 terms
### Without stemming and wordLengths=c(2, 24): 30,162
### Without stemming and with doc bounds=c(2, 112): 30,131
### Without stemming, without bounds, without wordLengths restrictions: 87,797
stm_input <- stm::readCorpus(corpus=dtm, type="slam")
# empty_docs <- apply(as.matrix(dtm)==0, 1, all)
stm_input$meta <- d
stm_input <- stm::prepDocuments(stm_input$documents, stm_input$vocab, stm_input$meta)
# rm(all_pages_metadata, d, head_foot, metadata, tr, adobe_texts, all,
# all_pages, all_reports, custom_stopwords, custom_tokenizer,
# doc, doc_geog, doc_year, i, j, pages, titles, to_add,
# to_remove, to_replace, txt, url_pattern, prep_texts,
# replacement)
save(stm_input, raw_reports, file=output_file) # used to save dtm as well, but not sure I need it anymore
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.