library(modules)
nlp <- module({
import("dplyr")
import("ggraph")
import("igraph")
import("koRpus")
import("ldatuning")
import("mallet")
import("naturalsort")
import("pdftools")
import("qdapDictionaries")
import("textmineR")
import("tidyr")
import("tidytext")
import("tm")
import("tools")
import("topicmodels")
read_txt_files <- function(fnames, as_string = TRUE) {
# Apply 'readlines()' function to each element of a character vector of .txt filenames.
#
# Arguments:
# fnames {char} -- vector of filenames to read and apply cleaning to
#
# Keyword Arguments:
# as_string {logical} -- if TRUE, read each text file as a string rather than a vector
# by collapsing lines separated by '\n'
#
# Returns:
# {named char} -- if (as_string = TRUE); charracter vector of textfile contents as strings
# {list} -- if (as_string = FALSE); list of textfile contents as character vectors
stopifnot(is.character(fnames))
stopifnot(is.logical(as_string))
# Read files and collapse
text = lapply(fnames, readLines)
if (as_string) {
text = unlist(lapply(text, function(x) paste0(x, collapse = "\n")))
}
# Create named chr from filenames
names(text) = fnames
return(text)
}
rm_letters <- function(str, ltrs = LETTERS, upper_and_lower = TRUE) {
# Remove strings like "b" or "A", or "W", or any other standalone
# letter. Standalone letters are identified using word boundaries (\b).
#
# Arguments:
# str {char} -- character string to remove letters from
#
# Keyword Arguments:
# ltrs {char} -- letter or letters to remove from `str` (default: LETTERS)
# upper_and_lower {logical} -- if True, remove both upper and lower case versions of `ltrs`
#
# Returns:
# {char} -- string with letters removed
stopifnot(is.character(str))
stopifnot(is.logical(upper_and_lower))
# Get upper and lower case versions of `ltrs` if specified. If not, just use `ltrs` as it is
if (upper_and_lower) {
ltrs = c(toupper(ltrs), tolower(ltrs))
}
letters_regex = paste0("\\b", paste0(ltrs, collapse = "\\b|\\b"), "\\b")
return( gsub(letters_regex, " ", str) )
}
rm_stopwords <- function(str, stopwords = tm::stopwords("en")){
# Remove any word in a vector, stopwords, from another character vector.
# You can pass in any list of "stopwords" to be removed from x
# If unspecified, stopwords list defaults to list from 'tm' package
#
# Arguments:
# str {char} -- character string to remove stopwords from
#
# Keyword Arguments:
# stopwords {char} -- character vector of words to remove (default: tm::stopwords("en"))
#
# Function Dependencies:
# - tm
#
# Returns:
# {char} -- string with stopwords removed
stopifnot(is.character(str))
stopifnot(is.character(stopwords))
# Collapse into regex string
stopwords_regex = paste0("\\b", paste0(stopwords, collapse = "\\b|\\b"), "\\b")
# Remove all stopwords by replacing with an empty string
out = gsub(stopwords_regex, "", str)
# Remove extra spaces
out = rm_excess_spaces(out)
return(out)
}
rm_punct <- function(str, except = NULL){
# Remove punctuation from a character vector.
#
# Arguments:
# str {char} -- character string to remove punctuation from
#
# Keyword Arguments:
# except {char} or NULL -- character string specifying punctuation characters NOT to
# remove (default: NULL). Ex: except = ".,|" -> will not remove
# periods, commas or pipes.
#
# Returns:
# {char} -- character string with punctuation removed
stopifnot(is.character(str))
stopifnot(is.character(except) || is.null(except))
stopifnot(length(except) == 1)
# Original pattern specifying all punctuation (all characters that are
# NOT alphanumeric)
punct_pattern = "[^[:alnum:] "
# If single quote ' is in "except", this needs to be treated differently.
# It must be replaced with nothing (instead of a space) due to cases like
# "don't". If it were treated the same, don't --> don t, which is not what
# we want. We will simply remove the single quote before the rest.
if(!is.null(except)){
if(grepl("'", except)){
str = gsub("'", "", str)
except = gsub("'", "", except)
}
}
# Add except if specified
if(!is.null(except)) punct_pattern = paste0(punct_pattern, except)
punct_pattern = paste0(punct_pattern, "]") # Close unclosed square bracket
# Apply removal of punctuation
str = gsub(punct_pattern, " ", str)
str = rm_excess_spaces(str)
return(str)
}
rm_nonwords <- function(str, keep_numerics = TRUE, keep_acronyms = TRUE, keep_names = TRUE){
# Remove non-English words from character string or vector using the Grady Augmented
# dictionary. This will also remove all punctuation and automatically encode to UTF-8.
#
# Arguments:
# str {char} -- character string to remove nonwords from
#
# Keyword Arguments:
# keep_numerics {logical} -- if TRUE, any token that can be coerced to object of type
# numeric using 'as.numeric()' is kept
# keep_acronyms {logical} -- if TRUE, any token that can be coerced to object of type
# numeric using 'as.numeric()' is kept
# keep_names {logical} -- if TRUE, all tokens that are in qdapDictionaries::NAMES$name
# are kept.
#
# Function Dependencies:
# - qDapDictionaries
#
# Returns:
# {char} -- character string with nonwords removed
stopifnot(is.character(str))
stopifnot(is.logical(keep_numerics))
stopifnot(is.logical(keep_acronyms))
stopifnot(is.logical(keep_names))
# Prepare str vector
str = enc2utf8(str) # Encode to UTF-8
str = rm_punct(str, except = "'-") # Remove punct except ' and -
str = rm_excess_spaces(str)
# Separate words by splitting on spaces
if(length(str) > 1) {
# 'str' is a vector, not a string, so split each element of vector
# on a space (result is a list)
str = lapply(str, function(string) strsplit(string, split = " ")[[1]])
} else {
str = strsplit(str, split = " ")[[1]] # Create vector from string
}
# Apply removal of nonwords taking into account user-specified parameters
filter_nonwords <- function(x) {
# Find and remove invalid words
# For each element of 'str', apply the following
keep = tolower(x) %in% tolower(qdapDictionaries::GradyAugmented)
if(keep_numerics == TRUE) {
idx_numeric = which(!is.na(suppressWarnings(as.numeric(x))))
keep[idx_numeric] = TRUE
}
if(keep_acronyms == TRUE) {
# Separate numerics from acronyms
idx_numeric = which(!is.na(suppressWarnings(as.numeric(x))))
idx_acronym = which(x == toupper(x))
idx_acronym = idx_acronym[!(idx_acronym %in% idx_numeric)]
keep[idx_acronym] = TRUE
}
if(keep_names == TRUE) {
idx_name = which(tolower(x) %in% tolower(qdapDictionaries::NAMES$name))
keep[idx_name] = TRUE
}
return(x[keep])
}
if(class(str) == "list") {
str_clean = lapply(str, filter_nonwords)
str_clean = unlist(lapply(str, function(x) paste0(x, collapse = " ")))
} else {
str_clean = filter_nonwords(str)
}
# Apply original document names if 'str' argument was a named chr
if(!is.null(names(str))) names(str_clean) = names(str)
return(str_clean)
}
rm_names <- function(str) {
# Remove names from a character string or vector using regular expressions names are
# given by those in `qdapDictionaries::NAMES$name`.
#
# Arguments:
# str {char} -- character string to remove names from
#
# Function Dependencies:
# - qDapDictionaries
#
# Returns:
# {char} -- character string with names removed
stopifnot(is.character(str))
# Get vector of names
names = sort(tolower(qdapDictionaries::NAMES$name))
# Must break into batches otherwise regular expression fails (too long)
batch_size = 100
names_list = split(names, ceiling(seq_along(names)/batch_size))
names_vec = unname(unlist( lapply(names_list, function(x) {
paste0(paste0("\\b", paste0(str, collapse = "\\b|\\b"), "\\b"))
}) ))
# Sub out names by batch
for(i in 1:length(names_vec)) {
str = gsub(names_vec[i], " ", tolower(str))
}
return(x)
}
rm_excess_spaces <- function(str) {
# Trim whitespace on the ends of a string, and replace instances of two or more spaces
# with a single space.
#
# Arguments:
# str {char} -- character string to remove excess whitespace from
#
# Returns:
# {char} -- character string with excess whitespace removed
stopifnot(is.character(str))
return( trimws(gsub("\\s+", " ", str)) )
}
replace_non_ascii <- function(text){
# Replace any non-English characters with Latin counterparts.
#
# Arugments:
# text {char} -- character string
#
# Returns:
# {char}
# Define characters
unwanted_array = list(
"Š"="S", "š"="s", "Ž"="Z", "ž"="z", "À"="A", "Á"="A", "Â"="A","Ã"="A",
"Ä"="A", "Å"="A", "Æ"="A", "Ç"="C", "È"="E", "É"="E", "Ê"="E", "Ë"="E",
"Ì"="I", "Í"="I", "Î"="I", "Ï"="I", "Ñ"="N", "Ò"="O", "Ó"="O", "Ô"="O",
"Õ"="O", "Ö"="O", "Ø"="O", "Ù"="U", "Ú"="U", "Û"="U", "Ü"="U", "Ý"="Y",
"Þ"="B", "ß"="Ss", "à"="a", "á"="a", "â"="a", "ã"="a", "ä"="a",
"å"="a", "æ"="a", "ç"="c", "è"="e", "é"="e", "ê"="e", "ë"="e", "ì"="i",
"í"="i", "î"="i", "ï"="i", "ð"="o", "ñ"="n", "ò"="o", "ó"="o", "ô"="o",
"õ"="o", "ö"="o", "ø"="o", "ù"="u", "ú"="u", "û"="u", "ý"="y", "ý"="y",
"þ"="b", "ÿ"="y"
)
# Execute replacement
out = chartr(
old = paste(names(unwanted_array), collapse = ""),
new = paste(unwanted_array, collapse = ""),
x = text
)
return(out)
}
dtm <- function(doc_vec, tfidf = FALSE){
# Create a document-term matrix from a character vector using the 'tm' package.
#
# Arguments:
# doc_vec {char} -- character string or vector containing text to run DTM on, often a corpus
#
# Keyword Arguments:
# tfidf {logical} -- if TRUE, return TF-IDF scores instead of counts (default: FALSE)
#
# Function Dependencies:
# - tm
#
# Returns:
# {matrix} -- document-term matrix cast as matrix
library(tm)
stopifnot(is.character(doc_vec))
stopifnot(is.logical(tfidf))
# Prepare corpus
doc_vec = tolower(doc_vec)
corpus = tm::Corpus(tm::VectorSource(doc_vec))
# Create DTM to user specification
if(tfidf==TRUE) {
dtm = tm::DocumentTermMatrix(
corpus,
control = list(weighting = weightTfIdf))
} else {
dtm = tm::DocumentTermMatrix(corpus)
}
# Return as matrix object, set names if doc_vec is a named chr
dtm = as.matrix(dtm)
if(!is.null(names(doc_vec))) rownames(dtm) = names(doc_vec)
return(dtm)
}
clean_corpus <- function(doc_vec) {
# Clean a character vector or string by:
# - Removing excess whitespace
# - Removing punctuation
# - Converting to lower case
# - Removing stopwords
#
# Arguments:
# doc_vec {char} -- character string or vector representing corpus to clean
#
# Function Dependencies:
# - tm
#
# Returns:
# {char} -- corpus cleaned
library(tm)
crp_clean = tm::tm_map(doc_vec, stripWhitespace)
crp_clean = tm::tm_map(crp_clean, removePunctuation)
crp_clean = tm::tm_map(crp_clean, tm::content_transformer(tolower))
crp_clean = tm::tm_map(crp_clean, removeWords, tm::stopwords("en"))
return(crp_clean)
}
freqtable <- function(doc_vec){
# Make word-frequency table from a character string or vector
#
# Arguments:
# doc_vec {char} -- character string or vector with text to create a frequency table from
#
# Returns:
# {dataframe} -- frequency table cast as dataframe
# Make document-term matrix
document_term_matrix = dtm(doc_vec, tfidf = FALSE)
# Get word frequencies from document-term matrix
freq = sort(colSums(document_term_matrix), decreasing = TRUE)
words = names(freq)
freq = unname(freq)
# Bind into data frame and return
df = data.frame(word = words, frequency = freq, stringsAsFactors = FALSE)
df = df[order(-df$frequency),] # Sort by decreasing frequency
return(df)
}
ngram <- function(text, n = 2){
# Create table of ngrams from a character string or vector
#
# Arguments:
# doc_vec {char} -- character string or vector containing text to create ngrams from
#
# Keyword Arguments:
# n {numeric} -- ngram window, defaults to bigrams (default: 2)
# Ex. n = 1 will create a word-frequency table
# Ex. n = 2 will create a table of bigrams
# Ex. n = 3 will create a table of trigrams
#
# Function Dependencies:
# - dplyr
#
# Returns:
# {dataframe} -- dataframe of bigrams with columns "word1", "word2", ... , "frequency"
library(dplyr)
# Clean up character vector by removing NAs and punctuation excluding
# apostrophe (') and convert to lower case
text = text[!is.na(text)]
text = gsub("[^[:alnum:][:space:]']", " ", text)
text = gsub("\\s+", " ", text)
text = as.character(tolower(text))
# Create bigrams data frame
columns = paste0("word", 1:n)
bigrams = data.frame(text = text) %>%
tidytext::unnest_tokens(bigram, text, token = "ngrams", n = n) %>%
dplyr::count(bigram, sort = TRUE) %>%
tidyr::separate(bigram, columns, sep = " ") %>%
as.data.frame()
# Rename columns and rows
colnames(bigrams) = c(paste0("word", 1:n), "frequency")
rownames(bigrams) = NULL
return(bigrams)
}
ldatuning <- function(text, cores, start, end, by, verbose = FALSE) {
# Run ldatuning on a collection of texts.
#
# Set this function equal to a varaible to save results.
# Example:
# ldatuning_result = ldatuning(text, 4, 2, 20, 2)
# ldatuning::FindTopicsNumber_plot(ldatuning_result)
#
# This program will create a plot detailing the optimal number of topics to
# use when running a topic model on a dataset. It uses four metrics:
# * Griffiths 2004
# * Cao-Juan 2009
# * Arun 2010
# * Deveaud 2014
# Using these metrics and the output plot, not only will the optimal number
# of topics be able to be deduced, but also a range of numbers of topics
# that are acceptable.
#
# Look for the point at which Cao-Juan and Arun are at a MINIMUM, and
# Griffiths and Deveaud are at a MAXIMUM.
#
# Arguments:
# text {char} -- character vector of texts
# cores {numeric} -- number of cores to run LDATuning with
# start {numeric} -- starting number of sequence of topics
# end {numeric} -- ending number of sequence of topics
# by {numeric} -- increment number of topics
#
# Keyword Arguments
# verbose {logical} -- if TRUE, print updates to STDOUT (default: FALSE)
#
# Function Dependencies:
# - ldatuning
#
# Returns:
# {ldatuning::FindTopicsNumber}
#
# Example:
# If START = 2, END = 20, BY = 1, then this program will test all topics:
# 2, 3, 4, 5, ..., 20
# Example:
# If START = 5, END = 50, BY = 5, then this program will test all topics:
# 5, 10, 15, ..., 50
library(ldatuning)
stopifnot(is.character(texdt))
stopifnot(is.numeric(cores))
stopifnot(is.numeric(start))
stopifnot(is.numeric(end))
stopifnot(is.numeric(by))
stopifnot(is.logical(verbose))
est_time_ldatuning = function(data_size_in_mb, start, end, by) {
# Estimate time to run LDATuning based on size of data and topics.
#
# Declared average time based on trial 2-50 topics took 3.278738 days
# at sum(1:49) = 1225 total topics tested
# This averages out to 3.85419 minutes per topic/192.4 megabytes -->
# 0.02003217 minutes per topic per megabyte,
# or 1.20193 seconds
#
# Arguments:
# data_size_in_mb {int} -- data size of text in megabytes
# start {numeric} -- starting number of sequence of topics
# end {numeric} -- ending number of sequence of topics
# by {numeric} -- increment number of topics
#
# Returns:
# {char}
# Get the total number of topics using start, end and by
number_of_topics = sum(seq(start, end, by))
# Calcualted estimated time
est_time_sec = 1.20193 * data_size_in_mb * number_of_topics
est_time = format_time_diff(est_time_sec)
# Return character string indicating estimated length
est_time_string = paste(est_time$value, est_time$unit)
return(est_time_string)
}
# Load libraries
if(verbose == TRUE) echo("Loading libraries 'ldatuning' 'topicmodels' 'tm'")
packages = c("ldatuning", "topicmodels", "tm")
invisible(lapply(packages, FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE, quietly = TRUE)
}
}))
if(verbose == TRUE) echo("DONE", indent = 1)
# Make DTM
if(verbose == TRUE) echo("Creating document-term matrix")
corpus = tm::Corpus(tm::VectorSource(text))
dtm = tm::DocumentTermMatrix(corpus)
dtm = as.matrix(dtm)
if(verbose == TRUE) echo("DONE", indent = 1)
# Use four methods to find optimal number of topics
if(verbose == TRUE) echo("Fitting models (this may take a while)")
result = ldatuning::FindTopicsNumber(
dtm,
topics = seq(from = start, to = end, by = by),
metrics = c('Griffiths2004', 'CaoJuan2009', 'Arun2010', 'Deveaud2014'),
method = 'Gibbs',
control = list(seed = 77),
mc.cores = 4L,
verbose = FALSE)
if(verbose == TRUE) echo("DONE", indent = 1)
# Quickly visualize results
if(verbose == TRUE) echo("Plotting results")
ldatuning::FindTopicsNumber_plot(result)
if(verbose == TRUE) echo("DONE", indent = 1)
return(result)
}
pdf_to_text <- function(pdf_file) {
# Converts a PDF document to TXT using library "pdftools"
#
# Arguments:
# pdf_file {char} -- pdf file to read and convert to text
#
# Function Dependencies:
# - pdftools
#
# Returns:
# {char}
library(pdftools)
stopifnot(is.character(pdf_file))
# Get text
txt = pdftools::pdf_text(pdf_file)
# Clean text
txt = tolower(paste(unlist(txt), collapse = " "))
txt = nlp$rm_excess_spaces(txt)
return(txt)
}
nodegraph <- function(text) {
# Given a character vector or string, create a nodegraph using the 'igraph'
# package, by first extracting a table of bigrams from the text.
#
# Arguments:
# text {char} -- text to make nodegraph from
#
# Function Dependencies:
# - igraph
# - ggraph
#
# Returns:
# {ggraph}
library(igraph)
library(ggraph)
stopifnot(is.character(text))
# Make bigrams table
bigrams = ngram(text, n = 2)
# Make into graph object
bigrams_graph = bigrams %>% graph_from_data_frame()
# Make graph
ggraph(bigrams_graph, layout = "fr") +
geom_edge_link(
aes(edge_alpha = nrow(bigrams)),
show.legend = FALSE,
arrow = grid::arrow(
type = "open",
length = unit(.10, "inches")
),
end_cap = circle(.07, "inches")
) +
geom_node_point(
color = "lightblue",
size = 3
) +
geom_node_text(
aes(label = name),
repel = TRUE
) +
theme_void()
}
lemmatize <- function(
doc_vec,
doc_names,
outdir,
lem_src_dir,
rm_punct = FALSE,
to_lower = TRUE,
file_suffix = "_lemma",
file_prefix = NULL,
verbose = FALSE) {
# Lemmatize a collection of text files.
#
# HOW TO GET TREETAGGER SOURCE
# (1) Go to http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/
# (2) Follow the instructions under the heading "Download".
# (3) Point this program to the lemmatizer source code via the
# 'lem_src_dir' parameter.
#
# Arguments:
# doc_vec {char} -- character vector of your text files to lemmatize
# doc_names {char} -- is a character vector of the names of those documents
# outdir {char} -- is the output directory where you would like each lemmatized
# text file to be written to. The output of this program is
# thus a lemmatized texzt file for each input file.
# lem_src_dir {char} -- is the treetagger source. Instructions on how to get
# these source files is below.
#
# Keyword Arguments:
# rm_punct {logical} -- (default: FALSE)
# to_lower {logical} -- (default: TRUE)
# file_prefix {char} -- (default: NULL)
# file_suffix {char} -- (default: "_lemma")
# verbose {logical} -- (default: FALSE)
#
# Function Dependencies:
# - koRpus
#
# Returns:
# nothing -- files written to output directory
library(koRpus)
stopifnot(is.character(doc_vec))
stopifnot(is.character(doc_names))
stopifnot(length(doc_vec) == length(doc_names))
stopifnot(is.character(outdir))
stopifnot(dir.exists(outdir))
stopifnot(is.character(lem_src_dir))
stopifnot(dir.exists(lem_src_dir))
# Set the koRpus environment
set.kRp.env(
TT.cmd = paste0(lem_src_dir, "/cmd/tree-tagger-english"),
lang = "en",
preset = "en",
treetagger = "manual",
format = "file",
TT.tknz = TRUE,
encoding = "UTF-8",
TT.options = list(
path = lem_src_dir,
preset = "en"
)
)
if(verbose == TRUE) echo("koRpus environment set")
# If outdir doesn't end in a slash, append it
if(!grepl(".*\\/$", outdir)) outdir = paste0(outdir, "/")
# New filenames
if(is.null(file_suffix)) file_suffix = ""
if(is.null(file_prefix)) file_prefix = ""
new_files = paste0(outdir, file_prefix, basename(doc_names), file_suffix, ".txt")
# Execute lemmatization. This treetagger only works with file connections,
# not R objects. So each text file must be temporarily written to the
# file system.
if(verbose == TRUE) echo("Initiating lemmatization...")
if(dir.exists("~/.lemma_r_tmp")) unlink("~/.lemma_r_tmp", recursive = TRUE)
dir.create("~/.lemma_r_tmp")
for(i in 1:length(doc_vec)) {
# Write temporary file
tmp_file = file.path("~/.lemma_r_tmp", basename(doc_names[i]))
writeLines(text = doc_vec[i], con = tmp_file)
# Treetag
tryCatch({
tagged_words = treetag(
file = tmp_file,
treetagger ="manual",
lang = "en",
TT.options = list(
path = lem_src_dir,
preset = "en"
)
)
results = tagged_words@TT.res
lem = results$lemma
}, error = function(e) {
# Write error file and proceed
writeLines(text = "",
con = paste0(dirname(new_files),
"/ERROR_",
basename(new_files)))
echo("ERROR in file %s", doc_names[i])
})
# Replace "@card@" with the actual number (cardinal number)
lem[lem == "@card@"] = results$token[lem == "@card@"]
# Delete "<unknown>"
if(length(which(lem == "<unknown>"))) lem = lem[-which(lem == "<unknown>")]
# Collapse into character string
lem = paste0(lem, collapse = " ")
# Write output file
writeLines(text = lem, con = new_files[i])
# Remove temporary file
file.remove(tmp_file)
# Notify
if(verbose == TRUE) {
echo(sprintf("%s Lemmatized %s of %s (%s%%)",
Sys.time(), i, length(doc_names),
round(i/length(doc_names)*100, 2)) )
}
}
unlink("~/.lemma_r_tmp", recursive = TRUE)
# End
if(verbose == TRUE) echo("Lemmatization complete")
}
topic_model <- function(
doc_vec,
doc_names = NULL,
num_topics = 10,
top_terms = NumTopTerms,
alpha = 50 / num_topics,
estimate_beta = TRUE,
delta = 0.1,
burnin = 4000,
iter = 2000,
thin = 500,
nstart = 5,
best = TRUE,
stem = FALSE,
seed = list(2003, 5, 63, 100001, 765),
verbose = TRUE) {
# Run topic model using 'topicmodels' package.
#
# Arguments:
# doc_vec {char} -- texts to run topic model on, may be named chr with names as doc_names
#
# Keyword Arguments:
# doc_names {char} -- vector of text doc_names
# num_topics {numeric} -- number of topics to run
# top_terms {numeric} -- number of terms to retrieve for each topic
# alpha {numeric} -- hyperparameter: alpha
# estimate_beta {numeric} -- hyperparameter: beta
# delta {numeric} -- hyperparameter: delta
# burnin {numeric} -- burn in iterations
# iter {numeric} -- iterations to run
# thin {numeric} -- hyperparameter: thin
# nstart {numeric} -- Gibbs sampling parameter
# best {logical} -- if TRUE, optimize
# stem {logical} -- if TRUE, stem corpus before running Gibbs sampling
# seed {list} -- random seed list
# verbose {logical} -- if TRUE, print updates to STDOUT
#
# Function Dependencies:
# - tm
# - topicmodels
#
# Returns:
# model = list(
# dtm = dtm,
# freq = freq,
# ldaOut = ldaOut,
# topic_words = ldaOut.terms,
# doc_top_topics = ldaOut.topics,
# doc_topics = topicProbabilities
# )
library(tm)
library(topicmodels)
# Assign filenames if not specified
if (is.null(doc_names)) {
doc_names = paste0("textfile", 1:length(doc_vec))
}
stopifnot(is.character(doc_vec))
stopifnot(is.character(doc_names))
stopifnot(length(doc_vec) == length(doc_names))
stopifnot(length(doc_vec) > 0)
stopifnot(is.numeric(num_topics))
stopifnot(is.numeric(top_terms))
stopifnot(is.numeric(alpha))
stopifnot(is.numeric(estimate_beta))
stopifnot(is.numeric(delta))
stopifnot(is.numeric(burnin))
stopifnot(is.numeric(iter))
stopifnot(is.numeric(thin))
stopifnot(is.numeric(nstart))
stopifnot(is.logical(best))
stopifnot(is.logical(stem))
stopifnot(is.logical(verbose))
stopifnot(is.list(seed))
# Transform pattern to space
to_space = tm::content_transformer(function(x, pattern) {
return (gsub(pattern, " ", x))
})
# -------------------- #
# -- PREPARE CORPUS -- #
# -------------------- #
# Begin program
if (verbose) echo("BEGINNING TOPIC MODEL", indent = 0)
if (verbose) echo("-------------------------------", indent = 0)
start_time = Sys.time() # Begin program timing
options(scipen = 999) # Turn off scientific notation
# Create document-term matrix
if (verbose) echo("Creating Corpus object...")
docs = Corpus(VectorSource(doc_vec))
docs = tm_map(docs, content_transformer(tolower))
if (verbose) echo("DONE", indent = 1)
# Remove problematic symbols
if (verbose) {
echo("Cleaning corpus (removing symbols, numbers, punctuation, whitespace)...")
}
docs = tm_map(docs, to_space, "-")
docs = tm_map(docs, to_space, "’")
docs = tm_map(docs, to_space, "‘")
docs = tm_map(docs, to_space, "•")
docs = tm_map(docs, to_space, '"')
docs = tm_map(docs, to_space, "'")
# Remove punctuation, digits, stopwords, whitespace
docs = tm_map(docs, removeWords, stopwords("english"))
docs = tm_map(docs, removePunctuation)
docs = tm_map(docs, removeNumbers)
docs = tm_map(docs, stripWhitespace)
docs = tm_map(docs, content_transformer(trimws))
if (verbose) echo("DONE", indent = 1)
# Stem
if (stem == TRUE) {
if (verbose) echo("Stemming corpus...")
docs = tm_map(docs, stemDocument)
if (verbose) echo("DONE", indent = 1)
}
# ---------------------- #
# -- DTM & Freq Table -- #
# ---------------------- #
# Create document-term matrix
if (verbose) echo("Creating document-term matrix...")
dtm = DocumentTermMatrix(docs)
rownames(dtm) = doc_names
if (verbose) echo("DONE", indent = 1)
# Collapse matrix by summing over columns
if (verbose) echo("Creating frequency table...")
freq = colSums(as.matrix(dtm))
# List all terms in decreasing order of freq and write to disk
freq = freq[order(freq, decreasing = TRUE)]
if (verbose) echo("DONE", indent = 1)
# ----------------- #
# -- TOPIC MODEL -- #
# ----------------- #
# Run LDA using Gibbs sampling
if (verbose) echo("Running topic model. This could take a while...")
ldaOut = LDA(
x = dtm,
k = num_topics,
method = "Gibbs",
control = list(
alpha = alpha,
estimate.beta = estimate_beta,
delta = delta,
nstart = nstart,
best = best,
burnin = burnin,
iter = iter,
thin = thin,
seed = seed
)
)
if (verbose) echo("DONE", indent = 1)
# ------------------------- #
# -- TOPIC MODEL OBJECTS -- #
# ------------------------- #
# Docs top topics
if (verbose) echo("Creating doc top topics...")
ldaOut.topics = data.frame(
File = names(topics(ldaOut)),
Top_Topic = unname(topics(ldaOut)),
stringsAsFactors = FALSE
)
if (verbose) echo("DONE", indent = 1)
# Top terms in each topic
if (verbose) echo("Creating topic words...")
ldaOut.terms = as.data.frame(terms(ldaOut, top_terms))
ldaOut.terms = data.frame(lapply(ldaOut.terms, as.character), stringsAsFactors = FALSE)
ldaOut.terms = cbind(RowID = 1:nrow(ldaOut.terms), ldaOut.terms)
colnames(ldaOut.terms) = gsub(" |\\.", "", colnames(ldaOut.terms))
rownames(ldaOut.terms) = NULL
if (verbose) echo("DONE", indent = 1)
# Probabilities associated with each topic assignment (doc topics)
if (verbose) echo("Creating doc topics...")
topicProbabilities = as.data.frame(ldaOut@gamma)
colnames(topicProbabilities) = paste0("Topic", 1:num_topics)
topicProbabilities =
cbind(RowID = 1:nrow(topicProbabilities),
Document = as.character(doc_names), topicProbabilities)
rownames(topicProbabilities) = NULL
if (verbose) echo("DONE", indent = 1)
# Collect all results in a single list
model = list(
dtm = dtm,
freq = freq,
ldaOut = ldaOut,
topic_words = ldaOut.terms,
doc_top_topics = ldaOut.topics,
doc_topics = topicProbabilities
)
# End program
if (verbose) {
end_time = Sys.time() # End program timing
echo("-------------------------------", indent = 0)
echo(sprintf("TOPIC MODEL COMPLETED IN %s %s",
round(as.numeric(difftime(end_time, start_time)), 6),
gsub("MINS", "MINUTES",
gsub("SECS", "SECONDS",
toupper(units(difftime(end_time, start_time))))) ))
}
return(model)
}
topic_model_textmineR <- function(
doc_vec,
doc_names = NULL,
num_topics = seq(2, 40, 2),
iterations = 2000,
top_terms = 10) {
# Run a topic model using textmineR package.
#
# Arguments:
# doc_vec {char} -- texts to run topic model on, may be named chr with names as doc_names
#
# Keyword Arguments:
# doc_names {char} -- vector of text doc_names (default: NULL)
# num_topics {numeric} -- number of topics to run (default: `seq(2, 40, 2)`)
# iterations {numeric} -- number of iterations to run (default: 2000)
# top_terms {numeric} -- number of terms to return in each topic (default: 10)
#
# Function Dependencies:
# - textmineR
#
# Returns:
# {list} -- model = list(
# summary # Summary table
# top_terms # Top terms
# top_terms_prime # Top terms prime for classifying new documents
# assignments # DocTopics matrix
# num_docs # Number of documents in each topic
# coherence # Probabilistic coherence: measures statistical support for a topic
# labels # Topic labels
# best_k # Optimized number of topics
# r2 # R- squared of this model
# dtm # Document-Term Matrix
# )
#
# plot(model$hclust) # Topic cluster dendogram
# model$tf[order(-model$tf$term_freq),] # Inspection of Document-Term Matrix
library(textmineR)
if (is.null(doc_names)) {
doc_names = paste0("textfile", 1:length(doc_vec))
}
stopifnot(is.character(doc_vec))
stopifnot(is.character(doc_names))
stopifnot(length(doc_vec) == length(doc_names))
stopifnot(is.numeric(num_topics))
stopifnot(is.numeric(iterations))
stopifnot(is.numeric(top_terms))
# Make Document-Term matrix
dtm = CreateDtm(
doc_vec = doc_vec,
doc_names = doc_names,
ngram_window = c(1,1)
)
# Inspect
tf = TermDocFreq(dtm = dtm)
rownames(tf) = NULL
# Eliminate words appearing less than 2 times or in more than half of the documents
vocabulary = tf$term[ tf$term_freq > 1 & tf$doc_freq < nrow(dtm) / 2 ]
dtm = dtm[, vocabulary]
# Fit LDA models and select best number of topics
k_list = num_topics
# Fit models based on multiple possible numbers of topics
model_list = TmParallelApply(k_list, function(k) {
m = FitLdaModel(
dtm = dtm,
k = k,
iterations = iterations
)
m$k = k
m$coherence = CalcProbCoherence(
phi = m$phi,
dtm = dtm,
M = top_terms
)
m
})
# Get coherence matrix used to evaluate best model
coherence_mat = data.frame(
k = sapply(model_list, function(x) nrow(x$phi)),
coherence = sapply(model_list, function(x) mean(x$coherence)),
stringsAsFactors = FALSE
)
# Select k based on maximum average coherence
# phi is P(words|topics)
# theta is P(topics|documents)
model = model_list[which.max(coherence_mat$coherence)][[1]]
model$best_k = coherence_mat$k[which.max(coherence_mat$coherence)]
# Get the R-squared of this model
model$r2 = CalcTopicModelR2(
dtm = dtm,
phi = model$phi,
theta = model$theta
)
# Top terms of the model according to phi & phi-prime
model$top_terms = GetTopTerms(
phi = model$phi,
M = top_terms
)
# Phi-prime, P(topic | words) for classifying new documents
model$phi_prime = CalcPhiPrime(
phi = model$phi,
theta = model$theta,
p_docs = rowSums(dtm)
)
model$top_terms_prime = GetTopTerms(
phi = model$phi_prime,
M = top_terms
)
# Give a hard in/out assignment of topics in documents
model$assignments = model$theta
model$assignments[model$assignments < 0.05] = 0
model$assignments = model$assignments / rowSums(model$assignments)
model$assignments[is.na(model$assignments)] = 0
# Get topic labels using n-grams from the DTM
model$labels = LabelTopics(
assignments = model$assignments,
dtm = dtm,
M = 2
)
# Probabilistic coherence: measures statistical support for a topic
model$coherence = CalcProbCoherence(
phi = model$phi,
dtm = dtm,
M = top_terms
)
# Number of documents ineach topic
model$num_docs = colSums(model$assignments > 0)
# Cluster topics together in a dendrogram
model$topic_linguistic_dist = CalcHellingerDist(model$phi)
model$hclust = hclust(as.dist(model$topic_linguistic_dist), "ward.D")
model$hclust$clustering = cutree(model$hclust, k = model$best_k)
model$hclust$labels = paste(model$hclust$labels, model$labels[,1])
# rect.hclust(model$hclust, k = length(unique(model$hclust$clustering)))
# rect.hclust(model$hclust, k = 10)
# Make a summary table
model$summary = data.frame(
topic = rownames(model$phi),
cluster = model$hclust$clustering,
labels = model$labels,
coherence = model$coherence,
num_docs = model$num_docs,
top_terms = apply(model$top_terms, 2, function(x) {
paste(x, collapse = ", ")
}),
top_terms_prime = apply(model$top_terms_prime, 2, function(x) {
paste(x, collapse = ", ")
}),
stringsAsFactors = FALSE
)
model$summary = model$summary[order(model$hclust$clustering),]
# Retrieve `tf` inspection of DTM from earlier, and DTM itself
model$tf = tf
model$dtm = dtm
return(model)
}
docClusterCsim <- function(doc_vec, doc_names = NULL, num_clusters) {
# Cluster documents using TFIDF and Cosine Similarity.
#
# Arguments:
# doc_vec {char} -- character vector of text files
# num_clusters {numeric} -- the desired number of clusters
#
# Keyword Arguments:
# doc_names {char} -- filenames of text files to appear in clustering (default: NULL)
#
# Function Dependencies:
# - textmineR
#
# Returns:
# {list} -- model = list(
# summary, # {dataframe}
# cluster_words, # {list}
# hclust, # {hclust} (ready for plotting)
# dtm, # {}
# tfidf, # {}
# csim # {}
# )
#
# Original Source:
# https://cran.r-project.org/web/packages/textmineR/vignettes/b_document_clustering.html
# Create DTM
library(textmineR)
dtm = textmineR::CreateDtm(
doc_vec = doc_vec,
doc_names = doc_names,
verbose = FALSE
)
rownames(dtm) = basename(txt_files)
tf_mat = textmineR::TermDocFreq(dtm)
# TF-IDF and cosine similarity
tfidf = t(dtm[ , tf_mat$term ]) * tf_mat$idf
tfidf = t(tfidf)
csim = tfidf / sqrt(rowSums(tfidf * tfidf))
csim = csim %*% t(csim)
# Clustering
cdist = as.dist(1 - csim)
hc = hclust(cdist, "ward.D")
clustering = cutree(hc, num_clusters)
# plot(hc, main = "Document Clustering", ylab = "", xlab = "", yaxt = "n")
# rect.hclust(hc, num_clusters, border = "red")
# Cluster words
p_words = colSums(dtm) / sum(dtm)
cluster_words = lapply(unique(clustering), function(x){
rows = dtm[clustering == x, ]
rows = rows[, colSums(rows) > 0] # Drop words not in cluster
colSums(rows) / sum(rows) - p_words[colnames(rows)]
})
# Summary table of top 5 words in each cluster
cluster_summary = data.frame(
cluster = unique(clustering),
num_docs = as.numeric(table(clustering)),
top_words = sapply(cluster_words, function(d){
paste(
names(d)[order(d, decreasing = TRUE)][1:5],
collapse = ", "
)
}),
stringsAsFactors = FALSE
)
# Final list
model = list(
summary = cluster_summary,
cluster_words = cluster_words,
hclust = hc,
dtm = dtm,
tfidf = tfidf,
csim = csim
)
return(model)
}
topic_model <- function(
fnames,
outdir,
alpha,
hyper,
iterations,
num_topics,
corrplot = FALSE,
dendrogram = FALSE,
dendro_bal = 0.5){
# Run topic model using MALLET library from Stanford NLP.
#
# Arguments:
# fnames -- {char} Textfiles to run topic model on.
# outdir -- {char} Directory path to write outputs to.
# alpha -- {numeric} Set the Dirichlet alpha parameter. The lower the value
# (can be negative) the more words in distribution that will
# have probabilities approximating 0. It primarily affects
# the distribution of words across the topic (which has a
# subsequent effect on topics across the corpus).
# hyper -- {numeric} Determines the strongly the alpha parameter affects the
# overall model. Each time you run an optimization, the
# topic probability distribution departs farther from the
# initial homogeneous distribution. It primarily affects the
# distribution of topics across the corpus (which has a
# subsequent effect on words in each topic). If your goal
# is to identify small numbers of texts about specific
# themes in a large collection, then a lot of optimization
# may be good. However, if your goal is to identify topics
# typical of certain authors, periods, genres or some other
# reasonably large subset of your collection, then optimize
# a bit less.
# iterations -- {numeric} Set the number of times the Latent Dirichlet
# Allocation (LDA) algorithm will be executed.
# Trade-off between speed and quality. Higher
# iterations generally leads to lower computation
# speed but higher accuracy. 500 iterations is the
# suggested baseline for up to 10,000 documents.
# num_topics -- {numeric} The desired number of topics.
#
# Keyword Arguments:
# corrplot -- {logical} If TRUE, a correlation matrix plot is outputted (default: FALSE)
# dendrogram -- {logical} If TRUE, 3 dendrograms are outputted. Topic, Documents,
# Custom split. The 'Topic' dendrogram is created with
# the balance set to 1. This means that only word-level
# similarity is used. The 'Document' dendrogram is made
# with balance set to 0, indicating only document-level
# similarity is used. Any dendrogram balance inputted
# here creates a third dendrogram using this parameter as
# the balance. Set to NULL to not create the third
# dendrogram (default: FALSE)
# dendro_bal -- {numeric} Between 0 and 1 (default: 0.5)
#
# Function Dependencies:
# - tm
# - mallet
# - naturalsort
#
# Returns:
# {list} -- list(
# )
# May need this depending on Java issues
# dyn.load("/Library/Java/JavaVirtualMachines/jdk-9.0.1.jdk/Contents/Home/lib/server/libjvm.dylib")
library(tm)
library(tools)
library(mallet)
library(naturalsort)
stopifnot(is.character(fnames))
stopifnot(length(fnames) > 0)
stopifnot(is.character(outdir))
stopifnot(dir.exists(outdir))
stopifnot(is.numeric(alpha))
stopifnot(is.numeric(hyper))
stopifnot(is.numeric(iterations))
stopifnot(is.numeric(num_topics))
stopifnot(is.logical(corrplot))
stopifnot(is.logical(dendrogram))
stopifnot(is.numeric(dendro_bal))
options(scipen = 999)
read_file = function(fname){
# Read and format a text file using proper encoding.
#
# Arguments:
# fname {char} -- textfile to read, must have .txt extension
#
# Returns:
# {char} -- contents of textfile
library(tools)
stopifnot(file.exists(fname))
stopifnot(tolower(tools::file_ext(fname)) == "txt")
text = scan(fname, what = "character", sep = "\n")
text = iconv(text, "WINDOWS-1252", "UTF-8")
text = tolower(text)
text = paste(text, collapse = " ")
return(text)
}
write_topic_df <- function(df, fname) {
# Write topic model dataframe. Shorthand for `write.csv()` with arguments.
#
# Arguments:
# df {dataframe} -- dataframe to write
# fname {char} -- filename to write to
#
# Returns:
# nothing
stopifnot(is.data.frame(df))
stopifnot(is.character(fname))
suppressWarnings(
write.csv(
TopicWords_DF,
file = sprintf("%s/TopicWords.csv", outdir),
file = fname,
row.names = TRUE,
col.names = TRUE,
quote = FALSE
)
)
}
write_corrplot <- function(df_corr, outdir) {
# Write corrplot as a PNG image on file system.
#
# Arguments:
# df_corr {dataframe} -- correlation dataframe
# outdir {char} -- directory to save output PNG image to
#
# Returns:
# nothing
library(corrplot)
stopifnot(is.data.frame(df))
stopifnot(is.character(outdir))
stopifnot(dir.exists(outdir))
# Correlation - Topic Words
png(
filename = file.path(outdir, "Correlation_TopicWords.png"),
width = 5000,
height = 5000,
pointsize = 12,
res = 500
)
corrplot.mixed(
corr = df_corr,
lower = "circle",
upper = "number",
tl.col = "black",
tl.cex = .45,
number.cex = .6,
title = "Topic Words Correlation"
)
dev.off()
}
write_dendrogram <- function(dt, tw, fname, num_topics, bal) {
# Create a dendrogram and write to a PNG on the filesystem.
#
# Arguments:
# dt {matrix} -- doc topics matrix
# tw {matrix} -- topic words matrix
# fname {char} -- png filename to write to
# num_topics {numeric} -- the number of topics run
# bal {numeric} -- the dendrogram balance (0 to 1), 1 being word-weighted and 0
# being doc-weighted
#
# Function Dependencies:
# - mallet
#
# Returns:
# nothing
library(mallet)
stopifnot(is.matrix(dt))
stopifnot(is.matrix(tw))
stopifnot(is.character(fname))
stopifnot(is.numeric(num_topics))
png(
filename = fname,
width = 3000,
height = 3000,
pointsize = 12,
res = 450
)
plot(
mallet::mallet.topic.hclust(
doc.topics = dt,
topic.words = tw,
balance = bal
),
labels = paste0("Topic", 1:num_topics)
)
dev.off()
}
# Write stopwords to a temporary file
stopwords_fname = "~/tmp.stopwords.list.csv"
write.table(tm::stopwords("en"), stopwords_fname,
quote = FALSE, row.names = FALSE, col.names = FALSE)
# Sort files naturally, not based on text string comparison
# If documents have numbers, this will sort them on the numbers rather than lexographically
fnames = naturalsort::naturalsort(fnames)
# Read all textfiles
texts = sapply(fnames, read_file)
# Convert array of document IDs and text files to Mallet instance list
MalletInstances = mallet::mallet.import(
id.array = names(texts),
text.array = unname(texts),
stoplist.file = stopwords_fname,
preserve.case = FALSE,
token.regexp = "[\\p{L}]+"
)
# Wraps a Mallet topic model trainer Java object
TopicModel = mallet::MalletLDA(num.topics = num_topics)
# Load the documents into the topic model object
TopicModel$loadDocuments(MalletInstances)
# Extract words and frequencies
Vocabulary = TopicModel$getVocabulary()
WordFreqs = mallet.word.freqs(TopicModel)
WordFreqs = WordFreqs[order(-WordFreqs$term.freq),]
# Apply parameters and train
TopicModel$setAlphaOptimization(alpha, hyper)
TopicModel$train(iterations)
# ----------------- #
# -- DATA FRAMES -- #
# ----------------- #
# TopicWords
TopicWords_Matrix = mallet.topic.words(
topic.model = TopicModel,
smoothed = TRUE,
normalized = TRUE
)
TopicWords_DF = as.data.frame(t(TopicWords_Matrix))
TopicWords_DF = as.data.frame(
cbind(
1:nrow(TopicWords_DF),
Vocabulary,
TopicWords_DF
)
)
colnames(TopicWords_DF) = c("RowID", "Word", paste0("Topic", 1:num_topics))
# Ordered Version
# Must be specific format with no non-numeric values
TopicWords_DF_tmp = as.data.frame(t(TopicWords_Matrix))
rownames(TopicWords_DF_tmp) = Vocabulary
colnames(TopicWords_DF_tmp) = paste0("Topic", 1:num_topics)
TopicWords_DF_Ordered = as.data.frame(
matrix(
row.names(TopicWords_DF_tmp)[apply(-TopicWords_DF_tmp, 2, order)],
nrow(TopicWords_DF_tmp)
)
)
colnames(TopicWords_DF_Ordered) = paste0("Topic", 1:num_topics)
# DocTopics
DocTopics_Matrix = mallet.doc.topics(
topic.model = TopicModel,
smoothed = TRUE,
normalized = TRUE
)
DocTopics_DF = as.data.frame(DocTopics_Matrix)
#rownames(DocTopics_DF) = names(texts)
DocTopics_DF = cbind(
1:nrow(DocTopics_DF),
names(texts),
DocTopics_DF
)
colnames(DocTopics_DF) = c("RowID", "Document", paste0("Topic", 1:num_topics))
# Ordered Version
DocTopics_DF_tmp = as.data.frame(DocTopics_Matrix)
rownames(DocTopics_DF_tmp) = names(texts)
colnames(DocTopics_DF_tmp) = paste0("Topic", 1:num_topics)
DocTopics_DF_Ordered = as.data.frame(
matrix(
row.names(DocTopics_DF_tmp)[apply(-DocTopics_DF_tmp, 2, order)],
nrow(DocTopics_DF_tmp)
)
)
colnames(DocTopics_DF_Ordered) = paste0("Topic", 1:num_topics)
# ----------------- #
# -- WRITE FILES -- #
# ----------------- #
write_topic_df(TopicWords_DF, file.path(outdir, "TopicWords.csv"))
write_topic_df(TopicWords_DF_Ordered, file.path(outdir, "TopicWords_DF_Ordered.csv"))
write_topic_df(DocTopics_DF, file.path(outdir, "DocTopics_DF.csv"))
write_topic_df(DocTopics_DF_Ordered, file.path(outdir, "DocTopics_DF_Ordered.csv"))
if (corrplot) {
# Write Topic Words correlation PNG
write_corrplot(
df_corr = as.matrix(Hmisc::rcorr(as.matrix(TopicWords_DF_tmp))[[1]]),
outdir = outdir
)
# Write Doc Topics correlation PNG
write_corrplot(
df_corr = as.matrix(Hmisc::rcorr(as.matrix(DocTopics_DF_tmp))[[1]]),
outdir = outdir
)
}
if (dendrogram) {
# Word-weighted
write_dendrogram(
dt = DocTopics_Matrix,
tw = TopicWords_Matrix,
fname = file.path(outdir, "Dendogram_WordSimilarity_balance=1.png"),
bal = 1
)
# Doc-weighted
write_dendrogram(
dt = DocTopics_Matrix,
tw = TopicWords_Matrix,
fname = file.path(outdir, "Dendogram_DocSimilarity_balance=1.png"),
bal = 0
)
# Custom balance dendrogram if balance is specified and is not 0 or 1, for whic
# the dendrograms are already written
if (!is.null(dendro_bal)) {
if (!(dendro_bal %in% c(0, 1))) {
write_dendrogram(
dt = DocTopics_Matrix,
tw = TopicWords_Matrix,
fname = file.path(outdir, "Dendogram_WordSimilarity_balance=1.png"),
bal = dendro_bal
)
}
}
}
# Remove temporary stopwords file from above
if(file.exists(stopwords_fname)) file.remove(stopwords_fname)
# Return list of all dataframes
out = list(
TopicWords = TopicWords_DF,
TopicWords_Ordered = TopicWords_DF_Ordered,
DocTopics = DocTopics_DF,
DocTopics_Ordered = DocTopics_DF_Ordered
)
return(out)
}
annotate_doc_by_sentence <- function(fname, outdir, verbose = FALSE) {
# Annotate text by sentence. Parse a large textfile using the coreNLP
# function, `annotateString()`. However, this function can run into an
# 'out of memory' error if the file is too large.
#
# coreNLP must be initialized with `initCoreNLP()`
#
# Normally done as follows (your version of Java might be different):
#
# # Set up Java 8
# # (give path to your libjvm.dylib)
# dyn.load("/Library/Java/JavaVirtualMachines/jdk1.8.0_162.jdk/Contents/Home/jre/lib/server/libjvm.dylib")
#
# # Load libraries
# library(rJava)
# library(NLP)
# library(coreNLP)
#
# # Initialize coreNLP
# initCoreNLP()
#
# # Run split_parse
# split_parse(...)
#
# Arguments:
# fname {char} -- file to annotate
# outdir {char} -- directory to write files to
#
# Keyword Arguments:
# verbose {logical} -- if TRUE, print updates to STDOUT (default: FALSE)
#
# Function Dependencies:
# - rJava
# - NLP
# - coreNLP
#
# Returns:
# nothing
stopifnot(is.character(fname))
stopifnot(file.exists(fname))
stopifnot(is.character(outdir))
stopifnot(dir.exists(outdir))
chunk_into_sentences = function(text) {
# Break text string into sentences delimiting by . or ! or ?
#
# Arguments:
# text {char} -- text to break into sentences
#
# Returns:
# {char}
break_points = c(1, as.numeric(gregexpr('[[:alnum:] ][.!?]', text)[[1]]) + 1)
sentences = NULL
for(i in 1:length(break_points)) {
res = substr(text, break_points[i], break_points[i+1])
if(i > 1) {
sentences[i] = sub('. ', '', res)
} else {
sentences[i] = res
}
}
sentences = sentences[sentences=!is.na(sentences)]
return(sentences)
}
# Read in file
text = readLines(fname)
text = trimws(paste0(text, collapse = " "))
text = nlp$rm_excess_spaces(text)
if (verbose) echo(sprintf("File: %s", basename(fname)))
# Break document into sentences
sentences = chunk_into_sentences(text)
if (verbose) echo("Document split into %s sentences", length(sentences), indent = 1)
# Create teporary directory
tmpdir = "tmp.coreNLP.annotateString"
if(dir.exists(tmpdir)) unlink(tmpdir, recursive = TRUE)
dir.create(tmpdir)
if (verbose) echo("Created temp dir '%s/%s'", getwd(), tmpdir, indent = 1)
# Execute string annotation and save each sentence as RDS object
if (verbose) echo("Annotating by sentence...", indent = 1)
for(i in seq_along(sentences)) {
anno = coreNLP::annotateString(sentences[i])
saveRDS(anno, file.path(tmpdir, paste0("anno", i, ".rds")))
}
if (verbose) echo("DONE", indent = 2)
# Set up dataframe object to load RDS data into
parsetree = c()
table = setNames(
data.frame(matrix(ncol = 9, nrow = 0)),
c("sentence", "id", "token", "lemma", "CharacterOffsetBegin",
"CharacterOffsetEnd", "POS", "NER", "Speaker"
)
)
# Extract parsetree from each RDS object
echo("Extracting annotable and parsetree from each RDS object...", indent = 1)
for(i in seq_along(sentences)) {
anno = readRDS(paste0(tmpdir, "/anno", i, ".rds"))
anno.table = anno[[1]]
anno.table$sentence = rep(i, nrow(anno.table))
table = rbind(table, anno.table)
parsetree = append(parsetree, anno[[2]])
}
echo("DONE", indent = 2)
# Save final objects (annotable and parsetree) for entire doc
parsetree_fname = file.path(outdir, paste0(tools::file_path_sans_ext(basename(fname)), "-parsetree.rds"))
saveRDS(parsetree, parsetree_fname)
annotable_fname = file.path(outdir, paste0(tools::file_path_sans_ext(basename(fname)), "-annotable"))
saveRDS(table, annotable_fname)
if (verbose) echo("Saved final objects as RDS files:", indent = 1)
if (verbose) echo(parsetree_fname, indent = 2)
if (verbose) echo(annotable_fname, indent = 2)
# Remove temporary directory
unlink(tmpdir, recursive = TRUE)
}
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.