Nothing
# Clark: Original file consists of 80K entries to lemmatize. They can all
# happen in parallel.
# The challenge to automatically parallelize this on Windows is to know
# what code must be executed before the main lapply function can run.
#
# Without analyzing the code we could just run all the code before the
# `lapply`. Further this has the structure:
#
# code
# f = function() ... lapply()
# f()
#
# So we would like to:
# - create an appropriate cluster
# - execute the code in the first part on the manager and all the workers
# - change the lapply to parallel::parLapply
#
# This general strategy won't work with nested `lapply` calls.
# Also don't want to blindly replace every lapply with a parallel version.
# Maybe could have user hints about where to parallelize?
# Clark: this takes about 0.5 seconds. It's 9327 words, which is much
# longer than any letter of recommendation. Do it 80K times and it becomes 11
# hours. But Jared was talking about 100 hours for the whole dataset. What
# am I missing?
#system.time(t1 <- treetag("/home/clark/data/vets/appeals_sample/1719181.txt"))
library(koRpus)
# options(warn = -1)
# Clark:
fname = "goodnightmoon.csv"
doc = read.csv(fname, header = TRUE, stringsAsFactors = FALSE)
# The only things you need to modify
LemmatizerSourceDir = '/home/clark/dev/tree-tagger' # Where the lemmatizer source files live
# Clark: CodeDepends doesn't identify any outputs, updates, or side effects
# based on the code below. To do this it needs to recurse inside the body
# of the function. But how far do we proceed examining function bodies?
# Perhaps just into the functions from the package environments.
# Clark: set.kRp.env manipulates the private environment koRpus:::.koRpus.env
# For a SNOW cluster the only reasonable way to achieve the same result is
# to actually evaluate this code, since we aren't going to try to fool
# around with private variables in a sealed namespace.
#
# CodeDepends doesn't help me to understand what it's doing.
#s = CodeDepends::getInputs(body(set.kRp.env))
#s = CodeDepends::getInputs(quote(
# Set the koRpus environment
set.kRp.env(TT.cmd = "manual",
lang = 'en',
preset = 'en',
treetagger = 'manual',
format = 'obj',
TT.tknz = TRUE,
encoding = 'UTF-8',
TT.options = list(path = LemmatizerSourceDir,
preset = 'en'))
#))
# This function will take in a character vector and output a data frame
lemmatize = function(txt){
tagged.words = treetag(txt,
format = "obj",
treetagger ='manual',
lang = 'en',
TT.options = list(path = paste0(LemmatizerSourceDir),
preset = 'en'))
results = tagged.words@TT.res
return(results)
}
# Clark: The lines between this point and the GSRLem definition are where
# he's developing the function to use.
# They're not necessary, and should be removed in a preprocessing step.
# Run the lemmatizer!
LemmatizedDF = lemmatize(doc[1, "paragraph"])
# Clean Data
LemmatizedDF$lemma = as.character(LemmatizedDF$lemma)
LemmatizedDF[which(LemmatizedDF$lemma == "<unknown>"), "lemma"] = LemmatizedDF[which(LemmatizedDF$lemma == "<unknown>"), "token"]
# Clark: He computes the same condition twice here, when it can't change.
# Unlikely to be a bottleneck in this code, but more generally it could be.
# We could detect and change this to:
#cond = LemmatizedDF$lemma == "<unknown>"
#LemmatizedDF[cond, "lemma"] = LemmatizedDF[cond, "token"]
paste(LemmatizedDF$lemma, collapse = " ")
#### Function ####
GSRLem = function(text.col){
require(pbapply)
# Clark: Dropping in mclapply here should suffice to parallelize this. But
# original code ran on Windows, so need to do more.
lemdflist = pblapply(X = text.col, function(x) lemmatize(x))
rcv = vector()
for(i in 1:length(lemdflist)){
activedf = lemdflist[[i]]
activedf$lemma = as.character(activedf$lemma)
activedf[which(activedf$lemma == "<unknown>"), "lemma"] = activedf[which(activedf$lemma == "<unknown>"), "token"]
coltext = paste(activedf$lemma, collapse = " ")
# Clark: Building a result by successive concatenation is inefficient,
# but not the bottleneck here.
rcv = c(rcv, coltext)
print(paste("#", i, " of ", length(lemdflist), " done!"))
}
return(rcv)
}
lemed = GSRLem(doc[ , "paragraph"])
# If this gives you the following warning:
# Warning message:
# Can't find the lexicon file, hence omitted! Please ensure this path is valid:
# ~/Desktop/test/lib/english-lexicon.txt
# Do not worry about it. From the documentation for the treetag() function:
# you can omit all the following elements, because they will be filled with defaults. Of course this only makes sense if you have a working default installation.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.