demo/backup/stems-test.R

library(SnowballC)
data(RS.data)

words = tolower(as.vector(unlist(sapply(as.vector(RS.data$text), function(txt) {
  strsplit(x = txt, split = "\\s", perl=T)
}))))

just.words = grepl(pattern = "^[a-z0-9]*$", x = words, perl = T)
words.easy = words[just.words]
words.dirty = words[!just.words]

words.no.punct = gsub(pattern = "([,\\.\\?\\!\\):;/>=*]+$)", replacement = "", x = words.dirty)
words.no.start.punct = gsub(x=words.no.punct, pattern="^[\\(@\\*^]+", replacement="")
words.no.quote = gsub(x=words.no.start.punct, pattern='[\\\'|"]', replacement="")
words.dirty.only = words.no.quote[grepl(x=words.no.quote, pattern="\\w", perl=T)]
words.cleaned = unique(unlist(strsplit(x=words.dirty.only, split = "[-|\\.|:|/|\\?|,]+", perl=T)))
words.go = unique(words.cleaned[!tolower(words.cleaned) %in% tolower(stopwords$word)])
words.go.just.words = words.go[!grepl(x=words.go, pattern="[\\(|#|@}|%]")]
words.go.stems = wordStem(words.go.just.words)
words.hard = words.go.stems[-which(words.go.stems == "")]

# Remove Emails
words = words[!grepl(pattern = '.+[\\@]', x = words, perl = T, ignore.case = T)]

# Remove links 
words = words[!grepl(pattern = '^http', x = words, perl=T, ignore.case = T)]

# Remove easy punctuation
words = gsub(pattern = "[\\(\\)\\?\\!,\"]*", replacement="", x = words, perl=T, ignore.case = T)

# Remove ending periods (keep decimals)
words = gsub(pattern = "\\.$", replacement = "", x = words)

words = unique(tolower(sapply(words, function(x) {
  # gsub(pattern = "[^\\w| |\\(|\\)]*", replacement="", x=x, perl = T)
  gsub(pattern = "[^a-zA-Z0-9- ]*", replacement="", x=x, perl =T)
})))

stopwords = read.csv("~/Downloads/stopwords.csv")

stops = tolower(words) %in% tolower(stopwords$word)

stems = wordStem(words[!stops])
epistemic-analytics/ncodeR documentation built on June 15, 2019, 12:03 a.m.