R/corpus.create.R

Defines functions create.corpus

Documented in create.corpus

#' @title rCreate Corpus
#' @description Blank Description.
#' @param d Text Data
#' @keywords package
#' @return NULL
#' @export
#' @examples create.corpus()
#' create.corpus(nk.tweets)
#' nk.corpus.ultimus <- corpus.ultimus
#' Stopwords to Remove from Corpus 
#' stopwords.language <- "english"
#' List of Stopwords Dictionary Languages
#' stopwords.languages()
#' Remove Words from Stopwords List
#' remove.stopwords <- "c('word1')"
#' [<!>] Remove Specific Characters from Final Corpus [Ex: ’]
#' nk.corpus <- tm_map(nk.corpus, content_transformer(function(x,pattern)gsub(pattern,"",x)), "’")

create.corpus <- function(d, sw, rsw, cw) {

if('tm' %in% rownames(installed.packages()) == TRUE) {
require(tm)} else {
install.packages("tm", repos = "http://cran.us.r-project.org")	
require(tm)}

if('SnowballC' %in% rownames(installed.packages()) == TRUE) {
require(SnowballC)} else {
install.packages("SnowballC", repos = "http://cran.us.r-project.org")	
require(SnowballC)}

nk.tweets <- d

if (missing(sw)) {nk.stopwords <- "english"} else {nk.stopwords <- sw}
	
if (missing(rsw)) {nk.remove.stopwords <- NULL} else {nk.remove.stopwords <- rsw}

if (missing(cw)) {nk.remove.words <- NULL} else {nk.remove.words <- cw}

# Build Corpus from Plain Tweets
nk.corpus.primus <- Corpus(VectorSource(plain_tweets(nk.tweets$text)))

# Transform all to Lowercase
nk.corpus.lowercase <- tm_map(nk.corpus.primus, content_transformer(tolower))

# Remove Punctuation
nk.corpus.punctuation <- tm_map(nk.corpus.lowercase, removePunctuation) 

# Remove Numbers
nk.corpus.numbers <- tm_map(nk.corpus.punctuation, removeNumbers)

# Remove URLs
nk.corpus.urls <- tm_map(nk.corpus.numbers, removeURL <- function(x) gsub("http[[:alnum:][:punct:]]*", "", x))

# Remove White Spaces
nk.corpus.whitespaces <- tm_map(nk.corpus.urls, stripWhitespace)

# Stem Corpus Document
nk.corpus.stem <- tm_map(nk.corpus.whitespaces, stemDocument)

# Remove Stop Words
nk.corpus.stopwords <- tm_map(nk.corpus.stem, removeWords, nk.stopwords <- c(stopwords(nk.stopwords)))

# Remove Custom Words & Build Final Corpus
nk.corpus.ultimus <- tm_map(nk.corpus.stopwords, removeWords, nk.remove.words)

# Set Corpus as Global Variable
corpus.ultimus <<- nk.corpus.ultimus

}
sabalicodev/sabali documentation built on Jan. 13, 2020, 2:22 p.m.