#' @title rCreate Corpus
#' @description Blank Description.
#' @param d Text Data
#' @keywords package
#' @return NULL
#' @export
#' @examples create.corpus()
#' create.corpus(nk.tweets)
#' nk.corpus.ultimus <- corpus.ultimus
#' Stopwords to Remove from Corpus
#' stopwords.language <- "english"
#' List of Stopwords Dictionary Languages
#' stopwords.languages()
#' Remove Words from Stopwords List
#' remove.stopwords <- "c('word1')"
#' [<!>] Remove Specific Characters from Final Corpus [Ex: ’]
#' nk.corpus <- tm_map(nk.corpus, content_transformer(function(x,pattern)gsub(pattern,"",x)), "’")
create.corpus <- function(d, sw, rsw, cw) {
if('tm' %in% rownames(installed.packages()) == TRUE) {
require(tm)} else {
install.packages("tm", repos = "http://cran.us.r-project.org")
require(tm)}
if('SnowballC' %in% rownames(installed.packages()) == TRUE) {
require(SnowballC)} else {
install.packages("SnowballC", repos = "http://cran.us.r-project.org")
require(SnowballC)}
nk.tweets <- d
if (missing(sw)) {nk.stopwords <- "english"} else {nk.stopwords <- sw}
if (missing(rsw)) {nk.remove.stopwords <- NULL} else {nk.remove.stopwords <- rsw}
if (missing(cw)) {nk.remove.words <- NULL} else {nk.remove.words <- cw}
# Build Corpus from Plain Tweets
nk.corpus.primus <- Corpus(VectorSource(plain_tweets(nk.tweets$text)))
# Transform all to Lowercase
nk.corpus.lowercase <- tm_map(nk.corpus.primus, content_transformer(tolower))
# Remove Punctuation
nk.corpus.punctuation <- tm_map(nk.corpus.lowercase, removePunctuation)
# Remove Numbers
nk.corpus.numbers <- tm_map(nk.corpus.punctuation, removeNumbers)
# Remove URLs
nk.corpus.urls <- tm_map(nk.corpus.numbers, removeURL <- function(x) gsub("http[[:alnum:][:punct:]]*", "", x))
# Remove White Spaces
nk.corpus.whitespaces <- tm_map(nk.corpus.urls, stripWhitespace)
# Stem Corpus Document
nk.corpus.stem <- tm_map(nk.corpus.whitespaces, stemDocument)
# Remove Stop Words
nk.corpus.stopwords <- tm_map(nk.corpus.stem, removeWords, nk.stopwords <- c(stopwords(nk.stopwords)))
# Remove Custom Words & Build Final Corpus
nk.corpus.ultimus <- tm_map(nk.corpus.stopwords, removeWords, nk.remove.words)
# Set Corpus as Global Variable
corpus.ultimus <<- nk.corpus.ultimus
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.