#########################################################################################################
# iSAX is an R package which provides access to iSA technology developed by
# VOICES from the Blogs. It is released for academic use only and licensed
# under the Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License
# see http://creativecommons.org/licenses/by-nc-nd/4.0/
# Warning: Commercial use of iSA is protected under the U.S. provisional patent application No. 62/215264
#########################################################################################################
prep.data <- function(corpus, th=0.99, lang="english", train=NULL,
use.all=TRUE, shannon=FALSE, verbose=FALSE,
stripWhite=TRUE, removeNum=TRUE, removePunct=TRUE,
removeStop=TRUE, toPlain=TRUE, doGC=FALSE){
mc <- min(2, as.integer(detectCores()/2))
options(mc.cores = mc)
data(ltab)
if(!(lang %in% ltab$lang))
lang <- ltab$lang[ltab$code==lang]
if(length(lang)==0)
lang <- "english"
JP <- lang == "japanese"
CN <- lang == "chinese"
if(JP){
#require(RMeCab)
if(verbose)
cat("Phase1: Japanese tokenization...")
tkz <- function(x) iconv(paste(RMeCab::RMeCabC(x$content),collapse=" "),"UTF-8", sub="byte")
corpus <- tm_map(corpus, tkz)
if(stripWhite){
if(verbose) cat("strip white...")
corpus <- tm_map(corpus, stripWhitespace)
}
if(removeNum){
if(verbose) cat("remove numbers...")
corpus <- tm_map(corpus, removeNumbers)
}
if(removePunct){
if(verbose) cat("remove punctuation...")
corpus <- tm_map(corpus, removePunctuation)
}
if(removeStop){
if(verbose) cat("remove English stopwords...")
corpus <- tm_map(corpus, removeWords, stopwords("english"))
}
if(toPlain){
if(verbose) cat("sanitizing corpus...")
corpus <- tm_map(corpus, PlainTextDocument)
}
}
if(CN){
# library(rmmseg4j)
if(verbose) cat("Phase1: Chinese tokenization...")
tkz <- function(x) {y <- rmmseg4j::mmseg4j(x$content); Encoding(y) <- "UTF-8"; y
}
tmp <- lapply(corpus, tkz)
corpus <- VCorpus(VectorSource(tmp),readerControl=list(language="zh"))
rm(tmp)
if(removeNum){
if(verbose) cat("remove numbers...")
corpus <- tm_map(corpus, removeNumbers)
}
if(removePunct){
if(verbose) cat("remove punctuation...")
corpus <- tm_map(corpus, removePunctuation)
}
if(stripWhite){
if(verbose) cat("strip white...")
corpus <- tm_map(corpus, stripWhitespace)
}
if(removeStop){
if(verbose) cat("remove English stopwords...")
corpus <- tm_map(corpus, removeWords, stopwords("english"))
}
if(toPlain){
if(verbose) cat("sanitizing corpus...")
corpus <- tm_map(corpus, PlainTextDocument)
}
}
if(!CN & !JP){
if(verbose) cat("\nPhase1: Cleaning up...")
tl <- function(x) iconv(x$content,"UTF-8", sub="byte")
tmp <- lapply(corpus, tl)
corpus <- VCorpus(VectorSource(tmp))
rm(tmp)
#if(removeStop){
# if(verbose) cat("remove English stopwords...")
# corpus <- tm_map(corpus, removeWords, stopwords("english"))
# if(verbose) cat("remove Italian stopwords...")
# corpus <- tm_map(corpus, removeWords, stopwords("italian"))
#}
# if(toPlain){
# if(verbose) cat("sanitizing corpus...")
# corpus <- tm_map(corpus, PlainTextDocument)
#}
}
if(!JP)
gc(doGC,doGC)
if(CN | JP){
if(verbose) cat("\nPhase2: stemming...")
dtm <- DocumentTermMatrix(corpus, control=list(tolower=FALSE))
} else {
if(stripWhite){
if(verbose) cat("\nstripping white spaces...")
corpus <- tm_map(corpus, stripWhitespace)
}
if(verbose) cat("\nPhase2: stemming...")
dtm <- DocumentTermMatrix(corpus, control=list(mc.cores=mc,
removePunctuation=removePunct, stopwords=removeStop, removeNumbers=removeNum, tolower=TRUE, stemming=TRUE,
weighting=weightBin, language=lang))
}
rm(corpus)
if(!JP)
gc(doGC,doGC)
# dtm[dtm>1] <- 1
if(!use.all){
if(shannon){
sh <- apply(dtm[train,], 2, entropy)
idx <- which(sh>quantile(sh,pr=th, na.rm=TRUE))
dtm2.train <- dtm[train,idx]
dtm3.train <- as.matrix(dtm2.train)
} else {
dtm2.train <- removeSparseTerms(dtm[train,], th)
dtm3.train <- as.matrix(dtm2.train)
}
}
if(shannon){
dtm1 <- removeSparseTerms(dtm, th)
sh <- apply(dtm1, 2, entropy)
idx <- which(sh>quantile(sh,pr=th, na.rm=TRUE))
dtm2.full <- dtm1[,idx]
dtm3.full <- as.matrix(dtm2.full)
} else {
dtm2.full <- removeSparseTerms(dtm, th)
dtm3.full <- as.matrix(dtm2.full)
}
if(!use.all){
idx <- match(colnames(dtm3.train), colnames(dtm3.full))
idx <- idx[!is.na(idx)]
dtm3.full <- dtm3.full[, idx]
}
dtm3.full[dtm3.full>1] <- 1
if(verbose) cat("\nPhase3: bin2hexing...")
S <- apply(dtm3.full, 1, bin2hex)
return(list(S=S, dtm=dtm3.full, train=train, th=th))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.