# Predict word
#
# This model built by using N-Gram to predicts words
#
#
# Some useful keyboard shortcuts for package authoring:
#
# Build and Reload Package: 'Ctrl + Shift + B'
# Check Package: 'Ctrl + Shift + E'
# Test Package: 'Ctrl + Shift + T'
options( java.parameters = "-Xmx6g")
library(stringi)
library(tm)
library(RWeka)
library(ggplot2)
library(tidyr)
print("Building the Predict model... Please wait")
url<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file(url,basename(url))
unzip("Coursera-SwiftKey.zip")
}
# Read the blogs,news and Twitter data.
print("Reading the data... Please wait")
bData <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
#nData <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
tData <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
con <- file("final/en_US/en_US.news.txt", open="rb")
nData <- readLines(con, encoding="UTF-8")
close(con)
rm(con)
# Cleaning The Data
print("Creating sample.. Please wait")
set.seed(679)
data.sample <- c(sample(bData, length(bData) * 0.1),
sample(nData, length(nData) * 0.1),
sample(tData, length(tData) * 0.1))
# Remove all non english characters as they cause issues
print("Cleaning the data.. Please wait")
data.sample <- iconv(data.sample, "latin1", "ASCII", sub="")
# Create corpus(sample data set) and clean the data
cSample<-VCorpus(VectorSource(list(data.sample)))
cSample <- tm_map(cSample, content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
cSample <- tm_map(cSample, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
cSample <- tm_map(cSample, toSpace, "@[^\\s]+")
cSample <- tm_map(cSample, removeWords, stopwords("english"))
cSample <- tm_map(cSample, removePunctuation)
cSample <- tm_map(cSample, removeNumbers)
cSample <- tm_map(cSample, stripWhitespace)
cSample <- tm_map(cSample, PlainTextDocument)
getFreq <- function(tm) {
freq <- sort(rowSums(as.matrix(tm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
print("Building N-Gram model.. Please wait")
uniTokenSize <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biTokenSize <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triTokenSize <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
quadTokenSize <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
#quintTokenSize <- function(x) NGramTokenizer(x,Weka_control(min = 5, max = 5))
print("Saving Model.. Please wait")
unigramFreq <- getFreq(removeSparseTerms(TermDocumentMatrix(cSample, control = list(tokenize = uniTokenSize)), 0.9999))
bigramFreq <- getFreq(removeSparseTerms(TermDocumentMatrix(cSample, control = list(tokenize = biTokenSize)), 0.9999))
trigramFreq <- getFreq(removeSparseTerms(TermDocumentMatrix(cSample, control = list(tokenize = triTokenSize)), 0.9999))
quadgramFreq <- getFreq(removeSparseTerms(TermDocumentMatrix(cSample, control = list(tokenize = quadTokenSize)), 0.9999))
#quintgramFreq <- getFreq(removeSparseTerms(TermDocumentMatrix(cSample, control = list(tokenize = quintTokenSize)), 0.9999))
print("Framing the data..Please Wait")
biGram <- data.frame(bigramFreq$word, separate(bigramFreq,word,c("uword","bword"),sep=" "), stringsAsFactors = FALSE)
triGram <- data.frame(trigramFreq$word, separate(trigramFreq,word,c("uword","bword","tword"),sep=" "), stringsAsFactors = FALSE)
quadGram <- data.frame(quadgramFreq$word, separate(quadgramFreq,word,c("uword","bword","tword","qword"),sep=" "), stringsAsFactors = FALSE)
colnames(biGram)[1] <- "word"
colnames(triGram)[1] <- "word"
colnames(quadGram)[1] <- "word"
print("Generating Graph..")
print("Success fully installed - Predict Word package")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.