Nothing
## ----echo=FALSE, results='hide'-----------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.path = "reference/figures/"
)
## ----setup, echo=FALSE, results='hide', message=FALSE-------------------------
library(wordpredictor)
# The level of verbosity in the information messages
ve <- 0
#' @description
#' Used to setup the test environment
#' @param rf The required files.
#' @param ve The verbosity level.
#' @return The list of directories in the test environment
setup_env <- function(rf, ve) {
# An object of class EnvManager is created
em <- EnvManager$new(rp = "../", ve = ve)
# The required files are downloaded
ed <- em$setup_env(rf)
return(ed)
}
#' @description
#' Used to clean up the test environment
clean_up <- function(ve) {
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve)
# The test environment is removed
em$td_env(T)
}
## ----data-exploration, cache=FALSE--------------------------------------------
# The required files
rf <- c(
"test.txt",
"validate.txt",
"validate-clean.txt",
"test-clean.txt"
)
# The test environment is setup
ed <- setup_env(rf, ve)
# The DataAnalyzer object is created
da <- DataAnalyzer$new(ve = ve)
# Information on all text files in the ed folder is returned
fi <- da$get_file_info(ed)
# The file information is printed
print(fi)
# The test environment is cleaned up
clean_up(ve)
## ----data-sampling-1, cache=FALSE---------------------------------------------
# The required files
rf <- c("input.txt")
# The test environment is setup
ed <- setup_env(rf, ve)
# The sample size as a proportion of the input.txt file
ssize <- 0.1
# The data file path
dfp <- paste0(ed, "/input.txt")
# The object size is formatted
obj_size <- file.size(dfp)/10^6
# The proportion of data to sample
prop <- (ssize/obj_size)
# An object of class DataSampler is created
ds <- DataSampler$new(dir = ed, ve = ve)
# The sample file is generated.
# The randomized sample is saved to the file train.txt in the ed folder
ds$generate_sample(
fn = "input.txt",
ss = prop,
ic = F,
ir = T,
ofn = "train.txt",
is = T
)
# The test environment is cleaned up
clean_up(ve)
## ----data-sampling-2, cache=FALSE---------------------------------------------
# The required files
rf <- c("input.txt")
# The test environment is setup
ed <- setup_env(rf, ve)
# An object of class DataSampler is created
ds <- DataSampler$new(dir = ed, ve = ve)
# The train, test and validation files are generated
ds$generate_data(
fn = "input.txt",
percs = list(
"train" = 0.8,
"test" = 0.1,
"validate" = 0.1
)
)
# The test environment is cleaned up
clean_up(ve)
## ----data-cleaning, cache=FALSE-----------------------------------------------
# The required files
rf <- c("input.txt")
# The test environment is setup
ed <- setup_env(rf, ve)
# The data file path
fn <- paste0(ed, "/input.txt")
# The clean file path
cfn <- paste0(ed, "/input-clean.txt")
# The data cleaning options
dc_opts = list(
"min_words" = 2,
"to_lower" = T,
"remove_stop" = F,
"remove_punct" = T,
"remove_non_dict" = T,
"remove_non_alpha" = T,
"remove_extra_space" = T,
"remove_bad" = F,
"output_file" = cfn
)
# The data cleaner object is created
dc <- DataCleaner$new(fn, dc_opts, ve = ve)
# The sample file is cleaned and saved as input-clean.txt in the ed dir
dc$clean_file()
# The test environment is cleaned up
clean_up(ve)
## ----tokenization-1, cache=FALSE----------------------------------------------
# The required files
rf <- c("test-clean.txt")
# The test environment is setup
ed <- setup_env(rf, ve)
# The test file path
fn <- paste0(ed, "/test-clean.txt")
# The n-grams are generated
for (n in 1:4) {
# The ngram number is set
tg_opts = list("n" = n, "save_ngrams" = T, dir = ed)
# The TokenGenerator object is created
tg <- TokenGenerator$new(fn, tg_opts, ve = ve)
# The ngram tokens are generated
tg$generate_tokens()
}
# The test environment is cleaned up
clean_up(ve)
## ----tokenization-2, cache=FALSE, out.width="70%", out.height="70%"-----------
# The required files
rf <- c("n2.RDS")
# The test environment is setup
ed <- setup_env(rf, ve)
# The ngram file name
fn <- paste0(ed, "/n2.RDS")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(fn, ve = ve)
# The top features plot is checked
df <- da$plot_n_gram_stats(opts = list(
"type" = "top_features",
"n" = 10,
"save_to" = "png",
"dir" = "./reference/figures"
))
# The output file path
fn <- paste0("./reference/figures/top_features.png")
knitr::include_graphics(fn)
# The test environment is cleaned up
clean_up(ve)
## ----tokenization-3, cache=FALSE, out.width="70%", out.height="70%"-----------
# The required files
rf <- c("n2.RDS")
# The test environment is setup
ed <- setup_env(rf, ve)
# The ngram file name
fn <- paste0(ed, "/n2.RDS")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(fn, ve = ve)
# The top features plot is checked
df <- da$plot_n_gram_stats(opts = list(
"type" = "coverage",
"n" = 10,
"save_to" = "png",
"dir" = "./reference/figures"
))
# The output file path
fn <- paste0("./reference/figures/coverage.png")
knitr::include_graphics(fn)
# The test environment is cleaned up
clean_up(ve)
## ----tokenization-4, cache=FALSE----------------------------------------------
# The required files
rf <- c("n2.RDS")
# The test environment is setup
ed <- setup_env(rf, ve)
# The ngram file name
fn <- paste0(ed, "/n2.RDS")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(ve = ve)
# Bi-grams starting with "and_" are returned
df <- da$get_ngrams(fn = fn, c = 10, pre = "^and_*")
# The data frame is sorted by frequency
df <- df[order(df$freq, decreasing = T),]
# The first 10 rows of the data frame are printed
knitr::kable(df[1:10,], col.names = c("Prefix", "Frequency"))
# The test environment is cleaned up
clean_up(ve)
## ----transition-probabilities, cache=FALSE------------------------------------
# The required files
rf <- c("n1.RDS", "n2.RDS", "n3.RDS", "n4.RDS")
# The test environment is setup
ed <- setup_env(rf, ve)
# The TPGenerator object is created
tp <- TPGenerator$new(opts = list(n = 4, dir = ed), ve = ve)
# The combined transition probabilities are generated
tp$generate_tp()
# The test environment is cleaned up
clean_up(ve)
## ----generate-model, results='hide', cache=FALSE------------------------------
# The required files
rf <- c("input.txt")
# The test environment is setup
ed <- setup_env(rf, ve)
# The following code generates n-gram model using default options for data
# cleaning and tokenization. See the following section on how to customize these
# options. Note that input.txt is the name of the input data file. It should be
# present in the data directory. dir is the directory containing the input and output files. It is set to the path of the environment directory, ed.
# ModelGenerator class object is created
mg <- ModelGenerator$new(
name = "def-model",
desc = "N-gram model generating using default options",
fn = "def-model.RDS",
df = "input.txt",
n = 4,
ssize = 0.1,
dir = ed,
dc_opts = list(),
tg_opts = list(),
ve = ve
)
# Generates n-gram model. The output is the file def-model.RDS
mg$generate_model()
# The test environment is cleaned up
clean_up(ve)
## ----model-evaluation-1, cache=FALSE------------------------------------------
# The required files
rf <- c("def-model.RDS", "validate-clean.txt")
# The test environment is setup
ed <- setup_env(rf, ve)
# The model file name
mfn <- paste0(ed, "/def-model.RDS")
# The path to the cleaned validation file
vfn <- paste0(ed, "/validate-clean.txt")
# ModelEvaluator class object is created
me <- ModelEvaluator$new(mf = mfn, ve = ve)
# The intrinsic evaluation is performed on first 20 lines
stats <- me$intrinsic_evaluation(lc = 20, fn = vfn)
# The test environment is cleaned up
clean_up(ve)
## ----model-evaluation-2, cache=FALSE------------------------------------------
# The required files
rf <- c("def-model.RDS", "validate-clean.txt")
# The test environment is setup
ed <- setup_env(rf, ve)
# The model file name
mfn <- paste0(ed, "/def-model.RDS")
# The path to the cleaned validation file
vfn <- paste0(ed, "/validate-clean.txt")
# ModelEvaluator class object is created
me <- ModelEvaluator$new(mf = mfn, ve = ve)
# The intrinsic evaluation is performed on first 100 lines
stats <- me$extrinsic_evaluation(lc = 100, fn = vfn)
# The test environment is cleaned up
clean_up(ve)
## ----predict-word, cache=FALSE------------------------------------------------
# The required files
rf <- c("def-model.RDS")
# The test environment is setup
ed <- setup_env(rf, ve)
# The model file name
mfn <- paste0(ed, "/def-model.RDS")
# An object of class ModelPredictor is created. The mf parameter is the name of
# the model file that was generated in the previous example.
mp <- ModelPredictor$new(mf = mfn, ve = ve)
# Given the words: "how are", the next word is predicted. The top 3 most likely
# next words are returned along with their respective probabilities.
res <- mp$predict_word(words = "how are", 3)
# The test environment is cleaned up
clean_up(ve)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.