Nothing
#' Generates transition probabilities for n-grams
#'
#' @description
#' It provides a method for generating transition probabilities for
#' the given n-gram size. It also provides a method for generating the combined
#' transition probabilities data for n-gram sizes from 1 to the given size. The
#' combined transition probabilities data can be used to implement back-off.
#'
#' @details
#' It provides a method for generating n-gram transition probabilities.
#' It reads n-gram frequencies from an input text file that is generated by the
#' TokenGenerator class.
#'
#' It parses each n-gram into a prefix, a next word, the next word frequency and
#' the next word probability. Maximum Likelihood count is used to generate the
#' next word probabilities.
#'
#' Each n-gram prefix is converted to a numeric hash using the digest2int
#' function. The next word is replaced with the position of the next word in the
#' list of all words. The transition probabilities data is stored as a dataframe
#' in a file.
#'
#' Another method is provided that combines the transition probabilities for
#' n-grams of size 1 to the given size. The combined transition probabilities
#' can be saved to a file as a data frame. This file may be regarded as a
#' completed self contained n-gram model. By combining the transition
#' probabilities of n-grams, back-off may be used to evaluate word probabilities
#' or predict the next word.
#' @importFrom stringr str_match
#' @importFrom digest digest2int
#' @importFrom dplyr group_by mutate
TPGenerator <- R6::R6Class(
"TPGenerator",
inherit = Base,
public = list(
#' @description
#' It initializes the current obj. It is used to set the
#' transition probabilities options and verbose option.
#' @param opts The options for generating the transition probabilities.
#' * **save_tp**. If the data should be saved.
#' * **n**. The n-gram size.
#' * **dir**. The directory containing the input and output files.
#' * **format**. The format for the output. There are two options.
#' * **plain**. The data is stored in plain text.
#' * **obj**. The data is stored as a R obj.
#' @param ve The level of detail in the information messages.
#' @export
initialize = function(opts = list(), ve = 0) {
# The given options are merged with the opts attribute
private$tp_opts <- modifyList(private$tp_opts, opts)
# The base class is initialized
super$initialize(NULL, NULL, ve)
# The processed output is initialized
private$p_output <- data.frame()
},
#' @description
#' It first generates the transition probabilities for each
#' n-gram of size from 1 to the given size. The transition probabilities
#' are then combined into a single data frame and saved to the output
#' folder that is given as parameter to the current object.
#'
#' By combining the transition probabilities for all n-gram sizes from 1
#' to n, back-off can be used to calculate next word probabilities or
#' predict the next word.
#' @examples
#' # Start of environment setup code
#' # The level of detail in the information messages
#' ve <- 0
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL implies tempdir will be used
#' fn <- NULL
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("n1.RDS", "n2.RDS", "n3.RDS", "n4.RDS")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve, rp = "./")
#' # The required files are downloaded
#' ed <- em$setup_env(rf, fn)
#' # End of environment setup code
#'
#' # The list of output files
#' fns <- c("words", "model-4", "tp2", "tp3", "tp4")
#'
#' # The TPGenerator object is created
#' tp <- TPGenerator$new(opts = list(n = 4, dir = ed), ve = ve)
#' # The combined transition probabilities are generated
#' tp$generate_tp()
#'
#' # The test environment is removed. Comment the below line, so the
#' # files generated by the function can be viewed
#' em$td_env()
generate_tp = function() {
# The information message
msg <- paste0("Generating Transition Probabilities for n = ")
msg <- paste0(msg, "1:", private$tp_opts[["n"]])
# Information message is shown
private$dh(msg, "-", md = 1)
# The processed output is cleared
private$p_output <- data.frame()
# The output format
fo <- private$tp_opts[["format"]]
# The n-gram number
nmax <- private$tp_opts[["n"]]
# The file extension
if (fo == "plain") {
ext <- ".txt"
} else {
ext <- ".RDS"
}
# The short output file name
fn <- paste0("model-", nmax, ext)
# The model file name path
fp <- paste0(private$tp_opts[["dir"]], "/", fn)
# If the combined tp file already exists
if (file.exists(fp)) {
# Information message is shown
private$dm(
"The output file: ", fp, " already exists\n",
md = 1, ty = "w"
)
}
else {
# The options for generating transition probabilities
tp_opts <- list(
n = 1,
format = fo,
save_tp = T,
dir = private$tp_opts[["dir"]]
)
# The combined tp data
c_pre <- c_nw <- c_prob <- c()
# For each n-gram number, the transition probabilities data is
# generated.
for (n in 1:nmax) {
# The value of n is set
tp_opts$n <- n
# The transition probabilities or word list is generated
self$generate_tp_for_n(n)
# If n > 1
if (n > 1) {
# c_pre is updated
c_pre <- c(c_pre, private$p_output$pre)
# c_nw is updated
c_nw <- c(c_nw, private$p_output$nw)
# c_prob is updated
c_prob <- c(c_prob, private$p_output$prob)
# The processed output is cleared
private$p_output <- data.frame()
}
}
# The processed output is set to the combined tp data
private$p_output <-
data.frame(
"pre" = c_pre,
"nw" = c_nw,
"prob" = c_prob
)
# If the data should be saved
if (private$tp_opts[["save_tp"]]) {
private$save_data(fn)
}
# Information message is shown
private$dh("DONE", "=", md = 1)
}
},
#' @description
#' It generates the transition probabilities table for the
#' given n-gram size. It first reads n-gram token frequencies from an
#' input text file.
#'
#' It then generates a data frame whose columns are the
#' n-gram prefix, next word and next word frequency. The data frame may
#' be saved to a file as plain text or as a R obj. If n = 1, then the
#' list of words is saved.
#' @param n The n-gram size for which the tp data is generated.
generate_tp_for_n = function(n) {
# The n value is set
private$tp_opts[["n"]] <- n
# The output format
fo <- private$tp_opts[["format"]]
# The output file name
fn <- private$get_file_name(T)
# If the output file already exists
if (file.exists(fn)) {
# The information message is shown
private$dm(
"The file: ", fn, " already exists",
md = 1, ty = "w"
)
# The file is read
data <- private$read_data(fn, fo, T)
# If n = 1
if (n == 1) {
# The word list is set to the data
private$wl <- data
}
else {
# The processed output is set to the data
private$p_output <- data
}
}
else {
# The information message
msg <- paste0(
"Generating transition probabilities for n = ", n)
# Information message is shown
private$dh(msg, "-", md = 1)
# The input file name
private$fn <- private$get_file_name(F)
# The data is read
df <- private$read_data(private$fn, fo, T)
# If n = 1
if (n == 1) {
# The word list is set to the data frame
private$wl <- df
# A probabilities column is added
private$wl$prob <- (private$wl$freq / sum(private$wl$freq))
# The probabilities are rounded to 8 decimal places
private$wl$prob <- round(private$wl$prob, 8)
# The frequency column is removed
private$wl$freq <- NULL
}
else {
# The 1-gram words are read
private$read_words()
# The lines are split on "prefix_nextword:frequency"
m <- str_match(df$pre, "(.+)_(.+)")
# The hash of the prefix is taken
np <- digest2int(m[, 2])
# The next word id based on index position
nw <- match(m[, 3], private$wl$pre)
# The next word frequencies
nf <- df$freq
# The data is added to a data frame
df <- data.frame(
"pre" = np,
"nw" = nw,
"freq" = nf
)
# The processed output is set to the data frame
private$p_output <- df
# The next word probabilities are generated
private$generate_probs()
# The frequency column is removed
private$p_output$freq <- NULL
}
# If the data should be saved
if (private$tp_opts[["save_tp"]]) {
private$save_data()
}
# Information message is shown
private$dh("DONE", "=", md = 1)
}
}
),
private = list(
# @field tp_opts The options for generating the transition
# probabilities.
# * **save_tp**. If the data should be saved.
# * **n**. The n-gram number
# * **dir**. The directory containing the input and output files.
# * **format**. The format for the output. There are two options.
# * **plain**. The data is stored in plain text.
# * **obj**. The data is stored as a R obj.
tp_opts = list(
"save_tp" = T,
"n" = 1,
"dir" = "./data/model",
"format" = "obj"
),
# @field The list of unique words and their frequencies
wl = data.frame(),
# @description
# It calculates the next word probabilities and optionally
# saves the transition probability data to a file.
generate_probs = function() {
# The n-gram number
n <- private$tp_opts[["n"]]
# If n > 1
if (n > 1) {
# The output is copied to a variable
df <- private$p_output
# A new probability column is added. It is set to the sum of
# frequency column for each prefix group.
df <- df %>%
group_by(pre) %>%
mutate(prob = sum(freq))
# Each frequency is divided by the sum to give the probability.
df$prob <- round(df$freq / df$prob, 8)
# The output is set to the updated variable
private$p_output <- df
}
},
# @description
# It returns the name of the output or input file.
# @param is_output If the output file name is required.
get_file_name = function(is_output) {
# The n-gram number
n <- private$tp_opts[["n"]]
# The directory
od <- private$tp_opts[["dir"]]
# The format
fo <- private$tp_opts[["format"]]
# The file extension
if (fo == "plain") {
ext <- ".txt"
} else {
ext <- ".RDS"
}
# If the output file name is required
if (is_output) {
# If n = 1
if (n == 1) {
# The file name
fn <- paste0(od, "/words", ext)
}
# If n > 1
else if (n > 1) {
# The file name
fn <- paste0(od, "/tp", n, ext)
}
}
else {
# The file name
fn <- paste0(od, "/n", n, ext)
}
return(fn)
},
# @description
# It saves the transition probabilities to a file in plain format or as
# a R obj. If the file name is not given, then it is generated using the
# current object attributes.
# @param fn The file name to use.
save_data = function(fn = NULL) {
# The n-gram number
n <- private$tp_opts[["n"]]
# The directory
od <- private$tp_opts[["dir"]]
# The format
fo <- private$tp_opts[["format"]]
# If n = 1
if (n == 1) {
# The data to save
data <- private$wl
}
# If n > 1
else if (n > 1) {
# The data to save
data <- private$p_output
}
# If the file name is given as parameter then it is used
if (!is.null(fn)) {
fn <- paste0(od, "/", fn)
} else {
fn <- private$get_file_name(T)
}
# The data is written
private$write_data(data, fn, fo, F)
},
# @description
# It reads the list of 1-gram words.
read_words = function() {
# If the word list has not been read
if (nrow(private$wl) == 0) {
# The format
fo <- private$tp_opts[["format"]]
# The file extension
if (fo == "plain") {
ext <- ".txt"
} else {
ext <- ".RDS"
}
# The 1-gram words file name
fn <- paste0(private$tp_opts[["dir"]], "/words", ext)
# The words are read
private$wl <- private$read_data(
fn, private$tp_opts[["format"]], F
)
}
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.