#' k-gram Language Models
#'
#' @description
#'
#' Build a k-gram language model.
#'
#' ### Principal methods supported by objects of class \code{language_model}
#'
#' - \code{probability()}: compute word continuation and sentence probabilities.
#' See \link[kgrams]{probability}.
#'
#' - \code{sample_sentences()}: generate random text by sampling from the
#' language model probability distribution at arbitary temperature. See
#' \link[kgrams]{sample_sentences}.
#'
#' - \code{perplexity()}: Compute the language model perplexity on a test
#' corpus. See \link[kgrams]{perplexity}.
#'
#'
#' @author Valerio Gherardi
#' @md
#'
#'
#' @param object an object which stores the information required to build the
#' k-gram model. At present, necessarily a \code{kgram_freqs} object, or a
#' \code{language_model} object of which a copy is desired (see Details).
#' @param N a length one integer. Maximum order of k-grams to use in the language
#' model. This muss be less than or equal to the order of the underlying
#' \code{kgram_freqs} object.
#' @param smoother a length one character vector. Indicates the smoothing
#' technique to be applied to compute k-gram continuation probabilities. A list
#' of available smoothers can be obtained with \code{smoothers()}, and
#' further information on a particular smoother through
#' \code{info()}.
#' @param ... possible additional parameters required by the smoother.
#'
#' @return A \code{language_model} object.
#' @details
#' These generics are used to construct objects of class \code{language_model}.
#' The \code{language_model} method is only needed to create copies of
#' \code{language_model} objects (that is to say, new copies which are not
#' altered by methods which modify the original object in place,
#' see e.g. \link[kgrams]{parameters}). The discussion below focuses on
#' language models and the \code{kgram_freqs} method.
#'
#' \link[kgrams]{kgrams} supports several k-gram language models, including
#' Interpolated Kneser-Ney, Stupid Backoff and others
#' (see \link[kgrams]{smoothers}). The objects created by
#' \code{language_models()} have methods for computing word continuation and
#' sentence probabilities (see \link[kgrams]{probability}),
#' random text generation (see \link[kgrams]{sample_sentences})
#' and other type of language modeling tasks such as computing perplexities and
#' word prediction accuracies.
#'
#' Smoothers have often tuning parameters, which need to be specified by
#' (exact) name through the \code{...} arguments; otherwise,
#' \code{language_model()} will use default values and, once per session, throw
#' a warning. \code{info(smoother)} lists all parameters needed by a
#' specific smoother, together with their allowed parameter space.
#'
#' The run-time of \code{language_model()} may vary substantially for different
#' smoothing methods, depending on whether or not a method requires the
#' computation of additional quantities (that is to say, beyond k-gram counts)
#' for its operativity (this is, for instance, the case for the Kneser-Ney
#' smoother).
#' @examples
#' # Create an interpolated Kneser-Ney 2-gram language model
#'
#' freqs <- kgram_freqs("a a b a a b a b a b a b", 2)
#' model <- language_model(freqs, "kn", D = 0.5)
#' model
#' summary(model)
#' probability("a" %|% "b", model)
#'
#' # For more examples, see ?probability, ?sample_sentences and ?perplexity.
#'
#' @name language_model
#' @rdname language_model
#' @export
language_model <- function(object, ...)
UseMethod("language_model", object)
#' @rdname language_model
#' @export
language_model.language_model <- function(object, ...) {
cpp_freqs <- attr(object, "cpp_freqs")
smoother <- attr(object, "smoother")
args <- parameters(object)
N <- args[["N"]]
cpp_obj <- cpp_smoother_constructor(smoother, cpp_freqs, N, args)
new_language_model(
cpp_obj,
cpp_freqs,
attr(object, ".preprocess"),
attr(object, ".tknz_sent"),
smoother
)
}
#' @rdname language_model
#' @export
language_model.kgram_freqs <-
function(object, smoother = "ml", N = param(object, "N"), ...)
{
assert_positive_integer(N)
if (N > param(object, "N")) {
h <- "Invalid input"
x <- "'N' cannot be greater than 'param(object, \"N\")'."
rlang::abort(c(h, x = x), class = "kgrams_lm_max_order_error")
}
validate_smoother(smoother, ...)
args <- list(...)
for (parameter in list_parameters(smoother))
if (is.null(args[[parameter$name]]))
args[[parameter$name]] <- parameter$default
cpp_freqs <- attr(object, "cpp_obj")
cpp_obj <- cpp_smoother_constructor(smoother, cpp_freqs, N, args)
new_language_model(
cpp_obj,
cpp_freqs,
attr(object, ".preprocess"),
attr(object, ".tknz_sent"),
smoother
)
}
#----------------------------- printing methods -------------------------------#
#' @export
print.language_model <- function(x, ...) {
cat("A k-gram language model.\n")
return(invisible(x))
}
#' @export
summary.language_model <- function(object, ...) {
cat("A k-gram language model.\n\n")
cat("Smoother:\n")
cat("* '", attr(object, "smoother"), "'.\n", sep = "")
cat("\n")
cat("Parameters:\n")
for (name in names(parameters(object)))
cat("* ", name, ": ", param(object, name), "\n", sep = "")
cat("\n")
cat("Number of words in training corpus:\n")
cat("* W: ", attr(object, "cpp_freqs")$tot_words(), "\n", sep = "")
cat("\n")
cat("Number of distinct k-grams with positive counts:\n")
for (k in 1:param(object, "N"))
cat("* ", k, "-grams:", attr(object, "cpp_freqs")$unique(k),
"\n", sep = "")
return(invisible(object))
}
#' @export
str.language_model <- function(object, ...) summary(object)
#---------------------------------- internal ----------------------------------#
new_language_model <- function(
cpp_obj, cpp_freqs, .preprocess, .tknz_sent, smoother
)
{
structure(list(),
cpp_obj = cpp_obj,
cpp_freqs = cpp_freqs,
.preprocess = .preprocess,
.tknz_sent = .tknz_sent,
smoother = smoother,
class = c("language_model")
)
}
cpp_smoother_constructor <- function(smoother, cpp_freqs, N, args) {
switch(smoother,
sbo = new(SBOSmoother, cpp_freqs, N, args[["lambda"]]),
add_k = new(AddkSmoother, cpp_freqs, N, args[["k"]]),
ml = new(MLSmoother, cpp_freqs, N),
kn = new(KNSmoother, cpp_freqs, N, args[["D"]]),
mkn = new(mKNSmoother, cpp_freqs, N,
args[["D1"]], args[["D2"]], args[["D3"]]),
abs = new(AbsSmoother, cpp_freqs, N, args[["D"]]),
wb = new(WBSmoother, cpp_freqs, N)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.