Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(kgrams)
## -----------------------------------------------------------------------------
.preprocess <- function(x) {
# Remove speaker name and locations (boldfaced in original html)
x <- gsub("<b>[A-z]+</b>", "", x)
# Remove other html tags
x <- gsub("<[^>]+>||<[^>]+$||^[^>]+>$", "", x)
# Apply standard preprocessing including lower-case
x <- kgrams::preprocess(x)
# Collapse to a single string to avoid splitting into more sentences at the end of lines
x <- paste(x, collapse = " ")
return(x)
}
.tknz_sent <- function(x) {
# Tokenize sentences
x <- kgrams::tknz_sent(x, keep_first = TRUE)
# Remove empty sentences
x <- x[x != ""]
return(x)
}
## -----------------------------------------------------------------------------
freqs <- kgram_freqs(much_ado, # Read Shakespeare's text from connection
N = 5, # Store k-gram counts for k <= 5
.preprocess = .preprocess, # preprocess text
.tknz_sent = .tknz_sent, # tokenize sentences
verbose = FALSE
)
freqs
## -----------------------------------------------------------------------------
summary(freqs)
## -----------------------------------------------------------------------------
# Query some simple unigrams and bigrams
query(freqs, c("leonato", "enter leonato", "thy", "smartphones"))
# Query k-grams at the beginning or end of a sentence
query(freqs, c(BOS() %+% BOS() %+% "i", "love" %+% EOS()))
# Total number of words processed
query(freqs, "")
# Total number of sentences processed
query(freqs, EOS())
## -----------------------------------------------------------------------------
smoothers()
## -----------------------------------------------------------------------------
info("kn")
## -----------------------------------------------------------------------------
kn <- language_model(freqs, "kn", D = 0.75)
kn
## -----------------------------------------------------------------------------
summary(kn)
## -----------------------------------------------------------------------------
parameters(kn)
param(kn, "D")
param(kn, "D") <- 0.6
param(kn, "D")
param(kn, "D") <- 0.75
## -----------------------------------------------------------------------------
param(kn, "N") <- 4 # 'kn' uses only 1:4-grams
param(kn, "N")
param(kn, "N") <- 5 # 'kn' uses also 5-grams
## -----------------------------------------------------------------------------
probability(c("Did he break out into tears?",
"I see, lady, the gentleman is not in your books.",
"We are predicting sentence probabilities."
),
model = kn
)
## -----------------------------------------------------------------------------
probability("tears" %|% "Did he break out into", model = kn)
probability("pieces" %|% "Did he break out into", model = kn)
## -----------------------------------------------------------------------------
set.seed(840)
sample_sentences(model = kn,
n = 10,
max_length = 10
)
## -----------------------------------------------------------------------------
sample_sentences(model = kn,
n = 10,
max_length = 10,
t = 0.1 # low temperature
)
sample_sentences(model = kn,
n = 10,
max_length = 10,
t = 10 # high temperature
)
## -----------------------------------------------------------------------------
midsummer[840]
## -----------------------------------------------------------------------------
perplexity(midsummer, model = kn)
## ---- out.width="50%", fig.cap="Perplexity as a function of the discount parameter of Interpolated Kneser-Ney 2-gram (red), 3-gram (green), 4-gram (blue) and 5-gram (black) models."----
D_grid <- seq(from = 0.5, to = 0.99, by = 0.01)
FUN <- function(D, N) {
param(kn, "N") <- N
param(kn, "D") <- D
perplexity(midsummer, model = kn)
}
P_grid <- lapply(2:5, function(N) sapply(D_grid, FUN, N = N))
oldpar <- par(mar = c(2, 2, 1, 1))
plot(D_grid, P_grid[[1]], type = "n", xlab = "D", ylab = "Perplexity", ylim = c(300, 500))
lines(D_grid, P_grid[[1]], col = "red")
lines(D_grid, P_grid[[2]], col = "chartreuse")
lines(D_grid, P_grid[[3]], col = "blue")
lines(D_grid, P_grid[[4]], col = "black")
par(oldpar)
## -----------------------------------------------------------------------------
sapply(c("2-gram" = 1, "3-gram" = 2, "4-gram" = 3, "5-gram" = 4),
function(N) min(P_grid[[N]])
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.