Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## -----------------------------------------------------------------------------
library(sbo)
## -----------------------------------------------------------------------------
head(sbo::twitter_train, 3)
## -----------------------------------------------------------------------------
p <- sbo_predictor(object = sbo::twitter_train, # preloaded example dataset
N = 3, # Train a 3-gram model
dict = target ~ 0.75, # cover 75% of training corpus
.preprocess = sbo::preprocess, # Preprocessing transformation
EOS = ".?!:;", # End-Of-Sentence tokens
lambda = 0.4, # Back-off penalization in SBO algorithm
L = 3L, # Number of predictions for input
filtered = "<UNK>" # Exclude the <UNK> token from predictions
)
## -----------------------------------------------------------------------------
predict(p, "i love")
## -----------------------------------------------------------------------------
set.seed(840)
babble(p)
babble(p)
babble(p)
## -----------------------------------------------------------------------------
t <- sbo_predtable(object = sbo::twitter_train, # preloaded example dataset
N = 3, # Train a 3-gram model
dict = target ~ 0.75, # cover 75% of training corpus
.preprocess = sbo::preprocess, # Preprocessing transformation
EOS = ".?!:;", # End-Of-Sentence tokens
lambda = 0.4, # Back-off penalization in SBO algorithm
L = 3L, # Number of predictions for input
filtered = "<UNK>" # Exclude the <UNK> token from predictions
)
## -----------------------------------------------------------------------------
p <- sbo_predictor(t) # This is the same as 'p' created above
## ---- eval=FALSE--------------------------------------------------------------
# save(t)
# # ... and, in another session:
# load("t.rda")
## -----------------------------------------------------------------------------
summary(p)
## -----------------------------------------------------------------------------
head(t[[3]])
## -----------------------------------------------------------------------------
t[[1]]
## ----message=FALSE, warning=FALSE---------------------------------------------
library(dplyr) # installed with `sbo`
## -----------------------------------------------------------------------------
set.seed(840)
(evaluation <- eval_sbo_predictor(p, test = sbo::twitter_test))
## -----------------------------------------------------------------------------
evaluation %>% summarise(accuracy = sum(correct)/n(),
uncertainty = sqrt(accuracy * (1 - accuracy) / n())
)
## -----------------------------------------------------------------------------
evaluation %>% # Accuracy for in-sentence predictions
filter(true != "<EOS>") %>%
summarise(accuracy = sum(correct) / n(),
uncertainty = sqrt(accuracy * (1 - accuracy) / n())
)
## ---- fig.align = "center"----------------------------------------------------
if (require(ggplot2)) {
evaluation %>%
filter(correct, true != "<EOS>") %>%
select(true) %>%
transmute(rank = match(true, table = attr(p, "dict"))) %>%
ggplot(aes(x = rank)) + geom_histogram(binwidth = 25)
}
## -----------------------------------------------------------------------------
dict <- sbo_dictionary(corpus = sbo::twitter_train,
max_size = 100,
target = 0.5,
.preprocess = sbo::preprocess,
EOS = ".?!:;")
## -----------------------------------------------------------------------------
(c <- word_coverage(p, sbo::twitter_train))
## -----------------------------------------------------------------------------
summary(c)
## ---- fig.align = "center", fig.width=5---------------------------------------
plot(c)
## -----------------------------------------------------------------------------
f <- kgram_freqs(corpus = sbo::twitter_train,
N = 3,
dict = target ~ 0.75,
.preprocess = sbo::preprocess,
EOS = ".?!:;"
)
## -----------------------------------------------------------------------------
predict(f, "i love")
## -----------------------------------------------------------------------------
predict(p, "i love")
## -----------------------------------------------------------------------------
size_in_MB <- function(x) format(utils::object.size(x), units = "MB")
sapply(list(sbo_predtable = t, kgram_freqs = f), size_in_MB)
## -----------------------------------------------------------------------------
chrono_predict <- function(x) system.time(predict(x, "i love"), gcFirst = TRUE)
lapply(list(sbo_predictor = p, kgram_freqs = f), chrono_predict)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.