Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
eval = FALSE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
# library(morphemepiece)
# library(dplyr)
## -----------------------------------------------------------------------------
# # These are local paths for illustration purposes
# vocab_path <- "/shared/morphemepiece_vocabs/mp_vocab_large.txt"
# lookup_path <- "/shared/morphemepiece_vocabs/mp_lookup_large.txt"
# # We will be interested in words that are in the large lookup, but not the small
# # one (as a proxy for the most common words that will hit the fallthrough
# # algorithm).
# lookup_path_small <- "/shared/morphemepiece_vocabs/mp_lookup_small.txt"
#
# mp_vocab <- load_or_retrieve_vocab(vocab_path)
# mp_lookup <- load_or_retrieve_lookup(lookup_path)
# mp_lookup_small <- load_or_retrieve_lookup(lookup_path_small)
#
## -----------------------------------------------------------------------------
# breakdown1 <- list()
# breakdown2 <- list()
# words_to_do <- setdiff(names(mp_lookup), names(mp_lookup_small))
# # It takes about an hour to do all words in this set.
# for (word in words_to_do) {
# bd1 <- morphemepiece:::.mp_tokenize_word_bidir(word,
# mp_vocab,
# allow_compounds = FALSE)
# bd2 <- morphemepiece:::.mp_tokenize_word_bidir(word,
# mp_vocab,
# allow_compounds = TRUE)
# breakdown1 <- append(breakdown1, paste0(bd1, collapse = " "))
# breakdown2 <- append(breakdown2, paste0(bd2, collapse = " "))
# }
#
# actual_bd <- mp_lookup[words_to_do]
# wdtbl <- dplyr::tibble(words_to_do, actual_bd, bd1 = unlist(breakdown1), bd2 = unlist(breakdown2))
#
# calc_score <- function(bd0, bd) {
# bd0 <- stringr::str_split(bd0, " ", simplify = FALSE)
# bd <- stringr::str_split(bd, " ", simplify = FALSE)
# bd0 <- purrr::map(bd0, function(b) {b[b != "##"]} )
# bd <- purrr::map(bd, function(b) {b[b != "##"]} )
#
# purrr::map2_dbl(bd0, bd, function(a, b) {
# re <- mean(a %in% b)
# pr <- mean(b %in% a)
# if (re == 0 & pr == 0) {
# return(0)
# }
# f1 <- 2*re*pr / (re + pr)
# return(f1)
# })
# }
#
#
# scored <- wdtbl %>%
# # The filter helps focus on the difference between the two algorithms.
# # To measure absolute performance, we'd take out this filter.
# filter(bd1 != bd2) %>%
# mutate(score1 = calc_score(actual_bd, bd1)) %>%
# mutate(score2 = calc_score(actual_bd, bd2))
#
# # what was the mean score of each algorithm? (1=old, 2=new)
# mean(scored$score1) # 0.3717737
# mean(scored$score2) # 0.4134288
#
# # what fraction of words did each algorithm score 100% on?
# mean(scored$score1 == 1) # 0.03477313
# mean(scored$score2 == 1) # 0.1674262
#
# # what fraction of words did each algorithm score 0% on?
# mean(scored$score1 == 0) # 0.1803051
# mean(scored$score2 == 0) # 0.2317713
#
# # in what fraction of cases was the old or new algorithm strictly better?
# scored %>%
# mutate(old_better = score1 > score2) %>%
# mutate(new_better = score1 < score2) %>%
# summarize(mean(old_better), mean(new_better))
#
# # # A tibble: 1 x 2
# # `mean(old_better)` `mean(new_better)`
# # <dbl> <dbl>
# # 1 0.343 0.536
#
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.