algorithm_test.R
In morphemepiece: Morpheme Tokenization

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  eval = FALSE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
#  library(morphemepiece)
#  library(dplyr)

## -----------------------------------------------------------------------------
#  # These are local paths for illustration purposes
#  vocab_path <- "/shared/morphemepiece_vocabs/mp_vocab_large.txt"
#  lookup_path <- "/shared/morphemepiece_vocabs/mp_lookup_large.txt"
#  # We will be interested in words that are in the large lookup, but not the small
#  # one (as a proxy for the most common words that will hit the fallthrough
#  # algorithm).
#  lookup_path_small <- "/shared/morphemepiece_vocabs/mp_lookup_small.txt"
#  
#  mp_vocab <- load_or_retrieve_vocab(vocab_path)
#  mp_lookup <- load_or_retrieve_lookup(lookup_path)
#  mp_lookup_small <- load_or_retrieve_lookup(lookup_path_small)
#  

## -----------------------------------------------------------------------------
#  breakdown1 <- list()
#  breakdown2 <- list()
#  words_to_do <- setdiff(names(mp_lookup), names(mp_lookup_small))
#  # It takes about an hour to do all words in this set.
#  for (word in words_to_do) {
#    bd1 <- morphemepiece:::.mp_tokenize_word_bidir(word,
#                                                   mp_vocab,
#                                                   allow_compounds = FALSE)
#    bd2 <- morphemepiece:::.mp_tokenize_word_bidir(word,
#                                                   mp_vocab,
#                                                   allow_compounds = TRUE)
#    breakdown1 <- append(breakdown1, paste0(bd1, collapse = " "))
#    breakdown2 <- append(breakdown2, paste0(bd2, collapse = " "))
#  }
#  
#  actual_bd <- mp_lookup[words_to_do]
#  wdtbl <- dplyr::tibble(words_to_do, actual_bd, bd1 = unlist(breakdown1), bd2 = unlist(breakdown2))
#  
#  calc_score <- function(bd0, bd) {
#    bd0 <- stringr::str_split(bd0, " ", simplify = FALSE)
#    bd <- stringr::str_split(bd, " ", simplify = FALSE)
#    bd0 <- purrr::map(bd0, function(b) {b[b != "##"]} )
#    bd <- purrr::map(bd, function(b) {b[b != "##"]} )
#  
#    purrr::map2_dbl(bd0, bd, function(a, b) {
#      re <- mean(a %in% b)
#      pr <- mean(b %in% a)
#      if (re == 0 & pr == 0) {
#        return(0)
#      }
#      f1 <- 2*re*pr / (re + pr)
#      return(f1)
#      })
#  }
#  
#  
#  scored <- wdtbl %>%
#    # The filter helps focus on the difference between the two algorithms.
#    # To measure absolute performance, we'd take out this filter.
#    filter(bd1 != bd2) %>%
#    mutate(score1 = calc_score(actual_bd, bd1)) %>%
#    mutate(score2 = calc_score(actual_bd, bd2))
#  
#  # what was the mean score of each algorithm? (1=old, 2=new)
#  mean(scored$score1) # 0.3717737
#  mean(scored$score2) # 0.4134288
#  
#  # what fraction of words did each algorithm score 100% on?
#  mean(scored$score1 == 1) # 0.03477313
#  mean(scored$score2 == 1) # 0.1674262
#  
#  # what fraction of words did each algorithm score 0% on?
#  mean(scored$score1 == 0) # 0.1803051
#  mean(scored$score2 == 0) # 0.2317713
#  
#  # in what fraction of cases was the old or new algorithm strictly better?
#  scored %>%
#    mutate(old_better = score1 > score2) %>%
#    mutate(new_better = score1 < score2) %>%
#    summarize(mean(old_better), mean(new_better))
#  
#  # # A tibble: 1 x 2
#  #   `mean(old_better)` `mean(new_better)`
#  #                <dbl>              <dbl>
#  # 1              0.343              0.536
#