generating_vocab.R
In morphemepiece: Morpheme Tokenization

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  eval = FALSE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
#  # Install non-CRAN packages
#  # remotes::install_github("macmillancontentscience/wikimorphemes")
#  # remotes::install_github("macmillancontentscience/wordpiece.data")
#  library(morphemepiece)
#  library(wikimorphemes)
#  library(wordpiece.data)
#  library(dplyr)
#  library(ggplot2)
#  library(purrr)

## ----load-data----------------------------------------------------------------
#  # load functions to make the vocab + lookup
#  source(here::here("vignettes", "make_vocab_and_lookup.R"))
#  
#  # Load data from various packages related to this task.
#  original_lookup <- readRDS(wikimorphemes::download_wikimorphemes_lookup())
#  
#  # TODO: Add something to reproduce this or host it somewhere.
#  word_frequency_table <- readRDS(
#    fs::path(
#      morphemepiece_cache_dir(),
#      "word_frequency_table.rds"
#    )
#  )
#  
#  # Not all wiktionary words are in the wiktionary lookup. Use the full word list
#  # to add short words back in.
#  full_lookup <- .add_words_to_lookup(
#    original_lookup,
#    wikimorphemes::wiktionary_word_list()
#  )

## ----count-tokens-------------------------------------------------------------
#  # for much of this process, it's more convenient to have the processed words
#  # unnested, with one morpheme per row. This takes a few minutes.
#  
#  # Currently, we're considering only words with pure lowercase latin characters.
#  # We likely will want to also include the simplified version of words with
#  # accented characters in this list.
#  unnested_lookup <- .unnest_lookup(full_lookup, clean = TRUE)
#  
#  # count how many wiktionary words each token appears in.
#  token_counts <- count_tokens(unnested_lookup)
#  
#  utils::head(token_counts)

## ----temp-values, eval = TRUE, include = FALSE--------------------------------
# This is here so the package can still build properly when this vignette is
# "turned off."
words_vs_tokens <- list(
  max_rank = 194347L,
  n_words = 643745L
)

## ----token-ranks--------------------------------------------------------------
#  # Some words process into "non-clean" tokens (diacrits, etc.).
#  # Those tokens are excluded in token_counts, so will get an NA here.
#  # Deal with this better later, but for now, just remove those words.
#  # (only about 0.01% of words)
#  
#  # Find the highest-rank (rarest) token within each word.
#  words_with_max_token_ranks <- dplyr::left_join(
#    unnested_lookup,
#    token_counts,
#    by = "token"
#  ) %>%
#    dplyr::group_by(word) %>%
#    dplyr::summarize(max_rank = max(rank)) %>%
#    dplyr::filter(!is.na(max_rank))
#  
#  # Count how many total words are covered by tokens up to some rank:
#  words_vs_tokens <- words_with_max_token_ranks %>%
#    dplyr::group_by(max_rank) %>%
#    dplyr::summarize(n_words = dplyr::n_distinct(word)) %>%
#    dplyr::arrange(max_rank) %>%
#    dplyr::mutate(n_words = cumsum(n_words))

## ----plot-counts--------------------------------------------------------------
#  # plot!
#  words_vs_tokens %>%
#    dplyr::mutate(frac_words = n_words/max(n_words)) %>%
#    dplyr::mutate(frac_tokens = max_rank/max(max_rank)) %>%
#    ggplot2::ggplot(ggplot2::aes(x = frac_tokens, y = frac_words)) +
#    ggplot2::geom_point()
#  

## ----count-weighted-----------------------------------------------------------
#  # passing a word frequency table in (columns: word, word_count) applies weights
#  # to the token counts.
#  token_counts_weighted <- count_tokens(unnested_lookup, word_frequency_table)
#  utils::head(token_counts_weighted)

## ----plot-weights-------------------------------------------------------------
#  words_with_max_token_ranks_weighted <- dplyr::left_join(
#    unnested_lookup,
#    token_counts_weighted,
#    by = "token"
#  ) %>%
#    dplyr::group_by(word) %>%
#    dplyr::summarize(max_rank = max(rank)) %>%
#    dplyr::filter(!is.na(max_rank))
#  
#  weighted_tokens_and_words <- dplyr::left_join(
#    words_with_max_token_ranks_weighted,
#    word_frequency_table,
#    by = "word"
#  ) %>%
#    dplyr::mutate(
#      word_count = ifelse(
#        test = is.na(word_count),
#        yes = 1L,
#        no = word_count
#      )
#    )
#  
#  words_vs_tokens_weighted <- weighted_tokens_and_words %>%
#    dplyr::group_by(max_rank) %>%
#    dplyr::summarize(n_words = sum(word_count)) %>%
#    dplyr::arrange(max_rank) %>%
#    dplyr::mutate(n_words = cumsum(n_words))
#  
#  # plot!
#  words_vs_tokens_weighted %>%
#    dplyr::mutate(frac_words = n_words/max(n_words)) %>%
#    dplyr::mutate(frac_tokens = max_rank/max(max_rank)) %>%
#    ggplot2::ggplot(ggplot2::aes(x = frac_tokens, y = frac_words)) +
#    ggplot2::geom_point()

## ----make-vocabs--------------------------------------------------------------
#  vandl_small <- make_vocab_and_lookup(
#    full_lookup = original_lookup,
#    full_vocabulary = wikimorphemes::wiktionary_word_list(),
#    wordpiece_vocab = wordpiece.data::wordpiece_vocab(),
#    target_vocab_size = 0, # no extra tokens
#    word_frequency_table = word_frequency_table
#  )
#  
#  vandl_large <- make_vocab_and_lookup(
#    full_lookup = original_lookup,
#    full_vocabulary = wikimorphemes::wiktionary_word_list(),
#    wordpiece_vocab = wordpiece.data::wordpiece_vocab(),
#    target_vocab_size = 30000L,
#    word_frequency_table = word_frequency_table
#  )

## ----make-lookups-------------------------------------------------------------
#  # TODO: Make this save and reload stuff unnecessary!
#  text_lookup_small <- .make_text_lookup(
#    voc = vandl_small$vocab,
#    lu = vandl_small$lookup,
#    word_freq_tab = word_frequency_table
#  )
#  
#  writeLines(
#    text_lookup_small,
#    file.path(morphemepiece_cache_dir(), "mp_lookup_small.txt")
#  )
#  # vocab is already just a character vector
#  writeLines(
#    vandl_small$vocab,
#    file.path(morphemepiece_cache_dir(), "mp_vocab_small.txt")
#  )
#  
#  # now do large
#  text_lookup_large <- .make_text_lookup(
#    voc = vandl_large$vocab,
#    lu = vandl_large$lookup,
#    word_freq_tab = word_frequency_table
#  )
#  
#  readr::write_lines(
#    text_lookup_large,
#    file.path(morphemepiece_cache_dir(), "mp_lookup_large.txt")
#  )
#  # vocab is already just a character vector
#  readr::write_lines(
#    vandl_large$vocab,
#    file.path(morphemepiece_cache_dir(), "mp_vocab_large.txt")
#  )
#  
#  # Read back from text files to process as standard {morphemepiece} files:
#  vocab <- load_or_retrieve_vocab(
#    file.path(morphemepiece_cache_dir(), "mp_vocab_large.txt")
#  )
#  lookup <- load_or_retrieve_lookup(
#    file.path(morphemepiece_cache_dir(), "mp_lookup_large.txt")
#  )
#  
#  morphemepiece_tokenize("Surprisingly easy", vocab, lookup)
#  morphemepiece_tokenize("'Twas brillig, and the slithy toves", vocab, lookup)

## ----check-coverage-small-----------------------------------------------------
#  corpus_coverage_small <- dplyr::left_join(
#    word_frequency_table,
#    vandl_small$lookup,
#    by = "word"
#  ) %>%
#    dplyr::mutate(covered_lookup = !is.na(tokenization)) %>%
#    # not every word in the vocab is in the lookup; check vocab too
#    dplyr::mutate(covered_vocab = word %in% vandl_small$vocab) %>%
#    dplyr::mutate(covered = covered_lookup | covered_vocab) %>%
#    dplyr::mutate(covered_weighted = covered*word_count)
#  
#  corpus_coverage_small %>%
#    dplyr::summarize(sum(covered_weighted)/sum(word_count))

## ----check-coverage-large-----------------------------------------------------
#  corpus_coverage_large <- dplyr::left_join(
#    word_frequency_table,
#    vandl_large$lookup,
#    by = "word"
#  ) %>%
#    dplyr::mutate(covered_lookup = !is.na(tokenization)) %>%
#    # not every word in the vocab is in the lookup; check vocab too
#    dplyr::mutate(covered_vocab = word %in% vandl_large$vocab) %>%
#    dplyr::mutate(covered = covered_lookup | covered_vocab) %>%
#    dplyr::mutate(covered_weighted = covered*word_count)
#  
#  corpus_coverage_large %>%
#    dplyr::summarize(sum(covered_weighted)/sum(word_count))

## ----show-coverage-large------------------------------------------------------
#  #large
#  uncovered <- corpus_coverage_large %>%
#    dplyr::filter(!.data$covered) %>%
#    dplyr::arrange(dplyr::desc(.data$word_count)) %>%
#    dplyr::select(.data$word, .data$word_count) %>%
#    head(100) %>%
#    dplyr::mutate(
#      tokenization = morphemepiece_tokenize(
#        .data$word,
#        vocab = vocab,
#        lookup = lookup
#      )
#    ) %>%
#    dplyr::rowwise() %>%
#    dplyr::mutate(
#      tokenization = paste(names(.data$tokenization), collapse = " ")
#    ) %>%
#    dplyr::ungroup()
#  
#  head(uncovered, 10)

## ----xkcd, eval = FALSE-------------------------------------------------------
#  # just for fun :-D
#  xkcd_words_url <- "https://xkcd.com/simplewriter/words.js"
#  raw_words <- readr::read_lines(xkcd_words_url)
#  
#  raw_words <- raw_words[grepl("WORDS", raw_words)]
#  raw_words <- stringr::str_split(raw_words, '"')[[1]]
#  raw_words <- raw_words[grepl("\\|", raw_words)]
#  words <- dplyr::tibble(top_words = stringr::str_split(raw_words, "\\|")[[1]])
#  # I feel lied to. There are more than 3k words in this list.
#  words <- words %>%
#    dplyr::mutate(
#      tokenized = morphemepiece_tokenize(.data$top_words, vocab, lookup)
#    ) %>%
#    dplyr::rowwise() %>%
#    dplyr::mutate(
#      tokenized = paste(names(.data$tokenized), collapse = " ")
#    ) %>%
#    dplyr::ungroup()
#  
#  # ss_url <- "url of mp_scratch google sheet"
#  
#  # already authorized
#  # googlesheets4::write_sheet(words, ss_url)
#  # manual check, add column "is_ok"
#  
#  # checked_words <- googlesheets4::read_sheet(ss_url, sheet = "check common words")
#  
#  # if breakdown is ok, value is "y"
#  
#  # mean(checked_words$is_ok == "y")
#  
#  # [1] 0.9711062
#  # many of the exceptions can/should be fixed in wiktionary
#  # These have not been checked in a while.

## ----more-checks, eval = FALSE------------------------------------------------
#  all_words <- unique(unnested_lookup$word)
#  unbroken_vocab_words <- intersect(names(vocab), all_words)
#  unbroken_vocab_words <- unbroken_vocab_words[nchar(unbroken_vocab_words) > 3]
#  
#  # sample a few hundred
#  unbroken_sample <- dplyr::tibble(unbroken_word = sample(unbroken_vocab_words,
#                                                          size = 300))
#  # send to google sheet for manual check
#  
#  # googlesheets4::write_sheet(unbroken_sample, ss_url, sheet = "unbroken_check")
#  # manual check, add column "is_ok"
#  # checked_unbroken_words <- googlesheets4::read_sheet(
#  #   ss_url,
#  #   sheet = "check unbroken words"
#  # )
#  
#  # if breakdown is ok, value is "y"
#  # table(checked_unbroken_words$is_ok)
#   #  ?   n   y
#   # 24  32 244
#  # many of the exceptions can/should be fixed in wiktionary