This R notebook makes use of N-Grams and the ngram
package to model the R code dataset from this repo.
We will first load the R code dataset,
library(magrittr) code <- pins::pin_get("rmds", "https://raw.githubusercontent.com/javierluraschi/rmds/datasets/")
To reduce computation resources, start with a subset but skip this line otherwise,
# code <- code[1:10,]
First lets split between test and validation,
index <- sample(1:2, size = nrow(code), replace = TRUE, prob = c(.90, .10)) train <- code[index == 1,] test <- code[index == 2,]
We will then manually tokenize the code in a format usable by the ngram
package,
code_tokenize <- function(code) { code <- paste0(" <sof> ", paste(code, collapse = " <eof> "), " <eof> ") code <- gsub("\n", " <eol> ", code, fixed = TRUE) code <- gsub("\t", " ", code, fixed = TRUE) code <- gsub(",", " , ", code, fixed = TRUE) code <- gsub("(", " ( ", code, fixed = TRUE) code <- gsub(")", " ) ", code, fixed = TRUE) code <- gsub("[", " [ ", code, fixed = TRUE) code <- gsub("]", " ] ", code, fixed = TRUE) code <- gsub("$", " $ ", code, fixed = TRUE) code <- gsub("::", " :: ", code, fixed = TRUE) gsub("=", " = ", code, fixed = TRUE) } train_code <- code_tokenize(train$code) test_code <- code_tokenize(test$code)
To validate this predicts properly, we used the following check,
# train_code <- test_code <- "a b c d e f g h i j k l m n o p q r s t u v w x y z"
We build the 3-Gram,
ngram_size <- 3 ngram_model <- ngram::ngram(train_code, n = ngram_size, sep = " ") %>% print()
And retrieve the phrase table,
ngram_table <- ngram::get.phrasetable(ngram_model) %>% tibble::as_tibble() %>% print()
Then we validate against the test dataset,
test_tokens <- strsplit(test_code, " ")[[1]] test_tokens <- test_tokens[test_tokens != ""] match_count <- 0 checks_count <- length(test_tokens)-ngram_size token_incorrect <- c() token_correct <- c() start_time <- Sys.time() for (i in 1:checks_count) { # Cap test time to 5 minutes if (Sys.time() > start_time + 60 * 5) break current_ngram <- test_tokens[i:(i+ngram_size-2)] current_text <- paste(current_ngram[1:ngram_size-1], collapse = " ") matches <- stringr::str_starts(ngram_table$ngrams, stringr::fixed(current_text)) best <- ngram_table[matches,]$ngrams best <- if (length(best) == 0) "<na>" else best[1] correct_match <- paste0(current_text, " ", test_tokens[i+ngram_size-1], " ") if (correct_match %in% best) { match_count <- match_count + 1 if (length(token_correct) < 1000) token_correct <- c(token_correct, correct_match) } else { if (length(token_incorrect) < 1000) token_incorrect <- c(token_incorrect, best) } } message("Correct: ", match_count, "\n", "Incorrect: ", checks_count, "\n", "Accuracy: ", match_count / checks_count)
A few examples of correct completions,
tibble::tibble(correct = token_correct)
A few examples of incorrect completions,
tibble::tibble(incorrect = token_incorrect)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.