This R notebook makes use of N-Grams and the ngram package to model the R code dataset from this repo.

We will first load the R code dataset,

library(magrittr)

code <- pins::pin_get("rmds", "https://raw.githubusercontent.com/javierluraschi/rmds/datasets/")

To reduce computation resources, start with a subset but skip this line otherwise,

# code <- code[1:10,]

First lets split between test and validation,

index <- sample(1:2, size = nrow(code), replace = TRUE, prob = c(.90, .10))
train <- code[index == 1,]
test <- code[index == 2,]

We will then manually tokenize the code in a format usable by the ngram package,

code_tokenize <- function(code) {
  code <- paste0(" <sof> ", paste(code, collapse = " <eof> "), " <eof> ")
  code <- gsub("\n", " <eol> ", code, fixed = TRUE)
  code <- gsub("\t", " ", code, fixed = TRUE)
  code <- gsub(",", " , ", code, fixed = TRUE)
  code <- gsub("(", " ( ", code, fixed = TRUE)
  code <- gsub(")", " ) ", code, fixed = TRUE)
  code <- gsub("[", " [ ", code, fixed = TRUE)
  code <- gsub("]", " ] ", code, fixed = TRUE)
  code <- gsub("$", " $ ", code, fixed = TRUE)
  code <- gsub("::", " :: ", code, fixed = TRUE)
  gsub("=", " = ", code, fixed = TRUE)
}

train_code <- code_tokenize(train$code)
test_code <- code_tokenize(test$code)

To validate this predicts properly, we used the following check,

# train_code <- test_code <- "a b c d e f g h i j k l m n o p q r s t u v w x y z"

We build the 3-Gram,

ngram_size <- 3
ngram_model <- ngram::ngram(train_code, n = ngram_size, sep = " ") %>% print()

And retrieve the phrase table,

ngram_table <- ngram::get.phrasetable(ngram_model) %>% tibble::as_tibble() %>% print()

Then we validate against the test dataset,

test_tokens <- strsplit(test_code, " ")[[1]]
test_tokens <- test_tokens[test_tokens != ""]

match_count <- 0
checks_count <- length(test_tokens)-ngram_size

token_incorrect <- c()
token_correct <- c()
start_time <- Sys.time()

for (i in 1:checks_count) {
  # Cap test time to 5 minutes
  if (Sys.time() > start_time + 60 * 5) break

  current_ngram <- test_tokens[i:(i+ngram_size-2)]
  current_text <- paste(current_ngram[1:ngram_size-1], collapse = " ")

  matches <- stringr::str_starts(ngram_table$ngrams, stringr::fixed(current_text))
  best <- ngram_table[matches,]$ngrams
  best <- if (length(best) == 0) "<na>" else best[1]

  correct_match <- paste0(current_text, " ", test_tokens[i+ngram_size-1], " ")
  if (correct_match %in% best) {
    match_count <- match_count + 1

    if (length(token_correct) < 1000) token_correct <- c(token_correct, correct_match)
  }
  else {
    if (length(token_incorrect) < 1000) token_incorrect <- c(token_incorrect, best)
  }
}

message("Correct: ", match_count, "\n",
        "Incorrect: ", checks_count, "\n",
        "Accuracy: ", match_count / checks_count)

A few examples of correct completions,

tibble::tibble(correct = token_correct)

A few examples of incorrect completions,

tibble::tibble(incorrect = token_incorrect)


javierluraschi/rmds documentation built on Dec. 8, 2019, 12:44 p.m.