textwhiz: Support tools for Text Analytics

# tune_fasttext.R function Tests ------------------------

# dummy data set up -------------------------
fast.text.parameters <- expand.grid(
  lr = seq(4, 4.3, 0.5),
  epoch = seq(30, 33, 10),
  dim = seq(100,120, 25),
  ws = seq(4, 6, 2),
  wordNgrams = 2,
  #loss = "softmax",
  minn = 2,
  maxn = 6
  # n_sample_negatives = 5,
)

df = data.frame(text_id = c(1:23),
                text = c("the seahawks are my favorite team.",
                         "russel wilson should be an mvp",
                         "seattle are superbowl champions",
                         "the seahawks are my favorite team.",
                         "russel wilson should be an mvp",
                         "seattle are superbowl champions",
                         "arsenal is the champion of the fa cup.",
                         "thierry henry was the best arsenal player",
                         "arsen wegner was the best manager.",
                         "arsenal is the champion of the fa cup.",
                         "thierry henry was the best arsenal player",
                         "arsen wegner was the best manager.",
                         "russel wilson should be an mvp",
                         "seattle are superbowl champions",
                         "the seahawks are my favorite team.",
                         "russel wilson should be an mvp",
                         "seattle are superbowl champions",
                         "arsenal is the champion of the fa cup.",
                         "thierry henry was the best arsenal player",
                         "arsen wegner was the best manager.",
                         "arsenal is the champion of the fa cup.",
                         "thierry henry was the best arsenal player",
                         "arsen wegner was the best manager."
                ),
                labels = c("nfl",
                           "nfl",
                           "nfl",
                           "nfl",
                           "nfl",
                           "nfl",
                           "soccer",
                           "soccer",
                           "soccer",
                           "soccer",
                           "soccer",
                           "soccer",
                           "nfl",
                           "nfl",
                           "nfl",
                           "nfl",
                           "nfl",
                           "soccer",
                           "soccer",
                           "soccer",
                           "soccer",
                           "soccer",
                           "soccer"
                ))

# tune_fasttext testing -----------------
foo <- textwhiz::tune_fasttext(k = 5,
                               text = df$text,
                               label = df$labels,
                               parameters = missing_dim,
                               seed = 123,
                               text_ids = df$text_id,
                               parallel = F)

system.time({
  textwhiz::tune_fasttext(k = 3,
                          text = df$text,
                          label = df$labels,
                          parameters = fast.text.parameters,
                          seed = 123,
                          text_ids = df$text_id,
                          parallel = T)
})

system.time({
  textwhiz::tune_fasttext(k = 3,
                          text = df$text,
                          label = df$labels,
                          parameters = fast.text.parameters,
                          seed = 123,
                          text_ids = df$text_id,
                          parallel = F)
})



# test forensic_fasttext ----------

fst.txt <- expand.grid(
  lr = 4.3,
  epoch = 30,
  dim = 120,
  ws = 6,
  wordNgrams = 2,
  #loss = "softmax",
  minn = 2,
  maxn = 6
  # n_sample_negatives <- 5,
)

foo <- textwhiz::forensic_fasttext(k = 3,
                               text = df$text,
                               label = df$labels,
                               parameters = missing_dim,
                               seed = 123,
                               text_ids = df$text_id)
incorrect = foo$incorrect
topic_m = foo$topic.metrics


system.time({
  textwhiz::forensic_fasttext(k = 3,
                              text = df$text,
                              label = df$labels,
                              parameters = fst.txt,
                              seed = 123,
                              text_ids = df$text_id)
})