inst/doc/Text_classification.R

## ----eval=FALSE---------------------------------------------------------------
#  options(width=100L)
#  fn <- "dbpedia_csv.tar.gz"
#  
#  if ( !file.exists(fn) ) {
#      download.file("https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz",
#                    fn)
#      untar(fn)
#  }

## ----eval=FALSE---------------------------------------------------------------
#  library("fastTextR")
#  
#  train <- sample(sprintf("__label__%s", readLines("dbpedia_csv/train.csv")))
#  head(train, 2)

## ----eval=FALSE---------------------------------------------------------------
#  train <- ft_normalize(train)
#  writeLines(train, con = "dbpedia.train")
#  
#  test <- readLines("dbpedia_csv/test.csv")
#  labels <- trimws(gsub(",.*", "", test))
#  table(labels)

## ----eval=FALSE---------------------------------------------------------------
#  test <- ft_normalize(test)
#  test <- trimws(sub(".*?,", "", test))
#  head(test, 2)

## ----eval=FALSE---------------------------------------------------------------
#  cntrl <- ft_control(word_vec_size = 10L, learning_rate = 0.1, max_len_ngram = 2L,
#                      min_count = 1L, nbuckets = 10000000L, epoch = 5L, nthreads = 4L)
#  
#  model <- ft_train(file = "dbpedia.train", method = "supervised", control = cntrl)
#  ft_save(model, "dbpedia.bin")

## ----eval=FALSE---------------------------------------------------------------
#  model <- ft_load("dbpedia.bin")

## ----eval=FALSE---------------------------------------------------------------
#  test_pred <- ft_predict(model, newdata=test, k = 1L)
#  str(test_pred)

## ----eval=FALSE---------------------------------------------------------------
#  confusion_matrix <- table(truth=as.integer(labels),
#                            predicted=as.integer(gsub("\\D", "", test_pred$label)))
#  print(confusion_matrix)

## ----eval=FALSE---------------------------------------------------------------
#  accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
#  print(sprintf("Accuracy: %0.4f", accuracy))

Try the fastTextR package in your browser

Any scripts or data that you put into this service are public.

fastTextR documentation built on June 22, 2024, 7:16 p.m.