inst/doc/udpipe-train.R

## ----setup, include=FALSE, cache=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
options(width = 1000)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, comment = NA, eval = TRUE)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
file_conllu <- system.file(package = "udpipe", "dummydata", "traindata.conllu")
file_conllu
cat(head(readLines(file_conllu), 3), sep="\n")

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
library(udpipe)
m <- udpipe_train(file = "toymodel.udpipe", files_conllu_training = file_conllu, 
                  annotation_tokenizer = list(dimension = 16, 
                                              epochs = 1, 
                                              batch_size = 100, 
                                              dropout = 0.7),
                  annotation_tagger = list(iterations = 1, 
                                           models = 1, 
                                           provide_xpostag = 1, 
                                           provide_lemma = 0, 
                                           provide_feats = 0), 
                  annotation_parser = "none")
m$file_model

## The model is now trained and saved in file toymodel.udpipe in the current working directory
## Now we can use the model to annotate some text
mymodel <- udpipe_load_model("toymodel.udpipe")
x <- udpipe_annotate(
  object = mymodel, 
  x = "Dit is een tokenizer met POS tagging, 
       zonder lemmatisation noch laat deze dependency parsing toe.", 
  parser = "none")
str(as.data.frame(x))

## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  m <- udpipe_train(file = "toymodel.udpipe", files_conllu_training = file_conllu,
#                    annotation_tokenizer = "default",
#                    annotation_tagger = "default",
#                    annotation_parser = "default")

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
params <- list()

## Tokenizer training parameters
params$tokenizer <- list(dimension = 24, 
                         epochs = 1, #epochs = 100, 
                         initialization_range = 0.1, 
                         batch_size = 100, learning_rate = 0.005, 
                         dropout = 0.1, early_stopping = 1)

## Tagger training parameters
params$tagger <- list(models = 2, 
  templates_1 = "tagger", 
      guesser_suffix_rules_1 = 8, guesser_enrich_dictionary_1 = 6, 
      guesser_prefixes_max_1 = 0, 
      use_lemma_1 = 0, use_xpostag_1 = 1, use_feats_1 = 1, 
      provide_lemma_1 = 0, provide_xpostag_1 = 1, 
      provide_feats_1 = 1, prune_features_1 = 0, 
  templates_2 = "lemmatizer", 
      guesser_suffix_rules_2 = 6, guesser_enrich_dictionary_2 = 4, 
      guesser_prefixes_max_2 = 4, 
      use_lemma_2 = 1, use_xpostag_2 = 0, use_feats_2 = 0, 
      provide_lemma_2 = 1, provide_xpostag_2 = 0, 
      provide_feats_2 = 0, prune_features_2 = 0)

## Dependency parser training parameters
params$parser <- list(iterations = 1, 
  #iterations = 30, 
  embedding_upostag = 20, embedding_feats = 20, embedding_xpostag = 0, 
  embedding_form = 50, 
  #embedding_form_file = "../ud-2.0-embeddings/nl.skip.forms.50.vectors", 
  embedding_lemma = 0, embedding_deprel = 20, 
  learning_rate = 0.01, learning_rate_final = 0.001, l2 = 0.5, hidden_layer = 200, 
  batch_size = 10, transition_system = "projective", transition_oracle = "dynamic", 
  structured_interval = 10)

## Train the model
m <- udpipe_train(file = "toymodel.udpipe", 
                  files_conllu_training = file_conllu, 
                  annotation_tokenizer = params$tokenizer,
                  annotation_tagger = params$tagger,
                  annotation_parser = params$parser)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
data(udpipe_annotation_params)
str(udpipe_annotation_params$tokenizer)
## Example for training the tokenizer on the Dutch treebank
hyperparams_nl <- subset(udpipe_annotation_params$tokenizer, language_treebank == "nl")
as.list(hyperparams_nl)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Example for training the tagger on the Dutch treebank
hyperparams_nl <- subset(udpipe_annotation_params$tagger, language_treebank == "nl")
as.list(hyperparams_nl)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Example for training the dependency parser on the Dutch treebank
hyperparams_nl <- subset(udpipe_annotation_params$parser, language_treebank == "nl")
as.list(hyperparams_nl)

## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  library(utils)
#  library(udpipe)
#  library(word2vec)
#  
#  ## Work on data from Universal Dependencies - German GSD treebank
#  settings <- list()
#  settings$ud.train    <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-train.conllu"
#  settings$ud.dev      <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-dev.conllu"
#  settings$ud.test     <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-test.conllu"
#  
#  ## Download the conllu files
#  download.file(url = settings$ud.train, destfile = "train.conllu")
#  download.file(url = settings$ud.dev,   destfile = "dev.conllu")
#  download.file(url = settings$ud.test,  destfile = "test.conllu")
#  
#  ## Create wordvectors as these are used for training the dependency parser + save the word vectors to disk
#  x <- udpipe_read_conllu("train.conllu")
#  x <- paste.data.frame(x, term = "token", group = c("doc_id", "paragraph_id", "sentence_id"), collapse = " ")
#  x <- x$token
#  writeLines(x, con = file("text.txt", encoding = "UTF-8", open = "wt"))
#  w2v <- word2vec("text.txt", type = "skip-gram", dim = 50, window = 10, min_count = 2, negative = 5, iter = 15, threads = 1)
#  write.word2vec(w2v, file = "wordvectors.vec", type = "txt", encoding = "UTF-8")
#  predict(w2v, c("gut", "freundlich"), type = "nearest", top = 20)
#  
#  ## Train the model
#  print(Sys.time())
#  m <- udpipe_train(file = "de_gsd-ud-2.6-20200924.udpipe",
#                    files_conllu_training = "train.conllu",
#                    files_conllu_holdout  = "dev.conllu",
#                    annotation_tokenizer = list(dimension = 64, epochs = 100, segment_size=200, initialization_range = 0.1,
#                                                batch_size = 50, learning_rate = 0.002, learning_rate_final=0, dropout = 0.1, early_stopping = 1),
#                    annotation_tagger = list(models = 2,
#                                             templates_1 = "lemmatizer", guesser_suffix_rules_1 = 8, guesser_enrich_dictionary_1 = 4, guesser_prefixes_max_1 = 4,
#                                             use_lemma_1 = 1,provide_lemma_1 = 1, use_xpostag_1 = 0, provide_xpostag_1 = 0,
#                                             use_feats_1 = 0, provide_feats_1 = 0, prune_features_1 = 1,
#                                             templates_2 = "tagger", guesser_suffix_rules_2 = 8, guesser_enrich_dictionary_2 = 4, guesser_prefixes_max_2 = 0,
#                                             use_lemma_2 = 1, provide_lemma_2 = 0, use_xpostag_2 = 1, provide_xpostag_2 = 1,
#                                             use_feats_2 = 1, provide_feats_2 = 1, prune_features_2 = 1),
#                    annotation_parser = list(iterations = 30, embedding_upostag = 20, embedding_feats = 20, embedding_xpostag = 0,
#                                             embedding_form = 50, embedding_form_file = "wordvectors.vec",
#                                             embedding_lemma = 0, embedding_deprel = 20, learning_rate = 0.01,
#                                             learning_rate_final = 0.001, l2 = 0.5, hidden_layer = 200,
#                                             batch_size = 10, transition_system = "projective", transition_oracle = "dynamic",
#                                             structured_interval = 8))
#  print(Sys.time())
#  
#  ## Evaluate the accuracy
#  m <- udpipe_load_model("de_gsd-ud-2.6-20200924.udpipe")
#  goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "default", tagger = "default", parser = "default")
#  cat(goodness_of_fit$accuracy, sep = "\n")
#  goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "none", tagger = "default", parser = "default")
#  cat(goodness_of_fit$accuracy, sep = "\n")
#  goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "none", tagger = "none", parser = "default")
#  cat(goodness_of_fit$accuracy, sep = "\n")

## ---- results='hide', echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
invisible(file.remove(c("toymodel.udpipe")))

Try the udpipe package in your browser

Any scripts or data that you put into this service are public.

udpipe documentation built on Jan. 6, 2023, 5:06 p.m.