Nothing
## ----setup, include=FALSE, cache=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
options(width = 1000)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, comment = NA, eval = TRUE)
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
file_conllu <- system.file(package = "udpipe", "dummydata", "traindata.conllu")
file_conllu
cat(head(readLines(file_conllu), 3), sep="\n")
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
library(udpipe)
m <- udpipe_train(file = "toymodel.udpipe", files_conllu_training = file_conllu,
annotation_tokenizer = list(dimension = 16,
epochs = 1,
batch_size = 100,
dropout = 0.7),
annotation_tagger = list(iterations = 1,
models = 1,
provide_xpostag = 1,
provide_lemma = 0,
provide_feats = 0),
annotation_parser = "none")
m$file_model
## The model is now trained and saved in file toymodel.udpipe in the current working directory
## Now we can use the model to annotate some text
mymodel <- udpipe_load_model("toymodel.udpipe")
x <- udpipe_annotate(
object = mymodel,
x = "Dit is een tokenizer met POS tagging,
zonder lemmatisation noch laat deze dependency parsing toe.",
parser = "none")
str(as.data.frame(x))
## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# m <- udpipe_train(file = "toymodel.udpipe", files_conllu_training = file_conllu,
# annotation_tokenizer = "default",
# annotation_tagger = "default",
# annotation_parser = "default")
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
params <- list()
## Tokenizer training parameters
params$tokenizer <- list(dimension = 24,
epochs = 1, #epochs = 100,
initialization_range = 0.1,
batch_size = 100, learning_rate = 0.005,
dropout = 0.1, early_stopping = 1)
## Tagger training parameters
params$tagger <- list(models = 2,
templates_1 = "tagger",
guesser_suffix_rules_1 = 8, guesser_enrich_dictionary_1 = 6,
guesser_prefixes_max_1 = 0,
use_lemma_1 = 0, use_xpostag_1 = 1, use_feats_1 = 1,
provide_lemma_1 = 0, provide_xpostag_1 = 1,
provide_feats_1 = 1, prune_features_1 = 0,
templates_2 = "lemmatizer",
guesser_suffix_rules_2 = 6, guesser_enrich_dictionary_2 = 4,
guesser_prefixes_max_2 = 4,
use_lemma_2 = 1, use_xpostag_2 = 0, use_feats_2 = 0,
provide_lemma_2 = 1, provide_xpostag_2 = 0,
provide_feats_2 = 0, prune_features_2 = 0)
## Dependency parser training parameters
params$parser <- list(iterations = 1,
#iterations = 30,
embedding_upostag = 20, embedding_feats = 20, embedding_xpostag = 0,
embedding_form = 50,
#embedding_form_file = "../ud-2.0-embeddings/nl.skip.forms.50.vectors",
embedding_lemma = 0, embedding_deprel = 20,
learning_rate = 0.01, learning_rate_final = 0.001, l2 = 0.5, hidden_layer = 200,
batch_size = 10, transition_system = "projective", transition_oracle = "dynamic",
structured_interval = 10)
## Train the model
m <- udpipe_train(file = "toymodel.udpipe",
files_conllu_training = file_conllu,
annotation_tokenizer = params$tokenizer,
annotation_tagger = params$tagger,
annotation_parser = params$parser)
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
data(udpipe_annotation_params)
str(udpipe_annotation_params$tokenizer)
## Example for training the tokenizer on the Dutch treebank
hyperparams_nl <- subset(udpipe_annotation_params$tokenizer, language_treebank == "nl")
as.list(hyperparams_nl)
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Example for training the tagger on the Dutch treebank
hyperparams_nl <- subset(udpipe_annotation_params$tagger, language_treebank == "nl")
as.list(hyperparams_nl)
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## Example for training the dependency parser on the Dutch treebank
hyperparams_nl <- subset(udpipe_annotation_params$parser, language_treebank == "nl")
as.list(hyperparams_nl)
## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# library(utils)
# library(udpipe)
# library(word2vec)
#
# ## Work on data from Universal Dependencies - German GSD treebank
# settings <- list()
# settings$ud.train <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-train.conllu"
# settings$ud.dev <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-dev.conllu"
# settings$ud.test <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-test.conllu"
#
# ## Download the conllu files
# download.file(url = settings$ud.train, destfile = "train.conllu")
# download.file(url = settings$ud.dev, destfile = "dev.conllu")
# download.file(url = settings$ud.test, destfile = "test.conllu")
#
# ## Create wordvectors as these are used for training the dependency parser + save the word vectors to disk
# x <- udpipe_read_conllu("train.conllu")
# x <- paste.data.frame(x, term = "token", group = c("doc_id", "paragraph_id", "sentence_id"), collapse = " ")
# x <- x$token
# writeLines(x, con = file("text.txt", encoding = "UTF-8", open = "wt"))
# w2v <- word2vec("text.txt", type = "skip-gram", dim = 50, window = 10, min_count = 2, negative = 5, iter = 15, threads = 1)
# write.word2vec(w2v, file = "wordvectors.vec", type = "txt", encoding = "UTF-8")
# predict(w2v, c("gut", "freundlich"), type = "nearest", top = 20)
#
# ## Train the model
# print(Sys.time())
# m <- udpipe_train(file = "de_gsd-ud-2.6-20200924.udpipe",
# files_conllu_training = "train.conllu",
# files_conllu_holdout = "dev.conllu",
# annotation_tokenizer = list(dimension = 64, epochs = 100, segment_size=200, initialization_range = 0.1,
# batch_size = 50, learning_rate = 0.002, learning_rate_final=0, dropout = 0.1, early_stopping = 1),
# annotation_tagger = list(models = 2,
# templates_1 = "lemmatizer", guesser_suffix_rules_1 = 8, guesser_enrich_dictionary_1 = 4, guesser_prefixes_max_1 = 4,
# use_lemma_1 = 1,provide_lemma_1 = 1, use_xpostag_1 = 0, provide_xpostag_1 = 0,
# use_feats_1 = 0, provide_feats_1 = 0, prune_features_1 = 1,
# templates_2 = "tagger", guesser_suffix_rules_2 = 8, guesser_enrich_dictionary_2 = 4, guesser_prefixes_max_2 = 0,
# use_lemma_2 = 1, provide_lemma_2 = 0, use_xpostag_2 = 1, provide_xpostag_2 = 1,
# use_feats_2 = 1, provide_feats_2 = 1, prune_features_2 = 1),
# annotation_parser = list(iterations = 30, embedding_upostag = 20, embedding_feats = 20, embedding_xpostag = 0,
# embedding_form = 50, embedding_form_file = "wordvectors.vec",
# embedding_lemma = 0, embedding_deprel = 20, learning_rate = 0.01,
# learning_rate_final = 0.001, l2 = 0.5, hidden_layer = 200,
# batch_size = 10, transition_system = "projective", transition_oracle = "dynamic",
# structured_interval = 8))
# print(Sys.time())
#
# ## Evaluate the accuracy
# m <- udpipe_load_model("de_gsd-ud-2.6-20200924.udpipe")
# goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "default", tagger = "default", parser = "default")
# cat(goodness_of_fit$accuracy, sep = "\n")
# goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "none", tagger = "default", parser = "default")
# cat(goodness_of_fit$accuracy, sep = "\n")
# goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "none", tagger = "none", parser = "default")
# cat(goodness_of_fit$accuracy, sep = "\n")
## ---- results='hide', echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
invisible(file.remove(c("toymodel.udpipe")))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.