options(prompt = "R> ", continue = "+   ")
options(prompt = " ", continue = "   ")

data("dekamer", package = "ruimtehol")

dekamer$x <- strsplit(dekamer$question, "\\W")
dekamer$x <- lapply(dekamer$x, FUN = function(x) x[x != ""])
dekamer$x <- sapply(dekamer$x, FUN = function(x) paste(x, collapse = " "))
dekamer$x <- tolower(dekamer$x)
dekamer$y <- strsplit(dekamer$question_theme, split = ",")
dekamer$y <- lapply(dekamer$y, FUN=function(x) gsub(" ", "-", x))

model <- embed_tagspace(x = dekamer$x, y = dekamer$y, 
                        early_stopping = 0.8, validationPatience = 10,
                        dim = 50, 
                        lr = 0.01, epoch = 40, loss = "softmax", adagrad = TRUE, 
                        similarity = "cosine", negSearchLimit = 50,
                        ngrams = 2, minCount = 2, bucket = 100000,
                        maxTrainTime = 2 * 60)

dict <- starspace_dictionary(model)

emb <- as.matrix(model)

emb_words  <- as.matrix(model, type = "words")
emb_labels <- as.matrix(model, type = "labels", prefix = FALSE)
e <- starspace_embedding(model, x = c("__label__VERVOERBELEID", "geld"), type = "ngram")

e <- starspace_embedding(model, c("nationale loterij"), type = "ngram")

text <- c("de nmbs heeft het treinaanbod uitgebreid via onteigening ...",
          "de migranten komen naar europa de asielcentra ...")
emb_text <- starspace_embedding(model, text)

predict(model, "de migranten komen naar europa de asielcentra ...")

embedding_similarity(emb_text, emb_labels, type = "cosine", top_n = 5)

starspace_knn(model, "de migranten komen naar europa de asielcentra ...", k = 5)

targetdocs <- c("__label__FISCALITEIT", 
                "de migranten komen naar europa ZZZ", 
predict(model, "de migranten komen naar europa de asielcentra ...", 
        basedoc = targetdocs)
  starspace_embedding(model, "de migranten komen naar europa de asielcentra ..."),
  starspace_embedding(model, targetdocs), top_n = 3)

starspace_save_model(model, file = "textspace.ruimtehol")
model <- starspace_load_model("textspace.ruimtehol")

dekamer <- dekamer[order(rnorm(n = nrow(dekamer))), ]
X <- dekamer$x
Y <- dekamer$y
X[1:250]   <- NA
Y[251:500] <- NA
model <- embed_tagspace(x = X, y = Y, 
                        early_stopping = 0.8, validationPatience = 10,
                        dim = 50, 
                        lr = 0.01, epoch = 40, loss = "softmax", adagrad = TRUE, 
                        similarity = "cosine", negSearchLimit = 50,
                        ngrams = 2, minCount = 2,
                        maxTrainTime = 2 * 60)

pretrained <- matrix(data = rnorm(1000 * 100), nrow = 1000, ncol = 100, 
                     dimnames = list(term = sprintf("word%s", 1:1000)))
model <- starspace(embeddings = pretrained, 
                   similarity = "cosine", p = 0.5, ngrams = 1, trainMode = 5)
predict(model, newdata = c("word5 word1 word5 word3"), type = "knn")

model <- embed_wordspace(dekamer$x, 
                         dim = 50, ws = 7, epoch = 5, ngrams = 2, adagrad = FALSE,
                         margin = 0.8, negSearchLimit = 10,
                         maxTrainTime = 2 * 60)
pretrained_words  <- as.matrix(model)

labels            <- sort(unique(unlist(dekamer$y)))
pretrained_labels <- matrix(data = rnorm(n = length(labels) * 50, 
                                         mean = mean(pretrained_words), 
                                         sd = sd(pretrained_words)), 
                            nrow = length(labels), 
                            ncol = 50, 
                            dimnames = list(term = sprintf("__label__%s", labels)))
pretrained        <- rbind(pretrained_words, pretrained_labels)

model <- embed_tagspace(x = dekamer$x, y = dekamer$y, 
                        embeddings = pretrained,
                        early_stopping = 0.8, validationPatience = 10,
                        dim = 50, 
                        lr = 0.01, epoch = 40, loss = "softmax", adagrad = TRUE, 
                        similarity = "cosine", negSearchLimit = 50,
                        ngrams = 2, minCount = 2,
                        maxTrainTime = 2 * 60)
embedding <- as.matrix(model)

starspace_knn(model, "__label__FISCALITEIT", k = 10)

