library(polmineR)
library(Matrix)
library(text2vec)
library(proxy)
library(data.table)
library(magrittr)
use("GermaParl")
stopwords <- unname(unlist(noise(terms("GERMAPARL", p_attribute = "word"), stopwordsLanguage = "de")))
C <- Cooccurrences$new(x = "GERMAPARL", p_attribute = "word", window = 5L, drop = stopwords, verbose = TRUE)
C$count()
tcm <- C$as.simple_triplet_matrix() # less than 10 seconds
tcm <- as.sparseMatrix(tcm)
tcm_dgt <- as(tcm, "dgTMatrix")
GV <- GloVe$new(
  word_vectors_size = 50, vocabulary = rownames(tcm_dgt),
  x_max = 10, learning_rate = .1
) # in example learning_rate .25
GV$fit_transform(x = tcm_dgt, n_iter = 25)
glove_word_vectors <- GV$components
wordcount <- count("GERMAPARL", p_attribute = "word") %>% as.data.table()
get_semantic_field <- function(query, n = 50, cnt = wordcount, word_vectors = glove_word_vectors){
  query_vector <- matrix(word_vectors[,query], nrow = 1L)
  similarities <- proxy::simil(x = t(word_vectors), y = query_vector, method = "cosine")
  similarities_dt <- data.table(word = rownames(similarities), cosine = similarities[,1])
  setkeyv(similarities_dt, cols = "word")
  if (is.null(key(cnt))) setkeyv(cnt, cols = "word")
  similarities_dt <- cnt[similarities_dt]
  setorderv(similarities_dt, cols = "cosine", order = -1L)
  similarities_dt <- similarities_dt[2L:nrow(similarities_dt),]
  if (!is.null(n)) similarities_dt <- head(similarities_dt, n = n)
  similarities_dt
}
get_semantic_field(query = "Asylsuchende", n = 25)


PolMine/polmineR.misc documentation built on Nov. 23, 2022, 9:01 p.m.