.dl_fasttext <- function(lang = "en") {
download.file(paste0("https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.", lang, ".align.vec"), destfile = paste0("wiki.", lang, ".align.vec"), method = 'curl')
}
.prepos <- function(filename, outputfilename, topwords = 200000) {
raw <- file(filename, open = "rt")
output_file <- file(outputfilename, open = "wt")
## Skip first line
rubbish <- readLines(raw, n = 1)
linecounter <- 0
while(linecounter < topwords) {
line_content <- readLines(raw, n = 1)
if (identical(line_content, character(0))) {
## EOF
break
}
### One must convert line_content to utf-8 to maintain compatibility with grep on most systems.
if (!grepl("^[[:punct:][:digit:]]", enc2utf8(line_content))) {
writeLines(line_content, output_file)
linecounter <- linecounter + 1
}
}
close(raw)
close(output_file)
}
#' Download fastText word embeddings from the internet
#'
#' This function downloads fastText word embeddings from the internet, trims the embeddings and saves as an R serialized object.
#' @param lang character, which language of fastText word embeddings to download, e.g. "en"
#' @param topwords integer, trim the word embeddings based on the most frequent words, default to top 200000 words
#' @return Nothing, a file is saved.
#' @export
get_ft <- function(lang = "en", topwords = 200000) {
.dl_fasttext(lang)
.prepos(paste0("wiki.", lang, ".align.vec"), paste0("wiki.", lang, ".clean.vec"), topwords = topwords)
readr::read_delim(paste0("wiki.", lang, ".clean.vec"), col_names = FALSE, delim = " ") %>% saveRDS(paste0("wiki.", lang, ".trimmed.RDS"))
unlink(paste0("wiki.", lang, ".align.vec"))
unlink(paste0("wiki.", lang, ".clean.vec"))
}
## Previous implementation that uses grep.
## get_ft <- function(lang = "en", topwords = 200000) {
## .dl_fasttext(lang)
## system(paste0("grep -v '^[[:punct:][:digit:]]' wiki.", lang ,".align.vec > wiki.", lang, ".clean.vec"))
## readr::read_delim(paste0("wiki.", lang, ".clean.vec"), col_names = FALSE, skip = 1, delim = " ") %>% head(topwords) %>% saveRDS(paste0("wiki.", lang, ".trimmed.RDS"))
## unlink(paste0("wiki.", lang, ".align.vec"))
## unlink(paste0("wiki.", lang, ".clean.vec"))
## }
#' Reading word embeddings downloaded with get_ft()
#'
#' This function reads fastText word embeddings downloaded with get_ft(). The file(s) must be in the working directory.
#' @param lang a vector of languages, e.g. c("en", "de")
#' @return a list of word embeddings
#' @export
read_ft <- function(lang = c("en", "de", "fr")) {
emb <- purrr::map(lang, ~readRDS(paste0("wiki.", ., ".trimmed.RDS")))
names(emb) <- lang
return(emb)
}
#' Create a multilingual corpus
#'
#' This function generates a multilingual corpus.
#' @param text_content a vector of text content
#' @param lang a vector of lang, using lower case, e.g. "de"
#' @return a quanteda corpus
#' @export
create_corpus <- function(text_content, lang) {
quanteda::corpus(text_content, docvars = data.frame(lang = lang, stringsAsFactors = FALSE))
}
.tokenize <- function(text) {
return(stringr::str_split(tolower(text), "[[:punct:][:space:]]")[[1]])
}
.gen_doc_embedding <- function(text_content, lang, emb, boe = FALSE, remove_stopwords = TRUE) {
splited_word <- .tokenize(text_content)
candidate_words <- tibble::tibble(X1 = splited_word)
bag_of_embeddings <- candidate_words %>% dplyr::left_join(emb[[lang]], by = "X1") %>% dplyr::filter(X1 != "")
if (remove_stopwords) {
bag_of_embeddings %>% dplyr::filter(!X1 %in% quanteda::stopwords(lang)) -> bag_of_embeddings
}
if (boe) {
return(bag_of_embeddings)
}
bag_of_embeddings %>% dplyr::summarise_at(dplyr::vars(X2:X301), mean, na.rm = TRUE)
}
#' Generate a document-feature matrix using word embeddings
#'
#' This function generates document-feature matrix (dfm) from a multilingual corpus.
#' @param corpus a multilingual corpus generated by create_corpus()
#' @param emb a list of word embeddings loaded from read_ft()
#' @param .progress boolean, displaying a progress bar or not
#' @param mode character, either 'bert' or 'fasttext'
#' @param noise boolean, printing noise so that you know the transmation is progressing
#' @param remove_stopwords, boolean, whether or not to remove stopwords
#' @return a rectr_dfm object
#' @importFrom magrittr %>%
#' @export
transform_dfm_boe <- function(corpus, emb = NULL, .progress = TRUE, mode = "bert", noise = FALSE, remove_stopwords = TRUE, bert_sentence_tokenization = TRUE, envname = "rectr_condaenv", path = "./") {
if (mode == "fasttext" | !is.null(emb)) {
mode <- "fasttext"
real_dfm <- furrr::future_map2_dfr(as.vector(corpus), quanteda::docvars(corpus, "lang"), .gen_doc_embedding, emb = emb, .progress = .progress, remove_stopwords = remove_stopwords) %>% as.matrix
} else if (mode == "bert") {
mode <- "bert"
real_dfm <- .bert(content = as.vector(corpus), lang = quanteda::docvars(corpus, "lang"), noise = noise, remove_stopwords = remove_stopwords, bert_sentence_tokenization = bert_sentence_tokenization, envname = envname, path = path)
} else {
stop("Argument 'mode' must be 'bert' or 'fasttext'.")
}
res <- list(dfm = real_dfm, corpus = corpus, k = NULL, filtered = FALSE, mode = mode)
class(res) <- append(class(res), "rectr_dfm")
return(res)
}
#' Print a rectr_dfm object
#'
#' This function prints useful information from an rectr_dfm object.
#' @param rectr_dfm a rectr_dfm object
#' @return nothing
#' @export
print.rectr_dfm <- function(rectr_dfm) {
cat(paste0("dfm with a dimension of ", nrow(rectr_dfm$dfm), " x ", ncol(rectr_dfm$dfm), " and ", paste0(unique(quanteda::docvars(rectr_dfm$corpus, "lang")), collapse = "/"), " language(s).\n", ifelse(rectr_dfm$filtered, paste("Filtered with k = ", rectr_dfm$k), ""), "\nAligned word embeddings: ", rectr_dfm$mode, "\n"))
}
.check_lang_indep <- function(lsa_res, lang_vector, alpha = 0.05, noise = FALSE) {
max_i <- ncol(lsa_res)
for (i in 1:max_i) {
p <- summary(aov(lsa_res[,i]~lang_vector))[[1]]$`Pr(>F)`[1]
if (p > alpha) {
if (noise) {
print(paste0("Select from: ", i, " th dimension."))
}
return(i)
}
}
stop("All selected dimensions in the U Matrix have a significant language influence. Adjust parameters 'alpha' and/or 'dimension'.")
}
.calculate_rectr <- function(lsa_res, k, corpus, seed) {
if (!is.null(seed)) {
set.seed(seed)
}
##lang_vector <- quanteda::docvars(corpus, "lang")
##i <- .check_lang_indep(lsa_res, lang_vector)
##max_d <- (k*2) + i
x <- flexmix::flexmix(lsa_res~1, k = k, model = flexmix::FLXMCmvnorm(diagonal = FALSE))
return(x)
}
#' Filter a document-feature matrix for systematic language differences
#'
#' This function filters a document-feature matrix using singular value decomposition.
#' @param input_dfm dfm generated by dfm_boe()
#' @param k integer, number of topics
#' @param corpus a multilingual corpus generated by create_corpus()
#' @param multiplication_factor integer, select k * mulitiplication_factor columns from the U-Matrix.
#' @param dimension integer, the first singular value to be extracted in the U-Matrix.
#' @param alpha double, alpha level to filter the U-Matrix using one-way ANOVA.
#' @return an rectr_dfm object
#' @export
filter_dfm <- function(input_dfm, k, corpus = NULL, multiplication_factor = 2, dimension = 100, alpha = 0.05, noise = FALSE) {
if (is.null(corpus)) {
## check if the input_dfm has a corpus
if (!is.null(input_dfm$corpus)) {
corpus <- input_dfm$corpus
} else {
stop("Corpus not found.")
}
}
svd_dfm <- RSpectra::svds(input_dfm$dfm, k = dimension, nu = dimension, nv = dimension)$u
lang_vector <- quanteda::docvars(corpus, "lang")
i <- .check_lang_indep(svd_dfm, lang_vector, alpha, noise = noise)
max_d <- (k * multiplication_factor) + i
input_dfm$dfm <- svd_dfm[,i:max_d]
input_dfm$k <- k
input_dfm$filtered <- TRUE
return(input_dfm)
}
#' Fit a Guassian Mixture Model from a document-feature matrix
#'
#' This function fits a guassian mixture model (GMM) from a document-feature matrix.
#' @param input_dfm a rectr_dfm object
#' @param seed integer, to set the random seed for reproducibility. When leave as NULL, no random seed is set.
#' @return a rectr_model object. rectr_model$theta is the topic-assignment matrix
#' @export
calculate_gmm <- function(input_dfm, seed = NULL) {
if (!input_dfm$filtered) {
stop("Please filter the dfm with filter_dfm() first.")
}
gmm_model <- .calculate_rectr(input_dfm$dfm, k = input_dfm$k, corpus = input_dfm$corpus, seed = seed)
res <- list()
res$gmm_model <- gmm_model
res$dfm <- input_dfm
res$theta <- flexmix::posterior(gmm_model)
res$k <- input_dfm$k
if (ncol(res$theta) < res$k) {
warning(paste0("Cannot converge with a model with k = ", res$k, ". Actual k = ", ncol(res$theta)))
}
res$seed <- seed
class(res) <- append(class(res), "rectr_model")
return(res)
}
#' Print a rectr_model object
#'
#' Print useful information about a rectr_model.
#' @param rectr_model a rectr_model object
#' @return nothing
#' @export
print.rectr_model <- function(rectr_model) {
cat(paste0(rectr_model$k, "-topic rectr model trained with a "))
print(rectr_model$dfm)
if (ncol(rectr_model$theta) < rectr_model$k) {
cat(paste0("Defacto k = ", ncol(rectr_model$theta), "\n"))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.