R/rectr.R

Defines functions print.rectr_model calculate_gmm filter_dfm .calculate_rectr .check_lang_indep print.rectr_dfm transform_dfm_boe .gen_doc_embedding .tokenize create_corpus read_ft get_ft .prepos .dl_fasttext

Documented in calculate_gmm create_corpus filter_dfm get_ft print.rectr_dfm print.rectr_model read_ft transform_dfm_boe

.dl_fasttext <- function(lang = "en") {
    download.file(paste0("https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.", lang, ".align.vec"), destfile = paste0("wiki.", lang, ".align.vec"), method = 'curl')
}

.prepos <- function(filename, outputfilename, topwords = 200000) {
    raw <- file(filename, open = "rt")
    output_file <- file(outputfilename, open = "wt")
    ## Skip first line
    rubbish <- readLines(raw, n = 1)
    linecounter <- 0
    while(linecounter < topwords) {
        line_content <- readLines(raw, n = 1)
        if (identical(line_content, character(0))) {
            ## EOF
            break
        }
        ### One must convert line_content to utf-8 to maintain compatibility with grep on most systems.
        if (!grepl("^[[:punct:][:digit:]]", enc2utf8(line_content))) {
            writeLines(line_content, output_file)
            linecounter <- linecounter + 1
        }
    }
    close(raw)
    close(output_file)
}


#' Download fastText word embeddings from the internet
#'
#' This function downloads fastText word embeddings from the internet, trims the embeddings and saves as an R serialized object.
#' @param lang character, which language of fastText word embeddings to download, e.g. "en"
#' @param topwords integer, trim the word embeddings based on the most frequent words, default to top 200000 words
#' @return Nothing, a file is saved.
#' @export
get_ft <- function(lang = "en", topwords = 200000) {
    .dl_fasttext(lang)
    .prepos(paste0("wiki.", lang, ".align.vec"), paste0("wiki.", lang, ".clean.vec"), topwords = topwords)
    readr::read_delim(paste0("wiki.", lang, ".clean.vec"), col_names = FALSE, delim = " ") %>% saveRDS(paste0("wiki.", lang, ".trimmed.RDS"))
    unlink(paste0("wiki.", lang, ".align.vec"))
    unlink(paste0("wiki.", lang, ".clean.vec"))
}

## Previous implementation that uses grep.
## get_ft <- function(lang = "en", topwords = 200000) {
##     .dl_fasttext(lang)
##     system(paste0("grep -v '^[[:punct:][:digit:]]' wiki.", lang ,".align.vec > wiki.", lang, ".clean.vec"))
##     readr::read_delim(paste0("wiki.", lang, ".clean.vec"), col_names = FALSE, skip = 1, delim = " ") %>% head(topwords) %>% saveRDS(paste0("wiki.", lang, ".trimmed.RDS"))
##     unlink(paste0("wiki.", lang, ".align.vec"))
##     unlink(paste0("wiki.", lang, ".clean.vec"))
## }



#' Reading word embeddings downloaded with get_ft()
#'
#' This function reads fastText word embeddings downloaded with get_ft(). The file(s) must be in the working directory.
#' @param lang a vector of languages, e.g. c("en", "de")
#' @return a list of word embeddings
#' @export
read_ft <- function(lang = c("en", "de", "fr")) {
    emb <- purrr::map(lang, ~readRDS(paste0("wiki.", ., ".trimmed.RDS")))
    names(emb) <- lang
    return(emb)
}

#' Create a multilingual corpus
#'
#' This function generates a multilingual corpus.
#' @param text_content a vector of text content
#' @param lang a vector of lang, using lower case, e.g. "de"
#' @return a quanteda corpus
#' @export
create_corpus <- function(text_content, lang) {
    quanteda::corpus(text_content, docvars = data.frame(lang = lang, stringsAsFactors = FALSE))
}

.tokenize <- function(text) {
    return(stringr::str_split(tolower(text), "[[:punct:][:space:]]")[[1]])
}

.gen_doc_embedding <- function(text_content, lang,  emb, boe = FALSE, remove_stopwords = TRUE) {
    splited_word <- .tokenize(text_content)
    candidate_words <- tibble::tibble(X1 = splited_word)
    bag_of_embeddings <- candidate_words %>% dplyr::left_join(emb[[lang]], by = "X1") %>% dplyr::filter(X1 != "")
    if (remove_stopwords) {
        bag_of_embeddings %>% dplyr::filter(!X1 %in% quanteda::stopwords(lang)) -> bag_of_embeddings
    }
    if (boe) {
        return(bag_of_embeddings)
    }
    bag_of_embeddings %>% dplyr::summarise_at(dplyr::vars(X2:X301), mean, na.rm = TRUE)
}

#' Generate a document-feature matrix using word embeddings
#'
#' This function generates document-feature matrix (dfm) from a multilingual corpus.
#' @param corpus a multilingual corpus generated by create_corpus()
#' @param emb a list of word embeddings loaded from read_ft()
#' @param .progress boolean, displaying a progress bar or not
#' @param mode character, either 'bert' or 'fasttext'
#' @param noise boolean, printing noise so that you know the transmation is progressing
#' @param remove_stopwords, boolean, whether or not to remove stopwords
#' @return a rectr_dfm object
#' @importFrom magrittr %>%
#' @export
transform_dfm_boe <- function(corpus, emb = NULL, .progress = TRUE, mode = "bert", noise = FALSE, remove_stopwords = TRUE, bert_sentence_tokenization = TRUE, envname = "rectr_condaenv", path = "./") {
    if (mode == "fasttext" | !is.null(emb)) {
        mode <- "fasttext"
        real_dfm <- furrr::future_map2_dfr(as.vector(corpus), quanteda::docvars(corpus, "lang"), .gen_doc_embedding, emb = emb, .progress = .progress, remove_stopwords = remove_stopwords) %>% as.matrix
    } else if (mode == "bert") {
        mode <- "bert"
        real_dfm <- .bert(content = as.vector(corpus), lang =  quanteda::docvars(corpus, "lang"), noise = noise, remove_stopwords = remove_stopwords, bert_sentence_tokenization = bert_sentence_tokenization, envname = envname, path = path)
    } else {
        stop("Argument 'mode' must be 'bert' or 'fasttext'.")
    }
    res <- list(dfm = real_dfm, corpus = corpus, k = NULL, filtered = FALSE, mode = mode)
    class(res) <- append(class(res), "rectr_dfm")
    return(res)
}

#' Print a rectr_dfm object
#'
#' This function prints useful information from an rectr_dfm object.
#' @param rectr_dfm a rectr_dfm object
#' @return nothing
#' @export
print.rectr_dfm <- function(rectr_dfm) {
    cat(paste0("dfm with a dimension of ", nrow(rectr_dfm$dfm), " x ", ncol(rectr_dfm$dfm), " and ", paste0(unique(quanteda::docvars(rectr_dfm$corpus, "lang")), collapse = "/"), " language(s).\n", ifelse(rectr_dfm$filtered, paste("Filtered with k = ", rectr_dfm$k), ""), "\nAligned word embeddings: ", rectr_dfm$mode, "\n"))
}

.check_lang_indep <- function(lsa_res, lang_vector, alpha = 0.05, noise = FALSE) {
    max_i <- ncol(lsa_res)
    for (i in 1:max_i) {
        p <- summary(aov(lsa_res[,i]~lang_vector))[[1]]$`Pr(>F)`[1]
        if (p > alpha) {
            if (noise) {
                print(paste0("Select from: ", i, " th dimension."))
            }
            return(i)
        }
    }
    stop("All selected dimensions in the U Matrix have a significant language influence. Adjust parameters 'alpha' and/or 'dimension'.")
}


.calculate_rectr <- function(lsa_res, k, corpus, seed) {
    if (!is.null(seed)) {
        set.seed(seed)
    }
    ##lang_vector <- quanteda::docvars(corpus, "lang")
    ##i <- .check_lang_indep(lsa_res, lang_vector)
    ##max_d <- (k*2) + i
    x <- flexmix::flexmix(lsa_res~1, k = k, model = flexmix::FLXMCmvnorm(diagonal = FALSE))
    return(x)
}

#' Filter a document-feature matrix for systematic language differences
#'
#' This function filters a document-feature matrix using singular value decomposition.
#' @param input_dfm dfm generated by dfm_boe()
#' @param k integer, number of topics
#' @param corpus a multilingual corpus generated by create_corpus()
#' @param multiplication_factor integer, select k * mulitiplication_factor columns from the U-Matrix.
#' @param dimension integer, the first singular value to be extracted in the U-Matrix.
#' @param alpha double, alpha level to filter the U-Matrix using one-way ANOVA.
#' @return an rectr_dfm object
#' @export
filter_dfm <- function(input_dfm, k, corpus = NULL, multiplication_factor = 2, dimension = 100, alpha = 0.05, noise = FALSE) {
    if (is.null(corpus)) {
        ## check if the input_dfm has a corpus
        if (!is.null(input_dfm$corpus)) {
            corpus <- input_dfm$corpus
        } else {
            stop("Corpus not found.")
        }
    }
    svd_dfm <- RSpectra::svds(input_dfm$dfm, k = dimension, nu = dimension, nv = dimension)$u
    lang_vector <- quanteda::docvars(corpus, "lang")
    i <- .check_lang_indep(svd_dfm, lang_vector, alpha, noise = noise)
    max_d <- (k * multiplication_factor) + i
    input_dfm$dfm <- svd_dfm[,i:max_d]
    input_dfm$k <- k
    input_dfm$filtered <- TRUE
    return(input_dfm)
}

#' Fit a Guassian Mixture Model from a document-feature matrix
#'
#' This function fits a guassian mixture model (GMM) from a document-feature matrix.
#' @param input_dfm a rectr_dfm object
#' @param seed integer, to set the random seed for reproducibility. When leave as NULL, no random seed is set.
#' @return a rectr_model object. rectr_model$theta is the topic-assignment matrix
#' @export
calculate_gmm <- function(input_dfm, seed = NULL) {
    if (!input_dfm$filtered) {
        stop("Please filter the dfm with filter_dfm() first.")
    }
    gmm_model <- .calculate_rectr(input_dfm$dfm, k = input_dfm$k, corpus = input_dfm$corpus, seed = seed)
    res <- list()
    res$gmm_model <- gmm_model
    res$dfm <- input_dfm
    res$theta <- flexmix::posterior(gmm_model)
    res$k <- input_dfm$k
    if (ncol(res$theta) < res$k) {
        warning(paste0("Cannot converge with a model with k = ", res$k, ".  Actual k = ", ncol(res$theta)))
    }
    res$seed <- seed
    class(res) <- append(class(res), "rectr_model")
    return(res)
}

#' Print a rectr_model object
#'
#' Print useful information about a rectr_model.
#' @param rectr_model a rectr_model object
#' @return nothing
#' @export
print.rectr_model <- function(rectr_model) {
    cat(paste0(rectr_model$k, "-topic rectr model trained with a "))
    print(rectr_model$dfm)
    if (ncol(rectr_model$theta) < rectr_model$k) {
        cat(paste0("Defacto k = ", ncol(rectr_model$theta), "\n"))
    }
}
chainsawriot/rectr documentation built on July 30, 2023, 2:30 p.m.