#' Interface to mallet topicmodelling.
#'
#' Functionality to support the following workflow (see examples): (a) Turn
#' \code{partition_bundle}-object into mallet instance list, (b) store the
#' resulting \code{jobjRef}-object, (c) run mallet topic modelling and (d)
#' turn ParallelTopicModel Java object into \code{LDA_Gibbs} object from
#' package \code{topicmodels}.
#'
#' @param x A `partition_bundle` object.
#' @param p_attribute Length-one `character` vector, a positional attribute.
#' @param ... further parameters
#' @param verbose A `logical` value, whether to be verbose.
#' @param filename Where to store the Java-object.
#' @importFrom utils read.csv read.table
#' @importFrom stats setNames
#' @importFrom slam simple_triplet_matrix
#' @importFrom parallel mclapply
#' @importFrom polmineR get_token_stream
#' @examples
#' # Preparations: Create instance list
#'
#' if (!mallet_is_installed()) mallet_install()
#' library(polmineR)
#' use("polmineR")
#'
#' speeches <- polmineR::as.speeches(
#' "GERMAPARLMINI",
#' s_attribute_name = "speaker",
#' s_attribute_date = "date"
#' )
#'
#' instance_list <- as.instance_list(speeches)
#' lda <- BigTopicModel(
#' instances = instance_list,
#' n_topics = 25,
#' alpha_sum = 5.1,
#' beta = 0.1,
#' threads = 1L,
#' iterations = 150L
#' )
#'
#' destfile <- tempfile()
#' lda$setSaveSerializedModel(50L, rJava::.jnew("java/lang/String", destfile))
#'
#' lda$estimate()
#' lda$write(rJava::.jnew("java/io/File", destfile))
#'
#' # Load topicmodel and turn it into LDA_Gibbs
#'
#' lda2 <- mallet_load_topicmodel(destfile)
#' topicmodels_lda <- as_LDA(lda)
#' @rdname as.instance_list
#' @importFrom polmineR get_token_stream
#' @exportMethod as.instance_list
#' @importFrom rJava J
setGeneric("as.instance_list", function(x, ...) standardGeneric("as.instance_list"))
#' @examples
#' library(polmineR)
#' use("polmineR")
#' speeches <- as.speeches("GERMAPARLMINI", s_attribute_name = "speaker", s_attribute_date = "date")
#' speeches_instance_list <- as.instance_list(speeches, p_attribute = "word")
#'
#' # Pass argument 'subset' to remove stopwords
#' terms_to_drop <- tm::stopwords("de")
#' speeches_instance_list <- as.instance_list(
#' speeches,
#' p_attribute = "word",
#' subset = {!get(p_attribute) %in% bquote(.(terms_to_drop))}
#' )
#' @param ... Arguments passed into `get_token_stream()` call (e.g. argument
#' `subset` to apply stopwords).
#' @param min_length Minimum length of documents after removing stopwords.
#' @importFrom rJava .jarray
#' @rdname as.instance_list
#' @exportMethod as.instance_list
#' @importFrom polmineR get_corpus
#' @importMethodsFrom polmineR corpus p_attributes
setMethod("as.instance_list", "partition_bundle", function(x, p_attribute = "word", verbose = TRUE, min_length = 1L, ...){
if (verbose) message("... create alphabet")
alphabet <- rJava::.jnew("cc/mallet/types/Alphabet", rJava::.jnew("java/lang/String")$getClass())
corpus_obj <- corpus(get_corpus(x))
vocabulary <- p_attributes(corpus_obj, p_attribute = p_attribute)
dummy <- alphabet$lookupIndices(.jarray(vocabulary), TRUE)
if (verbose) message("... instantiate pipe")
pipe <- .jnew("cc/mallet/pipe/TokenSequence2FeatureSequence", alphabet)
pipe$setTargetAlphabet(alphabet)
if (verbose) message("... decode token stream")
token_stream_list <- get_token_stream(x, p_attribute = p_attribute, ...)
if (!is.null(min_length)) {
if (verbose) message("... removing short documents.")
doc_lengths <- pblapply(token_stream_list, length)
documents_to_keep <- which(doc_lengths >= min_length)
if (length(documents_to_keep) == 0)
stop("... all documents are shorter than the minimum length.")
if (verbose){
message(
sprintf(
"... removing %s out of %s documents shorter than %s tokens.",
length(token_stream_list) - length(documents_to_keep),
length(token_stream_list),
min_length
)
)
}
token_stream_list <- token_stream_list[documents_to_keep]
}
if (verbose) message("... creating instances")
instance_list <- rJava::.jnew("cc/mallet/types/InstanceList")
pblapply(
names(token_stream_list),
function(doc_name){
token_sequence <- .jnew("cc/mallet/types/TokenSequence")
token_sequence$addAll(.jarray(token_stream_list[[doc_name]]))
instance <- .jnew(
"cc.mallet.types.Instance",
.jnew("java.lang.Object"),
.jnew("java.lang.Object"),
.jnew("java.lang.Object"),
.jnew("java.lang.Object")
)
instance$setData(token_sequence)
instance$setName(rJava::.jnew("java/lang/String", doc_name))
instance$setTarget(rJava::.jnew("java/lang/String", "foo"))
instance$setSource(rJava::.jnew("java/lang/String", "foo"))
instance_list$add(pipe$instanceFrom(instance))
NULL
}
)
instance_list
})
setOldClass("DocumentTermMatrix")
#' @exportMethod as.instance_list
#' @rdname as.instance_list
#' @examples
#' data("AssociatedPress", package = "topicmodels")
#' il <- as.instance_list(AssociatedPress)
setMethod("as.instance_list", "DocumentTermMatrix", function(x, verbose = TRUE){
stopifnot(is.logical(verbose))
if (is.null(dimnames(x)[["Terms"]])) stop("dimnames need to define Terms")
if (is.null(dimnames(x)[["Docs"]])){
docnames <- as.character(1L:nrow(x))
} else {
docnames <- dimnames(x)[["Docs"]]
}
if (verbose) cli_progress_step("reconstruct individual tokens from matrix")
j_split <- split(x = x$j - 1L, f = x$i)
v_split <- split(x = x$v, f = x$i)
unbagged <- lapply(
seq_along(j_split),
function(i) unlist(Map(rep, j_split[[i]], v_split[[i]]), recursive = FALSE)
)
if (verbose) cli_progress_done()
as.instance_list(
x = unbagged,
vocabulary = dimnames(x)[["Terms"]],
docnames = docnames
)
})
#' @param vocabulary A `character` vector with the vocabulary underlying input
#' object `x`, in the correct order.
#' @param docnames A `character` vector with document names. Needs to have same
#' length as input `list` object. If missing, names of the input `list` are
#' used as docnames, if present.
#' @param progress A `logical` value, whether to show progress bar.
#' @examples
#' use("polmineR", corpus = "GERMAPARLMINI")
#'
#' vocab <- p_attributes("GERMAPARLMINI", p_attribute = "word")
#'
#' il <- corpus("GERMAPARLMINI") |>
#' as.speeches(s_attribute_name = "speaker", s_attribute_date = "date") |>
#' p_attributes(p_attribute = "word", decode = FALSE) |>
#' as.instance_list(vocabulary = vocab)
#' @rdname as.instance_list
#' @exportMethod as.instance_list
setMethod("as.instance_list", "list", function(x, vocabulary, docnames, verbose = TRUE, progress = TRUE){
stopifnot(
length(verbose) == 1, is.logical(verbose),
length(progress) == 1, is.logical(progress)
)
if (missing(docnames)){
if (is.null(names(x))){
stop("Argument `docnames` missing and input list is not named.")
} else {
if (verbose) cli_alert_info("argument `docnames`, using names of input `list`.")
docnames <- names(x)
}
}
stopifnot(length(x) == length(docnames))
lexicon <- rJava::.jnew(
"cc/mallet/types/Alphabet",
rJava::.jnew("java/lang/String")$getClass()
)
lexicon$lookupIndices(vocabulary, TRUE)
# Create a dummy instance with a target.
target <- .jnew(
"cc.mallet.types.Instance",
.jnew("java.lang.Object"),
.jnew("java.lang.Object"),
.jnew("java.lang.Object"),
.jnew("java.lang.Object")
)
target$setData(.jnew("cc/mallet/types/FeatureSequence", lexicon))
instance_list <- rJava::.jnew("cc/mallet/types/InstanceList", lexicon, lexicon)
.fn <- function(i){
feature_sequence <- .jnew("cc/mallet/types/FeatureSequence", lexicon, x[[i]])
instance <- .jnew(
"cc.mallet.types.Instance",
.jnew("java.lang.Object"),
.jnew("java.lang.Object"),
.jnew("java.lang.Object"),
.jnew("java.lang.Object")
)
instance$setTarget(target)
instance$setData(feature_sequence)
instance$setName(.jnew("java/lang/String", docnames[[i]]))
instance_list$add(instance)
invisible(NULL)
}
if (progress) pblapply(seq_along(x), .fn) else lapply(seq_along(x), .fn)
instance_list
})
#' @param regex A regular expression (length-one `character` vector) used by
#' Mallet Java code for splitting `character` vector into tokens.
#' @param tolower A `logical` value, whether to lowercase tokens (performed)
#' by Mallet Java code.
#' @param stopwords Either a path with a plain text file with stopwords (one per
#' line), or a `character` vector.
#' @examples
#' instances <- as.speeches("GERMAPARLMINI", s_attribute_name = "speaker", s_attribute_date = "date") %>%
#' get_token_stream(p_attribute = "word", collapse = " ") %>%
#' unlist() %>%
#' as.instance_list()
#' @rdname as.instance_list
#' @exportMethod as.instance_list
#' @author Andreas Blaette, David Mimno
setMethod("as.instance_list", "character", function(x, regex = "[\\p{L}]+", tolower = FALSE, stopwords = NULL){
# This code is adapted from the 'mallet' package maintained by David Mimno,
# see: https://github.com/mimno/RMallet/blob/master/mallet/R/mallet.R
ids <- if (is.null(names(x))) as.character(1:length(x)) else names(x)
pipe_list <- rJava::.jnew("java/util/ArrayList")
pipe_list$add(
rJava::.jnew(
"cc/mallet/pipe/CharSequence2TokenSequence",
rJava::J("java/util/regex/Pattern")$compile(regex)
)
)
if (tolower)
pipe_list$add(rJava::.jnew("cc/mallet/pipe/TokenSequenceLowercase"))
if (!is.null(stopwords)){
stopifnot(is.character(stopwords))
if (length(stopwords) == 1L){
if (file.exists(stopwords)) {
stopword_file <- fs::path_norm(stopwords)
}
} else {
stopword_file <- tempfile(fileext = ".txt")
writeLines(text = stopwords, stopword_file)
}
pipe_list$add(
rJava::.jnew(
"cc/mallet/pipe/TokenSequenceRemoveStopwords",
rJava::.jnew("java/io/File", stopword_file),
"UTF-8", FALSE, FALSE, FALSE
)
)
}
pipe_list$add(rJava::.jnew("cc/mallet/pipe/TokenSequence2FeatureSequence"))
pipe <- rJava::.jnew(
"cc/mallet/pipe/SerialPipes",
rJava::.jcast(pipe_list, "java/util/Collection")
)
instance_list <- rJava::.jnew(
"cc/mallet/types/InstanceList",
rJava::.jcast(pipe, "cc/mallet/pipe/Pipe")
)
rJava::J("cc/mallet/topics/RTopicModel")$addInstances(instance_list, ids, x)
instance_list
})
#' @rdname as.instance_list
#' @export instance_list_save
instance_list_save <- function(x, filename = tempfile()){
# This snippet is inspired an unexported function save.mallet.instances in
# v1.2.0 of the R mallet package which has not yet been released at CRAN.
# See: https://github.com/mimno/RMallet/blob/master/mallet/R/mallet.R
x$save(rJava::.jnew("java/io/File", filename))
filename
}
#' @details `instance_list_load()` will load a Java `InstanceList` object that has
#' been saved to disk (e.g. by using the `instance_list_save()` function).
#' The return value is a `jobjRef` object.
#' @rdname as.instance_list
#' @export instance_list_load
#' @importFrom rJava J
instance_list_load <- function(filename){
filename <- path.expand(filename)
if (!file.exists(filename)) stop("file does not exist")
J("cc.mallet.types.InstanceList")$load(rJava::.jnew("java/io/File", filename))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.