R/mallet.R

#' Load MALLET sampling state from disk
#'
#' A helper function to read in a MALLET sampling state.
#'
#'
#' Does not require the \code{mallet} package. As long as the
#' supplied file is of the expected format (e.g. from command-line MALLET),
#' this will do the job. Well, if you have the RAM.
#'
#' To get a list of stored document IDs from a model object
#' \code{model}, use \code{model$getDocumentNames()}.
#'
#' @param filename name of gzip file holding the sampling state
#'
#' @param doc_ids character vector of document IDs. If supplied, the
#' \code{doc} column of the resulting dataframe will be populated with
#' these values. If it is not supplied, then the \code{doc} column will
#' be document numbers (from 1, not 0 as in the state file).
#'
#' @return a data frame with three columns, \code{doc}, \code{word},
#' and \code{topic}. \code{doc} is either a document index or an ID if
#' \code{doc_ids} is supplied; \code{word} is the token as a string; and
#' \code{topic} is the topic number (counting from 1, not 0)
#'
#' @seealso \code{\link{write_mallet_state}}
#'
#' @export
#' 
read_mallet_state <- function (filename, doc_ids=NULL) {
    st <- read.table(gzfile(filename),
        header=F, quote="", as.is=T, comment.char="#",
        col.names=c("doc", "skip2", "skip3", "skip4", "word", "topic"),
        colClasses=c("integer", "NULL", "NULL", "NULL", "character", "integer")
    )
    st$doc <- st$doc + 1
    if (!is.null(doc_ids)) {
        st$doc <- doc_ids[st$doc]
    }
    st$topic <- st$topic + 1

    st
}

#' Save MALLET sampling state to disk
#'
#' A helper function for saving the final sampling state.
#'
#' Warning: the sampling state size grows linearly with the number of tokens in #' the corpus. Expect big files.
#'
#' @param model model object from \code{\link[mallet]{MalletLDA}}.
#' 
#' @param filename name of file to save the sampling state to. The result is a 
#' gzipped file.
#'
#' @seealso \code{\link{read_mallet_state}}
#'
#' @export
#'
write_mallet_state <- function (model, filename) {
    if (!grepl('\\.gz(ip)?$', filename)) {
        warning("Supplied file name (", filename, ") does not end in .gz.",
                "\n", "It will be a gzip archive nonetheless.")
    }
    model$model$printState(new(J("java.io.File"), path.expand(filename)))
}

#' Save MALLET instances to disk
#'
#' A helper function for saving a MALLET instance list.
#'
#' @param instances instances generated by \code{\link[mallet]{mallet.import}}
#'
#' @param filename name of file to save the instance list to
#'
#' @seealso \code{\link{read_mallet_instances}}
#'
#' @export
#'
write_mallet_instances <- function (instances, filename) {
    instances$save(new(J("java.io.File"), path.expand(filename)))
}

#' Read MALLET instances from disk
#'
#' A helper function for reading a MALLET instance list back into memory.
#' 
#' If you are only loading instances for modeling, you can skip a step and
#' pass a filename to \code{model$loadDocuments(...)}.
#'
#' @param filename name of file to read MALLET instances from
#'
#' @return reference to an instances list object
#'
#' @seealso \code{\link{write_mallet_instances}}
#'
#' @export
#'
read_mallet_instances <- function (filename) {
    J("cc.mallet.types.InstanceList", "load",
      new(J("java.io.File"), path.expand(filename)))
}
agoldst/litdata documentation built on May 10, 2019, 7:34 a.m.