R/convert.R

Defines functions dfm2dataframe dfm2tripletlist dfm2lsa ijv.to.doc dfm2stm dfm2dtm split_matrix dtm2lda dfm2lda dfm2tm dfm2austin convert.corpus convert.dfm convert.default convert

Documented in convert convert.corpus convert.dfm dfm2austin dfm2dtm dfm2lda dfm2lsa dfm2stm dfm2tm dtm2lda

# convert generics ---------

#' Convert quanteda objects to non-quanteda formats
#'
#' Convert a quanteda [dfm] or [corpus] object to a format useable by other
#' packages. The general function `convert` provides easy conversion from a dfm
#' to the document-term representations used in all other text analysis packages
#' for which conversions are defined.  For [corpus] objects, `convert` provides
#' an easy way to make a corpus and its document variables into a data.frame.
#' @param x a [dfm] or [corpus] to be converted
#' @param to target conversion format, one of:
#'   \describe{ \item{`"lda"`}{a list with components "documents" and "vocab" as
#'   needed by the function
#'   [lda.collapsed.gibbs.sampler][lda::lda.collapsed.gibbs.sampler] from the
#'   \pkg{lda} package}
#'   \item{`"tm"`}{a \link[tm:matrix]{DocumentTermMatrix} from the \pkg{tm}
#'   package.  Note: The \pkg{tm} package version of `as.TermDocumentMatrix()`
#'   allows a `weighting` argument, which supplies a weighting function for
#'   \code{\link[tm:matrix]{TermDocumentMatrix()}}.  Here the default is for
#'   term frequency weighting. If you want a different weighting, apply the
#'   weights after converting using one of the \pkg{tm} functions. For other
#'   available weighting functions from the \pkg{tm} package, see
#'   \link[tm:matrix]{TermDocumentMatrix}.}
#'   \item{`"stm"`}{the  format for the \pkg{stm} package} \item{`"austin"`}{the
#'   `wfm` format from the **austin** package}
#'   \item{`"topicmodels"`}{the "dtm" format as used by the \pkg{topicmodels}
#'   package}
#'   \item{`"lsa"`}{the "textmatrix" format as
#'   used by the \pkg{lsa} package}
#'   \item{`"data.frame"`}{a data.frame of without row.names, in which documents
#'   are rows, and each feature is a variable (for a dfm),
#'    or each text and its document variables form a row (for a corpus)}
#'   \item{`"json"`}{(corpus only) convert a corpus and its document variables
#'   into JSON format, using the format described in
#'   \link[jsonlite:fromJSON]{jsonlite::toJSON()}}
#'   \item{`"tripletlist"`}{a named "triplet" format list consisting of
#'   `document`, `feature`, and `frequency`}
#'   }
#' @param docvars optional data.frame of document variables used as the
#'   `meta` information in conversion to the \pkg{stm} package format.
#'   This aids in selecting the document variables only corresponding to the
#'   documents with non-zero counts.  Only affects the "stm" format.
#' @param omit_empty logical; if `TRUE`, omit empty documents and features
#'   from the converted dfm. This is required for some formats (such as STM)
#'   that do not accept empty documents.  Only used when `to = "lda"` or
#'   `to = "topicmodels"`.  For `to = "stm"` format, `omit_empty` is
#'   always `TRUE`.
#' @param docid_field character; the name of the column containing document
#'   names used when `to = "data.frame"`.  Unused for other conversions.
#' @param ... unused directly
#' @return A converted object determined by the value of `to` (see above).
#'   See conversion target package documentation for more detailed descriptions
#'   of the return formats.
#' @export
#' @examples
#' ## convert a dfm
#'
#' toks <- corpus_subset(data_corpus_inaugural, Year > 1970) |>
#'     tokens()
#' dfmat1 <- dfm(toks)
#'
#' # austin's wfm format
#' identical(dim(dfmat1), dim(convert(dfmat1, to = "austin")))
#'
#' # stm package format
#' stmmat <- convert(dfmat1, to = "stm")
#' str(stmmat)
#'
#' # triplet
#' tripletmat <- convert(dfmat1, to = "tripletlist")
#' str(tripletmat)
#'
#' \dontrun{
#' # tm's DocumentTermMatrix format
#' tmdfm <- convert(dfmat1, to = "tm")
#' str(tmdfm)
#'
#' # topicmodels package format
#' str(convert(dfmat1, to = "topicmodels"))
#'
#' # lda package format
#' str(convert(dfmat1, to = "lda"))
#' }
convert <- function(x, to, ...) {
    UseMethod("convert")
}

#' @noRd
#' @export
convert.default <- function(x, to, ...) {
    check_class(class(x), "convert")
}

#' @rdname convert
#' @export
convert.dfm <- function(x, to = c("lda", "tm", "stm", "austin", "topicmodels",
                                  "lsa", "matrix", "data.frame", "tripletlist"),
                        docvars = NULL, omit_empty = TRUE, docid_field = "doc_id", 
                        ...) {

    x <- as.dfm(x)
    to <- match.arg(to)
    if (!missing(omit_empty)) {
        omit_empty <- check_logical(omit_empty)
    }
    docid_field <- check_character(docid_field)
    check_dots(...)
    
    attrs <- attributes(x)
    if (!is.null(docvars)) {
        if (!is.data.frame(docvars))
            stop("docvars must be a data.frame")
        if (nrow(docvars) != ndoc(x))
            stop("docvars must have the same number of rows as ndoc(x)")
    }

    if (to %in% c("stm", "lda", "topicmodels")) {
        if (field_object(attrs, "weight_tf")$scheme != "count" || field_object(attrs, "weight_df")$scheme != "unary")
            stop("cannot convert a non-count dfm to a topic model format")
    }

    if (!to %in% c("lda", "topicmodels") && !missing(omit_empty)) {
        warning("omit_empty not used for 'to = \"", to, "\"'")
    }

    if (to == "tm") {
        return(dfm2tm(x))
    } else if (to == "lda") {
        return(dfm2lda(x, omit_empty = omit_empty))
    } else if (to == "stm") {
        return(dfm2stm(x, docvars, omit_empty = TRUE))
    } else if (to == "austin") {
        return(dfm2austin(x))
    } else if (to == "topicmodels") {
        return(dfm2dtm(x, omit_empty = omit_empty))
    } else if (to == "lsa") {
        return(dfm2lsa(x))
    } else if (to == "data.frame") {
        return(dfm2dataframe(x, docid_field = docid_field))
    } else if (to == "matrix") {
        return(as.matrix(x))
    } else if (to == "tripletlist") {
        return(dfm2tripletlist(x))
    }
}

#' @rdname convert
#' @inheritParams jsonlite::toJSON
#' @export
#' @examples
#'
#' ## convert a corpus into a data.frame
#'
#' corp <- corpus(c(d1 = "Text one.", d2 = "Text two."),
#'                docvars = data.frame(dvar1 = 1:2, dvar2 = c("one", "two"),
#'                                     stringsAsFactors = FALSE))
#' convert(corp, to = "data.frame")
#' convert(corp, to = "json")
convert.corpus <- function(x, to = c("data.frame", "json"), pretty = FALSE, ...) {
    
    to <- match.arg(to)
    pretty <- check_logical(pretty)
    check_dots(...)
    
    if (to == "data.frame") {
        result <- data.frame(doc_id = docnames(x),
                         text = as.character(x, use.names = FALSE),
                         stringsAsFactors = FALSE,
                         row.names = NULL)
        result <- cbind(result, docvars(x))
        return(result)
    } else if (to == "json") {
        return(jsonlite::toJSON(convert(x, to = "data.frame"), pretty = pretty))
    }
}

# convert.dfm internal --------

#' Convenience wrappers for dfm convert
#'
#' To make the usage as consistent as possible with other packages, quanteda
#' also provides shortcut wrappers to [convert()], designed to be
#' similar in syntax to analogous commands in the packages to whose format they
#' are converting.
#' @param x the dfm to be converted
#' @inheritParams convert
#' @return A converted object determined by the value of `to` (see above).
#'   See conversion target package documentation for more detailed descriptions
#'   of the return formats.
#' @note  Additional coercion methods to base R objects are also available:
#'   \describe{ \item{`[as.data.frame](x)`}{converts a [dfm] into
#'   a [data.frame]}
#'
#'   \item{`[as.matrix](x)`}{converts a [dfm] into a
#'   [matrix]} }
#' @name convert-wrappers
#' @keywords internal
#' @examples
#' dfmat <- corpus_subset(data_corpus_inaugural, Year > 1970) |>
#'     tokens() |>
#'     dfm()
#'
NULL

#' @rdname convert-wrappers
dfm2austin <- function(x) {
    result <- as.matrix(as(x, "dMatrix"))
    names(dimnames(result))[2] <- "words"
    class(result) <- c("wfm", "matrix")
    return(result)
}

#' @rdname convert-wrappers
#' @param weighting a \pkg{tm} weight, see [tm::weightTf()]
dfm2tm <- function(x, weighting = tm::weightTf) {
    attrs <- attributes(x)
    if (!requireNamespace("tm", quietly = TRUE))
        stop("You must install the tm package for this conversion.")
    if (!requireNamespace("slam", quietly = TRUE))
        stop("You must install the slam package for this conversion.")

    if (!(field_object(attrs, "weight_tf")$scheme == "count" && field_object(attrs, "weight_df")$scheme == "unary")) {
        warning("converted DocumentTermMatrix will not have weight attributes set correctly")
    }
    tm::as.DocumentTermMatrix(slam::as.simple_triplet_matrix(x),
                              weighting = weighting)
}

## TODO:
## Implement weight recordings for
## weightTfIdf
## - attr(*, "weighting")= chr [1:2] "term frequency - inverse document frequency" "tf-idf"
## - attr(*, "weighting")= chr [1:2] "term frequency - inverse document frequency (normalized)" "tf-idf"
## weightTf
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
## weightSMART
## - attr(*, "weighting")= chr [1:2] "SMART ntc" "SMART"  (e.g.)

#' @rdname convert-wrappers
#' @details
#' `dfm2lda` provides converts a [dfm] into the list representation
#' of terms in documents used by the \pkg{lda} package (a list with components
#' "documents" and "vocab" as needed by
#'   [lda::lda.collapsed.gibbs.sampler()]).
#' @examples
#' \dontrun{
#' # shortcut conversion to lda package list format
#' identical(quanteda:::dfm2lda(dfmat), convert(dfmat, to = "lda"))
#' }
#'
#' @keywords internal
dfm2lda <- function(x, omit_empty = TRUE) {
    x <- as.dfm(x)
    if (!requireNamespace("tm", quietly = TRUE))
        stop("You must install the slam package for this conversion.")
    dtm2lda(dfm2dtm(x, omit_empty = omit_empty), omit_empty = omit_empty)
}

#' @rdname convert-wrappers
#' @details
#' `dfm2ldaformat` provides converts a [dfm] into the list
#' representation of terms in documents used by the \pkg{lda} package (a list
#' with components "documents" and "vocab" as needed by
#' [lda::lda.collapsed.gibbs.sampler()]).
#' @examples
#' \dontrun{
#' # shortcut conversion to lda package list format
#' identical(dfm2ldaformat(dfmat), convert(dfmat, to = "lda"))
#' }
#' @keywords internal
dtm2lda <- function(x, omit_empty = TRUE) {
    if (!requireNamespace("slam", quietly = TRUE))
        stop("You must install the slam package for this conversion.")

    docs <- vector(mode = "list", length = nrow(x))
    names(docs) <- rownames(x)

    docs[slam::row_sums(x) > 0] <- split_matrix(rbind(as.integer(x$j) - 1L,
                                                      as.integer(x$v)),
                                                as.integer(x$i))
    if (omit_empty) {
        docs[slam::row_sums(x) == 0] <- NULL
    } else {
        docs[slam::row_sums(x) == 0] <- rep(list(matrix(integer(), ncol = 0, nrow = 2)),
                                             sum(slam::row_sums(x) == 0))
    }
    list(documents = docs, vocab = colnames(x))
}

# internal function for dtm2lda
split_matrix <- function(x, f, drop = FALSE, ...) {
    lapply(split(seq_len(ncol(x)),
                 f, drop = drop, ...), function(ind) x[, ind, drop = FALSE])
}

#' @rdname convert-wrappers
dfm2dtm <- function(x, omit_empty = TRUE) {
    if (!requireNamespace("tm", quietly = TRUE))
        stop("You must install the tm package for this conversion.")
    if (!requireNamespace("slam", quietly = TRUE))
        stop("You must install the slam package for this conversion.")

    x <- as.dfm(x)
    x <- as(x, "TsparseMatrix")
    if (omit_empty)
        x <- x[rowSums(x) > 0, ]
    tm::as.DocumentTermMatrix(slam::as.simple_triplet_matrix(x), tm::weightTf)
}

#' @rdname convert-wrappers
dfm2stm <- function(x, docvars = NULL, omit_empty = TRUE) {
    # get docvars (if any)
    if (is.null(docvars))
        docvars <- docvars(x)

    # sort features into alphabetical order
    x <- x[, order(featnames(x))]

    # deal with empty documents
    empty_docs <- rowSums(x) == 0
    if (omit_empty) {
        if (sum(empty_docs) > 0)
            warning("Dropped ", format(length(empty_docs), big.mark = ","), 
                                       " empty document(s)")

        empty_feats <- colSums(x) == 0
        if (sum(empty_feats) > 0)
            warning("Dropped " , format(length(empty_feats), big.mark = ","), 
                                       " zero-count feature(s)")

        x <- x[!empty_docs, !empty_feats]
        docvars <- docvars[!empty_docs, , drop = FALSE]
    } else {
        stop("omit_empty = FALSE not implemented for STM format")
    }

    # convert counts to STM documents format
    x <- as(x, "TsparseMatrix")
    docs <- ijv.to.doc(x@i + 1, x@j + 1, x@x)
    names(docs) <- rownames(x)
    list(documents = docs, vocab = colnames(x), meta = docvars)
}

# internal function for dfm2stm
ijv.to.doc <- function(i, j, v) {
    index <- split(j, i)
    index <- lapply(index, as.integer)
    count <- split(v, i)
    count <- lapply(count, as.integer)
    mapply(rbind, index, count, SIMPLIFY = FALSE)
}

#' Convert a dfm to an lsa "textmatrix"
#'
#' Converts a dfm to a textmatrix for use with the lsa package.
#' @param x dfm to be converted
#' @examples
#' \dontrun{
#' (dfmat <- dfm(tokens(c(d1 = "this is a first matrix",
#'                        d2 = "this is second matrix as example"))))
#' lsa::lsa(convert(dfmat, to = "lsa"))
#' }
#' @keywords internal
dfm2lsa <- function(x) {
    if (!requireNamespace("lsa", quietly = TRUE))
        stop("You must install the lsa package for this conversion.")
    x <- t(as.matrix(x))
    names(dimnames(x))[1] <- "terms"
    lsa::as.textmatrix(x)
}

dfm2tripletlist <- function(x) {
    feat <- featnames(x)
    doc <- docnames(x)
    x <- as(x, "TsparseMatrix")
    list(
        document = doc[x@i + 1],
        feature = feat[x@j + 1],
        frequency = x@x
    )
}

dfm2dataframe <- function(x, row.names = NULL, ..., document = docnames(x),
                          docid_field = "doc_id", check.names = FALSE) {
    if (!(is.character(document) || is.null(document)))
        stop("document must be character or NULL")
    result <- data.frame(as.matrix(x), row.names = row.names,
                         check.names = check.names)
    if (!is.null(document)) {
        if (docid_field %in% names(result)) {
            stop("'", docid_field, "' matches a feature in the dfm; use a different docid_field value")
        }
        result <- cbind(document, result, stringsAsFactors = FALSE)
        names(result)[1] <- docid_field
    }
    return(result)
}
quanteda/quanteda documentation built on April 30, 2024, 3:13 p.m.