# featnames -----------
#' Get the feature labels from a dfm
#'
#' Get the features from a document-feature matrix, which are stored as the
#' column names of the [dfm] object.
#' @param x the dfm whose features will be extracted
#' @return character vector of the feature labels
#' @examples
#' dfmat <- dfm(tokens(data_corpus_inaugural))
#'
#' # first 50 features (in original text order)
#' head(featnames(dfmat), 50)
#'
#' # first 50 features alphabetically
#' head(sort(featnames(dfmat)), 50)
#'
#' # contrast with descending total frequency order from topfeatures()
#' names(topfeatures(dfmat, 50))
#' @export
featnames <- function(x) {
UseMethod("featnames")
}
#' @export
#' @noRd
featnames.dfm <- function(x) {
x <- as.dfm(x)
if (is.null(colnames(x))) {
character()
} else {
colnames(x)
}
}
# docnames -----------
#' @noRd
#' @export
docnames.dfm <- function(x) {
rownames(x)
}
# as.dfm -----------
#' Coercion and checking functions for dfm objects
#'
#' Convert an eligible input object into a dfm, or check whether an object is a
#' dfm. Current eligible inputs for coercion to a dfm are: [matrix],
#' (sparse) [Matrix][Matrix::Matrix],
#' \link[tm:matrix]{TermDocumentMatrix} and \link[tm:matrix]{DocumentTermMatrix}
#' (from the \pkg{tm} package), [data.frame], and other [dfm] objects.
#' @param x a candidate object for checking or coercion to [dfm]
#' @return `as.dfm` converts an input object into a [dfm]. Row names
#' are used for docnames, and column names for featnames, of the resulting
#' dfm.
#' @seealso [as.data.frame.dfm()], [as.matrix.dfm()],
#' [convert()]
#' @export
as.dfm <- function(x) {
UseMethod("as.dfm")
}
#' @export
as.dfm.default <- function(x) {
check_class(class(x), "as.dfm")
}
#' @noRd
#' @method as.dfm dfm
#' @export
as.dfm.dfm <- function(x) {
upgrade_dfm(x)
}
#' @noRd
#' @method as.dfm matrix
#' @export
as.dfm.matrix <- function(x) {
matrix2dfm(x)
}
#' @noRd
#' @method as.dfm Matrix
#' @export
as.dfm.Matrix <- function(x) {
matrix2dfm(x)
}
#' @noRd
#' @method as.dfm data.frame
#' @export
as.dfm.data.frame <- function(x) {
matrix2dfm(as.matrix(x, rownames.force = TRUE))
}
#' @noRd
#' @method as.dfm dfmSparse
#' @export
as.dfm.dfmSparse <- function(x) {
as.dfm(as(x, "dgCMatrix"))
}
#' @noRd
#' @method as.dfm DocumentTermMatrix
#' @export
as.dfm.DocumentTermMatrix <- function(x) {
as.dfm(
sparseMatrix(i = x$i, j = x$j, x = x$v,
dims = dim(x),
dimnames = list(docs = rownames(x),
features = colnames(x))))
}
#' @noRd
#' @method as.dfm TermDocumentMatrix
#' @export
as.dfm.TermDocumentMatrix <- function(x) {
as.dfm(
sparseMatrix(i = x$j, j = x$i, x = x$v,
dims = rev(dim(x)),
dimnames = list(docs = colnames(x),
features = rownames(x))))
}
#' Converts a Matrix to a dfm
#' @param x a Matrix
#' @param meta a list of values to be assigned to slots
#' @keywords internal
matrix2dfm <- function(x, docvars = NULL, meta = NULL) {
docname <- rownames(x)
if (nrow(x) > length(docname))
docname <- paste0(quanteda_options("base_docname"), seq_len(nrow(x)))
featname <- colnames(x)
if (ncol(x) > length(featname))
featname <- paste0(quanteda_options("base_featname"), seq_len(ncol(x)))
if (is.null(docvars))
docvars <- make_docvars(nrow(x), docname, FALSE)
if (is.null(meta))
meta <- make_meta("dfm")
if (nrow(x) == 0 && ncol(x) == 0) {
x <- make_null_dfm()
} else {
x <- Matrix(x, sparse = TRUE)
}
build_dfm(
as(as(as(x, "CsparseMatrix"), "generalMatrix"), "dMatrix"),
featname,
docvars = docvars,
meta = meta
)
}
#' @rdname as.dfm
#' @return
#' `is.dfm` returns `TRUE` if and only if its argument is a [dfm].
#' @export
is.dfm <- function(x) {
is(x, "dfm") && isS4(x)
}
# topfeatures -----------
#' Identify the most frequent features in a dfm
#'
#' List the most (or least) frequently occurring features in a [dfm], either
#' as a whole or separated by document.
#' @name topfeatures
#' @param x the object whose features will be returned
#' @param n how many top features should be returned
#' @param decreasing If `TRUE`, return the `n` most frequent features;
#' otherwise return the `n` least frequent features
#' @param scheme one of `count` for total feature frequency (within
#' `group` if applicable), or `docfreq` for the document frequencies
#' of features
#' @inheritParams groups
#' @return A named numeric vector of feature counts, where the names are the
#' feature labels, or a list of these if `groups` is given.
#' @examples
#' dfmat1 <- corpus_subset(data_corpus_inaugural, Year > 1980) |>
#' tokens(remove_punct = TRUE) |>
#' dfm()
#' dfmat2 <- dfm_remove(dfmat1, stopwords("en"))
#'
#' # most frequent features
#' topfeatures(dfmat1)
#' topfeatures(dfmat2)
#'
#' # least frequent features
#' topfeatures(dfmat2, decreasing = FALSE)
#'
#' # top features of individual documents
#' topfeatures(dfmat2, n = 5, groups = docnames(dfmat2))
#'
#' # grouping by president last name
#' topfeatures(dfmat2, n = 5, groups = President)
#'
#' # features by document frequencies
#' tail(topfeatures(dfmat1, scheme = "docfreq", n = 200))
#' @export
topfeatures <- function(x, n = 10, decreasing = TRUE,
scheme = c("count", "docfreq"), groups = NULL) {
UseMethod("topfeatures")
}
#' @export
topfeatures.default <- function(x, n = 10, decreasing = TRUE,
scheme = c("count", "docfreq"), groups = NULL) {
check_class(class(x), "topfeatures")
}
#' @export
#' @noRd
#' @importFrom stats quantile
topfeatures.dfm <- function(x, n = 10, decreasing = TRUE,
scheme = c("count", "docfreq"), groups = NULL) {
if (is.fcm(x)) {
lifecycle::deprecate_stop(
when = "4.0",
what = "quanteda::topfeatures.fcm()"
)
}
x <- as.dfm(x)
if (!nfeat(x) || !ndoc(x)) return(numeric())
if (!is.numeric(n)) stop("n must be a number")
scheme <- match.arg(scheme)
if (!missing(groups) && !is.null(substitute(groups))) {
groupsub <- substitute(groups)
result <- list()
x <- dfm_group(x, eval(groupsub), force = TRUE)
for (i in seq_len(ndoc(x))) {
result[[i]] <- topfeatures(x[i, ], n = n, scheme = scheme,
decreasing = decreasing, groups = NULL)
}
names(result) <- docnames(x)
return(result)
}
if (n > nfeat(x)) n <- nfeat(x)
if (scheme == "count") {
wght <- colSums(x)
} else if (scheme == "docfreq") {
wght <- docfreq(x)
}
result <- sort(wght, decreasing)
return(head(result, n))
}
# sparsity -----------
#' Compute the sparsity of a document-feature matrix
#'
#' Return the proportion of sparseness of a document-feature matrix, equal
#' to the proportion of cells that have zero counts.
#' @param x the document-feature matrix
#' @examples
#' dfmat <- dfm(tokens(data_corpus_inaugural))
#' sparsity(dfmat)
#' sparsity(dfm_trim(dfmat, min_termfreq = 5))
#' @export
sparsity <- function(x) {
UseMethod("sparsity")
}
#' @export
sparsity.default <- function(x) {
check_class(class(x), "sparsity")
}
#' @export
sparsity.dfm <- function(x) {
(1 - length(x@x) / prod(dim(x)))
}
# Internal --------
#' Internal functions for dfm objects
#'
#' Internal function documentation for [dfm] objects.
#' @name dfm-internal
#' @keywords dfm internal
NULL
#' The `Compare` methods enable relational operators to be use with dfm.
#' Relational operations on a dfm with a numeric will return a
#' [lgCMatrix-class][Matrix::lgCMatrix-class] object.
#' @rdname dfm-internal
#' @param e1 a [dfm]
#' @param e2 a numeric value to compare with values in a dfm
#' @export
#' @seealso [Comparison] operators
setMethod("Compare", c("dfm", "numeric"), function(e1, e2) {
as(callGeneric(as(e1, "CsparseMatrix"), e2), "CsparseMatrix")
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.