# class definition and class functions --------
#' @title textstat_simil/dist classes
#' @description Sparse classes for similarity and distance matrices created by
#' [textstat_simil()] and [textstat_dist()].
#' @name textstat_proxy-class
#' @keywords internal textstat
#' @seealso [textstat_simil()]
NULL
# class definitions and class functions for textstat_proxy --------
# transitional only, to get quanteda.textstats passing on CRAN
#' @title textstat_simil/dist classes
#' @description Sparse classes for similarity and distance matrices created by
#' [quanteda.textstats::textstat_simil()] and
#' [quanteda.textstats::textstat_dist()].
#' @rdname textstat_proxy-class
#' @export
#' @keywords internal textstat
#' @slot .Data a sparse \pkg{Matrix} object, symmetric if selection is
#' `NULL`
#' @slot method the method used for computing similarity or distance
#' @slot min_simil numeric; a threshold for the similarity values below which similarity
#' values are not computed
#' @slot margin identifies the margin of the dfm on which similarity or
#' difference was computed: `"documents"` for documents or
#' `"features"` for word/term features.
#' @slot type either `"textstat_simil"` or `"textstat_dist"`
#' @seealso [quanteda.textstats::textstat_simil()]
setClass("textstat_proxy", contains = "Matrix",
slots = c(method = "character",
margin = "character",
type = "character"))
#' @rdname textstat_proxy-class
#' @slot selection target units, if any
setClass("textstat_dist", contains = c("textstat_proxy", "dgeMatrix"))
#' @rdname textstat_proxy-class
setClass("textstat_dist_symm", contains = c("textstat_proxy", "dspMatrix"))
#' @rdname textstat_proxy-class
setClass("textstat_simil", contains = c("textstat_proxy", "dgeMatrix"))
#' @rdname textstat_proxy-class
#' @export
setClass("textstat_simil_symm", contains = c("textstat_proxy", "dspMatrix"))
#' @rdname textstat_proxy-class
#' @export
setClass("textstat_simil_sparse", contains = c("textstat_proxy", "dgTMatrix"),
slots = c(min_simil = "numeric"))
#' @rdname textstat_proxy-class
setClass("textstat_simil_symm_sparse", contains = c("textstat_proxy", "dsTMatrix"),
slots = c(min_simil = "numeric"))
#' @rdname textstat_proxy-class
validate_min_simil <- function(object) {
if (object@min_simil < -1.0 || object@min_simil > 1.0) {
paste("min_simil must range from -1.0 to 1.0")
} else {
return(TRUE)
}
}
setValidity("textstat_simil_sparse", function(object) {
validate_min_simil(object)
})
setValidity("textstat_simil_symm_sparse", function(object) {
validate_min_simil(object)
})
# #' @slot .Data a sparse \pkg{Matrix} object, symmetric if selection is
# #' `NULL`
# #' @slot method the method used for computing similarity or distance
# #' @slot min_simil numeric; a threshold for the similarity values below which similarity
# #' values are not computed
# #' @slot margin identifies the margin of the dfm on which similarity or
# #' difference was computed: `"documents"` for documents or
# #' `"features"` for word/term features.
# #' @slot type either `"textstat_simil"` or `"textstat_dist"`
# #' @export
# setClass("textstat_proxy", contains = "Matrix",
# slots = c(method = "character",
# margin = "character",
# type = "character"))
#
# #' @rdname textstat_proxy-class
# #' @slot selection target units, if any
# setClass("textstat_dist", contains = c("textstat_proxy", "dgeMatrix"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_dist_symm", contains = c("textstat_proxy", "dspMatrix"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_simil", contains = c("textstat_proxy", "dgeMatrix"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_simil_symm", contains = c("textstat_proxy", "dspMatrix"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_simil_sparse", contains = c("textstat_proxy", "dgTMatrix"),
# slots = c(min_simil = "numeric"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_simil_symm_sparse", contains = c("textstat_proxy", "dsTMatrix"),
# slots = c(min_simil = "numeric"))
#
# #' @rdname textstat_proxy-class
# #' @importClassesFrom quanteda textstat_simil_sparse textstat_simil_symm_sparse
# validate_min_simil <- function(object) {
# if (object@min_simil < -1.0 || object@min_simil > 1.0) {
# paste("min_simil must range from -1.0 to 1.0")
# } else {
# return(TRUE)
# }
# }
#
# setValidity("textstat_simil_sparse", function(object) {
# validate_min_simil(object)
# })
#
# setValidity("textstat_simil_symm_sparse", function(object) {
# validate_min_simil(object)
# })
#' Print a textstat_proxy object
#'
#' Print/show method for objects created by `textstat_simil` and
#' `textstat_dist`.
#' @param object the textstat_proxy object to be printed
#' @rdname textstat_proxy-class
#' @export
setMethod("show", "textstat_proxy",
function(object) {
cat(object@type, " object; method = \"", object@method, "\"\n", sep = "")
Matrix::printSpMatrix(as(object, "sparseMatrix"),
zero.print = if ("min_simil" %in% slotNames(object)) "." else 0,
digits = min(getOption("digits"), 3),
col.names = TRUE, align = "right")
})
#' Return the first or last part of a textstat_proxy object
#'
#' For a similarity or distance object computed via [textstat_simil] or
#' [textstat_dist], returns the first or last `n` rows.
#' @param x a textstat_simil/textstat_dist object
#' @param n a single, positive integer. If positive, size for the resulting
#' object: number of first/last documents for the dfm. If negative, all but
#' the n last/first number of documents of x.
#' @param ... unused
#' @return A [matrix] corresponding to the subset defined
#' by `n`.
#' @export
#' @name head.textstat_proxy
#' @method head textstat_proxy
#' @keywords textstat internal
head.textstat_proxy <- function(x, n = 6L, ...) {
head(as.matrix(x), n)
}
#' @rdname head.textstat_proxy
#' @method tail textstat_proxy
#' @export
tail.textstat_proxy <- function(x, n = 6L, ...) {
tail(as.matrix(x), n)
}
setMethod("head", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
UseMethod("head")
})
setMethod("tail", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
UseMethod("tail")
})
# core functions ------
#' Similarity and distance computation between documents or features
#'
#' These functions compute matrixes of distances and similarities between
#' documents or features from a [dfm][quanteda::dfm] and return a matrix of
#' similarities or distances in a sparse format. These methods are fast and
#' robust because they operate directly on the sparse [dfm][quanteda::dfm]
#' objects. The output can easily be coerced to an ordinary matrix, a data.frame
#' of pairwise comparisons, or a [dist][stats::dist] format.
#' @param x,y a [dfm][quanteda::dfm] objects; `y` is an optional target matrix
#' matching `x` in the margin on which the similarity or distance will be
#' computed.
#' @param selection (deprecated - use `y` instead).
#' @param margin identifies the margin of the dfm on which similarity or
#' difference will be computed: `"documents"` for documents or
#' `"features"` for word/term features.
#' @param method character; the method identifying the similarity or distance
#' measure to be used; see Details.
#' @param min_simil numeric; a threshold for the similarity values below which similarity
#' values will not be returned
#' @param ... unused
#' @details `textstat_simil` options are: `"correlation"` (default),
#' `"cosine"`, `"jaccard"`, `"ejaccard"`, `"dice"`,
#' `"edice"`, `"simple matching"`, and `"hamann"`.
#' @note If you want to compute similarity on a "normalized" dfm object
#' (controlling for variable document lengths, for methods such as correlation
#' for which different document lengths matter), then wrap the input dfm in
#' `[dfm_weight](x, "prop")`.
#' @return A sparse matrix from the \pkg{Matrix} package that will be symmetric
#' unless `y` is specified.
#'
#' @section Conversion to other data types:
#' The output objects from `textstat_simil()` and `textstat_dist()` can be
#' transformed easily into a list format using
#' [`as.list()`][as.list.textstat_proxy], which returns a list for each unique
#' element of the second of the pairs, a data.frame using
#' [`as.data.frame()`][as.data.frame.textstat_proxy], which returns pairwise
#' scores, `as.dist()`for a \link[stats:dist]{dist} object,
#' or `as.matrix()` to convert it into an ordinary matrix.
#' @export
#' @seealso [as.list.textstat_proxy()], [as.data.frame.textstat_proxy()],
#' \code{\link[stats:dist]{stats::as.dist()}}
#' @examples
#' # similarities for documents
#' library("quanteda")
#' dfmat <- corpus_subset(data_corpus_inaugural, Year > 2000) %>%
#' tokens(remove_punct = TRUE) %>%
#' tokens_remove(stopwords("english")) %>%
#' dfm()
#' (tstat1 <- textstat_simil(dfmat, method = "cosine", margin = "documents"))
#' as.matrix(tstat1)
#' as.list(tstat1)
#' as.list(tstat1, diag = TRUE)
#'
#' # min_simil
#' (tstat2 <- textstat_simil(dfmat, method = "cosine", margin = "documents", min_simil = 0.6))
#' as.matrix(tstat2)
#'
#' # similarities for for specific documents
#' textstat_simil(dfmat, dfmat["2017-Trump", ], margin = "documents")
#' textstat_simil(dfmat, dfmat["2017-Trump", ], method = "cosine", margin = "documents")
#' textstat_simil(dfmat, dfmat[c("2009-Obama", "2013-Obama"), ], margin = "documents")
#'
#' # compute some term similarities
#' tstat3 <- textstat_simil(dfmat, dfmat[, c("fair", "health", "terror")], method = "cosine",
#' margin = "features")
#' head(as.matrix(tstat3), 10)
#' as.list(tstat3, n = 6)
#'
textstat_simil <- function(x, y = NULL, selection = NULL,
margin = c("documents", "features"),
method = c("correlation", "cosine", "jaccard", "ejaccard",
"dice", "edice", "hamann", "simple matching"),
min_simil = NULL, ...) {
UseMethod("textstat_simil")
}
#' @export
textstat_simil.default <- function(x, y = NULL, selection = NULL,
margin = c("documents", "features"),
method = c("correlation", "cosine", "jaccard", "ejaccard",
"dice", "edice", "hamann", "simple matching"),
min_simil = NULL, ...) {
stop(friendly_class_undefined_message(class(x), "textstat_simil"))
}
#' @export
textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
margin = c("documents", "features"),
method = c("correlation", "cosine", "jaccard", "ejaccard",
"dice", "edice", "hamann", "simple matching"),
min_simil = NULL, ...) {
if (!is.null(selection))
.Deprecated(msg = "'selection' is deprecated. Use 'y' instead.")
check_dots(...)
x <- as.dfm(x)
margin <- match.arg(margin)
method[method == "hamman"] <- "hamann" # trap older "hamman" spelling
method <- match.arg(method)
if (margin == "features") {
name <- colnames(x)
} else {
name <- rownames(x)
}
if (is.null(y)) {
if (is.null(selection)) {
i <- seq_along(name)
} else {
if (is.character(selection)) {
i <- match(selection, name)
} else {
if (is.logical(selection))
selection <- which(selection)
i <- selection
i[i < 1 | length(name) < i] <- NA
selection <- featnames(x)[selection]
}
if (any(is.na(i)))
stop(paste(selection[is.na(i)], collapse = ", "), " does not exist")
if (margin == "features") {
y <- x[, i]
} else {
y <- x[i, ]
}
}
} else {
if (!is.dfm(y)) stop("y must be a dfm matching x in the margin specified")
y <- as.dfm(y)
}
temp <- textstat_proxy(x, y, margin, method,
min_proxy = min_simil, use_na = TRUE)
if (is.null(min_simil)) {
if (is(temp, "dsTMatrix")) {
retval <- if (packageVersion("Matrix") < "1.4.2") {
as(as(temp, "dsyMatrix"), "dspMatrix") } else {
as(temp, "packedMatrix")
}
retval <- pack(as(temp, "denseMatrix"))
return(new("textstat_simil_symm",
retval,
method = method, margin = margin,
type = "textstat_simil"))
} else {
retval <- if (packageVersion("Matrix") < "1.4.2") {
as(temp, "dgeMatrix") } else {
as(as(temp, "generalMatrix"), "unpackedMatrix")
}
return(new("textstat_simil",
retval,
method = method, margin = margin,
type = "textstat_simil"))
}
} else {
if (is(temp, "dsTMatrix")) {
return(new("textstat_simil_symm_sparse", temp,
method = method, margin = margin,
type = "textstat_simil",
min_simil = min_simil))
} else {
return(new("textstat_simil_sparse", temp,
method = method, margin = margin,
type = "textstat_simil",
min_simil = min_simil))
}
}
}
#' @rdname textstat_simil
#' @export
#' @param p The power of the Minkowski distance.
#' @details `textstat_dist` options are: `"euclidean"` (default),
#' `"manhattan"`, `"maximum"`, `"canberra"`,
#' and `"minkowski"`.
#' @examples
#'
#' # distances for documents
#' (tstat4 <- textstat_dist(dfmat, margin = "documents"))
#' as.matrix(tstat4)
#' as.list(tstat4)
#' as.dist(tstat4)
#'
#' # distances for specific documents
#' textstat_dist(dfmat, dfmat["2017-Trump", ], margin = "documents")
#' (tstat5 <- textstat_dist(dfmat, dfmat[c("2009-Obama" , "2013-Obama"), ], margin = "documents"))
#' as.matrix(tstat5)
#' as.list(tstat5)
#'
#' \dontrun{
#' # plot a dendrogram after converting the object into distances
#' plot(hclust(as.dist(tstat4)))
#' }
textstat_dist <- function(x, y = NULL, selection = NULL,
margin = c("documents", "features"),
method = c("euclidean",
"manhattan", "maximum", "canberra", "minkowski"),
p = 2, ...) {
UseMethod("textstat_dist")
}
#' @export
textstat_dist.default <- function(x, y = NULL, selection = NULL,
margin = c("documents", "features"),
method = c("euclidean",
"manhattan", "maximum", "canberra", "minkowski"),
p = 2, ...) {
stop(friendly_class_undefined_message(class(x), "textstat_dist"))
}
#' @export
#' @importFrom quanteda featnames is.dfm
#' @importFrom utils packageVersion
textstat_dist.dfm <- function(x, y = NULL, selection = NULL,
margin = c("documents", "features"),
method = c("euclidean",
"manhattan", "maximum", "canberra", "minkowski"),
p = 2, ...) {
if (!is.null(selection))
.Deprecated(msg = "'selection' is deprecated. Use 'y' instead.")
check_dots(...)
x <- as.dfm(x)
margin <- match.arg(margin)
method <- match.arg(method)
if (margin == "features") {
name <- colnames(x)
} else {
name <- rownames(x)
}
if (is.null(y)) {
if (is.null(selection)) {
i <- seq_along(name)
} else {
if (is.character(selection)) {
i <- match(selection, name)
} else {
if (is.logical(selection))
selection <- which(selection)
i <- selection
i[i < 1 | length(name) < i] <- NA
selection <- featnames(x)[selection]
}
if (any(is.na(i)))
stop(paste(selection[is.na(i)], collapse = ", "), " does not exist")
if (margin == "features") {
y <- x[, i]
} else {
y <- x[i, ]
}
}
} else {
if (!is.dfm(y)) stop("y must be a dfm matching x in the margin specified")
y <- as.dfm(y)
}
temp <- textstat_proxy(x, y, margin, method,
p = p, use_na = TRUE)
if (is(temp, "dsTMatrix")) {
retval <- if (packageVersion("Matrix") < "1.4.2") {
as(as(temp, "dsyMatrix"), "dspMatrix") } else {
as(temp, "packedMatrix")
}
return(new("textstat_dist_symm",
retval,
method = method, margin = margin,
type = "textstat_dist"))
} else {
retval <- if (packageVersion("Matrix") < "1.4.2") {
as(temp, "dgeMatrix") } else {
as(as(temp, "generalMatrix"), "unpackedMatrix")
}
return(new("textstat_dist",
retval,
method = method, margin = margin,
type = "textstat_dist"))
}
}
# coercion methods ----------
#' textstat_simil/dist coercion methods
#'
#' Coercion methods for objects created by [textstat_simil()] and
#' [textstat_dist()].
#' @name as.list.textstat_proxy
#' @keywords internal
NULL
#' @rdname as.list.textstat_proxy
#' @method as.list textstat_proxy
#' @param sorted sort results in descending order if `TRUE`
#' @param n the top `n` highest-ranking items will be returned. If n is
#' `NULL`, return all items.
#' @param diag logical; if `FALSE`, exclude the item's comparison with itself
#' @return `as.data.list` for a `textstat_simil` or
#' `textstat_dist` object returns a list equal in length to the columns of the
#' simil or dist object, with the rows and their values as named elements. By default,
#' this list excludes same-time pairs (when `diag = FALSE`) and sorts the values
#' in descending order (when `sorted = TRUE`).
#' @keywords textstat
#' @export
as.list.textstat_proxy <- function(x, sorted = TRUE, n = NULL, diag = FALSE, ...) {
if (!is.null(n) && n < 1)
stop("n must be 1 or greater")
if (!is.null(n) && !sorted) {
warning("ignoring n when sorted = FALSE")
n <- NULL
}
x <- proxy2triplet(x, upper = TRUE)
if (!diag)
x <- diag2na(x)
result <- split(structure(x@x, names = rownames(x)[x@i + 1L]),
f = factor(colnames(x)[x@j + 1], levels = colnames(x)))
if (sorted)
result <- lapply(result, sort, decreasing = TRUE, na.last = TRUE)
if (!is.null(n))
result <- lapply(result, head, n)
# remove any missing
result <- lapply(result, function(y) y[!is.na(y)])
# remove any empty
result <- result[lengths(result) > 0]
return(result)
}
#' @rdname as.list.textstat_proxy
#' @aliases as.data.frame.textstat_proxy
#' @method as.data.frame textstat_proxy
#' @inheritParams base::as.data.frame
#' @param upper logical; if `TRUE`, return pairs as both (A, B) and (B, A)
#' @return `as.data.frame` for a `textstat_simil` or
#' `textstat_dist` object returns a data.frame of pairwise combinations
#' and the and their similarity or distance value.
#' @importFrom stringi stri_sub
#' @export
as.data.frame.textstat_proxy <- function(x, row.names = NULL, optional = FALSE,
diag = FALSE, upper = FALSE, ...) {
method <- x@method
margin <- x@margin
stat <- NULL
if (!isSymmetric(x) && upper)
warning("upper = TRUE has no effect when columns have been selected")
x <- proxy2triplet(x, upper)
if (!diag)
x <- diag2na(x)
all <- unique(c(colnames(x), rownames(x)))
result <- data.frame(x = factor(rownames(x)[x@i + 1L], levels = all),
y = factor(colnames(x)[x@j + 1L], levels = all),
stat = x@x,
stringsAsFactors = FALSE)
result <- subset(result, !is.na(stat))
# replace x and y with margin names
names(result)[1:2] <- paste0(stri_sub(margin, 1, -2), 1:2)
# replace stat with measure name
names(result)[3] <- method
# drop row names
row.names(result) <- NULL
return(result)
}
#' convert same-value pairs to NA in a textstat_proxy object
#'
#' Converts the diagonal, or the same-pair equivalent in an object
#' where the columns have been selected, to NA.
#' @param x the return from [textstat_simil()] or [textstat_dist()]
#' @return sparse Matrix format with same-pair values replaced with `NA`
#' @keywords textstat internal
diag2na <- function(x) {
if (is(x, "dsTMatrix")) {
Matrix::diag(x) <- NA
} else if (is(x, "dgTMatrix")) {
name <- intersect(colnames(x), rownames(x))
i <- match(name, rownames(x))
j <- match(name, colnames(x))
x <- x + Matrix::sparseMatrix(
i = i, j = j, x = NA,
dims = dim(x), dimnames = dimnames(x)
)
x <- as(x, "TsparseMatrix") # as(x, "dgTMatrix")
} else {
stop("x must be a triplet matrix")
}
return(x)
}
proxy2triplet <- function(x, upper) {
if (class(x) %in% c("textstat_dist", "textstat_simil")) {
x <- as(x, "TsparseMatrix")
# x <- as(x, "dgTMatrix")
} else {
if (class(x) %in% c("textstat_dist_symm", "textstat_simil_symm")) {
# x <- as(x, "dsTMatrix")
x <- if (packageVersion("Matrix") < "1.4.2") {
# as(as(x, "dsyMatrix"), "dsTMatrix")
as(unpack(x), "TsparseMatrix")
} else {
as(unpack(x), "TsparseMatrix")
# as(as(x, "unpackedMatrix"), "TsparseMatrix")
}
}
if (upper) {
x <- if (packageVersion("Matrix") < "1.4.2") {
as(x, "dgTMatrix") } else {
as(as(x, "TsparseMatrix"), "generalMatrix")
}
}
}
return(x)
}
#' as.matrix method for textstat_simil_sparse
#'
#' @param x an object returned by [textstat_simil] when `min_simil >
#' 0`
#' @param omitted value that will replace the omitted cells
#' @param ... unused
#' @return a [matrix] object
#' @export
#' @keywords textstat internal
#' @rdname as.matrix.textstat_simil_sparse
setMethod("as.matrix", "textstat_simil_sparse",
function(x, omitted = NA, ...) {
x[x == 0] <- omitted
return(
if (packageVersion("Matrix") < "1.4.2") {
as.matrix(as(x, "dgeMatrix"))
} else {
as.matrix(as(x, "unpackedMatrix"))
}
# as.matrix(unpack(as(x, "denseMatrix")))
)
})
#' @export
#' @keywords textstat internal
#' @rdname as.matrix.textstat_simil_sparse
setMethod("as.matrix", "textstat_simil_symm_sparse",
function(x, omitted = NA, ...) {
x[x == 0] <- omitted
return(
if (packageVersion("Matrix") < "1.4.2") {
as.matrix(as(x, "dgeMatrix"))
} else {
as.matrix(as(x, "unpackedMatrix"))
}
# as.matrix(unpack(as(x, "denseMatrix")))
)
})
# textstat_proxy ---------
#' \[Experimental\] Compute document/feature proximity
#'
#' This is an underlying function for `textstat_dist` and
#' `textstat_simil` but returns `TsparseMatrix`.
#' @keywords internal
#' @param y if a [dfm][quanteda::dfm] object is provided, proximity between
#' documents or features in `x` and `y` is computed.
#' @param use_na if `TRUE`, return `NA` for proximity to empty
#' vectors. Note that use of `NA` makes the proximity matrices denser.
#' @inheritParams textstat_dist
#' @param min_proxy the minimum proximity value to be recoded.
#' @param rank an integer value specifying top-n most proximity values to be
#' recorded.
#' @export
#' @seealso [textstat_dist()], [textstat_simil()]
textstat_proxy <- function(x, y = NULL,
margin = c("documents", "features"),
method = c("cosine", "correlation", "jaccard", "ejaccard",
"dice", "edice", "hamann", "simple matching",
"euclidean", "chisquared", "hamming", "kullback",
"manhattan", "maximum", "canberra", "minkowski"),
p = 2, min_proxy = NULL, rank = NULL, use_na = FALSE) {
x <- as.dfm(x)
if (is.null(y)) {
y <- x
} else {
if (!is.dfm(y))
stop("y must be a dfm")
y <- as.dfm(y)
}
margin <- match.arg(margin)
method[method == "hamman"] <- "hamann" # trap older "hamman" spelling
method <- match.arg(method)
if (margin == "documents") {
f <- union(featnames(x), featnames(y))
x <- t(pad_dfm(x, f))
y <- t(pad_dfm(y, f))
} else {
if (!identical(docnames(x), docnames(y)))
stop("x and y must contain the same documents")
}
if (method %in% c("cosine", "correlation", "jaccard", "ejaccard", "dice", "edice",
"hamann", "simple matching", "faith")) {
if (identical(x, y)) {
suppressWarnings({
result <- proxyC::simil(x, NULL, 2, method, min_simil = min_proxy, rank = rank, use_nan = use_na)
})
} else {
suppressWarnings({
result <- proxyC::simil(x, y, 2, method, min_simil = min_proxy, rank = rank, use_nan = use_na)
})
}
} else {
if (identical(x, y)) {
result <- proxyC::dist(x, NULL, 2, method, p = p)
} else {
result <- proxyC::dist(x, y, 2, method, p = p)
}
}
dimnames(result) <- list(colnames(x), colnames(y))
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.