R/textstat_simil.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Documented in as.data.frame.textstat_proxy as.list.textstat_proxy diag2na head.textstat_proxy tail.textstat_proxy textstat_dist textstat_proxy textstat_simil validate_min_simil

# class definition and class functions --------

#' @title textstat_simil/dist classes
#' @description Sparse classes for similarity and distance matrices created by
#'   [textstat_simil()] and [textstat_dist()].
#' @name textstat_proxy-class
#' @keywords internal textstat
#' @seealso [textstat_simil()]
NULL

# class definitions and class functions for textstat_proxy --------

# transitional only, to get quanteda.textstats passing on CRAN

#' @title textstat_simil/dist classes
#' @description Sparse classes for similarity and distance matrices created by
#'   [quanteda.textstats::textstat_simil()] and
#'   [quanteda.textstats::textstat_dist()].
#' @rdname textstat_proxy-class
#' @export
#' @keywords internal textstat
#' @slot .Data a sparse \pkg{Matrix} object, symmetric if selection is
#'   `NULL`
#' @slot method the method used for computing similarity or distance
#' @slot min_simil numeric; a threshold for the similarity values below which similarity
#'   values are not computed
#' @slot margin identifies the margin of the dfm on which similarity or
#'   difference was computed:  `"documents"` for documents or
#'   `"features"` for word/term features.
#' @slot type either `"textstat_simil"` or `"textstat_dist"`
#' @seealso [quanteda.textstats::textstat_simil()]
setClass("textstat_proxy", contains = "Matrix",
         slots = c(method = "character",
                   margin = "character",
                   type = "character"))

#' @rdname textstat_proxy-class
#' @slot selection target units, if any
setClass("textstat_dist", contains = c("textstat_proxy", "dgeMatrix"))

#' @rdname textstat_proxy-class
setClass("textstat_dist_symm", contains = c("textstat_proxy", "dspMatrix"))

#' @rdname textstat_proxy-class
setClass("textstat_simil", contains = c("textstat_proxy", "dgeMatrix"))

#' @rdname textstat_proxy-class
#' @export
setClass("textstat_simil_symm", contains = c("textstat_proxy", "dspMatrix"))

#' @rdname textstat_proxy-class
#' @export
setClass("textstat_simil_sparse", contains = c("textstat_proxy", "dgTMatrix"),
         slots = c(min_simil = "numeric"))

#' @rdname textstat_proxy-class
setClass("textstat_simil_symm_sparse", contains = c("textstat_proxy", "dsTMatrix"),
         slots = c(min_simil = "numeric"))

#' @rdname textstat_proxy-class
validate_min_simil <- function(object) {
    if (object@min_simil < -1.0 || object@min_simil > 1.0) {
        paste("min_simil must range from -1.0 to 1.0")
    } else {
        return(TRUE)
    }
}

setValidity("textstat_simil_sparse", function(object) {
    validate_min_simil(object)
})

setValidity("textstat_simil_symm_sparse", function(object) {
    validate_min_simil(object)
})

# #' @slot .Data a sparse \pkg{Matrix} object, symmetric if selection is
# #'   `NULL`
# #' @slot method the method used for computing similarity or distance
# #' @slot min_simil numeric; a threshold for the similarity values below which similarity
# #'   values are not computed
# #' @slot margin identifies the margin of the dfm on which similarity or
# #'   difference was computed:  `"documents"` for documents or
# #'   `"features"` for word/term features.
# #' @slot type either `"textstat_simil"` or `"textstat_dist"`
# #' @export
# setClass("textstat_proxy", contains = "Matrix",
#          slots = c(method = "character",
#                    margin = "character",
#                    type = "character"))
#
# #' @rdname textstat_proxy-class
# #' @slot selection target units, if any
# setClass("textstat_dist", contains = c("textstat_proxy", "dgeMatrix"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_dist_symm", contains = c("textstat_proxy", "dspMatrix"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_simil", contains = c("textstat_proxy", "dgeMatrix"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_simil_symm", contains = c("textstat_proxy", "dspMatrix"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_simil_sparse", contains = c("textstat_proxy", "dgTMatrix"),
#          slots = c(min_simil = "numeric"))
#
# #' @rdname textstat_proxy-class
# setClass("textstat_simil_symm_sparse", contains = c("textstat_proxy", "dsTMatrix"),
#          slots = c(min_simil = "numeric"))
#
# #' @rdname textstat_proxy-class
# #' @importClassesFrom quanteda textstat_simil_sparse textstat_simil_symm_sparse
# validate_min_simil <- function(object) {
#     if (object@min_simil < -1.0 || object@min_simil > 1.0) {
#         paste("min_simil must range from -1.0 to 1.0")
#     } else {
#         return(TRUE)
#     }
# }
#
# setValidity("textstat_simil_sparse", function(object) {
#     validate_min_simil(object)
# })
#
# setValidity("textstat_simil_symm_sparse", function(object) {
#     validate_min_simil(object)
# })

#' Print a textstat_proxy object
#'
#' Print/show method for objects created by `textstat_simil` and
#' `textstat_dist`.
#' @param object the textstat_proxy object to be printed
#' @rdname textstat_proxy-class
#' @export
setMethod("show", "textstat_proxy",
          function(object) {
              cat(object@type, " object; method = \"", object@method, "\"\n", sep = "")
              Matrix::printSpMatrix(as(object, "sparseMatrix"),
                                    zero.print = if ("min_simil" %in% slotNames(object)) "." else 0,
                                    digits = min(getOption("digits"), 3),
                                    col.names = TRUE, align = "right")
          })

#' Return the first or last part of a textstat_proxy object
#'
#' For a similarity or distance object computed via [textstat_simil] or
#' [textstat_dist], returns the first or last `n` rows.
#' @param x a textstat_simil/textstat_dist object
#' @param n a single, positive integer.  If positive, size for the resulting
#'   object: number of first/last documents for the dfm. If negative, all but
#'   the n last/first number of documents of x.
#' @param ... unused
#' @return A [matrix] corresponding to the subset defined
#'   by `n`.
#' @export
#' @name head.textstat_proxy
#' @method head textstat_proxy
#' @keywords textstat internal
head.textstat_proxy <- function(x, n = 6L, ...) {
    head(as.matrix(x), n)
}

#' @rdname head.textstat_proxy
#' @method tail textstat_proxy
#' @export
tail.textstat_proxy <- function(x, n = 6L, ...) {
    tail(as.matrix(x), n)
}

setMethod("head", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
    UseMethod("head")
})
setMethod("tail", signature(x = "textstat_proxy"), function(x, n = 6L, ...) {
    UseMethod("tail")
})

# core functions ------

#' Similarity and distance computation between documents or features
#'
#' These functions compute matrixes of distances and similarities between
#' documents or features from a [dfm][quanteda::dfm] and return a matrix of
#' similarities or distances in a sparse format.  These methods are fast and
#' robust because they operate directly on the sparse [dfm][quanteda::dfm]
#' objects. The output can easily be coerced to an ordinary matrix, a data.frame
#' of pairwise comparisons, or a [dist][stats::dist] format.
#' @param x,y a [dfm][quanteda::dfm] objects; `y` is an optional target matrix
#'   matching `x` in the margin on which the similarity or distance will be
#'   computed.
#' @param selection (deprecated - use `y` instead).
#' @param margin identifies the margin of the dfm on which similarity or
#'   difference will be computed:  `"documents"` for documents or
#'   `"features"` for word/term features.
#' @param method character; the method identifying the similarity or distance
#'   measure to be used; see Details.
#' @param min_simil numeric; a threshold for the similarity values below which similarity
#'   values will not be returned
#' @param ... unused
#' @details `textstat_simil` options are: `"correlation"` (default),
#'   `"cosine"`, `"jaccard"`, `"ejaccard"`, `"dice"`,
#'   `"edice"`, `"simple matching"`, and `"hamann"`.
#' @note If you want to compute similarity on a "normalized" dfm object
#'   (controlling for variable document lengths, for methods such as correlation
#'   for which different document lengths matter), then wrap the input dfm in
#'   `[dfm_weight](x, "prop")`.
#' @return A sparse matrix from the \pkg{Matrix} package that will be symmetric
#'   unless `y` is specified.
#'
#' @section Conversion to other data types:
#'   The output objects from `textstat_simil()` and `textstat_dist()` can be
#'   transformed easily into a list format using
#'   [`as.list()`][as.list.textstat_proxy], which returns a list for each unique
#'   element of the second of the pairs, a data.frame using
#'   [`as.data.frame()`][as.data.frame.textstat_proxy], which returns pairwise
#'   scores, `as.dist()`for a \link[stats:dist]{dist} object,
#'   or `as.matrix()` to convert it into an ordinary matrix.
#' @export
#' @seealso [as.list.textstat_proxy()], [as.data.frame.textstat_proxy()],
#'   \code{\link[stats:dist]{stats::as.dist()}}
#' @examples
#' # similarities for documents
#' library("quanteda")
#' dfmat <- corpus_subset(data_corpus_inaugural, Year > 2000) %>%
#'     tokens(remove_punct = TRUE) %>%
#'     tokens_remove(stopwords("english")) %>%
#'     dfm()
#' (tstat1 <- textstat_simil(dfmat, method = "cosine", margin = "documents"))
#' as.matrix(tstat1)
#' as.list(tstat1)
#' as.list(tstat1, diag = TRUE)
#'
#' # min_simil
#' (tstat2 <- textstat_simil(dfmat, method = "cosine", margin = "documents", min_simil = 0.6))
#' as.matrix(tstat2)
#'
#' # similarities for for specific documents
#' textstat_simil(dfmat, dfmat["2017-Trump", ], margin = "documents")
#' textstat_simil(dfmat, dfmat["2017-Trump", ], method = "cosine", margin = "documents")
#' textstat_simil(dfmat, dfmat[c("2009-Obama", "2013-Obama"), ], margin = "documents")
#'
#' # compute some term similarities
#' tstat3 <- textstat_simil(dfmat, dfmat[, c("fair", "health", "terror")], method = "cosine",
#'                          margin = "features")
#' head(as.matrix(tstat3), 10)
#' as.list(tstat3, n = 6)
#'
textstat_simil <- function(x, y = NULL, selection = NULL,
                           margin = c("documents", "features"),
                           method = c("correlation", "cosine", "jaccard", "ejaccard",
                                      "dice", "edice", "hamann", "simple matching"),
                           min_simil = NULL, ...) {
    UseMethod("textstat_simil")
}

#' @export
textstat_simil.default <- function(x, y = NULL, selection = NULL,
                               margin = c("documents", "features"),
                               method = c("correlation", "cosine", "jaccard", "ejaccard",
                                          "dice", "edice", "hamann", "simple matching"),
                               min_simil = NULL, ...) {
    stop(friendly_class_undefined_message(class(x), "textstat_simil"))
}

#' @export
textstat_simil.dfm <- function(x, y = NULL, selection = NULL,
                               margin = c("documents", "features"),
                               method = c("correlation", "cosine", "jaccard", "ejaccard",
                                          "dice", "edice", "hamann", "simple matching"),
                               min_simil = NULL, ...) {

    if (!is.null(selection))
        .Deprecated(msg = "'selection' is deprecated. Use 'y' instead.")
    check_dots(...)

    x <- as.dfm(x)
    margin <- match.arg(margin)

    method[method == "hamman"] <- "hamann" # trap older "hamman" spelling
    method <- match.arg(method)

    if (margin == "features") {
        name <- colnames(x)
    } else {
        name <- rownames(x)
    }
    if (is.null(y)) {
        if (is.null(selection)) {
            i <- seq_along(name)
        } else {
            if (is.character(selection)) {
                i <- match(selection, name)
            } else {
                if (is.logical(selection))
                    selection <- which(selection)
                i <- selection
                i[i < 1 | length(name) < i] <- NA
                selection <- featnames(x)[selection]
            }
            if (any(is.na(i)))
                stop(paste(selection[is.na(i)], collapse = ", "), " does not exist")
            if (margin == "features") {
                y <- x[, i]
            } else {
                y <- x[i, ]
            }
        }
    } else {
        if (!is.dfm(y)) stop("y must be a dfm matching x in the margin specified")
        y <- as.dfm(y)
    }
    temp <- textstat_proxy(x, y, margin, method,
                           min_proxy = min_simil, use_na = TRUE)

    if (is.null(min_simil)) {
        if (is(temp, "dsTMatrix")) {
            retval <- if (packageVersion("Matrix") < "1.4.2") {
                as(as(temp, "dsyMatrix"), "dspMatrix") } else {
                    as(temp, "packedMatrix")
                }
            retval <- pack(as(temp, "denseMatrix"))
            return(new("textstat_simil_symm",
                       retval,
                       method = method, margin = margin,
                       type = "textstat_simil"))
        } else {
            retval <- if (packageVersion("Matrix") < "1.4.2") {
                as(temp, "dgeMatrix") } else {
                    as(as(temp, "generalMatrix"), "unpackedMatrix")
                }
            return(new("textstat_simil",
                       retval,
                       method = method, margin = margin,
                       type = "textstat_simil"))
        }
    } else {
        if (is(temp, "dsTMatrix")) {
            return(new("textstat_simil_symm_sparse", temp,
                       method = method, margin = margin,
                       type = "textstat_simil",
                       min_simil = min_simil))
        } else {
            return(new("textstat_simil_sparse", temp,
                       method = method, margin = margin,
                       type = "textstat_simil",
                       min_simil = min_simil))
        }
    }
}

#' @rdname textstat_simil
#' @export
#' @param p The power of the Minkowski distance.
#' @details `textstat_dist` options are: `"euclidean"` (default),
#'   `"manhattan"`, `"maximum"`, `"canberra"`,
#'   and `"minkowski"`.
#' @examples
#'
#' # distances for documents
#' (tstat4 <- textstat_dist(dfmat, margin = "documents"))
#' as.matrix(tstat4)
#' as.list(tstat4)
#' as.dist(tstat4)
#'
#' # distances for specific documents
#' textstat_dist(dfmat, dfmat["2017-Trump", ], margin = "documents")
#' (tstat5 <- textstat_dist(dfmat, dfmat[c("2009-Obama" , "2013-Obama"), ], margin = "documents"))
#' as.matrix(tstat5)
#' as.list(tstat5)
#'
#' \dontrun{
#' # plot a dendrogram after converting the object into distances
#' plot(hclust(as.dist(tstat4)))
#' }
textstat_dist <- function(x, y = NULL, selection = NULL,
                          margin = c("documents", "features"),
                          method = c("euclidean",
                                     "manhattan", "maximum", "canberra", "minkowski"),
                          p = 2, ...) {
    UseMethod("textstat_dist")
}

#' @export
textstat_dist.default <- function(x, y = NULL, selection = NULL,
                                  margin = c("documents", "features"),
                                  method = c("euclidean",
                                             "manhattan", "maximum", "canberra", "minkowski"),
                                  p = 2, ...) {
    stop(friendly_class_undefined_message(class(x), "textstat_dist"))
}

#' @export
#' @importFrom quanteda featnames is.dfm
#' @importFrom utils packageVersion
textstat_dist.dfm <- function(x, y = NULL, selection = NULL,
                              margin = c("documents", "features"),
                              method = c("euclidean",
                                         "manhattan", "maximum", "canberra", "minkowski"),
                              p = 2, ...) {
    if (!is.null(selection))
        .Deprecated(msg = "'selection' is deprecated. Use 'y' instead.")

    check_dots(...)
    x <- as.dfm(x)

    margin <- match.arg(margin)
    method <- match.arg(method)

    if (margin == "features") {
        name <- colnames(x)
    } else {
        name <- rownames(x)
    }
    if (is.null(y)) {
        if (is.null(selection)) {
            i <- seq_along(name)
        } else {
            if (is.character(selection)) {
                i <- match(selection, name)
            } else {
                if (is.logical(selection))
                    selection <- which(selection)
                i <- selection
                i[i < 1 | length(name) < i] <- NA
                selection <- featnames(x)[selection]
            }
            if (any(is.na(i)))
                stop(paste(selection[is.na(i)], collapse = ", "), " does not exist")
            if (margin == "features") {
                y <- x[, i]
            } else {
                y <- x[i, ]
            }
        }
    } else {
        if (!is.dfm(y)) stop("y must be a dfm matching x in the margin specified")
        y <- as.dfm(y)
    }
    temp <- textstat_proxy(x, y, margin, method,
                           p = p, use_na = TRUE)

    if (is(temp, "dsTMatrix")) {
        retval <- if (packageVersion("Matrix") < "1.4.2") {
            as(as(temp, "dsyMatrix"), "dspMatrix") } else {
                as(temp, "packedMatrix")
            }
        return(new("textstat_dist_symm",
                   retval,
                   method = method, margin = margin,
                   type = "textstat_dist"))
    } else {
        retval <- if (packageVersion("Matrix") < "1.4.2") {
                as(temp, "dgeMatrix") } else {
                    as(as(temp, "generalMatrix"), "unpackedMatrix")
                }
        return(new("textstat_dist",
                   retval,
                   method = method, margin = margin,
                   type = "textstat_dist"))
    }
}

# coercion methods ----------

#' textstat_simil/dist coercion methods
#'
#' Coercion methods for objects created by [textstat_simil()] and
#' [textstat_dist()].
#' @name as.list.textstat_proxy
#' @keywords internal
NULL

#' @rdname as.list.textstat_proxy
#' @method as.list textstat_proxy
#' @param sorted sort results in descending order if `TRUE`
#' @param n the top `n` highest-ranking items will be returned.  If n is
#'   `NULL`, return all items.
#' @param diag logical; if `FALSE`, exclude the item's comparison with itself
#' @return `as.data.list` for a `textstat_simil` or
#'   `textstat_dist` object returns a list equal in length to the columns of the
#'   simil or dist object, with the rows and their values as named  elements.  By default,
#'   this list excludes same-time pairs (when `diag = FALSE`) and sorts the values
#'   in descending order (when `sorted = TRUE`).
#' @keywords textstat
#' @export
as.list.textstat_proxy <- function(x, sorted = TRUE, n = NULL, diag = FALSE, ...) {
    if (!is.null(n) && n < 1)
        stop("n must be 1 or greater")
    if (!is.null(n) && !sorted) {
        warning("ignoring n when sorted = FALSE")
        n <- NULL
    }

    x <- proxy2triplet(x, upper = TRUE)
    if (!diag)
        x <- diag2na(x)
    result <- split(structure(x@x, names = rownames(x)[x@i + 1L]),
                    f = factor(colnames(x)[x@j + 1], levels = colnames(x)))

    if (sorted)
        result <- lapply(result, sort, decreasing = TRUE, na.last = TRUE)
    if (!is.null(n))
        result <- lapply(result, head, n)
    # remove any missing
    result <- lapply(result, function(y) y[!is.na(y)])
    # remove any empty
    result <- result[lengths(result) > 0]
    return(result)
}

#' @rdname as.list.textstat_proxy
#' @aliases as.data.frame.textstat_proxy
#' @method as.data.frame textstat_proxy
#' @inheritParams base::as.data.frame
#' @param upper logical; if `TRUE`, return pairs as both (A, B) and (B, A)
#' @return `as.data.frame` for a `textstat_simil` or
#'   `textstat_dist` object returns a data.frame of pairwise combinations
#'   and the and their similarity or distance value.
#' @importFrom stringi stri_sub
#' @export
as.data.frame.textstat_proxy <- function(x, row.names = NULL, optional = FALSE,
                                         diag = FALSE, upper = FALSE,  ...) {
    method <- x@method
    margin <- x@margin
    stat <- NULL

    if (!isSymmetric(x) && upper)
        warning("upper = TRUE has no effect when columns have been selected")
    x <- proxy2triplet(x, upper)
    if (!diag)
        x <- diag2na(x)

    all <- unique(c(colnames(x), rownames(x)))
    result <- data.frame(x = factor(rownames(x)[x@i + 1L], levels = all),
                         y = factor(colnames(x)[x@j + 1L], levels = all),
                         stat = x@x,
                         stringsAsFactors = FALSE)
    result <- subset(result, !is.na(stat))

    # replace x and y with margin names
    names(result)[1:2] <- paste0(stri_sub(margin, 1, -2), 1:2)
    # replace stat with measure name
    names(result)[3] <- method
    # drop row names
    row.names(result) <- NULL
    return(result)
}

#' convert same-value pairs to NA in a textstat_proxy object
#'
#' Converts the diagonal, or the same-pair equivalent in an object
#' where the columns have been selected, to NA.
#' @param x the return from [textstat_simil()] or [textstat_dist()]
#' @return sparse Matrix format with same-pair values replaced with `NA`
#' @keywords textstat internal
diag2na <- function(x) {
    if (is(x, "dsTMatrix")) {
        Matrix::diag(x) <- NA
    } else if (is(x, "dgTMatrix")) {
        name <- intersect(colnames(x), rownames(x))
        i <- match(name, rownames(x))
        j <- match(name, colnames(x))
        x <- x + Matrix::sparseMatrix(
            i = i, j = j, x = NA,
            dims = dim(x), dimnames = dimnames(x)
        )
        x <- as(x, "TsparseMatrix") # as(x, "dgTMatrix")
    } else {
        stop("x must be a triplet matrix")
    }
    return(x)
}

proxy2triplet <- function(x, upper) {
    if (class(x) %in% c("textstat_dist", "textstat_simil")) {
        x <- as(x, "TsparseMatrix")
        # x <- as(x, "dgTMatrix")
    } else {
        if (class(x) %in% c("textstat_dist_symm", "textstat_simil_symm")) {
            # x <- as(x, "dsTMatrix")
            x <- if (packageVersion("Matrix") < "1.4.2") {
                    # as(as(x, "dsyMatrix"), "dsTMatrix")
                    as(unpack(x), "TsparseMatrix")
                } else {
                    as(unpack(x), "TsparseMatrix")
                    # as(as(x, "unpackedMatrix"), "TsparseMatrix")
                }
        }
        if (upper) {
            x <- if (packageVersion("Matrix") < "1.4.2") {
                as(x, "dgTMatrix") } else {
                    as(as(x, "TsparseMatrix"), "generalMatrix")
                }
        }
    }
    return(x)
}

#' as.matrix method for textstat_simil_sparse
#'
#' @param x an object returned by [textstat_simil] when `min_simil >
#'   0`
#' @param omitted value that will replace the omitted cells
#' @param ... unused
#' @return a [matrix] object
#' @export
#' @keywords textstat internal
#' @rdname as.matrix.textstat_simil_sparse
setMethod("as.matrix", "textstat_simil_sparse",
          function(x, omitted = NA, ...) {
              x[x == 0] <- omitted
              return(
                  if (packageVersion("Matrix") < "1.4.2") {
                      as.matrix(as(x, "dgeMatrix"))
                  } else {
                      as.matrix(as(x, "unpackedMatrix"))
                  }
                  # as.matrix(unpack(as(x, "denseMatrix")))
              )
          })

#' @export
#' @keywords textstat internal
#' @rdname as.matrix.textstat_simil_sparse
setMethod("as.matrix", "textstat_simil_symm_sparse",
          function(x, omitted = NA, ...) {
              x[x == 0] <- omitted
              return(
                  if (packageVersion("Matrix") < "1.4.2") {
                      as.matrix(as(x, "dgeMatrix"))
                  } else {
                      as.matrix(as(x, "unpackedMatrix"))
                  }
                  # as.matrix(unpack(as(x, "denseMatrix")))
              )
          })

# textstat_proxy ---------

#' \[Experimental\] Compute document/feature proximity
#'
#' This is an underlying function for `textstat_dist` and
#' `textstat_simil` but returns `TsparseMatrix`.
#' @keywords internal
#' @param y if a [dfm][quanteda::dfm] object is provided, proximity between
#'   documents or features in `x` and `y` is computed.
#' @param use_na if `TRUE`, return `NA` for proximity to empty
#'   vectors. Note that use of `NA` makes the proximity matrices denser.
#' @inheritParams textstat_dist
#' @param min_proxy the minimum proximity value to be recoded.
#' @param rank an integer value specifying top-n most proximity values to be
#'   recorded.
#' @export
#' @seealso [textstat_dist()], [textstat_simil()]
textstat_proxy <- function(x, y = NULL,
                           margin = c("documents", "features"),
                           method = c("cosine", "correlation", "jaccard", "ejaccard",
                                      "dice", "edice", "hamann", "simple matching",
                                      "euclidean", "chisquared", "hamming", "kullback",
                                      "manhattan", "maximum", "canberra", "minkowski"),
                           p = 2, min_proxy = NULL, rank = NULL, use_na = FALSE) {
    x <- as.dfm(x)
    if (is.null(y)) {
        y <- x
    } else {
        if (!is.dfm(y))
            stop("y must be a dfm")
        y <- as.dfm(y)
    }

    margin <- match.arg(margin)

    method[method == "hamman"] <- "hamann" # trap older "hamman" spelling
    method <- match.arg(method)

    if (margin == "documents") {
        f <- union(featnames(x), featnames(y))
        x <- t(pad_dfm(x, f))
        y <- t(pad_dfm(y, f))
    } else {
        if (!identical(docnames(x), docnames(y)))
            stop("x and y must contain the same documents")
    }
    if (method %in% c("cosine", "correlation", "jaccard", "ejaccard", "dice", "edice",
                      "hamann", "simple matching", "faith")) {
        if (identical(x, y)) {
            suppressWarnings({
                result <- proxyC::simil(x, NULL, 2, method, min_simil = min_proxy, rank = rank, use_nan = use_na)
            })
        } else {
            suppressWarnings({
                result <- proxyC::simil(x, y, 2, method, min_simil = min_proxy, rank = rank, use_nan = use_na)
            })
        }
    } else {
        if (identical(x, y)) {
            result <- proxyC::dist(x, NULL, 2, method, p = p)
        } else {
            result <- proxyC::dist(x, y, 2, method, p = p)
        }
    }
    dimnames(result) <- list(colnames(x), colnames(y))
    return(result)
}
Any scripts or data that you put into this service are public.
quanteda.textstats documentation built on Sept. 11, 2024, 6:39 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

R/textstat_simil.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Defines functions textstat_proxy proxy2triplet diag2na as.data.frame.textstat_proxy as.list.textstat_proxy textstat_dist.dfm textstat_dist.default textstat_dist textstat_simil.dfm textstat_simil.default textstat_simil tail.textstat_proxy head.textstat_proxy validate_min_simil

Documented in as.data.frame.textstat_proxy as.list.textstat_proxy diag2na head.textstat_proxy tail.textstat_proxy textstat_dist textstat_proxy textstat_simil validate_min_simil

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats Textual Statistics for the Quantitative Analysis of Textual Data

R/textstat_simil.R In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Defines functions textstat_proxy proxy2triplet diag2na as.data.frame.textstat_proxy as.list.textstat_proxy textstat_dist.dfm textstat_dist.default textstat_dist textstat_simil.dfm textstat_simil.default textstat_simil tail.textstat_proxy head.textstat_proxy validate_min_simil

Documented in as.data.frame.textstat_proxy as.list.textstat_proxy diag2na head.textstat_proxy tail.textstat_proxy textstat_dist textstat_proxy textstat_simil validate_min_simil

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

R/textstat_simil.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data