R/textmodel_lsa.R

Defines functions coefficients.textmodel_lsa coef.textmodel_lsa as.dfm.textmodel_lsa predict.textmodel_lsa textmodel_lsa.dfm textmodel_lsa

Documented in as.dfm.textmodel_lsa coefficients.textmodel_lsa coef.textmodel_lsa predict.textmodel_lsa textmodel_lsa

#' Latent Semantic Analysis
#'
#' Fit the Latent Semantic Analysis scaling model to a [dfm][quanteda::dfm],
#' which may be weighted (for instance using [quanteda::dfm_tfidf()]).
#' @param x the [dfm][quanteda::dfm] on which the model will be fit
#' @param nd  the number of dimensions to be included in output
#' @param margin margin to be smoothed by the SVD
#' @returns a `textmodel_lsa` class object, a list containing:
#' * `sk` a numeric vector containing the d values from the SVD
#' * `docs` document coordinates from the SVD (u)
#' * `features` feature coordinates from the SVD (v)
#' * `matrix_low_rank` the multiplication of udv'
#' * `data` the input data as a CSparseMatrix from the \pkg{Matrix} package
#' @author Haiyan Wang and Kohei Watanabe
#' @details [svds][RSpectra::svds] in the \pkg{RSpectra} package is applied to
#'   enable the fast computation of the SVD.
#' @note  The number of dimensions `nd` retained in LSA is an empirical
#'   issue. While a reduction in \eqn{k} can remove much of the noise, keeping
#'   too few dimensions or factors may lose important information.
#' @references
#'   Rosario, B. (2000).
#'   [Latent
#'   Semantic Indexing: An Overview](http://www.cse.msu.edu/~cse960/Papers/LSI/LSI.pdf). *Technical report INFOSYS 240 Spring
#'   Paper, University of California, Berkeley.*
#'
#'   Deerwester, S., Dumais, S.T., Furnas, G.W., Landauer, T.K., &
#'   Harshman, R. (1990). [Indexing
#'   by Latent Semantic Analysis](https://www.proquest.com/docview/1301252034). *Journal of the American Society for
#'   Information Science*, 41(6): 391.
#' @examples
#' library("quanteda")
#' dfmat <- dfm(tokens(data_corpus_irishbudget2010))
#' # create an LSA space and return its truncated representation in the low-rank space
#' tmod <- textmodel_lsa(dfmat[1:10, ])
#' head(tmod$docs)
#'
#' # matrix in low_rank LSA space
#' tmod$matrix_low_rank[,1:5]
#'
#' # fold queries into the space generated by dfmat[1:10,]
#' # and return its truncated versions of its representation in the new low-rank space
#' pred <- predict(tmod, newdata = dfmat[11:14, ])
#' pred$docs_newspace
#'
#' @keywords textmodel experimental
#' @seealso [predict.textmodel_lsa()], [coef.textmodel_lsa()]
#' @importFrom quanteda as.dfm
#' @export
textmodel_lsa <- function(x, nd = 10, margin = c("both", "documents", "features")) {
    UseMethod("textmodel_lsa")
}

#' @export
textmodel_lsa.dfm <- function(x, nd = 10, margin = c("both", "documents", "features")) {

    x <- as.dfm(x)
    if (!sum(x)) stop(message_error("dfm_empty"))
    margin <- match.arg(margin)

    if (nd > min(nrow(x), ncol(x))) nd <- min(nrow(x), ncol(x))
    if (nd < 2) nd <- 2

    x <- as(x, "CsparseMatrix")
    if (margin == "documents") {
        dec <- RSpectra::svds(x, k = nd, nu = 0, nv = nd)
    } else if (margin == "features") {
        dec <- RSpectra::svds(x, k = nd, nu = nd, nv = 0)
    } else {
        dec <- RSpectra::svds(x, nd)
    }

    if (any(dec$d <= sqrt(.Machine$double.eps)))
        warning("[lsa] - there are singular values which are zero")

    result <- list(sk = dec$d, docs = NULL, features = NULL)

    if (margin == "documents") {
        result$features <- dec$v
        rownames(result$features) <- colnames(x)
        result$matrix_low_rank <- t(dec$v * dec$d)
        rownames(result$matrix_low_rank) <-
            paste0(quanteda_options("base_compname"), seq_len(nrow(result$matrix_low_rank)))
        colnames(result$matrix_low_rank) <- colnames(x)
    } else if (margin == "features") {
        result$docs <- dec$u
        rownames(result$docs) <- rownames(x)
        result$matrix_low_rank <- dec$u * dec$d
        rownames(result$matrix_low_rank) <- rownames(x)
        colnames(result$matrix_low_rank) <-
            paste0(quanteda_options("base_compname"), seq_len(ncol(result$matrix_low_rank)))
    } else {
        result$docs <- dec$u
        result$features <- dec$v
        rownames(result$docs) <- rownames(x)
        rownames(result$features) <- colnames(x)
        result$matrix_low_rank <- dec$u %*% diag(dec$d) %*% t(dec$v)
        rownames(result$matrix_low_rank) <- rownames(x)
        colnames(result$matrix_low_rank) <- colnames(x)
    }

    # keep the input matrix
    result$data <- x
    class(result) <- c("textmodel_lsa")

    # return the LSA space
    return(result)
}

# Post-estimation methods ---------------

#' Post-estimations methods for textmodel_lsa
#'
#' Post-estimation methods for fitted [textmodel_lsa] objects.
#' @name textmodel_lsa-postestimation
#' @param object,x previously fitted [textmodel_lsa] object
#' @param newdata new matrix to be transformed into the lsa space
#' @param ... unused
#' @return `predict()` returns a predicted [textmodel_lsa] object, projecting the patterns onto
#' new data.
#' @importFrom stats predict
#' @method predict textmodel_lsa
#' @keywords textmodel internal
#' @export
predict.textmodel_lsa <- function(object, newdata = NULL, ...) {

    call <- match.call()
    if (is.null(newdata)) newdata <- object$data

    tsa <-  newdata %*% object$features %*% solve(diag(object$sk))
    transfed <- t(object$features %*% diag(object$sk) %*% t(tsa))

    colnames(transfed) <- rownames(object$features)
    rownames(transfed) <- rownames(newdata)

    result <- list(docs_newspace = tsa,
                   matrix_low_rank = transfed)
    rownames(result$docs_newspace) <- rownames(newdata)
    class(result) <- "textmodel_lsa_predicted"
    return (result)
}

#' @rdname textmodel_lsa-postestimation
#' @method as.dfm textmodel_lsa
#' @export
as.dfm.textmodel_lsa <- function(x) {
    as.dfm(x$matrix_low_rank)
}

#' @rdname textmodel_lsa-postestimation
#' @return `coef.textmodel_lsa` extracts model coefficients from a fitted
#'   [textmodel_ca] object.
#' @param doc_dim,feat_dim the document and feature dimension scores to be
#'   extracted
#' @export
coef.textmodel_lsa <- function(object, doc_dim = 1, feat_dim = 1, ...) {
    list(coef_feature = object$features[, feat_dim],
         coef_feature_se = rep(NA, dim(object$features)[1]),
         coef_document = object$docs[, doc_dim],
         coef_document_se = rep(NA, dim(object$docs)[1]))
}

#' @rdname textmodel_lsa-postestimation
#' @export
coefficients.textmodel_lsa <- function(object, doc_dim = 1, feat_dim = 1, ...) {
    UseMethod('coef', ...)
}

Try the quanteda.textmodels package in your browser

Any scripts or data that you put into this service are public.

quanteda.textmodels documentation built on Sept. 11, 2024, 8:19 p.m.