Nothing
#' Latent Semantic Analysis
#'
#' Fit the Latent Semantic Analysis scaling model to a [dfm][quanteda::dfm],
#' which may be weighted (for instance using [quanteda::dfm_tfidf()]).
#' @param x the [dfm][quanteda::dfm] on which the model will be fit
#' @param nd the number of dimensions to be included in output
#' @param margin margin to be smoothed by the SVD
#' @returns a `textmodel_lsa` class object, a list containing:
#' * `sk` a numeric vector containing the d values from the SVD
#' * `docs` document coordinates from the SVD (u)
#' * `features` feature coordinates from the SVD (v)
#' * `matrix_low_rank` the multiplication of udv'
#' * `data` the input data as a CSparseMatrix from the \pkg{Matrix} package
#' @author Haiyan Wang and Kohei Watanabe
#' @details [svds][RSpectra::svds] in the \pkg{RSpectra} package is applied to
#' enable the fast computation of the SVD.
#' @note The number of dimensions `nd` retained in LSA is an empirical
#' issue. While a reduction in \eqn{k} can remove much of the noise, keeping
#' too few dimensions or factors may lose important information.
#' @references
#' Rosario, B. (2000).
#' [Latent
#' Semantic Indexing: An Overview](http://www.cse.msu.edu/~cse960/Papers/LSI/LSI.pdf). *Technical report INFOSYS 240 Spring
#' Paper, University of California, Berkeley.*
#'
#' Deerwester, S., Dumais, S.T., Furnas, G.W., Landauer, T.K., &
#' Harshman, R. (1990). [Indexing
#' by Latent Semantic Analysis](https://www.proquest.com/docview/1301252034). *Journal of the American Society for
#' Information Science*, 41(6): 391.
#' @examples
#' library("quanteda")
#' dfmat <- dfm(tokens(data_corpus_irishbudget2010))
#' # create an LSA space and return its truncated representation in the low-rank space
#' tmod <- textmodel_lsa(dfmat[1:10, ])
#' head(tmod$docs)
#'
#' # matrix in low_rank LSA space
#' tmod$matrix_low_rank[,1:5]
#'
#' # fold queries into the space generated by dfmat[1:10,]
#' # and return its truncated versions of its representation in the new low-rank space
#' pred <- predict(tmod, newdata = dfmat[11:14, ])
#' pred$docs_newspace
#'
#' @keywords textmodel experimental
#' @seealso [predict.textmodel_lsa()], [coef.textmodel_lsa()]
#' @importFrom quanteda as.dfm
#' @export
textmodel_lsa <- function(x, nd = 10, margin = c("both", "documents", "features")) {
UseMethod("textmodel_lsa")
}
#' @export
textmodel_lsa.dfm <- function(x, nd = 10, margin = c("both", "documents", "features")) {
x <- as.dfm(x)
if (!sum(x)) stop(message_error("dfm_empty"))
margin <- match.arg(margin)
if (nd > min(nrow(x), ncol(x))) nd <- min(nrow(x), ncol(x))
if (nd < 2) nd <- 2
x <- as(x, "CsparseMatrix")
if (margin == "documents") {
dec <- RSpectra::svds(x, k = nd, nu = 0, nv = nd)
} else if (margin == "features") {
dec <- RSpectra::svds(x, k = nd, nu = nd, nv = 0)
} else {
dec <- RSpectra::svds(x, nd)
}
if (any(dec$d <= sqrt(.Machine$double.eps)))
warning("[lsa] - there are singular values which are zero")
result <- list(sk = dec$d, docs = NULL, features = NULL)
if (margin == "documents") {
result$features <- dec$v
rownames(result$features) <- colnames(x)
result$matrix_low_rank <- t(dec$v * dec$d)
rownames(result$matrix_low_rank) <-
paste0(quanteda_options("base_compname"), seq_len(nrow(result$matrix_low_rank)))
colnames(result$matrix_low_rank) <- colnames(x)
} else if (margin == "features") {
result$docs <- dec$u
rownames(result$docs) <- rownames(x)
result$matrix_low_rank <- dec$u * dec$d
rownames(result$matrix_low_rank) <- rownames(x)
colnames(result$matrix_low_rank) <-
paste0(quanteda_options("base_compname"), seq_len(ncol(result$matrix_low_rank)))
} else {
result$docs <- dec$u
result$features <- dec$v
rownames(result$docs) <- rownames(x)
rownames(result$features) <- colnames(x)
result$matrix_low_rank <- dec$u %*% diag(dec$d) %*% t(dec$v)
rownames(result$matrix_low_rank) <- rownames(x)
colnames(result$matrix_low_rank) <- colnames(x)
}
# keep the input matrix
result$data <- x
class(result) <- c("textmodel_lsa")
# return the LSA space
return(result)
}
# Post-estimation methods ---------------
#' Post-estimations methods for textmodel_lsa
#'
#' Post-estimation methods for fitted [textmodel_lsa] objects.
#' @name textmodel_lsa-postestimation
#' @param object,x previously fitted [textmodel_lsa] object
#' @param newdata new matrix to be transformed into the lsa space
#' @param ... unused
#' @return `predict()` returns a predicted [textmodel_lsa] object, projecting the patterns onto
#' new data.
#' @importFrom stats predict
#' @method predict textmodel_lsa
#' @keywords textmodel internal
#' @export
predict.textmodel_lsa <- function(object, newdata = NULL, ...) {
call <- match.call()
if (is.null(newdata)) newdata <- object$data
tsa <- newdata %*% object$features %*% solve(diag(object$sk))
transfed <- t(object$features %*% diag(object$sk) %*% t(tsa))
colnames(transfed) <- rownames(object$features)
rownames(transfed) <- rownames(newdata)
result <- list(docs_newspace = tsa,
matrix_low_rank = transfed)
rownames(result$docs_newspace) <- rownames(newdata)
class(result) <- "textmodel_lsa_predicted"
return (result)
}
#' @rdname textmodel_lsa-postestimation
#' @method as.dfm textmodel_lsa
#' @export
as.dfm.textmodel_lsa <- function(x) {
as.dfm(x$matrix_low_rank)
}
#' @rdname textmodel_lsa-postestimation
#' @return `coef.textmodel_lsa` extracts model coefficients from a fitted
#' [textmodel_ca] object.
#' @param doc_dim,feat_dim the document and feature dimension scores to be
#' extracted
#' @export
coef.textmodel_lsa <- function(object, doc_dim = 1, feat_dim = 1, ...) {
list(coef_feature = object$features[, feat_dim],
coef_feature_se = rep(NA, dim(object$features)[1]),
coef_document = object$docs[, doc_dim],
coef_document_se = rep(NA, dim(object$docs)[1]))
}
#' @rdname textmodel_lsa-postestimation
#' @export
coefficients.textmodel_lsa <- function(object, doc_dim = 1, feat_dim = 1, ...) {
UseMethod('coef', ...)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.