R/topicmodels.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Documented in predict.LDA_VEM

#' @title Predict method for an object of class LDA_VEM or class LDA_Gibbs
#' @description Gives either the predictions to which topic a document belongs or 
#' the term posteriors by topic indicating which terms are emitted by each topic.\cr
#' If you provide in \code{newdata} a document term matrix 
#' for which a document does not contain any text and hence does not have any terms with nonzero entries,
#' the prediction will give as topic prediction NA values (see the examples).
#' @param object an object of class LDA_VEM or LDA_Gibbs as returned by \code{LDA} from the topicmodels package
#' @param newdata a document/term matrix containing data for which to make a prediction
#' @param type either 'topic' or 'terms' for the topic predictions or the term posteriors
#' @param labels a character vector of the same length as the number of topics in the topic model. 
#' Indicating how to label the topics. Only valid for type = 'topic'. 
#' Defaults to topic_prob_001 up to topic_prob_999.
#' @param min_posterior numeric in 0-1 range to output only terms emitted by each 
#' topic which have a posterior probability equal or higher than \code{min_posterior}. 
#' Only used if \code{type} is 'terms'. Provide -1 if you want to keep all values.
#' @param min_terms integer indicating the minimum number of terms to keep in the output when \code{type} is 'terms'. Defaults to 0.
#' @param ... further arguments passed on to topicmodels::posterior
#' @return 
#' \itemize{
#'  \item{in case of type = 'topic': }{a data.table with columns 
#'  doc_id, 
#'  topic (the topic number to which the document is assigned to), 
#'  topic_label (the topic label)
#'  topic_prob (the posterior probability score for that topic), 
#'  topic_probdiff_2nd (the probability score for that topic - the probability score for the 2nd highest topic) 
#'  and the probability scores for each topic as indicated by topic_labelyourownlabel}
#'  \item{n case of type = 'terms': }{a list of data.frames with columns term and prob, 
#'  giving the posterior probability that each term is emitted by the topic}
#' }
#' @export
#' @rdname predict.LDA
#' @aliases predict.LDA
#' @seealso \code{\link[topicmodels]{posterior-methods}}
#' @examples 
#' ## Build document/term matrix on dutch nouns
#' data(brussels_reviews_anno)
#' data(brussels_reviews)
#' x <- subset(brussels_reviews_anno, language == "nl")
#' x <- subset(x, xpos %in% c("JJ"))
#' x <- x[, c("doc_id", "lemma")]
#' x <- document_term_frequencies(x)
#' dtm <- document_term_matrix(x)
#' dtm <- dtm_remove_lowfreq(dtm, minfreq = 10)
#' dtm <- dtm_remove_tfidf(dtm, top = 100)
#' 
#' \dontshow{if(require(topicmodels))\{}
#' ## Fit a topicmodel using VEM
#' library(topicmodels)
#' mymodel <- LDA(x = dtm, k = 4, method = "VEM")
#' 
#' ## Get topic terminology
#' terminology <- predict(mymodel, type = "terms", min_posterior = 0.05, min_terms = 3)
#' terminology
#' 
#' ## Get scores alongside the topic model
#' dtm <- document_term_matrix(x, vocabulary = mymodel@terms)
#' scores <- predict(mymodel, newdata = dtm, type = "topics")
#' scores <- predict(mymodel, newdata = dtm, type = "topics", 
#'                   labels = c("mylabel1", "xyz", "app-location", "newlabel"))
#' head(scores)
#' table(scores$topic)
#' table(scores$topic_label)
#' table(scores$topic, exclude = c())
#' table(scores$topic_label, exclude = c())
#' 
#' ## Fit a topicmodel using Gibbs
#' library(topicmodels)
#' mymodel <- LDA(x = dtm, k = 4, method = "Gibbs")
#' terminology <- predict(mymodel, type = "terms", min_posterior = 0.05, min_terms = 3)
#' scores <- predict(mymodel, type = "topics", newdata = dtm)
#' \dontshow{\} # End of main if statement running only if the required packages are installed}
predict.LDA_VEM <- function(object, newdata, type = c("topics", "terms"), min_posterior=-1, min_terms=0, labels, ...){
  requireNamespace("topicmodels")
  type <- match.arg(type)
  if(type == "topics"){
    newdata <- newdata[, object@terms]
    idx <- Matrix::rowSums(newdata)
    idx_ok <- which(idx > 0)
    idx_nok <- which(idx == 0)
    scores <- topicmodels::posterior(object, newdata = newdata[idx_ok, ], ...)$topics
    if(missing(labels)){
      labels <-  sprintf("topic_%03d", seq_len(ncol(scores)))
      colnames(scores) <- sprintf("topic_%03d", seq_len(ncol(scores)))
    }else{
      colnames(scores) <- sprintf("topic_%s", labels)
    }
    topic <- apply(scores, MARGIN = 1, FUN=which.max)
    topic <- apply(scores, MARGIN = 1, FUN=function(x){
      out <- list()
      out$topic <- which.max(x)
      out$topic_prob <- x[out$topic]
      out$topic_probdiff_2nd <- out$topic_prob - max(x[-out$topic])
      out
    })
    recoder <- function(x, from, to){
      idx <- match(x, from)
      to[idx]
    }
    doc_ids <- rownames(newdata)
    if(is.null(doc_ids)){
      doc_ids <- seq_len(nrow(newdata))
    }
    result <- list(ok = data.table(doc_id = doc_ids[idx_ok],
                                   topic = sapply(topic, FUN=function(x) x$topic),
                                   topic_label = recoder(sapply(topic, FUN=function(x) x$topic), from = seq_along(labels), to = labels),
                                   topic_prob = sapply(topic, FUN=function(x) x$topic_prob),
                                   topic_probdiff_2nd = sapply(topic, FUN=function(x) x$topic_probdiff_2nd),
                                   scores),
                   nok = data.table(doc_id = doc_ids[idx_nok]))
    result <- rbindlist(result, fill = TRUE) 
    result <- setDF(result)
    result <- result[match(doc_ids, result$doc_id), ]
    rownames(result) <- NULL
    return(result)
  }else{
    topicmodel_terms <- function(object, min_posterior, min_terms, labels){
      x <- t(topicmodels::posterior(object, ...)$terms)
      if(missing(labels)){
        labels <-  sprintf("topic_%03d", seq_len(ncol(x)))
      }
      terms <- object@terms
      topicterms <- list()
      for(topic in 1:ncol(x)){
        topicterms[[sprintf("topic_%03d", topic)]] <- x[, topic]
        names(topicterms[[sprintf("topic_%03d", topic)]]) <- terms
      }
      topicterms <- lapply(topicterms, FUN=function(x){
        x <- sort(x, decreasing = TRUE)
        x <- data.frame(term = names(x), prob = as.numeric(x), stringsAsFactors = FALSE)
        if(sum(x$prob >= min_posterior) <= min_terms){
          x <- head(x, n = min_terms)
        }else{
          x <- x[x$prob >= min_posterior, ]  
        }
      })
      topicterms
    }
    result <- topicmodel_terms(object = object, min_posterior = min_posterior, min_terms = min_terms, labels = labels)
  }
  result
}

#' @export
#' @rdname predict.LDA
predict.LDA_Gibbs <- predict.LDA_VEM

Any scripts or data that you put into this service are public.

udpipe documentation built on Jan. 6, 2023, 5:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

udpipe
Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/topicmodels.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Defines functions predict.LDA_VEM

Documented in predict.LDA_VEM

Try the udpipe package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

udpipe Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/topicmodels.R In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

Defines functions predict.LDA_VEM

Documented in predict.LDA_VEM

Try the udpipe package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

udpipe
Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit

R/topicmodels.R
In udpipe: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit