R/summary.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

#' @title Summarize Poisson NMF or Multinomial Topic Model Fit
#'
#' @description \code{summary} method for the \dQuote{poisson_nmf_fit}
#'   and \dQuote{multinom_topic_model_fit} classes.
#'
#' @param object An object of class \dQuote{poisson_nmf_fit} or
#'   \dQuote{multinom_topic_model_fit}. The former is usually the result
#'   of calling \code{\link{fit_poisson_nmf}}; the latter is usually the
#'   result of calling \code{\link{fit_topic_model}} or
#'   \code{\link{poisson2multinom}}.
#'
#' @return The functions \code{summary.poisson_nmf_fit} and
#' \code{summary.multinom_topic_model_fit} compute and return a list
#' of statistics summarizing the model fit. The returned list
#' includes some or all of the following elements:
#'
#' \item{n}{The number of rows in the counts matrix, typically the
#'   number of samples.}
#'
#' \item{m}{The number of columns in the counts matrix, typically the
#'   number of observed counts per sample.}
#'
#' \item{k}{The rank of the Poisson NMF or the number of topics.}
#'
#' \item{s}{A vector of length n giving the "size factor" estimates;
#'   these estimates should be equal, or close to, the total counts in
#'   each row of the counts matrix.}
#'
#' \item{numiter}{The number of loadings and/or factor updates
#'   performed.}
#'
#' \item{loglik}{The Poisson NMF log-likelihood.}
#'
#' \item{loglik.multinom}{The multinomial topic model log-likelihood.}
#'
#' \item{dev}{The Poisson NMF deviance.}
#'
#' \item{res}{The maximum residual of the Karush-Kuhn-Tucker (KKT)
#'   first-order optimality conditions. This can be used to assess
#'   convergence of the updates to a (local) solution.}
#'
#' \item{mixprops}{Matrix giving a high-level summary of the
#'   mixture proportions, in which rows correspond to topics, and
#'   columns are ranges of mixture proportionss.}
#'
#' \item{topic.reps}{A matrix in which the ith row gives the mixture
#'   proportions for the sample "most representative" of topic i; by
#'   "most representative", we mean the row (or sample) with the highest
#'   proportion of counts drawn from the topic i.}
#' 
#' @method summary poisson_nmf_fit
#'
#' @export
#' 
summary.poisson_nmf_fit <- function (object, ...) {
  out <- summary.multinom_topic_model_fit(poisson2multinom(object),...)
  class(out) <- c("summary.poisson_nmf_fit","list")
  return(out)
}

#' @rdname summary.poisson_nmf_fit
#'
#' @importFrom stats quantile
#' 
#' @method summary multinom_topic_model_fit
#'
#' @export
#' 
summary.multinom_topic_model_fit <- function (object, ...) {
  verify.fit(object)
  numiter <- nrow(object$progress)
  out <- list(n          = nrow(object$L),
              m          = nrow(object$F),
              k          = ncol(object$F),
              s          = object$s,
              numiter    = numiter,
              loglik     = object$progress[numiter,"loglik"],
              loglik.multinom = object$progress[numiter,"loglik.multinom"],
              dev        = object$progress[numiter,"dev"],
              res        = object$progress[numiter,"res"],
              mixprops   = summarize_mixprops(object$L),
              topic.reps = get_topic_reps(object$L))
  class(out) <- c("summary.multinom_topic_model_fit","list")
  return(out)
}

#' @rdname summary.poisson_nmf_fit
#'
#' @param x An object of class \dQuote{summary.poisson_nmf_fit},
#'   usually a result of a call to \code{summary.poisson_nmf_fit}.
#'
#' @param show.mixprops If \code{TRUE}, print a summary of the mixture
#'   proportions.
#'
#' @param show.topic.reps If \code{TRUE}, print a summary of the topic
#'   representatives.
#' 
#' @param \dots Additional arguments passed to the generic \code{summary}
#'   or \code{print.summary} method.
#'
#' @method print summary.poisson_nmf_fit
#' 
#' @export
#' 
print.summary.poisson_nmf_fit <- function (x, show.mixprops = FALSE,
                                           show.topic.reps = FALSE, ...) {
  print.summary.multinom_topic_model_fit(x,FALSE,show.mixprops,
                                         show.topic.reps,...)
  return(invisible(x))
}

#' @rdname summary.poisson_nmf_fit
#'
#' @param show.size.factors If \code{TRUE}, print a summary of the
#'   size factors.
#'
#' @method print summary.multinom_topic_model_fit
#'
#' @importFrom stats quantile
#' 
#' @export
#'
print.summary.multinom_topic_model_fit <-
  function (x, show.size.factors = FALSE, show.mixprops = FALSE,
            show.topic.reps = FALSE, ...) {
  k <- x$k
  cat("Model overview:\n")
  cat(sprintf("  Number of data rows, n: %d\n",x$n))
  cat(sprintf("  Number of data cols, m: %d\n",x$m))
  cat(sprintf("  Rank/number of topics, k: %d\n",x$k))
  cat(sprintf("Evaluation of model fit (%d updates performed):\n",
              x$numiter))
  cat(sprintf("  Poisson NMF log-likelihood: %+0.12e\n",x$loglik))
  cat(sprintf("  Multinomial topic model log-likelihood: %0.12e\n",
              x$loglik.multinom))
  cat(sprintf("  Poisson NMF deviance: %+0.12e\n",x$dev))
  cat(sprintf("  Max KKT residual: %+0.6e\n",x$res))
  if (!(show.size.factors | show.mixprops | show.topic.reps))
    message("Set show.size.factors = TRUE, show.mixprops = TRUE and/or ",
            "show.topic.reps = TRUE in print(...) for more information")
  if (show.size.factors & !is.null(x$s)) {
    cat(sprintf("Size factors:\n"))
    q        <- quantile(x$s)
    names(q) <- c("Min","1Q","Median","3Q","Max")
    print(q)
  }
  if (show.mixprops) {
    cat(sprintf("Mixture proportions:\n"))
    print(x$mixprops)
  }
  if (show.topic.reps) {
    cat(sprintf("Topic representatives:\n"))
    print(round(x$topic.reps,digits = 3))
  }
  return(invisible(x))
}

#' @title Summarize and Compare Model Fits
#'
#' @description Create a table summarizing the results of fitting one
#'   or more Poisson non-negative matrix factorizations or multinomial
#'   topic models.
#'
#' @param fits An object of class \code{"poisson_nmf_fit"} or
#'   \code{"multinom_topic_model_fit"}, or a non-empty, named list in
#'   which all list elements are Poisson NMF model fits or all
#'   multinomial topic model fits.
#'
#' @return A data frame with one row per element of \code{fits}, and
#' with the following columns:
#'
#' \item{k}{The rank of the matrix factorization.}
#'
#' \item{loglik}{The log-likelihood (either Poisson NMF or multinomial topic
#'   model likelihood) achieved at the last model fitting update.}
#'
#' \item{dev}{For Poisson NMF model fits only, the deviance achieved
#'   at the last model fitting update.}
#'
#' \item{res}{The maximum residual of the Karush-Kuhn-Tucker (KKT)
#'   system achieved at the last model fitting update; small values
#'   indicate that the solution is close to a local maximum, or
#'   stationary point, of the likelihood.}
#'
#' \item{loglik.diff}{The improvement in the log-likelihood relative
#'   to the model fit with the smallest log-likelihood.}
#'
#' \item{dev.diff}{The improvement in the deviance relative to the
#'   model fit with the largest deviance.}
#' 
#' \item{nonzeros.f}{The rate of nonzeros in the factors matrix, as
#'   determined by \code{control$zero.threshold}.}
#'
#' \item{nonzeros.l}{The rate of nonzeros in the loadings matrix, as
#'   determined by \code{control$zero.threshold}.}
#'
#' \item{numiter}{The number of loadings and/or factor updates
#'   performed.}
#' 
#' \item{runtime}{The total runtime (in s) of the model fitting
#'   updates.}
#' 
#' @seealso \code{\link{fit_poisson_nmf}},
#'   \code{\link{fit_topic_model}}
#' 
#' @export
#' 
compare_fits <- function (fits) {

  # Check and process input "fits". It should either be an object of
  # class poisson_nmf_fit or multinom_topic_model_fit, or a list of
  # fit objects.
  if (inherits(fits,"poisson_nmf_fit")) {
    verify.fit(fits)
    fit.name    <- deparse(substitute(fits))          
    fits        <- list(fits)
    names(fits) <- fit.name
  } else {
    msg <- paste("Input argument \"fits\" should either be an object of",
                 "class \"poisson_nmf_fit\", or a non-empty, named list in",
                 "which all list elements are objects of class",
                 "\"poisson_nmf_fit\" or all objects of class",
                 "\"multinom_topic_model_fit\"")
    if (!(is.list(fits) & !is.null(names(fits)) & length(fits) > 0))
      stop(msg)
    if (!(all(sapply(fits,function(x)inherits(x,"poisson_nmf_fit"))) |
          all(sapply(fits,function(x)inherits(x,"multinom_topic_model_fit")))))
      stop(msg)
    if (!all(nchar(names(fits)) > 0))
      stop(msg)
    sapply(fits,verify.fit)
  }

  # Initialize the data structure.
  n   <- length(fits)
  out <- data.frame(k           = rep(0,n),
                    loglik      = rep(0,n),
                    dev         = rep(0,n),
                    loglik.diff = rep(0,n),
                    dev.diff    = rep(0,n),
                    res         = rep(0,n),
                    nonzeros.f  = rep(0,n),
                    nonzeros.l  = rep(0,n),
                    numiter     = rep(0,n),
                    runtime     = rep(0,n),
                    stringsAsFactors = FALSE)
  rownames(out) <- names(fits)
  
  # Collect the information from each fit.
  for (i in 1:n) {
    fit <- fits[[i]]
    x   <- tail(fit$progress,n = 1)
    out[i,"k"]          <- ncol(fit$F)
    out[i,"loglik"]     <- ifelse(inherits(fit,"poisson_nmf_fit"),x$loglik,
                                  x$loglik.multinom)
    out[i,"dev"]        <- ifelse(inherits(fit,"poisson_nmf_fit"),
                                  x$dev,as.numeric(NA))
    out[i,"res"]        <- x$res
    out[i,"nonzeros.f"] <- x$nonzeros.f
    out[i,"nonzeros.l"] <- x$nonzeros.l
    out[i,"numiter"]    <- nrow(fit$progress)
    out[i,"runtime"]    <- sum(fit$progress$timing)
  }

  # Calculate the "loglik.diff" and "dev.diff" entries.
  out$loglik.diff <- out$loglik - min(out$loglik)
  out$dev.diff    <- max(out$dev) - out$dev
  
  # Output the data frame.
  return(out)
}

# Given L, the matrix of mixture proportions, output mixture
# proportion histograms as a k x n matrix, where k is the number of
# topics, and n is the number of bins in the histogram.
summarize_mixprops <- function (L) {
  k   <- ncol(L)
  out <- t(apply(L,2,
                 function (x) as.vector(table(cut(x,c(-1,0.1,0.5,0.9,1))))))
  if (is.null(rownames(out)))
    rownames(out) <- 1:k
  colnames(out) <- c("<0.1","0.1-0.5","0.5-0.9",">0.9")
  return(out)
}
  
# Given a matrix of mixture proportions, L, return a k x k matrix,
# where k is the number of topics, in which each row is a sample (data
# matrix row). The ith row of the matrix contains the mixture
# proportions for the sample "most representative" of the ith topic;
# that is, the row or sample with the largest proportion of counts
# drawn from the ith topic.
get_topic_reps <- function (L) {
  n    <- nrow(L)
  k    <- ncol(L)
  rows <- apply(L,2,which.max)
  out  <- L[rows,]
  if (is.null(rownames(out)))
    rownames(out) <- rows
  if (is.null(colnames(out)))
    colnames(out) <- 1:k
  return(out)
}

Any scripts or data that you put into this service are public.

fastTopics documentation built on Sept. 11, 2024, 5:14 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

fastTopics
Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/summary.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

Try the fastTopics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fastTopics Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/summary.R In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

Try the fastTopics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fastTopics
Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/summary.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data