fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

on_cran <- TRUE

# Used to check that x is a vector in which x[i+1] >= x[i] for all i.
expect_nondecreasing <- function (x)
  expect_equal(diff(x) >= 0,rep(TRUE,length(x) - 1))

# Used to check that x is a vector in which x[i+1] <= x[i] for all i.
expect_nonincreasing <- function (x)
  expect_equal(diff(x) <= 0,rep(TRUE,length(x) - 1))

# The Rmosek package on CRAN will not work with REBayes. This function
# is used for some of the tests to check whether the correct Rmosek
# package (the one downloaded from mosek.com) is installed.
skip_if_mixkwdual_doesnt_work <- function() {
  skip_if_not_installed("REBayes")
  skip_if_not_installed("Rmosek")
  skip_if(!is.element("mosek_lptoprob",getNamespaceExports("Rmosek")))
}

# A short convenience function to quickly generate a data set used for
# testing the Poisson NMF optimization methods.
generate_test_data <- function (n, m, k, lmax = 0.5, fmax = 0.5) {
    
  # Generate count data to factorize.
  out <- simulate_count_data(n,m,k,lmax,fmax)
  X   <- out$X
  F   <- out$F
  L   <- out$L

  # Remove any rows that are entirely zeros.
  rows <- which(rowSums(X > 0) > 0)
  X    <- X[rows,]
  L    <- L[rows,]

  # Choose a row with the least number of nonzero counts, and set all
  # the counts to zero except the largest one.
  i <- which.min(rowSums(X > 0))
  j <- which.max(X[i,])
  X[i,-j] <- 0

  # Remove any columns that are entirely zeros.
  cols <- which(colSums(X > 0) > 0)
  X    <- X[,cols]
  F    <- F[cols,]
  
  # Choose a column with the least number of nonzero counts, and set
  # all the counts to zero except the largest one.
  j <- which.min(colSums(X > 0))
  i <- which.max(X[,j])
  X[-i,j] <- 0
  
  # Remove any rows that are entirely zeros.
  rows <- which(rowSums(X > 0) > 0)
  X    <- X[rows,]
  L    <- L[rows,]

  return(list(X = X,F = F,L = L))
}

# Generate data for testing the fitting of the Poisson mixture model.
generate_poismix_data <- function (n, x) {
  m <- length(x)
  L <- rand(n,m)
  w <- as.double(rpois(n,L %*% x))
  return(list(L = L,w = w))
}

# Given an n x k matrix of topic proportions, where n is the number of
# samples and k is the number of topics, force "hard" topic
# assignments; that is, return a topic proportions matrix with all
# ones and zeros, such that the largest topic proportion in each row
# is replaced with a 1.
force_hard_topic_assignments <- function (L)
  as.matrix(Matrix::sparseMatrix(i = 1:nrow(L),j = apply(L,1,which.max),
                                 x = 1,dims = dim(L)))

# Compute the log-likelihoods for a Poisson non-negative matrix
# factorization. This should give the same result as
# loglik_poisson_nmf, but the computation is done less efficiently
# using dpois.
loglik_poisson_nmf_with_dpois <- function (X, fit) {
  Y <- with(fit,tcrossprod(L,F))
  return(rowSums(stats::dpois(X,Y,log = TRUE)))
}

# Compute the deviance for a Poisson non-negative matrix
# factorization. This should give the same result as
# deviance_poisson_nmf, but the computation is done less efficiently
# using "poisson" from the stats package.
deviance_poisson_nmf_with_poisson <- function (X, fit) {
  Y <- with(fit,tcrossprod(L,F))
  return(rowSums(stats::poisson()$dev.resids(X,Y,1)))
}

# Compute the log-likelihoods for a multinomial topic model. This
# should give the same result as loglik_multinom_topic_model, but the
# computation is done less efficiently using dmultinom.
loglik_multinom_topic_model_with_dmultinom <- function (X, fit) {
  n             <- nrow(fit$L)
  loglik        <- rep(0,n)
  names(loglik) <- rownames(X)
  Y             <- with(fit,tcrossprod(L,F))
  for (i in 1:n)
    loglik[i] <- stats::dmultinom(X[i,],prob = Y[i,],log = TRUE)
  return(loglik)
}

# Iterate the given updates for the factors (F) and loadings (L) matrices.
iterate_updates <- function (X, F, L, numiter, update_factors = NULL,
                             update_loadings = NULL, factors_first = TRUE) {
  loglik <- rep(0,numiter)
  dev    <- rep(0,numiter)
  res    <- rep(0,numiter)
  for (i in 1:numiter) {
    if (factors_first) {
      if (!is.null(update_factors))
        F <- update_factors(X,F,L)
      if (!is.null(update_loadings))
        L <- update_loadings(X,F,L)
    } else {
      if (!is.null(update_loadings))
        L <- update_loadings(X,F,L)
      if (!is.null(update_factors))
        F <- update_factors(X,F,L)
    }
    k           <- ncol(F)
    rownames(L) <- rownames(X)
    rownames(F) <- colnames(X)
    colnames(L) <- paste0("k",1:k)
    colnames(F) <- paste0("k",1:k)
    fit         <- list(F = F,L = L)
    class(fit)  <- c("poisson_nmf_fit","list")
    loglik[i]   <- sum(loglik_poisson_nmf(X,fit))
    dev[i]      <- sum(deviance_poisson_nmf(X,fit))
    res[i]      <- with(poisson_nmf_kkt(X,F,L),max(abs(rbind(F,L))))
  }
  return(list(F = F,L = L,loglik = loglik,dev = dev,res = res))
}

stephenslab/fastTopics documentation built on July 4, 2025, 9:20 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

stephenslab/fastTopics
Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

tests/testthat/helper_functions.R
In stephenslab/fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R Package Documentation

Browse R Packages

We want your feedback!

stephenslab/fastTopics Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

tests/testthat/helper_functions.R In stephenslab/fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R Package Documentation

Browse R Packages

We want your feedback!

stephenslab/fastTopics
Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

tests/testthat/helper_functions.R
In stephenslab/fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data