R/init_poisson_nmf.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

#' @rdname fit_poisson_nmf
#'
#' @param F An optional argument giving is the initial estimate of the
#'   factors (also known as \dQuote{basis vectors}). It should be an m x
#'   k matrix, where m is the number of columns in the counts matrix
#'   \code{X}, and k > 1 is the rank of the matrix factorization
#'   (equivalently, the number of \dQuote{topics}). All entries of
#'   \code{F} should be non-negative. When \code{F} and \code{L} are not
#'   provided, input argument \code{k} should be specified instead.
#'
#' @param L An optional argument giving the initial estimate of the
#'   loadings (also known as \dQuote{activations}). It should be an n x k
#'   matrix, where n is the number of rows in the counts matrix
#'   \code{X}, and k > 1 is the rank of the matrix factorization
#'   (equivalently, the number of \dQuote{topics}). All entries of
#'   \code{L} should be non-negative. When \code{F} and \code{L} are not
#'   provided, input argument \code{k} should be specified instead.
#'
#' @param beta Initial setting of the extrapolation parameter. This is
#'   \eqn{beta} in Algorithm 3 of Ang & Gillis (2019).
#'
#' @param betamax Initial setting for the upper bound on the
#'   extrapolation parameter. This is \eqn{\bar{\gamma}} in Algorithm 3
#'   of Ang & Gillis (2019).
#'
#' @importFrom Matrix rowSums
#' @importFrom RhpcBLASctl blas_set_num_threads
#' @importFrom RhpcBLASctl blas_get_num_procs
#' 
#' @export
#' 
init_poisson_nmf <-
  function (X, F, L, k, init.method = c("topicscore","random"),
            beta = 0.5, betamax = 0.99, control = list(),
            verbose = c("detailed","none")) {

  # Check input X.
  verify.count.matrix(X)

  # Get the number of rows (n) and columns (m) of the data matrix,
  n <- nrow(X)
  m <- ncol(X)

  # Only one of k and (F or L) should be provided.
  if (!(missing(k) & !(missing(F) & missing(L)) |
       (!missing(k) & (missing(F) & missing(L)))))
    stop("Provide a rank (k) or an initialization of F and/or L, but not both")
  if (missing(k)) {
    if (missing(F))
      k <- ncol(L)
    else
      k <- ncol(F)
  }
  if (any(k < 2))
    stop("Matrix factorization rank \"k\" should be 2 or greater")

  # Check and process input argument "init.method".
  init.method <- match.arg(init.method)

  # Check and progress input argument "verbose".
  verbose <- match.arg(verbose)
  
  # Check and process the optimization settings.
  control <- modifyList(fit_poisson_nmf_control_default(),
                        control,keep.null = TRUE)
  ncb <- blas_get_num_procs()
  blas_set_num_threads(control$nc.blas)
  
  # If the factor and loadings matrices are not provided, initialize
  # them using the chosen method. If the factor and loadings matrices
  # are provided, run a few simple checks to verify that they are
  # valid initial estimates.
  if (missing(F) | missing(L)) {

    if (init.method == "random") {

      # Initialize the factors and loadings uniformly at random.
      if (missing(F))
        F <- rand(m,k)
      if (missing(L))
        L <- rand(n,k)
    } else if (init.method == "topicscore") {
      if (!(missing(F) & missing(L)))
        stop("init.method = \"topicscore\" can only be used when both L ",
             "and F are not provided")
      
      # Initialize the factors using the "Topic SCORE" algorithm.
      if (verbose == "detailed")
        cat("Initializing factors using Topic SCORE algorithm.\n")
      F <- tryCatch(topic_score(X,k),
                    error = function (e) {
                      warning("Topic SCORE failure occurred; falling back ",
                              "to init.method == \"random\"")
                      return(NULL)
                    })
      if (is.null(F)) {
        
        # The Topic SCORE algorithm failed; generate a random
        # initialization instead.
        if (verbose == "detailed")
          cat("Topic SCORE failure occurred; using random initialization",
              "instead.\n")
        F <- rand(m,k)
        L <- rand(n,k)
      } else {
        
        # Fit the loadings by running a small number of sequential
        # co-ordinate descent (SCD) updates.
        if (verbose == "detailed")
          cat(sprintf("Initializing loadings by running %d SCD updates.\n",
                      control$init.numiter))
        control$nc <- initialize.multithreading(control$nc,verbose != "none")
        s <- rowSums(X)
        L <- matrix(s,n,k)
        L <- scd_update_loadings(X,L,t(F),1:n,control$init.numiter,
                                 control$nc,control$eps)
      }
    }
    rownames(F) <- colnames(X)
    rownames(L) <- rownames(X)
    colnames(F) <- paste0("k",1:k)
    colnames(L) <- paste0("k",1:k)
  } else {

    # Check the provided factor matrix.
    if (!(is.matrix(F) & is.numeric(F)))
      stop("Input argument \"F\" should be a numeric matrix (is.matrix(F) ",
           "should return TRUE)")
    if (is.integer(F))
      storage.mode(F) <- "double"

    # Check the provided loadings matrix.
    if (!(is.matrix(L) & is.numeric(L)))
      stop("Input argument \"L\" should be a numeric matrix (is.matrix(L) ",
           "should return TRUE)")
    if (is.integer(L))
      storage.mode(L) <- "double"
  }

  # Force the initial estimates to be positive.
  F <- pmax(F,control$minval)
  L <- pmax(L,control$minval)

  # Compute the value of the objective ("loss") function at the
  # initial estimates of the factors and loading.
  loss <- sum(cost(X,L,t(F),control$eps))

  # Restore the BLAS settings.
  blas_set_num_threads(ncb)
  
  # Initialize the data frame for keeping track of the algorithm's
  # progress over time.
  progress        <- as.data.frame(matrix(0,0,13))
  names(progress) <- c("iter","loglik","loglik.multinom","dev","res",
                       "delta.f","delta.l","nonzeros.f","nonzeros.l",
                       "extrapolate","beta","betamax","timing")

  # Return a list containing: F, an initial estimate of the factors;
  # L, an initial estimate of the loadings; Fn and Ln, the
  # non-extrapolated estimates of the factors and loadings, which
  # initially are always the same as F and L; Fy and Ly, the
  # extrapolated estimates of the factors and loadings, which
  # initially are always the same as F and L; "iter", the current
  # iteration number (initially set to zero); "loss" and "loss.fnly",
  # the value of the objective or loss function at the current best
  # and partially extrapolated estimates of the factors and loadings;
  # beta, beta0 and betamax, the initial settings of the extrapolation
  # parameters; and "progress", the initial data frame for keeping
  # track of the algorithm's progress over time.
  fit <- list(F         = F,
              L         = L,
              Fn        = F,
              Ln        = L,
              Fy        = F,
              Ly        = L,
              loss      = loss,
              loss.fnly = loss,
              iter      = 0,
              beta      = beta,
              beta0     = beta,
              betamax   = betamax,
              progress  = progress)
  class(fit) <- c("poisson_nmf_fit","list")
  verify.fit.and.count.matrix(X,fit)
  return(fit)
}

#' @rdname fit_poisson_nmf
#'
#' @param clusters A factor specifying a grouping, or clustering, of
#'   the rows of \code{X}.
#'
#' @param \dots Additional arguments passed to \code{init_poisson_nmf}.
#'
#' @importFrom Matrix rowSums
#' @importFrom Matrix colSums
#'
#' @export
#'
init_poisson_nmf_from_clustering <- function (X, clusters, ...) {

  # Check input X.
  if (!((is.numeric(X) & is.matrix(X)) | is.sparse.matrix(X)))
    stop("Input argument \"X\" should be a numeric matrix (a \"matrix\" or ",
         "a \"dgCMatrix\")")

  # Get the number of rows (n) and columns (m) of the data matrix,
  n <- nrow(X)
  m <- ncol(X)

  # Check "clusters" input.
  if (!(is.factor(clusters) & length(clusters) == n))
    stop("Input argument \"clusters\" should be a factor with one entry ",
         "for each row of \"X\"")
  if (any(table(clusters) == 0))
    stop("Each level must appear at least once in factor \"clusters\"")

  # Initialize the loadings matrix from the clustering.
  k <- nlevels(clusters)
  L <- matrix(0,n,k)
  rownames(L) <- rownames(X)
  colnames(L) <- levels(clusters)
  for (j in levels(clusters)) {
    i      <- which(clusters == j)
    L[i,j] <- 1
  }
  L <- rowSums(X) * L

  # Initialize the factors matrix; the MLEs are available in
  # closed-form in this case.
  F <- matrix(0,m,k)
  rownames(F) <- colnames(X)
  colnames(F) <- levels(clusters)
  for (j in levels(clusters)) {
    i     <- which(clusters == j)
    F[,j] <- colSums(X[i,])/sum(L[i,j])
  }

  # Output the Poisson NMF fit object.
  return(init_poisson_nmf(X,F,L,...))
}

Any scripts or data that you put into this service are public.

fastTopics documentation built on Sept. 11, 2024, 5:14 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

fastTopics
Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/init_poisson_nmf.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

Try the fastTopics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fastTopics Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/init_poisson_nmf.R In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

Try the fastTopics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fastTopics
Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/init_poisson_nmf.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data