R/lfc.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

# This is the workhorse function used by de_analysis for computing the
# log-fold change (LFC) statistics using a topic model. It runs a
# simple Markov chain Monte Carlo algorithm to simulate the posterior
# distribution of the LFC statistics, then computes key posterior
# quantities from the simulated Monte Carlo samples.
#
# Most of the input arguments are explained in the documentation
# accompanying de_analysis. Please also bear in mind the following the
# additional points: (1) f0 should be a estimate of the paramter f0 in
# the the "null" model x ~ Poisson(u), with u = s*f0; (2) F and L
# should specify the parameters of the Poisson glm models; that is, F
# should be returned from fit_poisson_models(X,L,...) in which L =
# s*fit$L, and "fit" is a multinomial topic model fit; (3) inputs D
# and U should be ns x k matrices, where k = ncol(F) is the number of
# topics and ns is the number of Monte Carlo samples to simulate, and
# D should contain normally distributed random numbers simulated using
# rnorm(mean = 0,sd = 1, and U should contain uniformly distributed
# random numbers simulated using runif(min = 0,max = 1).
#
# The return value is a list containing five matrices of the same
# dimension as F. Several of these matrices contain posterior
# quantities estimated via MCMC. The matrices are: (1) "est", the
# point estimates of the LFC statistics; (2) "postmean", the posterior
# mean estimates (3) "lower", the estimated lower limits of the HPD
# intervals; (4) "upper", the estimated upper limits of the HPD
# intervals; (5) and "z", the z-scores determined from the LFC
# estimates and the HPD intervals.
#
# Note that all outputted LFC statistics are defined with the base-2
# logarithm.
#
#' @importFrom stats rnorm
#' @importFrom stats runif
#' @importFrom Matrix colSums
#' @importFrom progress progress_bar
compute_lfc_stats <-
  function (X, F, L, f0,
            D = matrix(rnorm(1000*k),1000,ncol(F)),
            U = matrix(runif(1000*k),1000,ncol(F)),
            M = matrix(sample(k,1000*k,replace=TRUE),1000,k)-1,
            lfc.stat = "le", conf.level = 0.68, rw = 0.3, e = 1e-15,
            verbose = TRUE) {

  # Get the number of counts matrix columns (m) and the number of
  # topics (k).
  m <- nrow(F)
  k <- ncol(F)

  # Allocate storage for the outputs.
  est      <- matrix(0,m,k)
  postmean <- matrix(0,m,k)
  lower    <- matrix(0,m,k)
  upper    <- matrix(0,m,k)
  ar       <- matrix(0,m,k)
  dimnames(est)      <- dimnames(F)
  dimnames(postmean) <- dimnames(F)
  dimnames(lower)    <- dimnames(F)
  dimnames(upper)    <- dimnames(F)
  dimnames(ar)       <- dimnames(F)

  # Fill in the outputs, row by row. The core computation is performed
  # by compute_lfc_helper.
  ls <- colSums(L)
  if (verbose)
    pb <- progress_bar$new(total = m)
  for (j in 1:m) {
    if (verbose)
      pb$tick()
    out <- compute_lfc_stats_helper(j,X,F,L,D,U,M,ls,f0,lfc.stat,
                                    conf.level,rw,e)
    est[j,]      <- out$dat["est",]
    postmean[j,] <- out$dat["postmean",]
    lower[j,]    <- out$dat["lower",]
    upper[j,]    <- out$dat["upper",]
    ar[j,]       <- out$ar
  }

  # Compute the z-scores, then output the LFC point estimates (est),
  # the posterior means (postmean), lower and upper limits of the HPD
  # intervals, the z-scores (z), and MCMC acceptance rates (ar).
  return(list(ar       = ar,
              est      = est/log(2),
              postmean = postmean/log(2),
              lower    = lower/log(2),
              upper    = upper/log(2),
              z        = compute_zscores(postmean,lower,upper)))
}

# This is the multithreaded variant of compute_lfc_stats. See the
# comments accompanying compute_lfc_stats for details.
#
#' @importFrom parallel splitIndices
#' @importFrom pbapply pblapply
#' @importFrom pbapply pboptions
compute_lfc_stats_multicore <- function (X, F, L, f0, D, U, M, lfc.stat,
                                         conf.level, rw, e, nc, nsplit = 100,
                                         verbose = TRUE) {
    
  # Get the number of counts matrix columns (m) and the number of
  # topics (k).
  m <- nrow(F)
  k <- ncol(F)
  
  # Split the data.
  nsplit <- min(m,nsplit)
  cols   <- splitIndices(m,nsplit)
  dat    <- vector("list",nsplit)
  for (i in 1:nsplit) {
    j        <- cols[[i]]
    dat[[i]] <- list(X = X[,j,drop = FALSE],F = F[j,,drop = FALSE],f0 = f0[j])
  }

  # Distribute the calculations using pblapply.
  parlapplyf <- function (dat, L, D, U, M, lfc.stat, conf.level, rw, e)
    compute_lfc_stats(dat$X,dat$F,L,dat$f0,D,U,M,lfc.stat,conf.level,rw,e,
                      verbose = FALSE)
  if (verbose)
    op <- pboptions(type = "txt",txt.width = 70)
  else
    op <- pboptions(type = NULL)
  ans <- pblapply(cl = nc,dat,parlapplyf,L,D,U,M,lfc.stat,conf.level,rw,e)
  pboptions(op)

  # Combine the individual compute_lfc_stats outputs, and output the
  # combined result.
  out <- list(ar       = matrix(0,m,k),
              est      = matrix(0,m,k),
              postmean = matrix(0,m,k),
              lower    = matrix(0,m,k),
              upper    = matrix(0,m,k),
              z        = matrix(0,m,k))              
  dimnames(out$ar)       <- dimnames(F)
  dimnames(out$est)      <- dimnames(F)
  dimnames(out$postmean) <- dimnames(F)
  dimnames(out$lower)    <- dimnames(F)
  dimnames(out$upper)    <- dimnames(F)
  dimnames(out$z)        <- dimnames(F)
  for (i in 1:nsplit) {
    j <- cols[[i]]
    out$ar[j,]       <- ans[[i]]$ar
    out$est[j,]      <- ans[[i]]$est
    out$postmean[j,] <- ans[[i]]$postmean
    out$lower[j,]    <- ans[[i]]$lower
    out$upper[j,]    <- ans[[i]]$upper
    out$z[j,]        <- ans[[i]]$z
  }
  return(out)
}

# This implements the core computation for compute_lfc_stats.
compute_lfc_stats_helper <- function (j, X, F, L, D, U, M, ls, f0,
                                      lfc.stat, conf.level, rw, e) {
  k <- ncol(F)
  if (is.sparse.matrix(X)) {
    dat <- get.nonzeros(X,j)
    out <- simulate_posterior_poisson_sparse_rcpp(dat$x,L[dat$i,],ls,
                                                  F[j,],D,U,M,rw,e)
  } else
    out <- simulate_posterior_poisson_rcpp(X[,j],L,F[j,],D,U,M,rw,e)
  if (lfc.stat == "vsnull")
    dat <- compute_lfc_vsnull(F[j,],f0[j],out$samples,conf.level)
  else if (lfc.stat == "le")
    dat <- compute_lfc_le(F[j,],out$samples,conf.level)
  else
    dat <- compute_lfc_pairwise(F[j,],lfc.stat,out$samples,conf.level)
  return(list(ar = out$ar,dat = dat))
}
  
# Compute posterior statistics for LFC comparing against the null
# parameter, LFC(j) = log(fj/f0). The inputs are: f, the estimates of
# the Poisson model parameters; f0, the estimate of the null model
# parameter; samples, an ns x k matrix of Monte Carlo samples used to
# approximate the posterior distribution of g = log(f), where k is the
# number of topics and ns is the number of samples; and conf.level, a
# number between 0 and 1 giving the desired size of the HPD interval.
#
# The return value is a 4 x k matrix containing LFC statistics: (1)
# point estimate (est), posterior mean (postmean), lower limit of the
# HPD interval (lower) and upper limit (upper).
# 
#' @importFrom Matrix colMeans
compute_lfc_vsnull <- function (f, f0, samples, conf.level) {
  est      <- log(f) - log(f0)
  samples  <- samples - log(f0)
  postmean <- colMeans(samples)
  ans      <- compute_hpd_intervals(samples,conf.level)
  return(rbind(est      = est,
               postmean = postmean,
               lower    = ans$lower,
               upper    = ans$upper))
}

# Compute "pairwise" LFC posterior statistics LFC(k) = log(fk,fj). The
# inputs are: f, the estimates of the Poisson model parameters; j, the
# topic to compare with; samples, an ns x k matrix of Monte Carlo
# samples used to approximate the posterior distribution of g =
# log(f), where k is the number of topics and ns is the number of
# samples; and conf.level, a number between 0 and 1 giving the desired
# size of the HPD interval.
#
# The return value is a 4 x k matrix containing LFC statistics: (1)
# point estimate (est), posterior mean (postmean), lower limit of the
# HPD interval (lower) and upper limit (upper). By definition column j
# of this matrix is all zeros.
#
#' @importFrom Matrix colMeans
compute_lfc_pairwise <- function (f, j, samples, conf.level) {
  k        <- length(f)
  est      <- log(f/f[j])
  samples  <- samples - samples[,j]
  postmean <- colMeans(samples)
  ans      <- compute_hpd_intervals(samples,conf.level,setdiff(1:k,j))
  out      <- rbind(est      = est,
                    postmean = postmean,
                    lower    = ans$lower,
                    upper    = ans$upper)
  out[,j]  <- 0
  return(out)
}

# Compute posterior estimates of the "least extreme" LFC statistics
# LFC(j) = log(fj/fk), in which k is the topic other than j that
# yields the LFC statistic closest to zero. The inputs are: f, the
# estimates of the Poisson model parameters; samples, an ns x k matrix
# of Monte Carlo samples used to approximate the posterior
# distribution of g = log(f), where k is the number of topics and ns
# is the number of samples; and conf.level, a number between 0 and 1
# giving the desired size of the HPD interval.
#
# The return value is a 4 x k matrix containing LFC statistics: (1)
# point estimate (est), posterior mean (postmean), lower limit of the
# HPD interval (lower) and upper limit (upper).
# 
#' @importFrom Matrix colMeans
compute_lfc_le <- function (f, samples, conf.level) {
  est      <- drop(le_diff_rcpp(matrix(log(f),1,length(f))))
  samples  <- le_diff_rcpp(samples)
  postmean <- colMeans(samples)
  ans      <- compute_hpd_intervals(samples,conf.level)
  return(rbind(est      = est,
               postmean = postmean,
               lower    = ans$lower,
               upper    = ans$upper))
}

# Output an HPD interval for each column of the samples matrix.
compute_hpd_intervals <- function (samples, conf.level,
                                   cols = seq(1,ncol(samples))) {
  k     <- ncol(samples)
  lower <- rep(0,k)
  upper <- rep(0,k)
  for (i in cols) {
    out      <- hpd(samples[,i],conf.level)
    lower[i] <- out[1]
    upper[i] <- out[2]
  }
  return(list(lower = lower,upper = upper))
}

# Compute z-scores given posterior mean estimates (postmean), and
# lower and upper limits of the HPD intervals (lower, upper).
compute_zscores <- function (postmean, lower, upper) {
  z <- postmean

  # First fix all the negative z-scores.
  i         <- which(postmean < 0)
  postmean1 <- postmean[i]
  upper1    <- upper[i]
  z1        <- postmean1/(upper1 - postmean1)
  z1[upper1 <= postmean1] <- as.numeric(NA)
  z[i]      <- z1

  # Next, fix all the positive z-scores.
  i         <- which(postmean > 0)
  postmean1 <- postmean[i]
  lower1    <- lower[i]
  z1        <- postmean1/(postmean1 - lower1)
  z1[lower1 >= postmean1] <- as.numeric(NA)
  z[i]      <- z1

  return(z)
}

Any scripts or data that you put into this service are public.

fastTopics documentation built on Sept. 11, 2024, 5:14 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

fastTopics
Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/lfc.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

Try the fastTopics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fastTopics Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/lfc.R In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

Try the fastTopics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

fastTopics
Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data

R/lfc.R
In fastTopics: Fast Algorithms for Fitting Topic Models and Non-Negative Matrix Factorizations to Count Data