#' Exponential Family Matrix Factorization
#'
#' @description Collins et al. (2001)'s Exponential Family PCA
#'
#' @param x matrix of either binary, proportions, count, or continuous data
#' @param k dimension
#' @param family exponential family distribution of data
#' @param weights an optional matrix of the same size as the \code{x} with data weights
#' @param quiet logical; whether the calculation should give feedback
#' @param max_iters maximum number of iterations
#' @param conv_criteria convergence criteria
#' @param partial_decomp logical; if \code{TRUE}, the function uses the RSpectra package
#' to more quickly calculate the SVD. When the number of columns is small,
#' the approximation may be less accurate and slower
#' @param random_start whether to randomly initialize \code{A} and \code{B}
#' @param start_A initial value for \code{A}
#' @param start_B initial value for \code{B}
#' @param mu specific value for \code{mu}, the mean vector of \code{x}
#' @param main_effects logical; whether to include main effects in the model
#' @param method which algorithm to use. \code{"als"} uses alternating least squares.
#' It has the benefit of majozing row-wise and column-wise for each of the updates.
#' \code{"svd"} uses singular value decomposition (similar to de Leeuw, 2006). It has to
#' a more gereral majorization, which may not work well for heterogeneous matrices.
#'
#' @return An S3 object of class \code{gmf} which is a list with the
#' following components:
#' \item{mu}{the main effects for dimensionality reduction}
#' \item{A}{the \code{n}x\code{k}-dimentional matrix with the scores}
#' \item{B}{the \code{d}x\code{k}-dimentional matrix with the loadings}
#' \item{family}{the exponential family of the data}
#' \item{iters}{number of iterations required for convergence}
#' \item{loss_trace}{the trace of the average deviance of the algorithm.
#' Should be non-increasing}
#' \item{prop_deviance_expl}{the proportion of deviance explained by this model.
#' If \code{main_effects = TRUE}, the null model is just the main effects, otherwise
#' the null model estimates 0 for all natural parameters.}
#' @export
#' @importFrom RSpectra svds
#'
#' @references
#' de Leeuw, Jan, 2006. Principal component analysis of binary data
#' by iterated singular value decomposition. Computational Statistics & Data Analysis
#' 50 (1), 21--39.
#'
#' Collins, M., Dasgupta, S., & Schapire, R. E., 2001. A generalization of principal
#' components analysis to the exponential family. In NIPS, 617--624.
#'
#' @examples
#' rows = 100
#' cols = 10
#' set.seed(1)
#' mat_np = outer(rnorm(rows), rnorm(cols))
#'
#' # generate a count matrix and binary response
#' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
#'
#' mod = generalizedMF(mat, k = 1, family = "poisson", quiet = FALSE)
#'
generalizedMF <- function(x, k = 2, family = c("gaussian", "binomial", "poisson"),
weights, quiet = TRUE, max_iters = 1000, conv_criteria = 1e-5,
partial_decomp = FALSE,
random_start = FALSE, start_A, start_B, mu, main_effects = TRUE,
method = c("als", "svd")) {
family = match.arg(family)
check_family(x, family)
method = match.arg(method)
if (method == "als") {
# used in ALS to make sure the estimates do not diverge
# a penalization like this was recommended in Rish et al. (2008), ICML
L2_eps = 1e-5
}
x = as.matrix(x)
missing_mat = is.na(x)
n = nrow(x)
d = ncol(x)
ones = rep(1, n)
if (missing(weights)) {
weights = 1.0
sum_weights = sum(!is.na(x))
} else {
weights[is.na(x)] <- 0
if (any(is.na(weights))) {
stop("Can't have NA in weights")
}
if (any(weights < 0)) {
stop("weights must be non-negative")
}
if (!all(dim(weights) == dim(x))) {
stop("x and weights are not the same dimension")
}
sum_weights = sum(weights)
}
# calculate the null log likelihood for % deviance explained and normalization
if (main_effects) {
if (length(weights) == 1) {
weighted_col_means = colMeans(x, na.rm = TRUE)
} else {
weighted_col_means = colSums(x * weights, na.rm = TRUE) / colSums(weights)
}
null_theta = as.numeric(saturated_natural_parameters(matrix(weighted_col_means, 1), family, M = Inf))
} else {
null_theta = rep(0, d)
}
null_deviance = exp_fam_deviance(x, outer(ones, null_theta), family, weights) / sum_weights
# Initialize #
##################
if (main_effects) {
if (missing(mu)) {
mu = saturated_natural_parameters(weighted_col_means, family, Inf)
} else {
mu = as.numeric(mu)
stopifnot(length(mu) == d)
}
} else {
mu = rep(0, d)
}
if (!missing(start_B)) {
stopifnot(dim(start_B) == c(d, k))
B = as.matrix(start_B)
} else if (random_start) {
B = matrix(rnorm(d * k), d, k)
} else {
if (partial_decomp) {
udv = RSpectra::svds(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE), min(k + 1, d))
} else {
udv = svd(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE))
}
B = udv$v[, 1:k, drop = FALSE]
}
if (!missing(start_A)) {
stopifnot(dim(start_A) == c(n, k))
A = as.matrix(start_A)
} else if (random_start) {
A = matrix(rnorm(n * k), n, k)
} else {
if (!missing(start_B)) {
if (partial_decomp) {
udv = RSpectra::svds(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE), min(k + 1, d))
} else {
udv = svd(scale(saturated_natural_parameters(x, family, 4), TRUE, FALSE))
}
}
A = udv$u[, 1:k, drop = FALSE] %*% diag(udv$d, k, k)
}
loss_trace = numeric(max_iters)
theta = outer(ones, mu) + tcrossprod(A, B)
# not really the first iteration
loss_trace[1] = exp_fam_deviance(x, theta, family, weights) / sum_weights
ptm <- proc.time()
if (!quiet) {
cat(0, " ", loss_trace[1], "")
cat("0 hours elapsed\n")
}
for (ii in seq_len(max_iters)) {
if (method == "als") {
# update A
theta = outer(ones, mu) + tcrossprod(A, B)
first_dir = exp_fam_mean(theta, family)
second_dir = exp_fam_variance(theta, family, weights)
W = apply(second_dir, 2, max)
Z = as.matrix(theta + weights * (x - first_dir) / outer(ones, W))
Z[is.na(x)] <- theta[is.na(x)]
A = t(solve(t(B) %*% diag(W) %*% B + diag(L2_eps, k, k),
t(B) %*% diag(W) %*% t(scale(Z, center = mu, scale = FALSE))))
# update B
theta = outer(ones, mu) + tcrossprod(A, B)
first_dir = exp_fam_mean(theta, family)
second_dir = exp_fam_variance(theta, family, weights)
W = apply(second_dir, 1, max)
Z = as.matrix(theta + weights * (x - first_dir) / outer(W, rep(1, d)))
Z[is.na(x)] <- theta[is.na(x)]
B = t(solve(t(A) %*% diag(W, n, n) %*% A + diag(L2_eps, k, k),
t(A) %*% diag(W, n, n) %*% scale(Z, center = mu, scale = FALSE)))
} else if (method == "svd") {
theta = outer(ones, mu) + tcrossprod(A, B)
first_dir = exp_fam_mean(theta, family)
second_dir = exp_fam_variance(theta, family, weights)
W = max(second_dir)
Z = as.matrix(theta + weights * (x - first_dir) / W)
Z[is.na(x)] <- theta[is.na(x)]
if (partial_decomp) {
udv = RSpectra::svds(scale(Z, center = mu, scale = FALSE), min(k + 1, d))
} else {
udv = svd(scale(Z, center = mu, scale = FALSE))
}
A = udv$u[, 1:k, drop = FALSE] %*% diag(udv$d, k, k)
B = udv$v[, 1:k, drop = FALSE]
}
# Calc Deviance
theta = outer(ones, mu) + tcrossprod(A, B)
loss_trace[ii] <- exp_fam_deviance(x, theta, family, weights) / sum_weights
if (!quiet) {
time_elapsed = as.numeric(proc.time() - ptm)[3]
tot_time = max_iters / ii * time_elapsed
time_remain = tot_time - time_elapsed
cat(ii, " ", loss_trace[ii], "")
cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n")
}
if (ii > 1 && abs(loss_trace[ii] - loss_trace[ii - 1]) < 2 * conv_criteria) {
break
} else {
B_lag = B
}
}
object <- list(
mu = mu,
A = A,
B = B,
family = family,
iters = ii,
loss_trace = loss_trace[1:ii],
prop_deviance_expl = 1 - loss_trace[ii] / null_deviance
)
class(object) <- "gmf"
object
}
#' @title Predict generalized PCA scores or reconstruction on new data
#'
#' @description Predict generalized PCA scores or reconstruction on new data
#'
#' @param object generalized MF object
#' @param newdata matrix of the same exponential family as covariates in \code{object}.
#' If missing, will use the data that \code{object} was fit on
#' @param type the type of fitting required.
#' \code{type = "PCs"} gives matrix of principal components of \code{x},
#' \code{type = "link"} gives a matrix on the natural parameter scale, and
#' \code{type = "response"} gives a matrix on the response scale
#' @param quiet logical; whether the calculation should show progress
#' @param max_iters maximum number of iterations
#' @param conv_criteria convergence criteria
#' @param start_A initial value for \code{A}
#' @param ... Additional arguments
#' @examples
#' # construct a low rank matrices in the natural parameter space
#' rows = 100
#' cols = 10
#' set.seed(1)
#' loadings = rnorm(cols)
#' mat_np = outer(rnorm(rows), rnorm(cols))
#' mat_np_new = outer(rnorm(rows), loadings)
#'
#' # generate a count matrices
#' mat = matrix(rpois(rows * cols, c(exp(mat_np))), rows, cols)
#' mat_new = matrix(rpois(rows * cols, c(exp(mat_np_new))), rows, cols)
#'
#' # run Poisson PCA on it
#' gmf = generalizedMF(mat, k = 1, family = "poisson")
#'
#' A = predict(gmf, mat_new)
#'
#' @export
predict.gmf <- function(object, newdata, type = c("PCs", "link", "response"), quiet = TRUE,
max_iters = 1000, conv_criteria = 1e-5, start_A,...) {
type = match.arg(type)
if (missing(newdata)) {
A = object$A
} else {
n = nrow(newdata)
d = ncol(newdata)
k = ncol(object$B)
stopifnot(d == nrow(object$B))
check_family(newdata, object$family)
L2_eps = 1e-5
ones = rep(1, n)
# solve for A
if (missing(start_A)) {
theta = outer(ones, object$mu)
} else {
stopifnot(dim(start_A) == c(n, k))
theta = outer(ones, object$mu) + tcrossprod(start_A, object$B)
}
last_deviance = exp_fam_deviance(newdata, theta, object$family) / sum(!is.na(newdata))
if (!quiet) {
cat(0, " ", last_deviance, "\n")
}
for (ii in seq_len(max_iters)) {
first_dir = exp_fam_mean(theta, object$family)
second_dir = exp_fam_variance(theta, object$family)
multiplier = 1
# while (TRUE) {
# W = apply(second_dir, 2, max)
W = rep(max(second_dir), d)
Z = as.matrix(theta + (newdata - first_dir) / outer(ones, W))
Z[is.na(newdata)] <- theta[is.na(newdata)]
A = t(solve(t(object$B) %*% diag(W, d, d) %*% object$B + diag(L2_eps, k, k),
t(object$B) %*% diag(W, d, d) %*% t(scale(Z, object$mu, FALSE))))
theta = outer(ones, object$mu) + tcrossprod(A, object$B)
this_deviance = exp_fam_deviance(newdata, theta, object$family) / sum(!is.na(newdata))
# if (this_deviance < last_deviance) {
# break
# } else {
# multiplier = multiplier * 2
# }
if (!quiet) {
cat(ii ," ", this_deviance, "\n")
}
if (abs(last_deviance - this_deviance) < 2 * conv_criteria)
break
last_deviance = this_deviance
}
}
if (type == "PCs") {
return(A)
} else {
theta = outer(rep(1, nrow(A)), object$mu) + tcrossprod(A, object$B)
if (type == "link") {
return(theta)
} else if (type == "response") {
return(exp_fam_mean(theta, object$family))
}
}
}
#' @title Plot generalized MF
#'
#' @description
#' Plots the results of a generalized MF
#'
#' @param x generalized MF object
#' @param type the type of plot \code{type = "trace"} plots the algorithms progress by
#' iteration, \code{type = "loadings"} plots the first 2 principal component
#' loadings, \code{type = "scores"} plots the loadings first 2 principal component scores
#' @param ... Additional arguments
#' @examples
#' # construct a low rank matrix in the logit scale
#' rows = 100
#' cols = 10
#' set.seed(1)
#' mat_logit = outer(rnorm(rows), rnorm(cols))
#'
#' # generate a binary matrix
#' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
#'
#' # run logistic SVD on it
#' gmf = generalizedMF(mat, k = 2, family = "binomial")
#'
#' \dontrun{
#' plot(gmf)
#' }
#' @export
plot.gmf <- function(x, type = c("trace", "loadings", "scores"), ...) {
type = match.arg(type)
if (type == "trace") {
df = data.frame(Iteration = 1:x$iters,
Deviance = x$loss_trace)
p <- ggplot2::ggplot(df, ggplot2::aes_string("Iteration", "Deviance")) +
ggplot2::geom_line()
} else if (type == "loadings") {
df = data.frame(x$B)
colnames(df) <- paste0("PC", 1:ncol(df))
if (ncol(df) == 1) {
df$PC2 = 0
p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() +
ggplot2::labs(y = NULL)
} else {
p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point()
}
} else if (type == "scores") {
df = data.frame(x$A)
colnames(df) <- paste0("PC", 1:ncol(df))
if (ncol(df) == 1) {
df$PC2 = 0
p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point() +
ggplot2::labs(y = NULL)
} else {
p <- ggplot2::ggplot(df, ggplot2::aes_string("PC1", "PC2")) + ggplot2::geom_point()
}
}
return(p)
}
#' @export
print.gmf <- function(x, ...) {
cat(nrow(x$A), "rows and ")
cat(nrow(x$B), "columns\n")
cat("Rank", ncol(x$B), "solution\n")
cat("\n")
cat(round(x$prop_deviance_expl * 100, 1), "% of deviance explained\n", sep = "")
cat(x$iters, "iterations to converge\n")
invisible(x)
}
#' @title CV for generalized MF
#'
#' @description
#' Run cross validation on dimension for generalized MF
#'
#' @param x matrix of either binary, count, or continuous data
#' @param ks the different dimensions \code{k} to try
#' @param family exponential family distribution of data
#' @param folds if \code{folds} is a scalar, then it is the number of folds. If
#' it is a vector, it should be the same length as the number of rows in \code{x}
#' @param quiet logical; whether the function should display progress
#' @param ... Additional arguments passed to generalizedMF
#'
#' @return A matrix of the CV deviance with \code{k} in rows
#'
#' @examples
#' # construct a low rank matrix in the logit scale
#' rows = 100
#' cols = 10
#' set.seed(1)
#' mat_logit = outer(rnorm(rows), rnorm(cols))
#'
#' # generate a binary matrix
#' mat = (matrix(runif(rows * cols), rows, cols) <= inv.logit.mat(mat_logit)) * 1.0
#'
#' \dontrun{
#' deviances = cv.gmf(mat, ks = 1:9, family = "binomial")
#' plot(deviances)
#' }
#' @export
cv.gmf <- function(x, ks, family = c("gaussian", "binomial", "poisson", "multinomial"),
folds = 5, quiet = TRUE, ...) {
family = match.arg(family)
check_family(x, family)
if (length(folds) > 1) {
# does this work if factor?
if (length(unique(folds)) <= 1) {
stop("If inputing CV split, must be more than one level")
}
if (length(folds) != nrow(x)) {
stop("if folds is a vector, it should be of same length as nrow(x)")
}
cv = folds
} else {
cv = sample(1:folds, nrow(q), replace = TRUE)
}
log_likes = matrix(0, length(ks), 1,
dimnames = list(k = ks, M = "GMF"))
for (k in ks) {
if (!quiet) {
cat("k =", k, "\n")
}
for (c in unique(cv)) {
gmf = generalizedMF(x[c != cv, ], k = k, family = family, ...)
pred_theta = predict(gmf, newdat = x[c == cv, ], type = "link")
log_likes[k == ks] = log_likes[k == ks] +
exp_fam_deviance(x[c == cv, ], theta = pred_theta, family = family)
}
}
class(log_likes) <- c("matrix", "cv.gpca")
which_max = which.max(log_likes)
if (!quiet) {
cat("Best: k =", ks[which_max], "\n")
}
return(-log_likes)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.