options(rmarkdown.html_vignette.check_title = FALSE) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Load the \code{sgdGMF} package in the workspace.
library(sgdGMF)
Load other useful packages in the workspace.
library(ggplot2) library(ggpubr) library(reshape2)
Load the ant traits data in the workspace and define the response matrix Y
and covariate matrices X
and Z
.
# install.packages("mvabund") # data(antTraits, package = "mvabund") load(url("https://raw.githubusercontent.com/cran/mvabund/master/data/antTraits.RData")) Y = as.matrix(antTraits$abund) X = as.matrix(antTraits$env[,-3]) Z = matrix(1, nrow = ncol(Y), ncol = 1) n = nrow(Y) m = ncol(Y)
Set the model family to Poisson since the response matrix contain count data.
family = poisson()
Select the optimal number of latent factors using the function \code{sgdgmf.rank}, which employs an adjusted eigenvalue thresholding method to identify the optimal elbow point of a screeplot.
ncomp = sgdgmf.rank(Y = Y, X = X, Z = Z, family = family)$ncomp cat("Selected rank: ", ncomp)
\code{sgdGMF} implements four estimation algorithms which are outlined below. Notice that we here describe the algorithms by assuming, without loss of generality, that $B = 0$ and $A = 0$ (see the \code{help} documentation of the \code{sgdgmf.fit} function for the notation). In the real implementation, where non-zero $B$ and $A$ are considered, we jointly optimize $[A, U]$ and $[B, V]$ following the same algorithmic strategy, but accounting for regression effects.
# knitr::include_graphics("../man/alg-AIRWLS.png")
# knitr::include_graphics("../man/alg-Newton.png")
# knitr::include_graphics("../man/alg-BSGD.png")
# knitr::include_graphics("../man/alg-CSGD.png")
Estimate a Poisson GMF model using iterated least squares, diagonal quasi-Newton and stochastic gradient descent algorithms.
gmf.airwls = sgdgmf.fit(Y, X, Z, ncomp = ncomp, family = family, method = "airwls") gmf.newton = sgdgmf.fit(Y, X, Z, ncomp = ncomp, family = family, method = "newton") gmf.sgd = sgdgmf.fit(Y, X, Z, ncomp = ncomp, family = family, method = "sgd", sampling = "block")
Compute the correlation between the observed data and the predicted values.
cat(" AIRWLS: ", 100 * round(cor(c(Y), c(gmf.airwls$mu)), 4), "\n", " Newton: ", 100 * round(cor(c(Y), c(gmf.newton$mu)), 4), "\n", " SGD: ", 100 * round(cor(c(Y), c(gmf.sgd$mu)), 4), sep = "")
Compute the partial deviance residuals holding out from the model the estimated matrix factorization. Additionally, compute the spectrum of such a residual matrix.
res.airwls = residuals(gmf.airwls, partial = FALSE, spectrum = TRUE, ncomp = 20) res.newton = residuals(gmf.newton, partial = FALSE, spectrum = TRUE, ncomp = 20) res.sgd = residuals(gmf.sgd, partial = FALSE, spectrum = TRUE, ncomp = 20)
Analyze the residual distribution.
get.factor = function (levels, nrep) factor(rep(levels, each = nrep), levels = levels) df = data.frame( residuals = c(res.airwls$residuals, res.newton$residuals, res.sgd$residuals), fitted = c(gmf.airwls$mu, gmf.newton$mu, gmf.sgd$mu), model = get.factor(c("AIRWLS", "Newton", "SGD"), n) ) plt.res.vs.fit = ggplot(data = df, map = aes(x = fitted, y = residuals)) + geom_point(alpha = 0.5) + facet_grid(cols = vars(model)) + geom_hline(yintercept = 0, col = 2, lty = 2) plt.res.hist = ggplot(data = df, map = aes(x = residuals, y = after_stat(density))) + geom_histogram(bins = 30) + facet_grid(cols = vars(model)) + geom_vline(xintercept = 0, col = 2, lty = 2) ggpubr::ggarrange(plt.res.vs.fit, plt.res.hist, nrow = 2, align = "v")
Plot the variance explained by each principal component of the residual matrix.
data.frame( components = rep(1:20, 3), variance = c(res.airwls$lambdas, res.newton$lambdas, res.sgd$lambdas), model = get.factor(c("AIRWLS", "Newton", "SGD"), 20)) |> ggplot(map = aes(x = components, y = variance)) + geom_col(color = "white") + facet_grid(cols = vars(model))
Plot the abundance predicted by each method comparing it with the observed matrix.
colnames(gmf.airwls$mu) = colnames(Y) colnames(gmf.newton$mu) = colnames(Y) colnames(gmf.sgd$mu) = colnames(Y) df = rbind( reshape2::melt(Y, varnames = c("environments", "species"), value.name = "abundance"), reshape2::melt(gmf.airwls$mu, varnames = c("environments", "species"), value.name = "abundance"), reshape2::melt(gmf.newton$mu, varnames = c("environments", "species"), value.name = "abundance"), reshape2::melt(gmf.sgd$mu, varnames = c("environments", "species"), value.name = "abundance")) df$model = get.factor(c("Data", "AIRWLS", "Newton", "SGD"), n*m) ggplot(data = df, map = aes(x = species, y = environments, fill = abundance)) + geom_raster() + facet_wrap(vars(model), nrow = 2, ncol = 2) + scale_fill_viridis_c() + theme(axis.text = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank()) + labs(x = "Species", y = "Environmens", fill = "Abundance", title = "Ant abundance data")
Plot the latent scores and loadings in a biplot-like figure.
pca.airwls = RSpectra::svds(tcrossprod(gmf.airwls$U, gmf.airwls$V), 2) pca.newton = RSpectra::svds(tcrossprod(gmf.newton$U, gmf.newton$V), 2) pca.sgd = RSpectra::svds(tcrossprod(gmf.sgd$U, gmf.sgd$V), 2) plt.envs = data.frame( idx = rep(1:n, times = 3), pc1 = c(scale(pca.airwls$u[,1]), scale(pca.newton$u[,1]), scale(pca.sgd$u[,1])), pc2 = c(scale(pca.airwls$u[,2]), scale(pca.newton$u[,2]), scale(pca.sgd$u[,2])), model = get.factor(c("AIRWLS", "Newton", "SGD"), n)) |> ggplot(map = aes(x = pc1, y = pc2, color = idx, label = idx)) + geom_hline(yintercept = 0, lty = 2, color = "grey40") + geom_vline(xintercept = 0, lty = 2, color = "grey40") + geom_point() + facet_grid(cols = vars(model)) + scale_color_viridis_c(option = "viridis") + geom_text(color = 1, size = 2.5, nudge_x = -0.2, nudge_y = +0.2) + labs(x = "PC 1", y = "PC 2", color = "Environments") plt.species = data.frame( idx = rep(1:m, times = 3), pc1 = c(scale(pca.airwls$v[,1]), scale(pca.newton$v[,1]), scale(pca.sgd$v[,1])), pc2 = c(scale(pca.airwls$v[,2]), scale(pca.newton$v[,2]), scale(pca.sgd$v[,2])), model = get.factor(c("AIRWLS", "Newton", "SGD"), m)) |> ggplot(map = aes(x = pc1, y = pc2, color = idx, label = idx)) + geom_hline(yintercept = 0, lty = 2, color = "grey40") + geom_vline(xintercept = 0, lty = 2, color = "grey40") + geom_point() + facet_grid(cols = vars(model)) + scale_color_viridis_c(option = "inferno") + geom_text(color = 1, size = 2.5, nudge_x = -0.2, nudge_y = +0.2) + labs(x = "PC 1", y = "PC 2", color = "Species") ggpubr::ggarrange(plt.envs, plt.species, nrow = 2, ncol = 1, align = "hv")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.