#' An R6 Class to represent a PLNfit in a PCA framework
#'
#' @description The function [PLNPCA()] produces a collection of models which are instances of object with class [`PLNPCAfit`].
#' This class comes with a set of methods, some of them being useful for the user:
#' See the documentation for the methods inherited by [`PLNfit`] and the [plot()] methods for PCA visualization
#'
## Parameters common to all PLN-xx-fit methods (shared with PLNfit but inheritance does not work)
#' @param responses the matrix of responses (called Y in the model). Will usually be extracted from the corresponding field in [`PLNfamily`]
#' @param covariates design matrix (called X in the model). Will usually be extracted from the corresponding field in [`PLNfamily`]
#' @param offsets offset matrix (called O in the model). Will usually be extracted from the corresponding field in [`PLNfamily`]
#' @param weights an optional vector of observation weights to be used in the fitting process.
#' @param formula model formula used for fitting, extracted from the formula in the upper-level call
#' @param control a list for controlling the optimization. See details.
#' @param config part of the \code{control} argument which configures the optimizer
#' @param nullModel null model used for approximate R2 computations. Defaults to a GLM model with same design matrix but not latent variable.
#' @param B matrix of regression matrix
#' @param Sigma variance-covariance matrix of the latent variables
#' @param Omega precision matrix of the latent variables. Inverse of Sigma.
#'
## Parameters specific to PLNPCA-fit methods
#' @param rank rank of the PCA (or equivalently, dimension of the latent space)
#'
## Parameters common to many graphical methods
#' @param map the type of output for the PCA visualization: either "individual", "variable" or "both". Default is "both".
#' @param nb_axes scalar: the number of axes to be considered when map = "both". The default is min(3,rank).
#' @param axes numeric, the axes to use for the plot when map = "individual" or "variable". Default it c(1,min(rank))
#' @param ind_cols a character, factor or numeric to define the color associated with the individuals. By default, all variables receive the default color of the current palette.
#' @param var_cols a character, factor or numeric to define the color associated with the variables. By default, all variables receive the default color of the current palette.
#' @param plot logical. Should the plot be displayed or sent back as ggplot object
#' @param main character. A title for the single plot (individual or variable factor map). If NULL (the default), an hopefully appropriate title will be used.
#'
#' @include PLNfit-class.R
#' @importFrom R6 R6Class
#' @examples
#' data(trichoptera)
#' trichoptera <- prepare_data(trichoptera$Abundance, trichoptera$Covariate)
#' myPCAs <- PLNPCA(Abundance ~ 1 + offset(log(Offset)), data = trichoptera, ranks = 1:5)
#' myPCA <- getBestModel(myPCAs)
#' class(myPCA)
#' print(myPCA)
#' @seealso The function \code{\link{PLNPCA}}, the class \code{\link[=PLNPCAfamily]{PLNPCAfamily}}
PLNPCAfit <- R6Class(
classname = "PLNPCAfit",
inherit = PLNfit,
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## PRIVATE MEMBERS ----
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
private = list(
C = NULL,
svdCM = NULL
),
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## PUBLIC MEMBERS ----
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
public = list(
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Creation functions ----------------
#' @description Initialize a [`PLNPCAfit`] object
initialize = function(rank, responses, covariates, offsets, weights, formula, control) {
super$initialize(responses, covariates, offsets, weights, formula, control)
private$optimizer$main <- nlopt_optimize_rank
private$optimizer$vestep <- nlopt_optimize_vestep_rank
if (!is.null(control$svdM)) {
svdM <- control$svdM
} else {
svdM <- svd(private$M, nu = rank, nv = self$p)
}
### TODO: check that it is really better than initializing with zeros...
private$M <- svdM$u[, 1:rank, drop = FALSE] %*% diag(svdM$d[1:rank], nrow = rank, ncol = rank) %*% t(svdM$v[1:rank, 1:rank, drop = FALSE])
private$S <- matrix(0.1, self$n, rank)
private$C <- svdM$v[, 1:rank, drop = FALSE] %*% diag(svdM$d[1:rank], nrow = rank, ncol = rank)/sqrt(self$n)
},
#' @description Update a [`PLNPCAfit`] object
#' @param M matrix of mean vectors for the variational approximation
#' @param C matrix of PCA loadings (in the latent space)
#' @param S matrix of variance vectors for the variational approximation
#' @param Ji vector of variational lower bounds of the log-likelihoods (one value per sample)
#' @param R2 approximate R^2 goodness-of-fit criterion
#' @param Z matrix of latent vectors (includes covariates and offset effects)
#' @param A matrix of fitted values
#' @param monitoring a list with optimization monitoring quantities
#' @return Update the current [`PLNPCAfit`] object
update = function(B=NA, Sigma=NA, Omega=NA, C=NA, M=NA, S=NA, Z=NA, A=NA, Ji=NA, R2=NA, monitoring=NA) {
super$update(B = B, Sigma = Sigma, Omega = Omega, M = M, S = S, Z = Z, A = A, Ji = Ji, R2 = R2, monitoring = monitoring)
if (!anyNA(C)) private$C <- C
},
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Optimization ----------------------
#' @description Call to the C++ optimizer and update of the relevant fields
optimize = function(responses, covariates, offsets, weights, config) {
args <- list(data = list(Y = responses, X = covariates, O = offsets, w = weights),
params = list(B = private$B, C = private$C, M = private$M, S = private$S),
config = config)
optim_out <- do.call(private$optimizer$main, args)
do.call(self$update, optim_out)
},
#' @description Result of one call to the VE step of the optimization procedure: optimal variational parameters (M, S) and corresponding log likelihood values for fixed model parameters (C, B). Intended to position new data in the latent space for further use with PCA.
#' @return A list with three components:
#' * the matrix `M` of variational means,
#' * the matrix `S2` of variational variances
#' * the vector `log.lik` of (variational) log-likelihood of each new observation
optimize_vestep = function(covariates, offsets, responses, weights = rep(1, self$n),
control = PLNPCA_param(backend = "nlopt")) {
# problem dimension
n <- nrow(responses); p <- ncol(responses); q <- self$rank
# turn offset vector to offset matrix
offsets <- as.matrix(offsets)
if (ncol(offsets) == 1) offsets <- matrix(offsets, nrow = n, ncol = p)
## Not completely naive starting values for M: SVD on the residuals of
## a linear regression on the log-counts (+1 to deal with 0s)
log_responses <- log(responses+1)
residuals <- lm.wfit(x = covariates, y = log_responses, w = weights, offset = offsets)$residuals
svd_residuals <- svd(residuals, nu = q, nv = p)
M_init <- svd_residuals$u[, 1:q, drop = FALSE] %*% diag(svd_residuals$d[1:q], nrow = q, ncol = q) %*% t(svd_residuals$v[1:q, 1:q, drop = FALSE])
## Initialize the variational parameters with the appropriate new dimension of the data
args <- list(data = list(Y = responses, X = covariates, O = offsets, w = weights),
## Initialize the variational parameters with the new dimension of the data
params = list(M = M_init, S = matrix(.1, n, q)),
B = private$B,
C = private$C,
config = control$config_optim)
optim_out <- do.call(private$optimizer$vestep, args)
optim_out
},
#' @description Project new samples into the PCA space using one VE step
#' @param newdata A data frame in which to look for variables, offsets and counts with which to predict.
#' @param control a list for controlling the optimization. See [PLN()] for details.
#' @param envir Environment in which the projection is evaluated
#' @return
#' * the named matrix of scores for the newdata, expressed in the same coordinate system as `self$scores`
project = function(newdata, control = PLNPCA_param(), envir = parent.frame()) {
## Extract the model matrices from the new data set with initial formula
args <- extract_model(call("PLNPCA", formula = private$formula, data = newdata), envir)
## Compute latent positions of the new samples
M <- self$optimize_vestep(covariates = args$X, offsets = args$O, responses = args$Y,
weights = args$w, control = control)$M
latent_pos <- t(tcrossprod(self$model_par$C, M)) %>% scale(center = TRUE, scale = FALSE)
## Compute scores in the PCA coordinate systems
scores <- latent_pos %*% self$rotation
dimnames(scores) <- list(rownames(newdata), paste0("PC", 1:ncol(scores)))
## Output
scores
},
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Post treatment --------------------
#' @description Compute PCA scores in the latent space and update corresponding fields.
#' @param scale.unit Logical. Should PCA scores be rescaled to have unit variance
setVisualization = function(scale.unit = FALSE) {
private$svdCM <- svd(scale(self$latent_pos, TRUE, scale.unit), nv = self$rank)
},
#' @description Update R2, fisher, std_err fields and set up visualization
#' @param config_optim a list for controlling the optimizer (either "nlopt" or "torch" backend). See details
#' @param config_post a list for controlling the post-treatments (optional bootstrap, jackknife, R2, etc.). See details
#' @details The list of parameters `config_post` controls the post-treatment processing, with the following entries:
#' * jackknife boolean indicating whether jackknife should be performed to evaluate bias and variance of the model parameters. Default is FALSE.
#' * bootstrap integer indicating the number of bootstrap resamples generated to evaluate the variance of the model parameters. Default is 0 (inactivated).
#' * variational_var boolean indicating whether variational Fisher information matrix should be computed to estimate the variance of the model parameters (highly underestimated). Default is FALSE.
#' * rsquared boolean indicating whether approximation of R2 based on deviance should be computed. Default is TRUE
#' * trace integer for verbosity. should be > 1 to see output in post-treatments
postTreatment = function(responses, covariates, offsets, weights, config_post, config_optim, nullModel) {
super$postTreatment(responses, covariates, offsets, weights, config_post, config_optim, nullModel)
colnames(private$C) <- colnames(private$M) <- 1:self$q
rownames(private$C) <- colnames(responses)
self$setVisualization()
},
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Graphical methods -----------------
#' @description Plot the factorial map of the PCA
# @inheritParams plot.PLNPCAfit
#' @param cols a character, factor or numeric to define the color associated with the individuals. By default, all individuals receive the default color of the current palette.
#' @return a [`ggplot`] graphic
plot_individual_map = function(axes=1:min(2,self$rank), main = "Individual Factor Map", plot = TRUE, cols = "default") {
.scores <- data.frame(self$scores[,axes, drop = FALSE])
colnames(.scores) <- paste("a",1:ncol(.scores),sep = "")
.scores$labels <- cols
.scores$names <- rownames(private$M)
axes_label <- paste(paste("axis",axes), paste0("(", round(100*self$percent_var,3)[axes], "%)"))
p <- get_ggplot_ind_map(.scores, axes_label, main)
if (plot) print(p)
invisible(p)
},
#' @description Plot the correlation circle of a specified axis for a [`PLNLDAfit`] object
# @inheritParams plot.PLNPCAfit
#' @param cols a character, factor or numeric to define the color associated with the variables. By default, all variables receive the default color of the current palette.
#' @return a [`ggplot`] graphic
plot_correlation_circle = function(axes=1:min(2,self$rank), main="Variable Factor Map", cols = "default", plot=TRUE) {
## data frame with correlations between variables and PCs
correlations <- as.data.frame(self$corr_circle[, axes, drop = FALSE])
colnames(correlations) <- paste0("axe", 1:length(axes))
correlations$labels <- cols
correlations$names <- rownames(correlations)
axes_label <- paste(paste("axis",axes), paste0("(", round(100*self$percent_var,3)[axes], "%)"))
p <- get_ggplot_corr_circle(correlations, axes_label, main)
if (plot) print(p)
invisible(p)
},
#' @description Plot a summary of the [`PLNPCAfit`] object
# @inheritParams plot.PLNPCAfit
#' @importFrom gridExtra grid.arrange arrangeGrob
#' @importFrom grid nullGrob textGrob
#' @return a [`grob`] object
plot_PCA = function(nb_axes = min(3, self$rank), ind_cols = "ind_cols", var_cols = "var_cols", plot = TRUE) {
axes <- 1:nb_axes
if (nb_axes > 1) {
pairs.axes <- combn(axes, 2, simplify = FALSE)
## get back all individual maps
ind.plot <- lapply(pairs.axes, function(pair) {
ggobj <- self$plot_individual_map(axes = pair, plot = FALSE, main = "", cols = ind_cols) + theme(legend.position = "none")
return(ggplotGrob(ggobj))
})
## get back all correlation circle
cor.plot <- lapply(pairs.axes, function(pair) {
ggobj <- self$plot_correlation_circle(axes = pair, plot = FALSE, main = "", cols = var_cols)
return(ggplotGrob(ggobj))
})
## plot that appear on the diagonal
crit <- setNames(c(self$loglik, self$BIC, self$ICL), c("loglik", "BIC", "ICL"))
criteria.text <- paste("Model Selection\n\n", paste(names(crit), round(crit, 2), sep=" = ", collapse="\n"))
percentV.text <- paste("Axes contribution\n\n", paste(paste("axis",axes), paste0(": ", round(100*self$percent_var[axes],3), "%"), collapse="\n"))
diag.grobs <- list(textGrob(percentV.text),
g_legend(self$plot_individual_map(plot=FALSE, cols=ind_cols) + guides(colour = guide_legend(nrow = 4, title="classification"))),
textGrob(criteria.text))
if (nb_axes > 3)
diag.grobs <- c(diag.grobs, rep(list(nullGrob()), nb_axes - 3))
grobs <- vector("list", nb_axes^2)
i.cor <- 1; i.ind <- 1; i.dia <- 1
ind <- 0
for (i in 1:nb_axes) {
for (j in 1:nb_axes) {
ind <- ind + 1
if (j > i) { ## upper triangular -> cor plot
grobs[[ind]] <- cor.plot[[i.ind]]
i.ind <- i.ind + 1
} else if (i == j) { ## diagonal
grobs[[ind]] <- diag.grobs[[i.dia]]
i.dia <- i.dia + 1
} else {
grobs[[ind]] <- ind.plot[[i.cor]]
i.cor <- i.cor + 1
}
}
}
p <- arrangeGrob(grobs = grobs, ncol = nb_axes)
} else {
p <- arrangeGrob(grobs = list(
self$plot_individual_map(plot = FALSE),
self$plot_correlation_circle(plot = FALSE)
), ncol = 1)
}
if (plot)
grid.arrange(p)
invisible(p)
},
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Print methods ---------------------
#' @description User friendly print method
show = function() {
super$show(paste0("Poisson Lognormal with rank constrained for PCA (rank = ",self$rank,")\n"))
cat("* Additional fields for PCA\n")
cat(" $percent_var, $corr_circle, $scores, $rotation, $eig, $var, $ind\n")
cat("* Additional S3 methods for PCA\n")
cat(" plot.PLNPCAfit()\n")
}
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## Other methods ---------------------
),
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## ACTIVE BINDINGS ----
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
active = list(
#' @field rank the dimension of the current model
rank = function() {self$q},
#' @field vcov_model character: the model used for the residual covariance
vcov_model = function() {"rank"},
#' @field nb_param number of parameters in the current PLN model
nb_param = function() {self$p * (self$d + self$q) - self$q * (self$q - 1)/2},
#' @field entropy entropy of the variational distribution
entropy = function() {.5 * (self$n * self$q * log(2*pi*exp(1)) + sum(log(self$var_par$S2)))},
#' @field latent_pos a matrix: values of the latent position vector (Z) without covariates effects or offset
latent_pos = function() {tcrossprod(private$M, private$C)},
#' @field model_par a list with the matrices associated with the estimated parameters of the pPCA model: B (covariates), Sigma (covariance), Omega (precision) and C (loadings)
model_par = function() {
par <- super$model_par
par$C <- private$C
par
},
#' @field percent_var the percent of variance explained by each axis
percent_var = function() {
eigen.val <- private$svdCM$d[1:self$rank]^2
setNames(round(eigen.val/sum(eigen.val)*self$R_squared,4), paste0("PC", 1:self$rank))
},
#' @field corr_circle a matrix of correlations to plot the correlation circles
corr_circle = function() {
corr <- private$svdCM$v[, 1:self$rank] * matrix(private$svdCM$d[1:self$rank], byrow = TRUE, nrow = self$p, ncol = self$rank)
corr <- corr/sqrt(rowSums(corr^2))
rownames(corr) <- rownames(private$Sigma)
colnames(corr) <- paste0("PC", 1:self$rank)
corr
},
#' @field scores a matrix of scores to plot the individual factor maps (a.k.a. principal components)
scores = function() {
scores <- private$svdCM$u[, 1:self$rank] * matrix(private$svdCM$d[1:self$rank], byrow = TRUE, nrow = self$n, ncol = self$rank)
rownames(scores) <- rownames(private$M)
colnames(scores) <- paste0("PC", 1:self$rank)
scores
},
#' @field rotation a matrix of rotation of the latent space
rotation = function() {
rotation <- private$svdCM$v[, 1:self$rank, drop = FALSE]
rownames(rotation) <- rownames(private$Sigma)
colnames(rotation) <- paste0("PC", 1:self$rank)
rotation
},
#' @field eig description of the eigenvalues, similar to percent_var but for use with external methods
eig = function() {
eigen.val <- private$svdCM$d[1:self$rank]^2
matrix(
c(eigen.val, # eigenvalues
100 * self$R_squared * eigen.val / sum(eigen.val), # percentage of variance
100 * self$R_squared * cumsum(eigen.val) / sum(eigen.val) # cumulative percentage of variance
),
ncol = 3,
dimnames = list(paste("comp", 1:self$rank), c("eigenvalue", "percentage of variance", "cumulative percentage of variance"))
)
},
#' @field var a list of data frames with PCA results for the variables: `coord` (coordinates of the variables), `cor` (correlation between variables and dimensions), `cos2` (Cosine of the variables) and `contrib` (contributions of the variable to the axes)
var = function() {
coord <- private$svdCM$v[, 1:self$rank] * matrix(private$svdCM$d[1:self$rank], ncol = self$rank, nrow = self$p, byrow = TRUE)
## coord[j, k] = d[k] * v[j, k]
var_sd <- sqrt(rowSums(coord^2))
coord <- coord / var_sd
cor <- coord
cos2 <- cor^2
contrib <- 100 * private$svdCM$v[, 1:self$rank, drop = FALSE]^2
dimnames(coord) <- dimnames(cor) <- dimnames(cos2) <- dimnames(contrib) <- list(rownames(private$Sigma), paste0("Dim.", 1:self$rank))
list(coord = coord,
cor = cor,
cos2 = cos2,
contrib = contrib)
},
#' @field ind a list of data frames with PCA results for the individuals: `coord` (coordinates of the individuals), `cos2` (Cosine of the individuals), `contrib` (contributions of individuals to an axis inertia) and `dist` (distance of individuals to the origin).
ind = function() {
coord <- private$svdCM$u[, 1:self$rank] * matrix(private$svdCM$d[1:self$rank], ncol = self$rank, nrow = self$n, byrow = TRUE)
## coord[i, k] = d[k] * v[i, k]
dist_origin <- sqrt(rowSums(coord^2))
cos2 <- coord^2 / dist_origin^2
contrib <- 100 * private$svdCM$u[, 1:self$rank, drop = FALSE]^2
dimnames(coord) <- dimnames(cos2) <- dimnames(contrib) <- list(rownames(private$M), paste0("Dim.", 1:self$rank))
names(dist_origin) <- rownames(private$M)
list(coord = coord,
cos2 = cos2,
contrib = contrib,
dist = dist_origin)
},
#' @field call Hacky binding for compatibility with factoextra functions
call = function() {
list(scale.unit = FALSE)
}
)
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
## END OF CLASS ----
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.