knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This vignette introduces the usage of COAP for the analysis of high-dimensional count data with additional high-dimensional covariates, by comparison with other methods.
The package can be loaded with the command:
library(COAP) library(GFM)
First, we generate the data simulated data.
n <- 200; p <- 200; d= 50 rank0 <- 6; q = 5; datList <- gendata_simu(seed = 1, n=n, p=p, d= d, rank0 = rank0, q= q, rho=c(2, 2), sigma2_eps = 1) X_count <- datList$X; Z <- datList$Z H0 <- datList$H0; B0 <- datList$B0 bbeta0 <- cbind( datList$mu0, datList$bbeta0)
Fit the COAP model using the function RR_COAP()
in the R package COAP
. Users can use ?RR_COAP
to see the details about this function
hq <- 5; hr <- 6 system.time({ tic <- proc.time() reslist <- RR_COAP(X_count, Z= Z, q=hq, rank_use= hr, epsELBO = 1e-6) toc <- proc.time() time_coap <- toc[3] - tic[3] })
Check the increased property of the envidence lower bound function.
library(ggplot2) dat_iter <- data.frame(iter=1:length(reslist$ELBO_seq), ELBO=reslist$ELBO_seq) ggplot(data=dat_iter, aes(x=iter, y=ELBO)) + geom_line() + geom_point() + theme_bw(base_size = 20)
We calculate the metrics to measure the estimatioin accuracy, where the trace statistic is used to measure the estimation accuracy of loading matrix and prediction accuracy of factor matrix, which is evaluated by the function measurefun()
in the R package GFM
, and the root of mean square error is adopted to measure the estimation error of bbeta.
library(GFM) metricList <- list() metricList$COAP <- list() metricList$COAP$Tr_H <- measurefun(reslist$H, H0) metricList$COAP$Tr_B <- measurefun(reslist$B, B0) norm_vec <- function(x) sqrt(sum(x^2/ length(x))) metricList$COAP$err_bb <- norm_vec(reslist$bbeta-bbeta0) metricList$COAP$err_bb1 <- norm_vec(reslist$bbeta[,1]-bbeta0[,1]) metricList$COAP$Time <- time_coap
We compare COAP with various prominent methods in the literature. They are (1) High-dimensional LFM (Bai and Ng 2002) implemented in the R package GFM; (2) PoissonPCA (Kenney et al. 2021) implemented in the R package PoissonPCA; (3) Zero-inflated Poisson factor model (ZIPFA, Xu et al. 2021) implemented in the R package ZIPFA; (4) Generalized factor model (Liu et al. 2023) implemented in the R package GFM; (5) PLNPCA (Chiquet et al. 2018) implemented in the R package PLNmodels; (6) Generalized Linear Latent Variable Models (GLLVM, Hui et al. 2017) implemented in the R package gllvm. (7) Poisson regression model for each $x_{ij}, (j = 1,ยทยทยท ,p)$, implemented in stats R package; (8) Multi-response reduced-rank Poisson regression model (MMMR, Luo et al. 2018) implemented in rrpack R package.
(1). First, we implemented the linear factor model (LFM) and record the metrics that measure the estimation accuracy and computational cost.
metricList$LFM <- list() tic <- proc.time() fit_lfm <- Factorm(X_count, q=q) toc <- proc.time() time_lfm <- toc[3] - tic[3] hbb1 <- colMeans(X_count) metricList$LFM$Tr_H <- measurefun(fit_lfm$hH, H0) metricList$LFM$Tr_B <- measurefun(fit_lfm$hB, B0) metricList$LFM$err_bb1 <- norm_vec(hbb1- bbeta0[,1]) metricList$LFM$err_bb <- NA metricList$LFM$Time <- time_lfm
(2). Then, we implemented PoissonPCA and recorded the metrics.
metricList$PoissonPCA <- list() library(PoissonPCA) tic <- proc.time() fit_poispca <- Poisson_Corrected_PCA(X_count, k= hq) toc <- proc.time() time_ppca <- toc[3] - tic[3] hbb1 <- colMeans(X_count) metricList$PoissonPCA$Tr_H <- measurefun(fit_poispca$scores, H0) metricList$PoissonPCA$Tr_B <- measurefun(fit_poispca$loadings, B0) metricList$PoissonPCA$err_bb1 <- norm_vec(log(1+fit_poispca$center)- bbeta0[,1]) metricList$PoissonPCA$err_bb <- NA metricList$PoissonPCA$Time <- time_ppca
(3) Thirdly, we implemented the zero-inflated Poisson factor model:
## ZIPFA runs very slowly, so we do not run it here. library(ZIPFA) metricList$ZIPFA <- list() system.time( tic <- proc.time() fit_zipfa <- ZIPFA(X_count, k=hq, display = FALSE) toc <- proc.time() time_zipfa <- toc[3] - tic[3] ) idx_max_like <- which.max(fit_zipfa$Likelihood) hbb1 <- colMeans(X_count) metricList$ZIPFA$Tr_H <- measurefun(fit_zipfa$Ufit[[idx_max_like]], H0) metricList$ZIPFA$Tr_B <- measurefun(fit_zipfa$Vfit[[idx_max_like]], B0) metricList$PoissonPCA$Time <- time_zipfa
(4) Fourthly, we also applied the generalized factor model to estimate the loading matrix and factor matrix.
metricList$GFM <- list() tic <- proc.time() fit_gfm <- gfm(list(X_count), type='poisson', q= q, verbose = F) toc <- proc.time() time_gfm <- toc[3] - tic[3] metricList$GFM$Tr_H <- measurefun(fit_gfm$hH, H0) metricList$GFM$Tr_B <- measurefun(fit_gfm$hB, B0) metricList$GFM$err_bb1 <- norm_vec(fit_gfm$hmu- bbeta0[,1]) metricList$GFM$err_bb <- NA metricList$GFM$Time <- time_gfm
(5) Fifthly, we implemented PLNPCA:
PLNPCA_run <- function(X_count, covariates, q, Offset=rep(1, nrow(X_count))){ require(PLNmodels) if(!is.character(Offset)){ dat_plnpca <- prepare_data(X_count, covariates) dat_plnpca$Offset <- Offset }else{ dat_plnpca <- prepare_data(X_count, covariates, offset = Offset) } d <- ncol(covariates) # offset(log(Offset))+ formu <- paste0("Abundance ~ 1 + offset(log(Offset))+",paste(paste0("V",1:d), collapse = '+')) myPCA <- PLNPCA(as.formula(formu), data = dat_plnpca, ranks = q) myPCA1 <- getBestModel(myPCA) myPCA1$scores res_plnpca <- list(PCs= myPCA1$scores, bbeta= myPCA1$model_par$B, loadings=myPCA1$model_par$C) return(res_plnpca) } tic <- proc.time() fit_plnpca <- PLNPCA_run(X_count, covariates = Z[,-1], q= q) toc <- proc.time() time_plnpca <- toc[3] - tic[3] message(time_plnpca, " seconds") metricList$PLNPCA$Tr_H <- measurefun(fit_plnpca$PCs, H0) metricList$PLNPCA$Tr_B <- measurefun(fit_plnpca$loadings, B0) metricList$PLNPCA$err_bb1 <- norm_vec(fit_plnpca$bbeta[,1]- bbeta0[,1]) metricList$PLNPCA$err_bb <- norm_vec(as.vector(fit_plnpca$bbeta) - as.vector(bbeta0)) metricList$PLNPCA$Time <- time_plnpca
(6) Sixthly, we implement the generalized linear latent variable models (GLLVM, Hui et al. 2017).
## GLLVM runs very slowly, so we do not run it here. library(gllvm) colnames(Z) <- c(paste0("V",1: ncol(Z))) tic <- proc.time() fit <- gllvm(y=X_count, X=Z, family=poisson(), num.lv= q, control = list(trace=T)) toc <- proc.time() time_gllvm <- toc[3] - tic[3] metricList$GLLVM <- list() metricList$GLLVM$Tr_H <- measurefun(fit$lvs, H0) metricList$GLLVM$Tr_B <- measurefun(fit$params$theta, B0) metricList$GLLVM$err_bb1 <- norm_vec(fit$params$beta0- bbeta0[,1]) metricList$GLLVM$err_bb <- norm_vec(as.vector(cbind(fit$params$beta0,fit$params$Xcoef)) - as.vector(bbeta0)) metricList$GLLVM$Time <- time_gllvm }
(7) Seventhly, Poisson regression model for each variable was implemented.
PoisReg <- function(X_count, covariates){ library(stats) hbbeta <- apply(X_count, 2, function(x){ glm1 <- glm(x~covariates+0, family = "poisson") coef(glm1) } ) return(t(hbbeta)) } tic <- proc.time() hbbeta_poisreg <- PoisReg(X_count, Z) toc <- proc.time() time_poisreg <- toc[3] - tic[3] metricList$GLM <- list() metricList$GLM$Tr_H <- NA metricList$GLM$Tr_B <- NA metricList$GLM$err_bb1 <- norm_vec(hbbeta_poisreg[,1]- bbeta0[,1]) metricList$GLM$err_bb <- norm_vec(as.vector(hbbeta_poisreg) - as.vector(bbeta0)) metricList$GLM$Time <- time_poisreg
(8) Eightly, we implemented the first version of multi-response reduced-rank Poisson regression model (MMMR, Luo et al. 2018) implemented in rrpack R package (MRRR-Z), that did not consider the latent factor structure but only the covariates.
mrrr_run <- function(Y, X, rank0, q=NULL, family=list(poisson()), familygroup=rep(1,ncol(Y))){ require(rrpack) n <- nrow(Y); p <- ncol(Y) if(!is.null(q)){ rank0 <- rank0+q X <- cbind(X, diag(n)) } svdX0d1 <- svd(X)$d[1] init1 = list(kappaC0 = svdX0d1 * 5) ## this setting follows the example that authors provided. fit.mrrr <- mrrr(Y=Y, X=X[,-1], family = family, familygroup = familygroup, penstr = list(penaltySVD = "rankCon", lambdaSVD = 0.1), init = init1, maxrank = rank0) hbbeta_mrrr <-t(fit.mrrr$coef[1:ncol(Z), ]) if(!is.null(q)){ Theta_hb <- (fit.mrrr$coef[(ncol(Z)+1): (nrow(Z)+ncol(Z)), ]) svdTheta <- svd(Theta_hb, nu=q, nv=q) return(list(hbbeta=hbbeta_mrrr, factor=svdTheta$u, loading=svdTheta$v)) }else{ return(list(hbbeta=hbbeta_mrrr)) } } tic <- proc.time() res_mrrrz <- mrrr_run(X_count, Z, rank0) toc <- proc.time() time_mrrrz <- toc[3] - tic[3] metricList$MRRR_Z <- list() metricList$MRRR_Z$Tr_H <- NA metricList$MRRR_Z$Tr_B <-NA metricList$MRRR_Z$err_bb1 <- norm_vec(res_mrrrz$hbbeta[,1]- bbeta0[,1]) metricList$MRRR_Z$err_bb <- norm_vec(as.vector(res_mrrrz$hbbeta) - as.vector(bbeta0)) metricList$MRRR_Z$Time <- time_mrrrz
(9) Lastly, we implemented the second version of MRRR (MRRR-F) that considered both covariates and the latent factor structure.
tic <- proc.time() res_mrrrf <- mrrr_run(X_count, Z, rank0, q=q) toc <- proc.time() time_mrrrf <- toc[3] - tic[3] metricList$MRRR_F <- list() metricList$MRRR_F$Tr_H <- measurefun(res_mrrrf$factor, H0) metricList$MRRR_F$Tr_B <- measurefun(res_mrrrf$loading, B0) metricList$MRRR_F$err_bb1 <- norm_vec(res_mrrrf$hbbeta[,1]- bbeta0[,1]) metricList$MRRR_F$err_bb <- norm_vec(as.vector(res_mrrrf$hbbeta) - as.vector(bbeta0)) metricList$MRRR_F$Time <- time_mrrrf
Next, we summarized the metrics for COAP and other compared methods in a dataframe object.
list2vec <- function(xlist){ nn <- length(xlist) me <- rep(NA, nn) idx_noNA <- which(sapply(xlist, function(x) !is.null(x))) for(r in idx_noNA) me[r] <- xlist[[r]] return(me) } dat_metric <- data.frame(Tr_H = sapply(metricList, function(x) x$Tr_H), Tr_B = sapply(metricList, function(x) x$Tr_B), err_bb1 =sapply(metricList, function(x) x$err_bb1), err_bb = list2vec(lapply(metricList, function(x) x[['err_bb']])), Method = names(metricList)) dat_metric
Plot the results for COAP and other methods, which suggests that COAP achieves better estimation accuracy for the quantiites of interest.
library(cowplot) p1 <- ggplot(data=subset(dat_metric, !is.na(Tr_B)), aes(x= Method, y=Tr_B, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL) + theme_bw(base_size = 16) p2 <- ggplot(data=subset(dat_metric, !is.na(Tr_H)), aes(x= Method, y=Tr_H, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16) p3 <- ggplot(data=subset(dat_metric, !is.na(err_bb1)), aes(x= Method, y=err_bb1, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16) p4 <- ggplot(data=subset(dat_metric, !is.na(err_bb)), aes(x= Method, y=err_bb, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16) plot_grid(p1,p2,p3, p4, nrow=2, ncol=2)
We applied the singular value ratio based method to select the number of factors and the rank of coefficient matrix. The results showed that the SVR method has the potential to identify the true values.
datList <- gendata_simu(seed = 1, n=n, p=p, d= d, rank0 = rank0, q= q, rho=c(3, 6), sigma2_eps = 1) X_count <- datList$X; Z <- datList$Z res1 <- selectParams(X_count=datList$X, Z=datList$Z, verbose=F) print(c(q_true=q, q_est=res1['hq'])) print(c(r_true=rank0, r_est=res1['hr']))
Session Info
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.