In this tutorial, we show the usage of the overdispersed generalized factor model (OverGFM) and compare it with the competitors.
The package can be loaded with the following command:
library("GFM")
The rrpack package can be loaded with the following command:
library("rrpack")
The PCAmixdata package can be loaded with the following command:
library("PCAmixdata")
We define the function with details for the data generation mechanism.
gendata_s2 <- function (seed = 1, n = 500, p = 500, type = c('homonorm', 'heternorm', 'pois', 'bino', 'norm_pois', 'pois_bino', 'npb'), q = 6, rho = c(0.05, 0.2, 0.1), n_bin=1, sigma_eps=0.1){ library(MASS) Diag <- GFM:::Diag cor.mat <- GFM:::cor.mat type <- match.arg(type) rho_gauss <- rho[1] rho_pois <- rho[2] rho_binary <- rho[3] set.seed(seed) Z <- matrix(rnorm(p * q), p, q) if (type == "homonorm") { g1 <- 1:p Z <- rho_gauss * Z }else if (type == "heternorm"){ g1 <- 1:p Z <- rho_gauss * Z }else if(type == "pois"){ g1 <- 1:p Z <- rho_pois * Z }else if(type == 'bino'){ g1 <- 1:p Z <- rho_binary * Z }else if (type == "norm_pois") { g1 <- 1:floor(p/2) g2 <- (floor(p/2) + 1):p Z[g1, ] <- rho_gauss * Z[g1, ] Z[g2, ] <- rho_pois * Z[g2, ] }else if (type == "pois_bino") { g1 <- 1:floor(p/2) g2 <- (floor(p/2) + 1):p Z[g1, ] <- rho_pois * Z[g1, ] Z[g2, ] <- rho_binary * Z[g2, ] }else if(type == 'npb'){ g1 <- 1:floor(p/3) g2 <- (floor(p/3) + 1):floor(p*2/3) g3 <- (floor(2*p/3) + 1):p Z[g1, ] <- rho_gauss * Z[g1, ] Z[g2, ] <- rho_pois * Z[g2, ] Z[g3, ] <- rho_binary * Z[g3, ] } svdZ <- svd(Z) B1 <- svdZ$u %*% Diag(svdZ$d[1:q]) B0 <- B1 %*% Diag(sign(B1[1, ])) mu0 <- 0.4 * rnorm(p) Bm0 <- cbind(mu0, B0) set.seed(seed) H <- mvrnorm(n, mu = rep(0, q), cor.mat(q, 0.5)) svdH <- svd(cov(H)) H0 <- scale(H, scale = F) %*% svdH$u %*% Diag(1/sqrt(svdH$d)) %*% svdH$v if (type == "homonorm") { X <- H0 %*% t(B0) + matrix(mu0, n, p, byrow = T) + mvrnorm(n, rep(0, p), sigma_eps*diag(p)) group <- rep(1, p) XList <- list(X) types <- c("gaussian") }else if (type == "heternorm") { sigmas = sigma_eps*(0.1 + 4 * runif(p)) X <- H0 %*% t(B0) + matrix(mu0, n, p, byrow = T) + mvrnorm(n, rep(0, p), diag(sigmas)) group <- rep(1, p) XList <- list(X) types <- c("gaussian") }else if (type == "pois") { Eta <- H0 %*% t(B0) + matrix(mu0, n, p, byrow = T) + mvrnorm(n,rep(0, p), sigma_eps*diag(p)) mu <- exp(Eta) X <- matrix(rpois(n * p, lambda = mu), n, p) group <- rep(1, p) XList <- list(X[,g1]) types <- c("poisson") }else if(type == 'bino'){ Eta <- cbind(1, H0) %*% t(Bm0[g1, ]) + mvrnorm(n,rep(0, p), sigma_eps*diag(p)) mu <- 1/(1 + exp(-Eta)) X <- matrix(rbinom(prod(dim(mu)), n_bin, mu), n, p) group <- rep(1, p) XList <- list(X[,g1]) types <- c("binomial") }else if (type == "norm_pois") { Eps <- mvrnorm(n,rep(0, p), sigma_eps*diag(p)) mu1 <- cbind(1, H0) %*% t(Bm0[g1, ]) + Eps[, g1] mu2 <- exp(cbind(1, H0) %*% t(Bm0[g2, ])+ Eps[, g2]) X <- cbind(matrix(rnorm(prod(dim(mu1)), mu1, 1), n, floor(p/2)), matrix(rpois(prod(dim(mu2)), mu2), n, ncol(mu2))) group <- c(rep(1, length(g1)), rep(2, length(g2))) XList <- list(X[,g1], X[,g2]) types <- c("gaussian", "poisson") }else if (type == "pois_bino") { Eps <- mvrnorm(n,rep(0, p), sigma_eps*diag(p)) mu1 <- exp(cbind(1, H0) %*% t(Bm0[g1, ])+ Eps[,g1]) mu2 <- 1/(1 + exp(-cbind(1, H0) %*% t(Bm0[g2, ])- Eps[,g2])) X <- cbind(matrix(rpois(prod(dim(mu1)), mu1), n, ncol(mu1)), matrix(rbinom(prod(dim(mu2)), n_bin, mu2), n, ncol(mu2))) group <- c(rep(1, length(g1)), rep(2, length(g2))) XList <- list(X[,g1], X[,g2]) types <- c("poisson", 'binomial') }else if(type == 'npb'){ Eps <- mvrnorm(n,rep(0, p), sigma_eps*diag(p)) mu11 <- cbind(1, H0) %*% t(Bm0[g1, ]) + Eps[,g1] mu1 <- exp(cbind(1, H0) %*% t(Bm0[g2, ]) + Eps[,g2]) mu2 <- 1/(1 + exp(-cbind(1, H0) %*% t(Bm0[g3, ])- Eps[,g3])) X <- cbind(matrix(rnorm(prod(dim(mu11)),mu11, 1), n, ncol(mu11)), matrix(rpois(prod(dim(mu1)), mu1), n, ncol(mu1)), matrix(rbinom(prod(dim(mu2)), n_bin, mu2), n, ncol(mu2))) group <- c(rep(1, length(g1)), rep(2, length(g2)), rep(3, length(g3))) XList <- list(X[,g1], X[,g2], X[,g3]) types <- c("gaussian", "poisson", 'binomial') } return(list(X=X, XList = XList, types= types, B0 = B0, H0 = H0, mu0 = mu0)) }
GFM method capable of handling mixed-type data is implemented in the R package $\bf GFM$.
MRRR method which is implemented in the R package $\bf rrpack$ also handles mixed-type data through reduced-rank regression model, and we redefine the function of this method and modify its output results for comparison conveniently.
Diag <- GFM:::Diag ## Compare with MRRR mrrr_run <- function(Y, rank0,family=list(poisson()), familygroup, epsilon = 1e-4, sv.tol = 1e-2,lambdaSVD=0.1, maxIter = 2000, trace=TRUE){ # epsilon = 1e-4; sv.tol = 1e-2; maxIter = 30; trace=TRUE require(rrpack) n <- nrow(Y); p <- ncol(Y) X <- cbind(1, diag(n)) svdX0d1 <- svd(X)$d[1] init1 = list(kappaC0 = svdX0d1 * 5) offset = NULL control = list(epsilon = epsilon, sv.tol = sv.tol, maxit = maxIter, trace = trace, gammaC0 = 1.1, plot.cv = TRUE, conv.obj = TRUE) res_mrrr <- mrrr(Y=Y, X=X[,-1], family = family, familygroup = familygroup, penstr = list(penaltySVD = "rankCon", lambdaSVD = lambdaSVD), control = control, init = init1, maxrank = rank0) hmu <- res_mrrr$coef[1,] hTheta <- res_mrrr$coef[-1,] #print(dim(hTheta)) # Matrix::rankMatrix(hTheta) svd_Theta <- svd(hTheta, nu=rank0,nv =rank0) hH <- svd_Theta$u hB <- svd_Theta$v %*% Diag(svd_Theta$d[1:rank0]) #print(dim(svd_Theta$v)) #print(dim(Diag(svd_Theta$d))) return(list(hH=hH, hB=hB, hmu= hmu)) }
PCAmix method which uses Principal component analysis for data with mix of qualitative and quantitative variables is implemented in the R package $\bf PCAmixdata$.
LFM method which handles linear factor model is implemented in the R package $\bf GFM$, and we define the function of this method based on R package $\bf GFM$.
factorm <- function(X, q=NULL){ signrevise <- GFM:::signrevise if ((!is.null(q)) && (q < 1)) stop("q must be NULL or other positive integer!") if (!is.matrix(X)) stop("X must be a matrix.") mu <- colMeans(X) X <- scale(X, scale = FALSE) n <- nrow(X) p <- ncol(X) if (p > n) { svdX <- eigen(X %*% t(X)) evalues <- svdX$values eigrt <- evalues[1:(21 - 1)]/evalues[2:21] if (is.null(q)) { q <- which.max(eigrt) } hatF <- as.matrix(svdX$vector[, 1:q] * sqrt(n)) B2 <- n^(-1) * t(X) %*% hatF sB <- sign(B2[1, ]) hB <- B2 * matrix(sB, nrow = p, ncol = q, byrow = TRUE) hH <- sapply(1:q, function(k) hatF[, k] * sign(B2[1, ])[k]) } else { svdX <- eigen(t(X) %*% X) evalues <- svdX$values eigrt <- evalues[1:(21 - 1)]/evalues[2:21] if (is.null(q)) { q <- which.max(eigrt) } hB1 <- as.matrix(svdX$vector[, 1:q]) hH1 <- n^(-1) * X %*% hB1 svdH <- svd(hH1) hH2 <- signrevise(svdH$u * sqrt(n), hH1) if (q == 1) { hB1 <- hB1 %*% svdH$d[1:q] * sqrt(n) } else { hB1 <- hB1 %*% diag(svdH$d[1:q]) * sqrt(n) } sB <- sign(hB1[1, ]) hB <- hB1 * matrix(sB, nrow = p, ncol = q, byrow = TRUE) hH <- sapply(1:q, function(j) hH2[, j] * sB[j]) } sigma2vec <- colMeans((X - hH %*% t(hB))^2) res <- list() res$hH <- hH res$hB <- hB res$mu <- mu res$q <- q res$sigma2vec <- sigma2vec res$propvar <- sum(evalues[1:q])/sum(evalues) res$egvalues <- evalues attr(res, "class") <- "fac" return(res) }
First, we generate a simulated data for three mixed-type variables based on the aforementioned data generate function. We fix $(n,p) = (500,500)$ ,$\sigma^2 = 0.7$ and the signal strength $(\rho_1, \rho_2, \rho_3)=(0.05,0.2,0.1)$. Additionally, we set the number of factor $q$ is 6. The details for the data setting is following :
q <- 6 datList <- gendata_s2(seed = 1, type= 'npb', n=500, p=500, q=q, rho= c(0.05, 0.2, 0.1) ,sigma_eps = 0.7)
Second, we define the trace statistic to assess the performance.
trace_statistic_fun <- function(H, H0){ tr_fun <- function(x) sum(diag(x)) mat1 <- t(H0) %*% H %*% ginv(t(H) %*% H) %*% t(H) %*% H0 tr_fun(mat1) / tr_fun(t(H0) %*% H0) }
Then we use OverGFM to fit model.
gfm_over <- overdispersedGFM(datList$XList, types=datList$types, q=q) OverGFM_H <- trace_statistic_fun(gfm_over$hH, datList$H0) OverGFM_G <- trace_statistic_fun(cbind(gfm_over$hmu,gfm_over$hB), cbind(datList$mu0,datList$B0))
We use other methods to fit model.
lfm <- factorm(datList$X, q=q) gfm_am <- gfm(datList$XList, types=datList$types, q=q, algorithm = "AM", maxIter = 15) familygroup <- lapply(1:length(datList$types), function(j) rep(j, ncol(datList$XList[[j]]))) res_mrrr <- mrrr_run(datList$X, rank0=q, family=list(gaussian(), poisson(), binomial()),familygroup = unlist(familygroup), maxIter=2000) dat_bino <- as.data.frame(datList$XList[[3]]) for(jj in 1:ncol(dat_bino)) dat_bino[,jj] <- factor(dat_bino[,jj]) dat_norm <- as.data.frame(cbind(datList$XList[[1]],datList$XList[[2]])) res_pcamix <- PCAmix(X.quanti = dat_norm, X.quali = dat_bino,rename.level=TRUE, ndim=q, graph=F) reslits <- lapply(res_pcamix$coef, function(x) x[c(seq(2, ncol(dat_norm)+1, by=1), seq(ncol(dat_norm)+3, nrow(res_pcamix$coef[[1]]), by=2)),]) loadings <- Reduce(cbind, reslits) GFM_H <- trace_statistic_fun(gfm_am$hH, datList$H0) GFM_G <- trace_statistic_fun(cbind(gfm_am$hmu,gfm_am$hB), cbind(datList$mu0,datList$B0)) MRRR_H <- trace_statistic_fun(res_mrrr$hH, datList$H0) MRRR_G <- trace_statistic_fun(cbind(res_mrrr$hmu,res_mrrr$hB), cbind(datList$mu0,datList$B0)) PCAmix_H <- trace_statistic_fun(res_pcamix$ind$coord, datList$H0) PCAmix_G <- trace_statistic_fun(loadings, cbind(datList$mu0,datList$B0)) LFM_H <- trace_statistic_fun(lfm$hH, datList$H0) LFM_G <- trace_statistic_fun(cbind(lfm$mu,lfm$hB), cbind(datList$mu0,datList$B0))
We visualize the comparison of the trace statistic of $\bf H$ and $\bf \Upsilon$ for each methods
library(ggplot2) value <- c(OverGFM_H,OverGFM_G,GFM_H,GFM_G,MRRR_H,MRRR_G,PCAmix_H,PCAmix_G,LFM_H,LFM_G) df <- data.frame(Value = value, Methods = factor(rep(c("OverGFM","GFM","MRRR","PCAmix","LFM"), each = 2), levels = c("OverGFM","GFM","MRRR","PCAmix","LFM")), Trace = factor(rep(c("Tr_H","Tr_Gamma"), times = 5),levels = c("Tr_H","Tr_Gamma"))) ggplot(data = df,aes(x = Methods, y = Value, colour = Methods, fill=Methods)) + geom_bar(stat="identity") + facet_grid(Trace ~ .,drop = TRUE, scales = "free_x") + theme_bw() + theme(axis.text.x = element_text(size = 10,angle = 25, hjust = 1, vjust = 1), axis.text.y = element_text(size = 10, hjust = 1, vjust = 1), axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15), legend.title=element_blank())+ labs( x="Method", y = "Trace statistic ")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.