knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
In this tutorial, we show that the alternate maximization (AM) is used in the first step of the two-step estimation method and the information criterion (IC) method is adopted to choose the number of factors.
The package can be loaded with the command:
library("GFM") set.seed(1) # set a random seed for reproducibility.
First, we generate the data with homogeneous normal variables.
## Homogeneous normal variables dat <- gendata(q = 2, n=100, p=100, rho=3)
Then, we set the algorithm parameters and fit model
# Obtain the observed data XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix # set variables' type, 'gaussian' means there is continous variable type. types <- 'gaussian'
Third, we fit the GFM model with user-specified number of factors.
# specify q=2 gfm1 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE) # measure the performance of GFM estimators in terms of canonical correlations measurefun(gfm1$hH, dat$H0, type='ccor') measurefun(gfm1$hB, dat$B0, type='ccor')
The number of factors can also be determined by data-driven manners.
# select q automatically hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:6, verbose = FALSE, parallelList=list(parallel=TRUE)) hq
First, we generate the data with heterogeous normal variables and set the parameters of algorithm.
dat <- gendata(seed=1, n=100, p=100, type='heternorm', q=2, rho=1) # Obtain the observed data XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix # set variables' type, 'gaussian' means there is continous variable type. types <- 'gaussian'
Third, we fit the GFM model with user-specified number of factors and compare the results with that of linear factor models.
# specify q=2 gfm1 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE) # measure the performance of GFM estimators in terms of canonical correlations corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor') corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor') lfm1 <- Factorm(X, q=2) corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor') corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor') library(ggplot2) df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm), Method =factor(rep(c('GFM', "LFM"), times=2)), Quantity= factor(c(rep('factors',2), rep("loadings", 2)))) ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
The number of factors can also be determined by data-driven manners.
# select q automatically hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:6, verbose = FALSE, parallelList=list(parallel=TRUE))
First, we generate the data with Count(Poisson) variables and set the parameters of algorithm.
q <- 3; p <- 200 dat <- gendata(seed=1, n=200, p=p, type='pois', q=q, rho=4) # Obtain the observed data XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix # set variables' type, 'gaussian' means there is continous variable type. types <- 'poisson'
Second, we we fit the GFM models given the true number of factors.
system.time( gfm1 <- gfm(XList, types, algorithm="AM", q=3, verbose = FALSE) )
system.time( hq <- chooseFacNumber(XList, types, q_set=1:6, select_method = "IC", parallelList=list(parallel=TRUE)) )
Third, we compare the results with that of linear factor models.
# measure the performance of GFM estimators in terms of canonical correlations corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor') corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor') lfm1 <- Factorm(X, q=3) corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor') corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor') library(ggplot2) df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm), Method =factor(rep(c('GFM', "LFM"), times=2)), Quantity= factor(c(rep('factors',2), rep("loadings", 2)))) ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
First, we generate the data with Count(Poisson) variables and set the parameters of algorithm. Then fit the GFM model with user-specified number of factors.
dat <- gendata(seed=1, n=200, p=200, type='pois_bino', q=2, rho=2) # Obtain the observed data XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix # set variables' type, 'gaussian' means there is continous variable type. types <- dat$types table(dat$X[,1]) table(dat$X[, 200]) # user-specified q=2 gfm2 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE) measurefun(gfm2$hH, dat$H0, type='ccor') measurefun(gfm2$hB, dat$B0, type='ccor')
Third, we compare the results with that of linear factor models.
# select q automatically hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:4, verbose = FALSE, parallelList=list(parallel=TRUE)) # measure the performance of GFM estimators in terms of canonical correlations corH_gfm <- measurefun(gfm2$hH, dat$H0, type='ccor') corB_gfm <- measurefun(gfm2$hB, dat$B0, type='ccor')
Compare with linear factor models
lfm1 <- Factorm(dat$X, q=3) corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor') corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor') library(ggplot2) df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm), Method =factor(rep(c('GFM', "LFM"), times=2)), Quantity= factor(c(rep('factors',2), rep("loadings", 2)))) ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.