# transform a covariance matrix into a block diagonal matrix, each block is the
# covariance matrix (or its inverse if inv=TRUE) of a group of variables
cov.bdiag <- function(M, group, inv=TRUE){
uniq <- unique(group)
L <- list()
i <- 1
for(g in uniq){
ix <- group == g
L[[i]] <- if(inv) solve(M[ix,ix]) else M[ix,ix]
i <- i+1
}
bdiag(L)
}
# get pvalue and coverage probability for grouped variables:
group.pval <- function(formula, data, family, intercept, gamma_hat, group){
if(intercept) group <- c(min(group)-1,group)
uniq.grp <- unique(group); n <- nrow(data)
grp.size <- sapply(uniq.grp, function(x) sum(group==x))
fit0 <- glm(formula, data = data, family = family,
start=gamma_hat, control=glm.control(epsilon=1e20, maxit=1, trace=F))
VCOV.gamma <- summary(fit0)$"cov.scaled"
VCOV.inv <- cov.bdiag(VCOV.gamma,group)
lst_hat <- split(gamma_hat,group); lst_hat <- bdiag(lst_hat)
stat_pval <- diag(t(lst_hat)%*%VCOV.inv%*%(lst_hat))
pvalues <- pchisq(stat_pval, df=grp.size, lower.tail=F)
return(pvalues)
}
#' The gMIC Function for (Group) Variable Selection in Generalized Linear Model
#'
#' @param formula An object of class \code{\link[stats]{formula}}, with the response on the left of a \code{~} operator, and the terms on the right.
#' @param family A description of the error distribution and link function to be used in the model. Preferably for computational speed,
#' this is a character string naming a family function among the following three choices: \code{"gaussian"},
#' \code{"binomial"}, or \code{"poisson"}. Otherwise, it has to be a family function or the result of a call to a family function that
#' can be called for by \code{\link[stats]{glm.fit}}. See \code{\link[stats]{family}} for details of family functions.
#' @param data A data.frame in which to interpret the variables named in the \code{formula} argument.
#' @param group A vector indicating the group structure of the model. For example, assume that X has 4 columns and group=c(1,1,2,2).
#' It means the first 2 features form a group of variables and the last 2 features form another group of variables.
#' @param beta0 A vector eqaul to the initial value for the model parameter, default is NULL.
#' @param criterion A string indicating the type of information criterion ("AIC" or "BIC") to approximate. Default is "BIC".
#' @param lambda0 A number, the user-specified penalty parameter for model complexity. If \code{criterion="AIC"} or \code{"BIC"}, the value
#' of \code{lambda0} will be ignored.
#' @param a0 The approximation parameter of the gMIC method.
#' @param scale.x A boolean indicating whether or not to studentize the features. Default is \code{TRUE}.
#' @param orthogonal.x A boolean indicating whether or not to orthogonalize the features within each group. Default is true. See \code{link{orthogonalize}} for details.
#' @param rounding.digits Number of digits after the decimal point for rounding-up estiamtes. Default value is 4.
#' @param optim.method Optimization method for gMIC, one of c("GenSA", "BFGS", "ADAM"), indicating we use GenSA, BFGS, or ADAM for gMIC optimmization.
#' Default is BFGS. For unknown methods specified by user, the default with be used.
#' @param lower The lower bounds for the search space in \code{GenSA}. The default is -10 (\eqn{p} by \eqn{1} vector).
#' @param upper The upper bounds for the search space in \code{GenSA}. The default is +10 (\eqn{p} by \eqn{1} vector).
#' @param maxit.global Maximum number of iterations allowed for the global optimization algorithm \code{SANN}. Default value is 100.
#' @param maxit.local Maximum number of iterations allowed for the local optimizaiton algorithm \code{BFGS}. Default value is 100.
#' @param epsilon The convergence tolerance.
#' @param stepsize The stepsize (or learning rate) for optim.method = "GD" and "ADAM".
#' @param details Logical value: if \code{TRUE}, detailed results will be printed out when running \code{glm.gMIC}.
#' @return A list of objects as follows,
#' \describe{
#' \item{coefficients}{The estimates for gamma, the standard error of gamma, pvalues of gamma, and estimates of beta.}
#' \item{group.pvalues}{The group-level p-values for each group of variables.}
#' }
#' @import GenSA
#' @import Matrix
#' @examples
#' library(MASS)
#' library(Matrix)
#' n=500;a=100
#' sig <- function(k, rho){
#' m = matrix(rho,nrow=k,ncol=k)
#' diag(m) <- 1
#' return(m)
#' }
#' bt = c(1,1,1, .5,.5,.5, rep(0,24)); p = length(bt)
#' group = c(1,1,1, 2,2,2,rep(3:10,each=3))
#' rho1 = 0.1; rho2 = 0.6
#' COV = sig(p,rho1) + bdiag(rep(list(sig(3,rho2)),p/3))
#' set.seed(1234)
#' X = mvrnorm(n,rep(0,p),COV)
#' z = X%*%bt; pr = 1/(1+exp(-z))
#' y = rbinom(n,1,pr)
#' Xy = as.data.frame(cbind(X,y))
#' dim(Xy)
#' names(Xy) <- c(paste("X",1:30,sep=""),"y")
#' fit <- glm.gMIC(y~.-1,group=group,family="binomial",a0=a,data=Xy,orthogonal.x=T)
#'
#' @export
#'
glm.gMIC <- function(formula, family, data, group=NULL,
beta0=NULL, criterion ="BIC", lambda0=0, a0=NULL,
scale.x=FALSE, orthogonal.x=FALSE, # Do we orthogonalize X?
rounding.digits = 4, # Rounding digits for final result
optim.method = "BFGS", lower=NULL, upper=NULL,
maxit.global=100, maxit.local=100, epsilon=1e-6,
stepsize = 0.01, details=FALSE)
{
call <- match.call()
# CHECK THE family= ARGUMENT
family0 <- family
if (is.character(family))
family <- get(family, mode = "function", envir = parent.frame())
if (is.function(family))
family <- family()
if (!is.element(family$family, c("gaussian", "binomial", "poisson")))
stop(paste0("Family '", family$family, "' not supported!"))
# CHECK THE data= ARGUMENT
if (missing(data)) data <- environment(formula)
# OBTAIN THE DESIGN MATRIX X AND RESPONSE y
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
mt <- attr(mf, "terms")
Y <- model.response(mf, "any")
if (length(dim(Y)) == 1L) {
nm <- rownames(Y); dim(Y) <- NULL
if (!is.null(nm)) names(Y) <- nm
}
yname <- as.character(formula[[2]]); # EXTRACT RESPONSE NAME FROM THE FORMULA
X <- if (!is.empty.model(mt)) model.matrix(mt, mf)
else matrix(, NROW(Y), 0L)
Xnames <- colnames(X)
intercept <- Xnames[1]=="(Intercept)"
if(is.null(group)) {
if(intercept) group <- (1:(ncol(X)-1))
else group <- (1:ncol(X))
}
# Standardize X
if (scale.x || orthogonal.x) {
if (intercept){
X1 <- X[, -1]
if(scale.x) { X1 <- scale(X1, center = TRUE, scale = TRUE); sc = attr(X1,"scaled:scale"); adj <- attr(X1,"scaled:center") }
if(orthogonal.x) X1 <- orthogonalize(X1, group)
X[, -1] <- X1
} else {
if(scale.x) { X <- scale(X, center = TRUE, scale = TRUE); sc = attr(X,"scaled:scale")}
if(orthogonal.x) X <- orthogonalize(X, group)
}
}
# DETERMINE lambda
n <- NROW(Y);
if (is.null(a0)) a0 <- min(n, 100)
lambda <- if (criterion =="BIC") log(n) else if (criterion =="AIC") 2 else lambda0
# DETERMINE OPTIMIZATION METHODS
if(!optim.method %in% c("BFGS", "GenSA", "ADAM")) {
warning(paste0("Unknow optimization method: \"", optim.method, "\", \"BFGS\" will be used!"))
optim.method = "BFGS"
}
# OBTAIN MLE AS STARTING VALUE
if (is.null(beta0)) {
fit <- eval(call("glm.fit", x =X, y = Y, family = family))
beta0 <- fit$coef
if (details) {
print("Initial beta:")
print(beta0)
}
}
p <- length(beta0)
# THE OBJECTIVE FUNCTION
if(is.character(family0)) {
fun <- LoglikPen
} else {
fun <- LoglikPenGLM
family0 <- family
}
# GRADIENT
grad <- NULL
# USING GENERALIZED SIMULATED ANNEALING {GenSA} PLUS BFGS
if (optim.method == "BFGS") {
# OPTIMIZATION USING SIMULATED ANNEALING, FOLLOWED BY BFGS
opt.fit1 <- optim(par=beta0, fn=fun, gr = grad,
method = "SANN", control = list(maxit=maxit.global, trace=F, reltol=epsilon),
group=group,X=X, y=Y, lambda=lambda, a=a0, family = family0)
opt.fit2 <- optim(par=opt.fit1$par, fn=fun, gr = grad,
method = "BFGS", control = list(maxit=maxit.local, trace=F, reltol=epsilon),
group=group, X=X, y=Y, lambda=lambda, a=a0, family = family0)
gamma <- opt.fit2$par
# min.Q <- opt.fit2$value
} else if(optim.method == "GenSA") {
# THE LOWER AND UPPER BOUNDS FOR SEARCH SPACE IN OPTIMIZATION
if (is.null(lower)) {lower=rep(-10, p); upper <- rep(10, p)}
else if (length(lower)==1 && p > 1) {lower <- rep(lower, p); upper <- rep(upper, p)}
else if (length(lower)!= p) stop("Wrong specification for lower= and upper=. Be aware of its appropriate dimension!")
opt.fit <- GenSA::GenSA(par=beta0, fn=fun, lower = lower, upper = upper,
control=list(maxit=maxit.global, nb.stop.improvement=5),
group=group, X=X, y=Y, lambda=lambda, a=a0, family = family0)
# min.Q <- opt.fit1$value
gamma <- opt.fit$par
} else{
gamma <- as.vector(adam_gmic(X, Y, a0, lambda, beta0, group, family$family, stepsize, epsilon, maxit.global))
}
if (details) {
print("Fitted Gamma:")
print(gamma)
}
# Reverse the scaling and/or orthogonalization
if(scale.x || orthogonal.x){
if (intercept) {
if(orthogonal.x) {
gamma[-1] <- unorthogonalize(gamma[-1],X1)
}
if(scale.x) {
gamma[-1] <- gamma[-1]/sc
gamma[1] <- gamma[1] - sum(gamma[-1]*adj)
}
}
else{
if(orthogonal.x) {
gamma <- unorthogonalize(gamma, X)
}
if(scale.x) {
gamma <- gamma/sc
}
}
}
# Prepare the variable selection result
beta.hat <- rep(0, length(gamma))
grps <- unique(group)
for(grp in grps) {
ix <- if(intercept) which(group == grp) + 1 else which(group == grp)
m = length(ix)
beta.hat[ix] <- gamma[ix] * tanh(a0 / m *sum(gamma[ix]^2))
}
# Obtain SE for gamma
pvals <- group.pval(formula, data, family, intercept, gamma, group)
fit0 <- glm(formula, data = data, family = family,
start=gamma, control=glm.control(epsilon=1e20, maxit=1, trace=F))
VCOV.gamma <- summary(fit0)$"cov.scaled"
se.gamma <- sqrt(diag(VCOV.gamma))
z.gamma <- gamma/se.gamma
pvalue.gamma <- 2*pnorm(abs(z.gamma), lower.tail=F)
beta.hat <- round(beta.hat, rounding.digits) # Rounding
##
result <- cbind(gamma=gamma,se.gamma=se.gamma,pvalue.gamma=pvalue.gamma,beta=beta.hat)
list(coefficients=data.frame(result), group.pvalues=pvals)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.