mMRFfit <- function(
data, #data matrix, col=variables
type, #data type for col 1:ncol; c=categorical, g=gaussian, p=poisson, e=exponential
lev, #number of categories of categorical variables, continuous variables have level=1
lambda.sel="CV", #method for penalization parameter (lambda) -selection
folds=10, #folds in case CV is used for lambda selection
gam=.25, #tuning parameter for EBIC, in case EBIC is used for lambda selection
d=1, #maximal degree of the true graph
rule.reg="AND", #parameter-aggregation of categorical variables
rule.cat="OR", #either "OR"- conditional independence; or matrix that specifies costumized rules
pbar = TRUE # shows a progress bar if TRUE
)
{
# step 1: sanity checks & info from data
stopifnot(ncol(data)==length(type)) # type vector has to match data
stopifnot(ncol(data)==length(lev)) # level vector has to match data
if(sum(apply(data, 2, function(x) class(x)!="numeric"))>0) stop("Only numeric values permitted!")
if(sum(apply(cbind(data[,type=="p"],rep(1,nrow(data))), 2, function(x) sum(x-round(x))))>0) stop("Only integers permitted for Poisson random variables!")
n <- nrow(data)
nNode <- ncol(data)
# step 2: prepare data
#data[,type!="c" & type!="p"] <- scale(data[,type!="c" & type!="p"]) #standardize continuous variables
data <- as.data.frame(data) #necessary for formula input
colnames(data) <- paste("V",1:nNode, sep="") #necessary for formula input
#compare entered and empirical levels
emp_lev <- numeric(nNode) + 1
for(z in 1:nNode)
{
if(type[z]=="c") {
emp_lev[z] <- length(unique(data[,z]))
data[,z] <- as.factor(data[,z]) #turn categoricals into factors (for formula function)
}
}
# warning: entered lev = emp lev ?
if(sum(emp_lev[type=="c"]!=lev[type=="c"])) warning("Entered levels are not equal to empirical levels. Empirical levels are used.")
#indexing-dummy that helps us to put parameters involving categorical variables in the right place
dummy_par.sort <- logical() # this will tell us, in which places of each row we fill parameters; this is because whenn gauss <- cat; we get m-1 parameter
dummy_levels <- numeric()
for(i in 1:nNode)
{
lev.e <- 1
tar.e <- TRUE
if(emp_lev[i]>1)
{
tar.e <- c(FALSE,rep(TRUE,emp_lev[i]-1))
lev.e <- emp_lev[i]-1
}
dummy_par.sort <- c(dummy_par.sort, tar.e)
dummy_levels <- c(dummy_levels, lev.e)
}
dummy_matrix.int <- cbind(dummy_levels, 1:length(dummy_levels))
dummy.ind <- unlist(apply(dummy_matrix.int,1,function(x) { rep(x[2],x[1])}))
dummy_matrix <- cbind(emp_lev, 1:length(emp_lev))
ind <- as.numeric(unlist(apply(dummy_matrix,1,function(x) { rep(x[2],x[1])})))
# step 3: create storage for parameters
model.par.matrix <- matrix(0, sum(emp_lev), sum(emp_lev))
m_lambdas <- matrix(0,nNode,2) #storing lambda threshold and actual lambda
#progress bar
if(pbar==TRUE) {
pb <- txtProgressBar(min = 0, max=nNode, initial=0, char="-", style = 3)
}
# step 4: estimation
for(v in seq_len(nNode))
{
# step 4.1: compute design matrix (adding interactions as a function of d)
if(d>(nNode-1)) {
stop("Order of interactions can be maximal the number of predictors!")
} else if (d==1){ form <- as.formula(paste(colnames(data)[v],"~ (.)"))
} else { form <- as.formula(paste(colnames(data)[v],"~ (.)^",d)) }
X <- model.matrix(form, data=data)[,-1]
#define link function
if(type[v]=="c") {
fam <- "multinomial"
} else if(type[v]=="g" | type[v]=="e") { #should be inverse link for "e", but currently not avail. for glmnet
fam <- "gaussian"
} else if(type[v]=="p") {
fam <- "poisson"
}
# step 4.2: select alpha & call glmnet
#lambda selection with EBIC
if(lambda.sel=="EBIC") {
fit <- glmnet(X, data[,v], family=fam, alpha=1)
#glmnet doesnt give us the pseudo LL, therefore we have to calculate it
#calculate LL_Null depending on cat/cont
if(type[v]=="g") {
mean.i <- coef(fit, s=1)[1] #mean given by intercept model
LL_null <- sum(dnorm(data[,v],mean.i,1, log=TRUE))
} else if(type[v]=="e") {
mean.i <- coef(fit, s=1)[1] #mean given by intercept model
LL_null <- sum(dnorm(data[,v],mean.i,1, log=TRUE))
} else if(type[v]=="p") {
mean.i <- coef(fit, s=1)[1] #mean given by intercept model
LL_null <- sum(dpois(data[,v],exp(mean.i), log=TRUE))
} else if(type[v]=="c") {
n_cats <- emp_lev[v]
#dummy matrices to compute LL
ind_dum <- cbind(matrix(0,n,n_cats), as.numeric(data[,v]))
ind_mat <- t(apply(ind_dum, 1, function(x) {x[x[4]] <- 1; return(x[1:n_cats])} ))
int_mat <- matrix(0,n,n_cats)
for(ca in 1:n_cats) { int_mat[,ca] <- coef(fit, s=1)[[ca]][1] }
LL_null <- 1/n * (sum(ind_mat * int_mat) - n*log(sum(exp(int_mat[1,]))) ) #LL multinomial from glmnet paper
}
# calc LL_sat
LL_sat <- 1/2 * fit$nulldev + LL_null
# calc LL for all lambdas
dev <- glmnet:::deviance.glmnet(fit)
LL <- - 1/2 * dev + LL_sat
n_lambdas <- length(fit$lambda)
# calculation of nonzero neighborhoods
if(type[v]!="c") { #continuous case
coefs_bin <- as.matrix(coef(fit)[-1,]) != 0 #nonzero?
n_neighbors <- colSums(coefs_bin)
}
if(type[v]=="c"){ #categorical case
m_neighbors <- matrix(0,ncol=n_lambdas, nrow=n_cats)
coefs_bin <- vector("list", length=n_cats)
for(ca in 1:n_cats){
coefs_bin[[ca]] <- as.matrix(coef(fit)[[ca]][-1,]) != 0 #nonzero?
}
n_neighbors <- colSums(Reduce('+', coefs_bin)!=0) #rule: a predictor has a nonzero parameter with 1 category of the y, then we have a neighborhood relation
}
# calc all EBICs
EBIC_lambda <- -2*LL + n_neighbors * log(n) + 2*gam*n_neighbors*log(ncol(X))
lambda_select <- fit$lambda[which(EBIC_lambda==min(EBIC_lambda))]
coefs <- coef(fit, s=lambda_select) #lambda value with highest EBIC
# lambda selection with CV
} else {
fit <- cv.glmnet(X, data[,v], family=fam, alpha=1, nfolds=folds, type.measure = "deviance")
lambda_select <- fit$lambda.min
coefs <- coef(fit, s=lambda_select)
} # end of estimation;
#list to matrix; cut out intercepts
coefsm <- matrix(do.call(rbind,lapply(coefs, as.numeric)),nrow=emp_lev[v])[,-1]
# step 4.3: save lambda + save & apply tau threshold
m_lambdas[v,2] <- bound <- sqrt(d) * sqrt(sum(coefsm^2)) * sqrt(log(nNode)/n)
m_lambdas[v,1] <- lambda_select
coefsm[abs(coefsm)<bound]<-0 # apply tau threshold
# step 4.4: write into model.par.matrix
#select corresponding row in model par matrix & fill in
#get correct dummy
dummy_par.sort.v <- dummy_par.sort[ind!=v]
#select corresponding row(s) in model par matrix & fill in
# continuous
if(emp_lev[v]==1) {
exp.n.c <- length(model.par.matrix[ind==v,ind!=v][dummy_par.sort.v]) #number of coefficients
model.par.matrix[ind==v,ind!=v][dummy_par.sort.v] <- coefsm[1:(exp.n.c)]
} else { # categorical
for(L in 1:emp_lev[v])
{
exp.n.c <- length(model.par.matrix[ind==v,ind!=v][,dummy_par.sort.v][L,])
model.par.matrix[ind==v,ind!=v][,dummy_par.sort.v][L,] <- coefsm[L,1:(exp.n.c)]
}
}
#progress bar
if(pbar==TRUE) {
setTxtProgressBar(pb, v)
}
} # end variable-loop
# step 5: derivates on model parameter matrix
# 5.1: aggregate on within categories
f_agg_cats <- function(model.par.matrix, rule) {
#select only colums where paramater are actually estimated (glmnet estimates k-1 not k parameters)
m.p.m <- model.par.matrix[,dummy_par.sort]
# averaging over columns
m.p.m.1 <- t(apply(m.p.m, 1, function(x) {
out <- numeric(0)
for(i in 1:nNode)
{
out.n <- mean(abs(x[dummy.ind==i])) #without abs, this keeps the sign; but because of the glmnet parameterization in categoricals it burries nonzero coefficients in the binary case
if(rule=="AND") {
out.n <- out.n * (sum(x[dummy.ind==i]==0)<1) #the second term = 0 when not all coefficients are nonzero
}
out <- rbind(out, out.n)
}
out <- matrix(out, nrow=1)
}))
# averaging over rows
m.p.m.2 <- apply(m.p.m.1, 2, function(x) {
out <- numeric()
for(i in 1:nNode)
{
out.n <- mean(abs(x[ind==i]))
if(rule=="AND") {
out.n <- out.n * (sum(x[ind==i]==0)<1) #the second term = 0 when not all coefficients are nonzero
}
out <- rbind(out, out.n)
}
out <- matrix(out, ncol=1)
})
} #end of function
#costumized rule
if(rule.cat!="OR")
{
#check on matrix
if(sum(dim(rule.cat) == dim(model.par.matrix))!=2) stop("rule.cat must have the same dimension as the parameter matrix!")
#apply rule
ind_costcat <- ((model.par.matrix + rule.cat)!=1)*1
diag(ind_costcat)<-0
ind_costcat_agg <- f_agg_cats(ind_costcat, "AND")
m.p.m.2 <- f_agg_cats(model.par.matrix, "OR") * ind_costcat_agg
#standard rule: OR which leads to edges that indicate conditional independence
} else {
m.p.m.2 <- f_agg_cats(model.par.matrix, "OR")
}
### 5.3: aggregate across two regressions
if(rule.reg=="AND") {
m.p.m.2_nonzero <- m.p.m.2!=0
m.p.m.2_nonzero <- m.p.m.2_nonzero * t(m.p.m.2_nonzero)
m.p.m.2 <- m.p.m.2 * m.p.m.2_nonzero
}
#make matrices symmetric (taking the average)
wadj <- (m.p.m.2 + t(m.p.m.2))/2 #adjacency matrix
mpar.matrix.sym <- (model.par.matrix+t(model.par.matrix)) / 2
#create list mapping: parameters <-> variables as input for qgraph "group"-argument
parvar.map <- parvar.map.label <- vector("list", length=nNode)
ind_map <- 1
for(m in 1:nNode){
#create indices list for qgraph
parvar.map[[m]] <- ind_map:(ind_map+lev[m]-1)
#create labels for qgraph
if(lev[m]==1)
{
parvar.map.label[[m]] <- m
} else {
parvar.map.label[[m]] <- paste(m, 1:lev[m], sep = ".")
}
ind_map <- ind_map + lev[m]
}
parvar.map.label_all <- do.call(c, parvar.map.label)
#dichotomize
adj <- (wadj!=0)*1
# step 6: output
output_list <- list("adj"=adj, "wadj"=wadj, "wpar.matrix" = model.par.matrix,
"wpar.matrix.sym"=mpar.matrix.sym, "parvar.map"=parvar.map,
"parvar.map.labels"=parvar.map.label_all, "lambda"=m_lambdas[,1])
class(output_list) <- "mMRF"
return(output_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.