# R/iterateBMAglm_2class.R In iterativeBMA: The Iterative Bayesian Model Averaging (BMA) algorithm

```## iterative bic.glm in R:
zero.threshold <- 0.0001

iterateBMAglm <- function (x, y, curr.mat, stopVar=0, nextVar, nbest=10, thresProbne0 = 1, maxNvar = 30, maxIter=20000) {

## iterative bic.glm
currIter <- 0
while (stopVar == 0 && currIter < maxIter) {
## run bic.glm from the BMA package
ret.bic.glm <- bic.glm (x=curr.mat, y=y, maxCol=(maxNvar+1), glm.family="binomial", nbest=nbest)

## get a logical vector for which probne0 < thresProbne0
rmVector <- which (ret.bic.glm\$probne0 < thresProbne0)

## now, check if we could add more variables
## We want to run bic.glm with as many variables as possible
## so that we are considering more models simultaneously AND
## for efficiency reasons
if (length(ret.bic.glm\$probne0) < maxNvar) {
## we have extra space, so we want to add variables
## stuff up curr.mat with junk, to be removed later
num.var.to.add <- ncol(x) - nextVar + 1
## we only add if we have more variables left to add than
## space from removing low threshold ones
junk.mat <- matrix(0, nrow=nrow(curr.mat), ncol=junk.ncol)
curr.mat <- cbind (curr.mat, junk.mat)
rmVector.old <- rmVector
rmVector <- c(rmVector.old, (length(ret.bic.glm\$probne0)+1):(length(ret.bic.glm\$probne0)+junk.ncol))
}
}

## only increase thresProbne0 if there is no space to add in variables
if (length(rmVector) == 0) {
## we only end up here if length(ret.bic.glm\$probne0) >= maxNvar
## no gene to swap in!!, increase threshold
currMin <- min (ret.bic.glm\$probne0)
print (paste("no gene to swap! Min probne0 = ", currMin, sep=""))
newThresProbne0 <- currMin + 1
print (paste("new probne0 threshold = ", newThresProbne0, sep=""))
rmVector <- which (ret.bic.glm\$probne0 < newThresProbne0)
## print (rmVector)
}

## now, guaranteed there is at least 1 variable to swap
## or there is space to add in new variables

if (nextVar <= ncol(x)) {
## set up new X
## print ("set up new X")
lastVar <- length(rmVector) + nextVar - 1
## to make sure lastVar <= ncol(x)
if (lastVar > ncol(x)) {
rmVector <- rmVector [1: (ncol(x) -nextVar + 1)]
lastVar <- ncol(x)
}
## print (paste(nextVar, lastVar, sep=" "))
##print (dimnames(x)[[2]][nextVar:lastVar])

## check for singularity
curr.mat[, rmVector] <- x[, nextVar:lastVar]
## change the colume name as well!!
dimnames(curr.mat)[[2]][rmVector] <- dimnames(x)[[2]][nextVar:lastVar]
svd.out <- svd (curr.mat)

if (any(svd.out\$d < zero.threshold) == TRUE) {
## singularity, try to add one variable at a time
print ("inside svd")
curr.mat <- curr.mat[, -(rmVector)]
for (curr.var in nextVar:lastVar) {
orig.dimnames <- dimnames (curr.mat)[[2]]
curr.mat <- cbind (curr.mat, x[, curr.var])
dimnames(curr.mat)[[2]] <- c(orig.dimnames, dimnames(x)[[2]] [curr.var])
svd.out <- svd (curr.mat)
if (any(svd.out\$d < zero.threshold) == TRUE) {
## NOT okay to add this variable, rm the last column
curr.mat <- curr.mat[, -(ncol(curr.mat))]
}
}
}

nextVar <- lastVar + 1

} else {
## there is no variable to be removed OR exhausted all data
stopVar <- 1
}
currIter <- currIter + 1
}
print (paste(currIter, ": explored up to variable ## ", nextVar-1, sep=""))

## print out selected genes if iterateBMA is over
if (stopVar == 1) {
selectedGenes <- which(ret.bic.glm\$probne0 >= thresProbne0)
}

list(curr.mat=curr.mat, stopVar=stopVar, nextVar=nextVar)
}

iterateBMAinit <- function (x, maxNvar = 30) {

maxNvar <- min (maxNvar, ncol(x))
curr.mat <- x[, 1:maxNvar]
stopVar <- 0
nextVar <- maxNvar + 1

## make sure the curr.mat is not singular
svd.out <- svd (curr.mat)

## if a variable causes singularity --> a linear combination of others
##           --> just ignore this variable
if (any(svd.out\$d < zero.threshold) == TRUE) {
## start by adding 1 variable at a time
curr.mat <- as.matrix(x[, 1])
dimnames (curr.mat)[[2]] <- as.vector(dimnames(x)[[2]][1], mode="list")
curr.var <- 2
while ((ncol(curr.mat) < maxNvar) && (curr.var <= ncol(x))) {
orig.dimnames <- dimnames (curr.mat)[[2]]
curr.mat <- cbind (curr.mat, x[, curr.var])
dimnames(curr.mat)[[2]] <- c(orig.dimnames, dimnames(x)[[2]] [curr.var])
print (paste("iterateBMAinit: add 1 column --> total ", ncol(curr.mat), sep=""))
svd.out <- svd (curr.mat)
if (any(svd.out\$d < zero.threshold) == TRUE) {
## NOT okay to add this variable, rm the last column
orig.dimnames.2 <- dimnames (curr.mat)[[2]]
orig.dimnames.1 <- dimnames (curr.mat)[[1]]
col.to.rm <- ncol(curr.mat)
curr.mat <- as.matrix (curr.mat[, -(col.to.rm)])
dimnames(curr.mat) <- list(orig.dimnames.1, orig.dimnames.2[-col.to.rm])
print (paste ("variables dropped: ", ncol(curr.mat)+1, sep=""))
}
curr.var <- curr.var + 1
}
## here, ncol(curr.mat) >= maxNvar OR curr.var > ncol(x)
nextVar <- curr.var
}

list(curr.mat=curr.mat, stopVar=stopVar, nextVar=nextVar)
}

## parameters: nbest: input to bicglm
##             sortedA: matrix of independent variables, sorted by columns
##                  expected G * C, all of our data
##             y: vector of response
##             thresProbne0: threshold to rm probne0, default = 1%
##	      maxIter: max ## iterations in repeating bic.glm
##             maxNvar: max ## variables to feed into bic.glm, default=30
## returns: an object of class bic.glm if the algorithm finishes
## 	   output from applying bic.glm on selected variables
##	   otherwise, returns -1

iterateBMAglm.wrapper <- function (sortedA, y, nbest=10, maxNvar=30, maxIter=20000, thresProbne0=1) {
## get the top "maxNvar" variables
ret.bma.init <- iterateBMAinit (x=sortedA, maxNvar)

## call bic.glm repeatedly
ret.bma <- iterateBMAglm (x=sortedA, y=y, curr.mat=ret.bma.init\$curr.mat, stopVar=ret.bma.init\$stopVar, nextVar=ret.bma.init\$nextVar, nbest, thresProbne0, maxNvar, maxIter)

if (ret.bma\$stopVar == 1) {
## apply bic.glm again using selected genes
ret.bic.glm <- bic.glm (x=ret.bma\$curr.mat, y=y, maxCol=(maxNvar+1), glm.family="binomial", nbest=nbest)
return (ret.bic.glm)
} else {
return (-1)
}
}

########################################################

iterateBMAglm.train <- function (train.expr.set, train.class, p=100, nbest=10, maxNvar=30, maxIter=20000, thresProbne0=1) {

train.dat <- t(exprs(train.expr.set))
## 1. order all the columns (variables) by BSS/WSS ratios
sorted.vec <- BssWssFast (train.dat, train.class, numClass = 2)
sorted.train.dat <- train.dat[, sorted.vec\$ix[1:p]]

## 2. run iterative bic.glm
## run iterative BMA using logistic regression
ret.bma <- iterateBMAglm.wrapper (sorted.train.dat, y=train.class, nbest=nbest, maxNvar=maxNvar, maxIter=maxIter, thresProbne0=thresProbne0)

# convert gene names in "namesx" and "label"
ret.bma\$namesx <- as.character(sapply (ret.bma\$namesx, convertSingleName, orig.expr.set=train.expr.set))
ret.bma\$label <- as.character(sapply (ret.bma\$label, convertModelName, orig.expr.set=train.expr.set))
ret.bma
}

iterateBMAglm.train.predict <- function (train.expr.set, test.expr.set, train.class, p=100, nbest=10, maxNvar=30, maxIter=20000, thresProbne0=1) {

train.dat <- t(exprs(train.expr.set))
test.dat <- t(exprs(test.expr.set))
## use the training data to select relevant genes
ret.bma <- iterateBMAglm.train (train.expr.set, train.class, p, nbest, maxNvar, maxIter, thresProbne0)

## predict the classes of the test samples using the selected genes
if (nrow(test.dat) > 1) {
curr.test.dat <- test.dat [, ret.bma\$namesx]
} else {    # for LOOCV
curr.test.dat <- matrix(test.dat [, ret.bma\$namesx], ncol=length(ret.bma\$namesx), dimnames=list(NULL, ret.bma\$namesx))
}
y.pred.test <- apply (curr.test.dat, 1, bma.predict, postprobArr=ret.bma\$postprob, mleArr=ret.bma\$mle)
}

iterateBMAglm.train.predict.test <- function (train.expr.set, test.expr.set, train.class, test.class, p=100, nbest=10, maxNvar=30, maxIter=20000, thresProbne0=1) {

train.dat <- t(exprs(train.expr.set))
test.dat <- t(exprs(test.expr.set))
## use the training data to select relevant genes
ret.bma <- iterateBMAglm.train (train.expr.set, train.class, p, nbest, maxNvar, maxIter, thresProbne0)

## predict the classes of the test samples using the selected genes
if (nrow(test.dat) > 1) {
curr.test.dat <- test.dat [, ret.bma\$namesx]
} else {    # for LOOCV
curr.test.dat <- matrix(test.dat [, ret.bma\$namesx], ncol=length(ret.bma\$namesx), dimnames=list(NULL, ret.bma\$namesx))
}
y.pred.test <- apply (curr.test.dat, 1, bma.predict, postprobArr=ret.bma\$postprob, mleArr=ret.bma\$mle)

## compute the number of errors and brier score
curr.table <- table (round(y.pred.test, 0), test.class)
curr.error <- sum (curr.table) - sum(diag(curr.table))
## print (curr.table)
## compute the brier score
## print (round(y.pred.test, 3))
curr.score <- brier.score (y.pred.test, test.class)
## print (curr.score)

selected.genes <- which (ret.bma\$probne0 > 0)
## print (paste("# selected genes = ", length(selected.genes), sep=""))
## print (paste("# selected models = ", length(ret.bma\$postprob), sep=""))
list (num.genes=length(selected.genes), num.model=length(ret.bma\$postprob), num.err=curr.error, brierScore=curr.score)

}
```

## Try the iterativeBMA package in your browser

Any scripts or data that you put into this service are public.

iterativeBMA documentation built on Nov. 8, 2020, 5:14 p.m.