Nothing
fabatch <-
function(x, y, batch, nbf=NULL, minerr=1e-06, probcrossbatch=TRUE, maxiter=100, maxnbf=12) {
if(any(is.na(x)))
stop("Data contains missing values.")
if(!is.factor(batch))
stop("'batch' has to be of class 'factor'.")
if(!is.matrix(x))
stop("'x' has to be of class 'matrix'.")
if(!(is.factor(y) & length(levels(y))==2))
stop("'y' has to be of class 'factor' with two levels.")
if(max(table(batch)) >= ncol(x))
stop("The sample size within each batch has to be smaller than the number of variables.")
if((length(batch)!=length(y)) | (length(y)!=nrow(x)))
stop("At least one pair out of 'x', 'y' and 'batch' is incompatible.")
##require("glmnet")
batches = levels(batch)
nbatches = length(batches)
nvars <- ncol(x)
biggestbatch <- names(table(batch))[which.max(table(batch))]
ylevels = levels(y)
# If a variable is constant in a batch, infinite values
# or missing values will arise, when scaling by batch.
# To avoid this, the following is done:
# Variables, for which this phenomen occurs, are left
# out during scaling and factor estimation and only
# adjusted with respect to their mean.
# Calculate batch-specific standard deviations:
sdb = as.list(rep(0,nbatches))
for (i in 1:nbatches) {
sdb[[i]] = apply(x[batch==batches[i],],2,sd)
}
sdb0=sdb
badvariableslist <- lapply(sdb, function(x) which(x==0))
badvariables <- sort(unique(unlist(badvariableslist)))
goodvariables <- setdiff(1:ncol(x), badvariables)
if(length(badvariables) > 0) {
xsafe <- x
x <- x[,goodvariables]
sdb <- lapply(sdb, function(x) x[goodvariables])
sdb0=sdb
}
# overall variable means:
meanoverall <- colMeans(x)
# First center and scale the variables per batch to apply the Lasso
# for obtaining the probability estimates:
# Remove batch specific means:
if(nbatches > 1) {
adjustmentmod = lm(x~batch)
design = model.matrix(~batch)
adjustmentcoef = coef(adjustmentmod)
xb = x-design%*%adjustmentcoef
adjustmentcoef0 = adjustmentcoef
} else
xb <- scale(x, scale=FALSE)
# Pooled variance:
pooledsds <- apply(xb, 2, sd)
# 'scaledxb' is X centered and scaled per batch:
scaledxb = xb
for (i in 1:nbatches) {
scaledxb[batch==batches[i],] = scale(xb[batch==batches[i],],center=rep(0,ncol(x)),scale=sdb[[i]])
}
# Now lasso is trained on the batch-centered and batch-scaled X and the fitted probability
# of each y to belong to class 1 or 2 calculated - 'proba' (There is probably a problem with overfitting:
# When the probabilities are much more shifted towards 0 and 1 in the training data than in the
# test data, the training and test data might not be that comparable anymore -> Solution might be
# to use cross validation to estimate the probabilities here):
cvmod = glmnet::cv.glmnet(x=as.matrix(scaledxb),y=y,family="binomial",type.measure="deviance", alpha = 0)
lambda.min = cvmod$lambda.min
mod = glmnet::glmnet(x=as.matrix(scaledxb),y=y,family="binomial",lambda=lambda.min, alpha = 0)
if(probcrossbatch) {
if(nbatches > 1) {
proba <- rep(NA, nrow(x))
for(i in 1:nbatches) {
modtemp = glmnet::glmnet(x=as.matrix(scaledxb[batch!=batches[i],]),y=y[batch!=batches[i]],family="binomial",lambda=lambda.min, alpha = 0)
b0temp = modtemp$a0
btemp = modtemp$b
proba[batch==batches[i]] = as.vector(1/(1+exp(-b0temp-as.matrix(scaledxb[batch==batches[i],])%*%btemp))) # mean(ifelse(p1x>0.5,2,1)!=ytest)
}
} else {
warning("Number of Training batches equal to one. Using ordinary cross-validation for preliminary class probability estimation.")
cvmod = glmnet::cv.glmnet(x=as.matrix(scaledxb),y=y,family="binomial",type.measure="deviance", keep=TRUE, lambda=c(lambda.min, lambda.min+(1/1000)*lambda.min), alpha = 0)
proba = cvmod$fit.preval[,1]
}
}
else {
cvmod = glmnet::cv.glmnet(x=as.matrix(scaledxb),y=y,family="binomial",type.measure="deviance", keep=TRUE, lambda=c(lambda.min, lambda.min+(1/1000)*lambda.min), alpha = 0)
proba = cvmod$fit.preval[,1]
}
# 'b0' is intercept from Lasso and 'b' coefficient vector for variables:
b0 = mod$a0
b = mod$b
# Calculate the class-specific mean vectors 'm1' and 'm2':
m1 = apply(scaledxb[y==ylevels[1],],2,mean)
m2 = apply(scaledxb[y==ylevels[2],],2,mean)
nbfvec <- rep(NA, nbatches)
# Determine number of factors if not given:
if (!is.null(nbf)) {
nbfvec <- rep(nbf, times=nbatches)
nbfinput <- NULL
}
else
nbfinput <- nbf
# Calculate the factors on 'cdta' (batch-centered-scaled and class-removed):
falist <- vector("list", nbatches)
criterionall <- list()
scaledxbfa <- scaledxb
for (i in 1:nbatches) {
if(is.null(nbf)) {
##require("mnormt")
maxnbf2 <- min(c(maxnbf, floor(sum(batch==batches[i])/2)))
nbfobj = nbfactors(scale(sweep(scale(scaledxb[batch==batches[i],],center=m1,scale=FALSE), 1, 1-proba[batch==batches[i]], "*") +
sweep(scale(scaledxb[batch==batches[i],],center=m2,scale=FALSE), 1, proba[batch==batches[i]], "*")), maxnbfactors=maxnbf2, minerr=minerr, maxiter=maxiter)
nbfvec[i] <- nbfobj$optimalnbfactors
}
if(is.na(nbfvec[i])) {
warning("There occured an issue in the factor estimation. Number of factors set to zero.")
nbfvec[i] <- 0
}
if(nbfvec[i] > 0) {
# Calculate the factors on 'cdta' (batch-centered-scaled and class-removed):
fa = emfahighdim(sweep(scale(scaledxb[batch==batches[i],],center=m1,scale=FALSE), 1, 1-proba[batch==batches[i]], "*") +
sweep(scale(scaledxb[batch==batches[i],],center=m2,scale=FALSE), 1, proba[batch==batches[i]], "*"),nbf=nbfvec[i],minerr=minerr, maxiter=maxiter)
# Remove the factor influences:
scaledxbfa[batch==batches[i],] = scaledxb[batch==batches[i],] - fa$Factors%*%t(fa$B)
falist[[i]] <- fa
} else {
scaledxbfa[batch==batches[i],] <- scaledxb[batch==batches[i],]
fa <- NULL
}
}
means2batch <- sd2batch <- matrix(nrow=length(levels(batch)), ncol=ncol(scaledxbfa))
# scale again:
for (i in 1:nbatches) {
means2batch[i,] <- colMeans(scaledxbfa[batch==batches[i],])
sd2batch[i,] <- apply(scaledxbfa[batch==batches[i],], 2, sd)
scaledxbfa[batch==batches[i],] = scale(scaledxbfa[batch==batches[i],], center=means2batch[i,], scale=sd2batch[i,])
}
xfa <- sweep(sweep(scaledxbfa, 2, pooledsds, "*"), 2, meanoverall, "+")
if(length(badvariables)>0) {
xbadvar <- xsafe[,badvariables, drop=FALSE]
sdbbad = as.list(rep(0,nbatches))
for (i in 1:nbatches) {
sdbbad[[i]] = apply(xbadvar[batch==batches[i],],2,sd)
}
whichzerosdmat <- t(sapply(sdbbad, function(x) x==0))
sdbbad0to1 <- lapply(sdbbad, function(x) {
x[x==0] <- 1
x
})
pooledsdsbad <- mapply(function(x, y) {
if(!all(y)) {
return(sd(x[batch %in% levels(batch)[which(!y)]]))
}
else
return(1)
}, as.data.frame(xbadvar), as.data.frame(whichzerosdmat))
mubadvar <- colMeans(xbadvar)
xbadvaradj <- matrix(nrow=nrow(xsafe), ncol=length(badvariables), data=mubadvar, byrow=TRUE)
for(i in 1:nbatches)
xbadvaradj[batch==levels(batch)[i],] <- xbadvaradj[batch==levels(batch)[i],,drop=FALSE] + sweep(scale(xbadvar[batch==levels(batch)[i],,drop=FALSE], center=TRUE, scale=sdbbad0to1[[i]]), 2, pooledsdsbad, "*")
xfanew <- matrix(nrow=nrow(xsafe), ncol=ncol(xsafe))
xfanew[,goodvariables] <- xfa
xfanew[,badvariables] <- xbadvaradj
xfa <- xfanew
}
meanvectorwhole <- rep(NA, nvars)
meanvectorwhole[goodvariables] <- meanoverall
if(length(badvariables)>0)
meanvectorwhole[badvariables] <- mubadvar
sdvectorwhole <- rep(NA, nvars)
sdvectorwhole[goodvariables] <- pooledsds
if(length(badvariables)>0)
sdvectorwhole[badvariables] <- 1
params <- list(xadj=xfa, m1=m1,m2=m2,b0=b0,b=b, pooledsds=sdvectorwhole, meanoverall=meanvectorwhole, minerr=minerr,
nbfinput=nbfinput, badvariables=badvariables, nbatches=nbatches, batch=batch, nbfvec=nbfvec)
class(params) <- "fabatch"
return(params)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.