MetabComBat <- function(dat, saminfo, type = "txt",
write = T, covariates = "all", par.prior = T,
filter = F, skip = 0, prior.plots = T) {
# debug: expression_xls='exp.txt';
# sample_info_file='sam.txt'; type='txt'; write=T;
# covariates='all'; par.prior=T; filter=F; skip=0;
# prior.plots=T cat('Reading Sample Information
# File\n') saminfo <-
# read.table(sample_info_file, header=T,
# sep='\t',comment.char='')
if (sum(colnames(saminfo) == "Batch") != 1) {
stop("ERROR: Sample Information File does not have a Batch column!")
return("ERROR: Sample Information File does not have a Batch column!")
}
if (skip > 0) {
geneinfo <- as.matrix(dat[, 1:skip])
dat <- dat[, -c(1:skip)]
} else {
geneinfo = dat[, c(1:4)]
dat <- dat[, -c(1:4)]
}
# print(geneinfo[1:4])
print(dat[1:2, 1:3])
if (filter) {
nfeatures <- nrow(dat)
col <- ncol(dat)/2
present <- apply(dat, 1, filter.absent, filter)
dat <- dat[present, -(2 * (1:col))]
if (skip > 0) {
geneinfo <- geneinfo[present, ]
}
cat("Filtered features absent in more than",
filter, "of samples. features remaining:",
nrow(dat), "; features filtered:", nfeatures -
nrow(dat), "\n")
}
if (any(apply(dat, 2, mode) != "numeric")) {
stop("ERROR: Array expression columns contain non-numeric values! (Check your .xls file for non-numeric values and if this is not the problem, make a .csv file and use the type=csv option")
return("ERROR: Array expression columns contain non-numeric values! (Check your .xls file for non-numeric values and if this is not the problem, make a .csv file and use the type=csv option)")
}
# cnames1<-colnames(dat)
tmp <- match(colnames(dat), saminfo[, 1])
# tmp<-which(cnames1%in%saminfo[,1])
# tmplen<-length(tmp)
# if(tmplen!=length(cnames1)){return('ERROR:
# Sample Information File and Data Array Names are
# not the same!')}
if (any(is.na(tmp))) {
stop("ERROR: Sample Information File and Data Array Names are not the same!")
return("ERROR: Sample Information File and Data Array Names are not the same!")
}
# tmp1 <- match(saminfo[,1],colnames(dat)) saminfo
# <- saminfo[tmp1[!is.na(tmp1)],]
saminfo <- saminfo[tmp, ] ## Bug fixed 01/04/2011\t\t
if (any(covariates != "all")) {
saminfo <- saminfo[, c(1:2, covariates)]
}
design <- design.mat(saminfo)
batches <- list.batch(saminfo)
n.batch <- length(batches)
n.batches <- sapply(batches, length)
n.array <- sum(n.batches)
## Check for missing values
NAs = any(is.na(dat))
if (NAs) {
cat(c("Found", sum(is.na(dat)), "Missing Data Values\n"),
sep = " ")
}
# print(dat[1:2,]) Standardize Data across
# features
cat("Standardizing Data across features\n")
if (!NAs) {
B.hat <- solve(t(design) %*% design) %*% t(design) %*%
t(as.matrix(dat))
} else {
B.hat = apply(dat, 1, Beta.NA, design)
# print(length(B.hat))
# print(dim(B.hat[[1]]))
# save(B.hat,file='bhat.Rda')
B.hat_orig <- B.hat
numfeats <- length(B.hat)/length(n.batches)
# print(numfeats) print(n.batches)
dim(B.hat) <- dim(matrix(0, nrow = length(n.batches),
ncol = numfeats))
} #Standarization Model
# rows: number of batches; columns: number of
# metabs
print("standardization done")
print(dim(B.hat))
grand.mean <- t(n.batches/n.array) %*% B.hat[1:n.batch,
]
if (!NAs) {
var.pooled <- ((dat - t(design %*% B.hat))^2) %*%
rep(1/n.array, n.array)
} else {
var.pooled <- apply(dat - t(design %*% B.hat),
1, var, na.rm = T)
}
stand.mean <- t(grand.mean) %*% t(rep(1, n.array))
if (!is.null(design)) {
tmp <- design
tmp[, c(1:n.batch)] <- 0
stand.mean <- stand.mean + t(tmp %*% B.hat)
}
s.data <- (dat - stand.mean)/(sqrt(var.pooled) %*%
t(rep(1, n.array)))
## Get regression batch effect parameters
cat("Fitting L/S model and finding priors\n")
batch.design <- design[, 1:n.batch]
if (!NAs) {
gamma.hat <- solve(t(batch.design) %*% batch.design) %*%
t(batch.design) %*% t(as.matrix(s.data))
} else {
gamma.hat = apply(s.data, 1, Beta.NA, batch.design)
}
delta.hat <- NULL
for (i in batches) {
delta.hat <- rbind(delta.hat, apply(s.data[,
i], 1, var, na.rm = T))
}
delta.hat <- replace(delta.hat, which(is.na(delta.hat) ==
TRUE), 1)
## Find Priors
gamma.bar <- apply(gamma.hat, 1, mean)
t2 <- apply(gamma.hat, 1, var)
a.prior <- apply(delta.hat, 1, aprior)
b.prior <- apply(delta.hat, 1, bprior)
## Plot empirical and parametric priors
if (prior.plots & par.prior) {
par(mfrow = c(2, 2))
tmp <- density(gamma.hat[1, ])
plot(tmp, type = "l", main = "Density Plot")
xx <- seq(min(tmp$x), max(tmp$x), length = 100)
lines(xx, dnorm(xx, gamma.bar[1], sqrt(t2[1])),
col = 2)
qqnorm(gamma.hat[1, ])
qqline(gamma.hat[1, ], col = 2)
tmp <- density(delta.hat[1, ])
invgam <- 1/rgamma(ncol(delta.hat), a.prior[1],
b.prior[1])
tmp1 <- density(invgam)
plot(tmp, typ = "l", main = "Density Plot",
ylim = c(0, max(tmp$y, tmp1$y)))
lines(tmp1, col = 2)
qqplot(delta.hat[1, ], invgam, xlab = "Sample Quantiles",
ylab = "Theoretical Quantiles")
lines(c(0, max(invgam)), c(0, max(invgam)),
col = 2)
title("Q-Q Plot")
}
## Find EB batch adjustments
gamma.star <- delta.star <- NULL
if (par.prior) {
cat("Finding parametric adjustments\n")
for (i in 1:n.batch) {
# delta.hat[i,]<-replace(delta.hat[i,],which(is.na(delta.hat[i,])==TRUE),1)
temp <- it.sol(s.data[, batches[[i]]],
gamma.hat[i, ], delta.hat[i, ], gamma.bar[i],
t2[i], a.prior[i], b.prior[i])
gamma.star <- rbind(gamma.star, temp[1,
])
delta.star <- rbind(delta.star, temp[2,
])
}
} else {
cat("Finding nonparametric adjustments\n")
for (i in 1:n.batch) {
bad_cols <- which(is.na(delta.hat[i, ]) ==
TRUE)
# delta.hat[i,]<-replace(delta.hat[i,],which(is.na(delta.hat[i,])==TRUE),1)
temp <- int.eprior(as.matrix(s.data[,
batches[[i]]]), gamma.hat[i, ], delta.hat[i,
])
gamma.star <- rbind(gamma.star, temp[1,
])
delta.star <- rbind(delta.star, temp[2,
])
}
}
### Normalize the Data ###
cat("Adjusting the Data\n")
bayesdata <- s.data
j <- 1
for (i in batches) {
bayesdata[, i] <- (bayesdata[, i] - t(batch.design[i,
] %*% gamma.star))/(sqrt(delta.star[j,
]) %*% t(rep(1, n.batches[j])))
j <- j + 1
}
bayesdata <- (bayesdata * (sqrt(var.pooled) %*%
t(rep(1, n.array)))) + stand.mean
if (write) {
output_file <- paste("Adjusted_data_parprior",
par.prior, ".xls", sep = "_")
# print(geneinfo[1:2]) print(bayesdata[1:2,1:4])
# cat(c(colnames(geneinfo),colnames(dat),'\n'),file=output_file,sep='\t')
# suppressWarnings(write.table(cbind(geneinfo,formatC(as.matrix(bayesdata),
# format = 'f')), file=output_file, sep='\t',
# quote=F,row.names=F,col.names=F,append=T))
outdata <- cbind(ProbeID = geneinfo, bayesdata)
write.table(outdata, file = output_file, sep = "\t")
cat("Adjusted data saved in file:", output_file,
"\n")
} else {
return(cbind(geneinfo, bayesdata))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.