"writeBinBlocks" <-
function(x, pattern, maxRows=10000, path = ".",
what=c("double", "integer", "character"), append=FALSE, verbose=TRUE)
{
# Write a vector (x) into a bunch of binary files.
# each binary file contains maxRows rows
what <- match.arg(what)
# remove files with pattern:
if (!append)
file.remove(dir (path, pattern=paste("^", pattern, "\\.[0-9]+$", sep=""), full=TRUE))
n <- dim(x)[1]
if (is.null(n)) n <- length(x)
if (n<maxRows) {
rMat <- t(c(1, n));
} else {
ntb <- ceiling (n/maxRows)
sx <- seq(from=1, to=n, by=maxRows)
ex <- seq(from=maxRows, to=n, by=maxRows)
if (rev(ex)[1]!=n) ex <- c(ex, n)
if (length(sx)!=length(ex)) {
cat ("bug:\n", sx, "\n", ex, "\n")
stop()
}
rMat <- cbind(sx, ex)
}
apply(rMat, 1,
function(lmt, x, pattern, path, maxRows, what, append, verbose)
{
i <- ceiling(lmt[2]/maxRows)
filename <- file.path(path, paste(pattern, i, sep="."))
f <- file(filename, "ab")
dim <- dim(x)
xr <- lmt[1]:lmt[2]
if (is.null(dim)) {
x <- x[xr]
if (!append) writeBin(as.integer(c(1, length(x))), f)
} else {
x <- x[xr,]
if (!append) writeBin(as.integer(c(length(dim), dim)), f)
}
if (verbose) {
if (append)
cat("Append to file", filename, "with", length(xr), "rows\n")
else
cat("Create file", filename, "with", length(xr), "rows\n")
}
writeBin(switch(what,
double=as.double(x),
integer=as.integer(x),
character=as.character(x)), f)
close(f)
invisible(NULL)
}, x, pattern, path, maxRows, what, append, verbose)
invisible(NULL)
}
"readBinMatrix" <-
function(name, what="double", ncol=1000000, ...)
{
f <- file(name, "rb")
n <- as.integer(readBin(f, what="integer", n=1))
nrow <- as.integer(readBin(f, what="integer", n=n))[1]
v <- readBin(f, what=what, n=nrow*ncol)
close(f)
matrix(v, ncol=ncol)
}
"cypherGenXML2Bin" <-
function(name, pattern, path, mz, tof=FALSE, maxRows=10000, append=TRUE, verbose=TRUE, ...)
{
nf <- ceiling(length(mz)/maxRows)
# read xmlfile into memory
if (tof) {
# for tof: calculate mz on the fly
nodeList <-
list(a="spectrum/processingParameters/massCalibration/massCalibrationA/text()",
b="spectrum/processingParameters/massCalibration/massCalibrationB/text()",
t0="spectrum/processingParameters/massCalibration/massCalibrationT0/text()",
u="spectrum/acquisitionInfo/setting/ionSourceVoltage/text()",
Fs="spectrum/acquisitionInfo/setting/digitizerRate/text()",
N="spectrum/tofData/tofDataNumSamples/text()",
time0="spectrum/tofData/tofDataTimeZero/text()",
data="spectrum/tofData/tofDataSamples/text()")
v <- XML::parseXMLPathFile(name, nodeList, default=NA, delimiter=" \t\n,")
names(v) <- names(nodeList)
if (length(v$time0)==0) v$time0=0
lv <- sapply(v, function(x) length(x)!=1)
if (any(lv)) stop ("\n\t", nodeList[[lv]]," cannot be found / in the wrong format\n")
v <- lapply(v, function(x) as.numeric(unlist(x)[1]))
ts <- seq(from=v$time0-v$t0, to=(v$time0-v$t0)+v$N/v$Fs, by=1/v$Fs)
v$mz <- v$u*sign(ts)*v$a*ts^2+v$b
} else {
# for processedData: use mz directly
nodeList <-
list(mzdata="spectrum/processedData/processedDataSamples/text()")
vx <- XML::parseXMLPathFile(name, nodeList, default=NA, delimiter=" \t\n,")
if (length(vx[[1]])==0)
stop ("no processed data are found in ", name, ". Please set TOF to be TRUE.")
v <- vector(length=2, mode="list")
v[1:2] <- split(as.numeric(vx[[1]][[1]]), 1:2)
names(v) <- c("mz", "data")
}
# use mz to calibrate/approx
if (length(v$mz)!=length(mz) || any(v$mz!=mz)){
v <- approx(x=v$mz, y=v$data, xout=mz,
method="linear", rule=2, f=0.5)$y
} else {
v <- v$data
}
# split mz/data into nf parts
# append to those files in binary format
if (!append) {
# also write mz into the tables:
v <- cbind(mz, v)
}
writeBinBlocks(v, pattern=pattern, path=path, maxRows=maxRows, what="double",
append=append, verbose=verbose)
}
"cypherGenXMList2BinBlocks" <-
function(x, pattern, tof=FALSE, maxRows=10000, maxCols=NULL,
path=".", category=NULL, verbose=1, ...)
{
if (is.R())
stop ("\nNot for R yet: require SPLUS specific library: SPXML\n")
if (length(category)) {
if (length(category)!=length(x))
stop ("length of non-null category must be equal to that of x.")
if (length(unique(category))==0)
category <- NULL
}
if (length(maxCols) && maxCols <= 0) {
stop ("A valid maxCols should a positive integer.")
}
nodeList <-
list(a="spectrum/processingParameters/massCalibration/massCalibrationA/text()",
b="spectrum/processingParameters/massCalibration/massCalibrationB/text()",
t0="spectrum/processingParameters/massCalibration/massCalibrationT0/text()",
u="spectrum/acquisitionInfo/setting/ionSourceVoltage/text()",
Fs="spectrum/acquisitionInfo/setting/digitizerRate/text()",
N="spectrum/tofData/tofDataNumSamples/text()",
time0="spectrum/tofData/tofDataTimeZero/text()")
if (verbose>0)
cat("scanning list of XML files ...\n")
mzR <- sapply(x, function(x, nodeList) {
v <- XML::parseXMLPathFile(x, nodeList, default=NA, delimiter=" \t\n,")
names(v) <- names(nodeList)
if (length(v$time0)==0) v$time0=0
lv <- sapply(v, function(x) length(x)!=1)
if (any(lv)) stop ("\n\t", nodeList[[lv]]," cannot be found / in the wrong format\n")
v <- lapply(v, function(x) as.numeric(unlist(x)[1]))
xlim <- c(v$time0-v$t0, (v$time0-v$t0)+v$N/v$Fs)
v$u*sign(xlim)*v$a*xlim^2+v$b
}, nodeList)
iMin <- which.max(mzR[1,])
iMax <- which.min(mzR[2,])
# now map back to time domain to get the actual mz range:
v <- XML::parseXMLPathFile(x[iMin], nodeList, default=NA, delimiter=" \t\n,")
names(v) <- names(nodeList)
v <- lapply(v, function(x) as.numeric(unlist(x)[1]))
ts <- seq(from=v$time0-v$t0, to=sqrt((mzR[2, iMax]/v$u-v$b)/v$a), by=1/v$Fs)
mz <- v$u*sign(ts)*v$a*ts^2+v$b
if(verbose>0)
cat("mz range to be used: ", range(mz), "(", length(mz), "peaks in total)\n")
# now using xlim to read those files in batch again:
# for each file, using predefined table blocks to
# store temporary data:
# a table having part information
# get file name out as ID:
sampleID <- sapply(strsplit(x, "\\", fixed=TRUE),
function(x) unlist(strsplit(rev(x)[1], ".xml", fixed=TRUE)))
if (length(category))
{
tableID <- tapply(x, category, function(x, maxCols)
{
n <- length(x)
# balanced design:
ifelse1(length(maxCols)==0 || n<=maxCols,
rep("", length=n),
sort(rep(1:ceiling(n/maxCols), length=n)))
}, maxCols)
tableID <- sapply(names(tableID),
function(x, tableID)
paste(x, tableID[[x]], sep=""),
tableID)
categoryID <- sapply(tableID, unique)
categoryTable <- data.frame(tableID=unlist(categoryID),
categoryID=rep(names(categoryID), sapply(categoryID, length)),
row.names=NULL)
tableID <- unlist(tableID)
} else {
categoryTable <- NULL
n <- length(x)
tableID <- ifelse1(length(maxCols)==0 || n<=maxCols,
rep("", length=n),
sort(rep(1:ceiling(n/maxCols), length=n)))
}
tableID <- paste(pattern, tableID, sep="")
# now each element of tbList will be a SQLite table.
# split each huge table into files containing nrow records:
sampleTable <- data.frame(sampleID=sampleID, tableID=tableID)
# mz can be used as the primary key in the database. However, it is redundant
# here (all tables share the same mz vector. We would have mz be returned as
# a single vector.
dupfile <- duplicated(tableID)
for (i in 1:length(x))
{
if (verbose>1)
cat (x[i], "=>", tableID[i], "\n")
cypherGenXML2Bin(x[i], pattern=tableID[i], path=path,
mz=mz, maxRows=maxRows, tof=tof, append=dupfile[i], verbose=verbose>1, ...)
}
list(sampleTable=sampleTable, categoryTable=categoryTable)
}
"binblocks2SQLite" <-
function(conn, path, tablename, columnID,
pattern, what="double", append=FALSE, verbose=TRUE, ...)
{
flist <- dir(path,
pattern=paste("^", pattern, "\\.[0-9]+$", sep=""), full.names=TRUE)
if (length(flist)==0) {
cat ("cannot find any file with pattern", pattern, "in path", path)
return (FALSE)
}
oo <- order(sapply(strsplit(flist, "\\."), function(x) as.integer(rev(x)[1])))
flist <- flist[oo]
newConn <- !is(conn, "DBIConnection")
if (newConn) {
dbinit <- DBI::dbDriver("SQLite")
conn <- DBI::dbConnect(dbinit, dbname = conn)
}
ans <- TRUE
if (DBI::dbExistsTable(conn, tablename) & !append) {
ans <- DBI::dbRemoveTable(conn, tablename)
}
# read flist one by one and append to tablename in conn
if (ans) {
ans <- all(sapply(flist,
function(f, conn, tablename, columnID, what, f1, verbose, ...) {
d <- readBinMatrix(f, ncol=length(columnID), what=what, ...)
if (verbose) cat (f, ": Dim=", dim(d), "\n")
colnames(d) <- columnID
DBI::dbWriteTable(conn, tablename, data.frame(d), overwrite=(f==f1),
append=(f!=f1), row.names=FALSE, ...)
}, conn, tablename, columnID, what=what, flist[1], verbose, ...))
}
if(newConn)
DBI::dbDisconnect(newConn)
return (ans)
}
"importBin2Sqlite" <-
function(conn, path, tablename, sampleTable,
what="double", categoryTable=NULL, prefix="",
verbose=1, ...)
{
if (length(tablename)>1) {
stop("more than one table names are provided, cannot proceed!")
}
if (any(is.na(match(c("sampleID", "tableID"), colnames(sampleTable)))))
stop ("Required columns sampleID and tableID not exist in sampleTable")
newConn <- is.character(conn)
if (newConn) {
dbinit <- dbDriver("SQLite")
conn <- dbConnect(dbinit, dbname = conn)
}
ans <- TRUE
sampleTableName <- paste(tablename, "sample", sep="_")
sampleTable$sampleID <- make.db.names.default(as.character(sampleTable$sampleID))
ans <- dbWriteTable(conn, sampleTableName, sampleTable, append=FALSE, overwrite=TRUE, row.names=FALSE)
if (ans && length(categoryTable)) {
categoryTableName <- paste(tablename, "category", sep="_")
if (!is.data.frame(categoryTable))
stop ("categoryTable must be a data.frame.")
ans <- dbWriteTable(conn, categoryTableName, categoryTable, overwrite=TRUE, append=FALSE, row.names=FALSE)
}
tbprefix <- as.character(sampleTable[["tableID"]])
lf <- split(cbind(sampleTable["sampleID"], pattern=tbprefix), tbprefix)
nmz <- -1
for (i in 1:length(lf))
{
if (ans) {
if (verbose>0)
cat("Importing", names(lf)[i], "\n")
ans <- binblocks2SQLite(conn, path=path,
tablename=names(lf)[i],
columnID=c("mz", as.character(lf[[i]][["sampleID"]])),
pattern=as.character(lf[[i]][["pattern"]][1]),
what=what, verbose=verbose>1)
if (ans) {
ntbrow <- dbGetQuery(conn,
paste("SELECT max(rowid) from", names(lf)[i]))[1,1]
if (verbose>1)
cat ("Read", ntbrow, "rows in", names(lf)[i], "\n")
if (i==1) nmz <- ntbrow
else if (nmz != ntbrow) {
if (verbose>0) {
cat ("intensity table", names(lf)[i],
"has a different length (", ntbrow, ") from mz table (", nmz, ")\n")
ans <- FALSE
dbRemoveTable(conn, names(lf)[i])
}
}
} else {
cat("error occurred in binblocks2SQLite\n")
dbRemoveTable(conn, names(lf)[i])
return (FALSE)
}
# OPTIMIZE:
if (ans) {
if (verbose>0)
cat("Creating index on mz\n")
idx <- paste(names(lf)[i], "mzIndex", sep="_")
rc <- try(dbGetQuery(conn, paste(
"CREATE INDEX", idx,
"ON", names(lf)[i], "( mz )")))
if (is.R())
ans <- !inherits(rc, "try-error")
else
ans <- !inherits(rc, "Error")
}
}
}
if (newConn) {
dbDisconnect(conn)
}
ans
}
"importXMLDir" <-
function (xmldir, dbname, tablename, tof = FALSE, maxRows = 10000, maxCols = NULL,
tmpdir = tempdir(), splitSubdir=TRUE, verbose = 0,
...)
{
f <- dir(xmldir, recursive = TRUE, full.names = TRUE)
if (splitSubdir)
category <- sapply(strsplit(f, "\\", fixed = TRUE), function(x) rev(x)[2])
else
category <- NULL
dir.create(tmpdir, showWarnings=FALSE)
# remove tempfiles from tempdir:
tmpfiles <- dir(tmpdir,
pattern=paste("^", tablename, "[0-9]*\\.[0-9]+$", sep=""), full.names=TRUE)
if (length(tmpfiles)>0) file.remove(tmpfiles)
if (verbose>0)
cat ("Convert XML files to binary blocks...\n")
p <- cypherGenXMList2BinBlocks(f, pattern = tablename, tof = tof, maxRows = maxRows,
maxCols = maxCols, path = tmpdir, category = category, verbose=verbose-1)
if (verbose>0)
cat ("Import binary blocks into SQLite...\n")
rc <- try(importBin2Sqlite(conn = dbname, path = tmpdir, tablename = tablename,
sampleTable = p$sampleTable, categoryTable = p$categoryTable,
verbose = verbose-1, ...))
ans <- !inherits(rc, XML::ErrorClass)
if (ans) {
tmpfiles <- dir(tmpdir,
pattern=paste("^", tablename, "[0-9]*\\.[0-9]+$", sep=""), full.names=TRUE)
if (length(tmpfiles)>0) {
if (verbose>0) {
cat("Removing", length(tmpfiles), "temporary files\n")
}
file.remove(tmpfiles)
}
}
else {
print(rc)
cat ("An error occurred.\n")
}
ans
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.