R/rtmsBrukerMCFReader.R

Defines functions mcf_readNamedTableRowBlank mcf_readNumericPrimitiveBlank mcf_readKeyValueTable mcf_readKeyValueRow mcf_readNamedKeyValueRow mcf_readTable mcf_readTableRow mcf_readNamedTableRow mcf_readPrimitiveArray mcf_readPrimitive mcf_readNamedName mcf_checkBlobCodeWord mcf_blobSeek sqlt_parseRecordValue sqlt_parseRecord sqlt_parseBTreeTable sqlt_parseBTreeLeafPage sqlt_parseBTreeInteriorPage sqlt_getPage bin_checkBytes bin_readVarInt wholenumber shrinkBounds retrieveMCFMetadata readMCFCalibration readBrukerMCFIndexFile new_rtmsBrukerMCFReader getBrukerMCFAllMetadata getBrukerMCFMetadata getBrukerMCFIntensities getBrukerMCFSampleSet getBrukerMCFSample getBrukerMCFSpectrum getBrukerMCFIndices getBrukerMCFSpots reopenBrukerMCFReader closeBrukerMCFReader getSampleSet.rtmsBrukerMCFReader getSample.rtmsBrukerMCFReader getSpectrum.rtmsBrukerMCFReader close.rtmsBrukerMCFReader reopen.rtmsBrukerMCFReader reopen.default reopen newBrukerMCFReader

Documented in closeBrukerMCFReader close.rtmsBrukerMCFReader getBrukerMCFAllMetadata getBrukerMCFIndices getBrukerMCFIntensities getBrukerMCFMetadata getBrukerMCFSample getBrukerMCFSampleSet getBrukerMCFSpectrum getBrukerMCFSpots getSample.rtmsBrukerMCFReader getSampleSet.rtmsBrukerMCFReader getSpectrum.rtmsBrukerMCFReader newBrukerMCFReader reopen reopenBrukerMCFReader reopen.rtmsBrukerMCFReader

# rtmsBrukerMCFReader.R

#' Open a Bruker multi-acquisition MCF directory
#'
#' @description
#' Creates an RTMS reader object (of class `rtmsBrukerMCFReader`) which can
#' extract data from a Bruker multi-acquisition directory (extension ".d")
#'
#' @details
#' Currently, RTMS can create reader objects for two binary Bruker data formats,
#' BAF (presumably standing for "Bruker acquisition format") holding data from
#' a single spectrum acquisition, and MCF (probably "multi-acquisition container
#' format") containing data from multiple spectra acquired in a single run.
#' Both formats hold data in a directory marked with the extension ".d". The
#' multi-acquisition MCF format directory contains four essential data files:
#' the main raw data file ending in "_1" with extension ".mcf", a matching
#' index file with extension ".mcf_idx", and a calibration data file ending in
#' "_2" with  extension ".mcf", and the matching calibration index file with
#' extension ".mcf_idx". This function preprocesses the main index file and
#' calibration files so that raw data can be extracted quickly on demand from
#' the main ".mcf" file.
#'
#' An important note: when a MCF multi-acquisition reader is created, it creates
#' an open connection to the raw data file, which allows for quicker processing
#' of many spectra in a single file.  This connection will remain open until
#' the reader is closed with the `close` function.
#'
#' @param mcfdir A directory (usually with the extension ".d") containing data
#' from a Bruker multi-acquisition run. This directory will contain a files with
#' extenssion ".mcf" and matching index files with extension ".mcf_idx" (see
#' Details).
#'
#' @returns A reader object of class `rtmsBrukerMCFReader` with an open
#' connection to the main ".mcf" data file.
#'
#' @export
newBrukerMCFReader <- function(mcfdir) {
	# Locate the relevant data files in the specified Bruker directory:
	#
	# - A main data file, with filename ending in "_1" and extension .mcf
	# - The corresponding SQLite index file, with the same file name but
	#   extension .mcf_idx
	# - A calibration datafile, with an identical file name to the main
	#   data file but ending in "_2", with extension .mcf
	# - The SQLite calibration data index file, with extension .mcf_idx
	mainFile <- dir(mcfdir,"_1.mcf$")
	if (length(mainFile)==0) { stop("Main data file not found in directory.") }
	else { mainFile <- mainFile[1] }
	mainIndex <- paste0(mainFile,"_idx")
	if (!file.exists(paste0(mcfdir,"/",mainIndex))) {
		stop("Main index file not found in directory.")
	}
	calibFile <- sub("_1\\.mcf","_2.mcf",mainFile)
	if (!file.exists(paste0(mcfdir,"/",calibFile))) {
		stop("Calibration data file not found in directory.")
	}
	calibIndex <- paste0(calibFile,"_idx")
	if (!file.exists(paste0(mcfdir,"/",calibIndex))) {
		stop("Calibration index file not found in directory.")
	}
	files <- list(main=mainFile,index=mainIndex,calibration=calibFile,calIndex=calibIndex)

	# Initialize the processing of the multispectrum file by reading in the
	# location of binary data blocks in the main data file and calibration
	# datafile from their corresponding SQLite index files.
	offsetTable <- readBrukerMCFIndexFile(paste0(mcfdir,"/",mainIndex),calibration=FALSE)
	calOffsets <- readBrukerMCFIndexFile(paste0(mcfdir,"/",calibIndex),calibration=TRUE)
	calOffsets$index <- 1:nrow(calOffsets)

	# Locate the relevant calibration data blocks in the calibration data
	# file for each spot measured, and store the calibration data
	spotCalOffsets <- merge(calOffsets,offsetTable[offsetTable$BlobResType==258,"id",drop=FALSE],by.x="toId",by.y="id",sort=FALSE)
	spotCalOffsets <- spotCalOffsets[order(spotCalOffsets$index),]
	spotCalData <- readMCFCalibration(paste0(mcfdir,"/",calibFile),spotCalOffsets)
	spotCalData$Index <- 1:nrow(spotCalData)

	# Use spot calibration data as a spot table
	spotTable <- spotCalData[order(spotCalData$Index),]
	names(spotTable)[names(spotTable)=="toId"] <- "id"

	# Open the main data file
	mcfcon <- file(paste0(mcfdir,"/",mainFile),"rb")

	# Retrieve a full representation of the method metadata, including
	# parameter keys, and lookup table values
	paramBlobOffset <- offsetTable$Offset[3]
	metadata <- retrieveMCFMetadata(mcfcon,paramBlobOffset,0)

	# Returned value is a object with the following fields:
	# dir : ".d" Bruker data directory
	# files : Relevant data files
	# spotTable : Table of spots with well ids, calibration parameters
	# offsetTable : Table of binary offsets in main data file
	# metadata : A two-element list containing metadata declarations and values
	# con : Open connection to main Bruker mcf data file
	return(new_rtmsBrukerMCFReader(dir=mcfdir,
								   files=files,
								   spotTable=spotTable,
								   offsetTable=offsetTable,
								   metadata=metadata,
								   con=mcfcon))
}

#' Reopen a reader object
#'
#' An S3 generic function for reopening reader objects that have been close
#'
#' @param x The object to be re-opened
#'
#' @param ... Other possible arguments for specific object types
#'
#' @returns An object of the same type as `x`
#'
#' @export
reopen <- function(x,...) {
	UseMethod("reopen")
}
#' @export
reopen.default <- function(x,...) {
	invisible(x)
}
#' @param x The reader object to reopen
#' @param ... Included for S3 compatibility
#'
#' @describeIn closeBrukerMCFReader The S3 method `close` for objects of class
#' `rtmsBrukerMCFReader`; calls `closeBrukerMCFReader`
#' @export
reopen.rtmsBrukerMCFReader <- function(x,...) {
	reopenBrukerMCFReader(x)
}
#' @param con The reader object to be closed
#'
#' @describeIn closeBrukerMCFReader The S3 method `reopen` for objects of class
#' `rtmsBrukerMCFReader`; calls `reopenBrukerMCFReader`
#' @export
close.rtmsBrukerMCFReader <- function(con,...) {
	closeBrukerMCFReader(con)
}

#' @param x The MCF reader object
#' @param ... Additional parameters
#'
#' @describeIn getBrukerMCFSpectrum The S3 method `getSpectrum` for objects
#' of class `rtmsBrukerMCFReader`; calls `getBrukerMCFSpectrum`
#' @export
getSpectrum.rtmsBrukerMCFReader <- function(x,...) {
	getBrukerMCFSpectrum(x,...)
}
#' @param x The MCF reader object
#' @param ... Additional parameters
#'
#' @describeIn getBrukerMCFSample The S3 method `getSample` for objects
#' of class `rtmsBrukerMCFReader`; calls `getBrukerMCFSample`
#' @export
getSample.rtmsBrukerMCFReader <- function(x,peaks,...) {
	getBrukerMCFSample(x,peaks,...)
}
#' @param x The MCF reader object
#' @param ... Additional parameters
#'
#' @describeIn getBrukerMCFSampleSet The S3 method `getSample` for objects
#' of class `rtmsBrukerMCFReader`; calls `getBrukerMCFSampleSet`
#' @export
getSampleSet.rtmsBrukerMCFReader <- function(x,peaks,...) {
	getBrukerMCFSampleSet(x,peaks,...)
}

#' Manage an MCF reader file connection
#'
#' Closes the open connection to the main data file in a Bruker MCF reader
#' object.
#'
#' @details
#' Because Bruker MCF directories contain a potentially large number of
#' spectra, reopening a connection to the main data file when reading many
#' spectra or samples from it is inefficient and slow, especially if the file
#' is being accessed over a network connection.  The `rtmsBrukerMCFReader`
#' object therefore maintains an open connection to the main binary data file
#' until it is closed by the user. Of course, the reader object still maintains
#' all the index and calibration data, making it possible to reopen a
#' connection to the MCF directory without all the preprocessing required when
#' first opening.  Unfortunately, taking advantage of this fact is a little
#' tricky.
#'
#' In most cases, R is a functional language, with limited side effects on R
#' objects; so it is difficult to alter the state of reader object without
#' returning it explicitly.  However, one of the few cases where side-effects
#' are quite important is R's management of open file connections, with
#' functions like `close`.  Thus, calling `closeBrukerMCFReader` (and the S3
#' function `close` which wraps it) will in fact close the connection, but will
#' not return an altered copy of the reader object reflecting that it is
#' closed.  So if the user wishes to close a reader object with the possibility
#' of reopening it, they must close the reader AND assign the returned reader
#' object to the relevant name.  This will store the fact the connection has
#' been closed, and allow the `reopen` function to operate correctly.
#'
#' @param reader An RTMS reader object of class `rtmsBrukerMCFReader`
#'
#' @returns The same reader object with a closed connection
#'
#' @export
closeBrukerMCFReader <- function(reader) {
	if (!inherits(reader,"rtmsBrukerMCFReader")) { stop("Parameter 'reader' must be of class 'rtmsBrukerMCFReader'.") }
	if (!is.null(reader$con)) {
		close(reader$con)
		reader$con <- NULL
	}
	invisible(reader)
}
#' @describeIn closeBrukerMCFReader Reopens a file connection to the main
#' binary data file in a Bruker MCF directgry so that data can be extracted
#' @export
reopenBrukerMCFReader <- function(reader) {
	if (!inherits(reader,"rtmsBrukerMCFReader")) { stop("Parameter 'reader' must be of class 'rtmsBrukerMCFReader'.") }
	if (is.null(reader$con) || !inherits(reader$con,"file")) {
		reader$con <- file(paste0(reader$dir,"/",reader$files$main),"rb")
	}
	invisible(reader)
}

#' Get spot names and indices from a Bruker MCF file
#'
#' Assembles a table of all acquisitions in a Bruker MCF file; Bruker
#' measurements are often identified by the metadata parameter "Spot Number",
#' so this function extracts that specific metadata value and joins it with the
#' indices used to pick out spectra in other functions.  Also retrieves the
#' timestamp at which acquisition was taken, if acquisitions must be identified
#' by order.
#'
#' @param reader An openRTMS reader object of class `rtmsBrukerMCFReader`
#'
#' @returns A data.frame with an `Index` column containing the indices of each
#' acquisition (used by other functions such as `getSpectrum` or `getSample`),
#' a `SpotNumber` column containing the "Spot Number" metadata value for each
#' acquisition, and a `Timestampe` column containing the time at which each
#' acquisition was collected by the instrument.
#'
#' @export
getBrukerMCFSpots <- function(reader) {
	if (!inherits(reader,"rtmsBrukerMCFReader")) { stop("Parameter 'reader' must be of class 'rtmsBrukerMCFReader'.") }
	mcfdir <- reader$dir
	mainIndex <- reader$files$index

	# Open index file for decoding
	scon <- file(paste0(mcfdir,"/",mainIndex),"rb")
	on.exit(close(scon),add=TRUE)

	# Locate all B-tree table leaf pages for the main table (which lists the
	# titles, column names, and root pages of all tables and index in the
	# SQLite file.
	mainHandler <- function(values,cellIndex) {
		return(list(name=values[[2]],value=values[[4]]))
	}
	mainTable <- sqlt_parseBTreeTable(scon,1,mainHandler,TRUE)$output

	metaRoot <- as.numeric(mainTable$value[which(mainTable$name=="MetaDataString")])
	metastringHandler <- function(values,cellIndex) {
		valueList <- values
		names(valueList) <- c("GuidA","GuidB","MetadataId","Text")
		# Guid values are 8-byte integers in the file, and hence return two integers
		# each, which must be pasted together to be a single column in a data table
		valueList$id <- paste(c(valueList$GuidA,valueList$GuidB),collapse=" ")
		valueList$GuidA <- NULL
		valueList$GuidB <- NULL
		return(as.data.frame(valueList))
	}
	metasout <- sqlt_parseBTreeTable(scon,metaRoot,metastringHandler,first=FALSE)$output

	wmeta1 <- metasout[which(metasout$MetadataId==64),c("id","Text"),drop=FALSE]
	names(wmeta1)[[2]] <- "SpotNumber"
	wmeta2 <- metasout[which(metasout$MetadataId==34),c("id","Text"),drop=FALSE]
	names(wmeta2)[[2]] <- "Timestamp"
	wmeta <- merge(wmeta1,wmeta2,by="id",all=TRUE)

	spots <- merge(reader$spotTable[c("id","Index")],wmeta,by="id",all.x=TRUE)
	spots <- spots[order(spots$Index),-1,drop=FALSE]
	spots
}

#' @describeIn getBrukerMCFSpots Retrieves a vector of all the indices
#' (beginning with zero) of the acquisitions in the MCF file.  Faster than
#' `getBrukerMCFSpots` but contains no metadata or spot names
#' @export
getBrukerMCFIndices <- function(reader) {
	return(reader$spotTable$Index)
}

#' Extract a spectrum from a Bruker MCF directory
#'
#' Extracts an RTMS spectrum object (of class `rtmsSpectrum`) from a multi-
#' acquisition Bruker MCF directory opened using an RTMS reader object (of
#' class `rtmsBrukerMCFReader`).  A numeric index is used to identify which
#' spectrum should be extracted.
#'
#' @param reader An RTMS reader object of class `rtmsBrukerMCFReader`
#'
#' @param index A single numeric index specifying which acquisition should be
#' extracted
#'
#' @returns An RTMS spectrum object of class `rtmsSpectrum`
#'
#' @export
getBrukerMCFSpectrum <- function(reader,index) {
	if (!inherits(reader,"rtmsBrukerMCFReader")) { stop("Parameter 'reader' must be of class 'rtmsBrukerMCFReader'.") }
	if (index <=0 || index>nrow(reader$spotTable)) {
		stop("Parameter 'index' is out of bounds.")
	}

	fcon <- reader$con
	spotId <- reader$spotTable$id[index]
	fhigh <- reader$spotTable$frequencyHigh[index]
	fwidth <- reader$spotTable$frequencyWidth[index]
	fsize <- reader$spotTable$size[index]
	alpha <- reader$spotTable$alpha[index]
	beta <- reader$spotTable$beta[index]

	massToIndex <- function(p) { return(fsize*(fwidth-(fhigh/(p-alpha/fhigh) + beta))/fwidth) }
	indexToMass <- function(i) { return(fhigh/((fwidth*(fsize-i)/fsize)-beta) + alpha/fhigh) }

	# Move to raw data blob.  From here we will extract all measured spectra
	# values within a given width of the desired peaks, and take their average
	rawdataBlob <- which(reader$offsetTable$id==spotId & reader$offsetTable$BlobResType==258)
	mcf_blobSeek(fcon,reader$offsetTable$Offset[rawdataBlob],reader$offsetTable$OffsetPage[rawdataBlob])

	mcf_checkBlobCodeWord(fcon)
	bin_checkBytes(fcon,charToRaw("\x01\x01"))
	seek(fcon,16,origin="current")
	numNamedEls <- bin_readVarInt(fcon)

	name <- mcf_readNamedName(fcon)
	if (name!="Intensities") { stop("First named element of raw data blob should be 'Intensities'") }
	# Intensities should be an array of 4-byte floats
	bin_checkBytes(fcon,as.raw(3),"Raw spectra must be an array.")
	typeByte <- readBin(fcon,"raw",1)
	if (typeByte==as.raw(32)) { # x20 indicates floats are uncompressed
		compressedSpectrum <- FALSE
		numValues <- bin_readVarInt(fcon)
		spectrum <- readBin(fcon,"double",numValues,size=4,endian="little")
	} else if (typeByte==as.raw(34)) { #x22 indicates floats have been compressed using gzip
		compressedSpectrum <- TRUE
		numBytes <- bin_readVarInt(fcon)
		gzippedBytes <- readBin(fcon,"raw",numBytes)
		unzippedBytes <- memDecompress(gzippedBytes,type="gzip")
		numValues <- length(unzippedBytes)/4
		spectrum <- readBin(unzippedBytes,"double",numValues,size=4,endian="little")
	} else { stop("Raw spectra must be an array of 32-bit floats.") }

	rtmsSpectrum(indexToMass(0:(numValues-1)),spectrum)
}

#' Extract a sample from a Bruker MCF directory
#'
#' Extracts an RTMS sample object (of class `rtmsSample`) from a multi-
#' acquisition Bruker MCF directory opened using an RTMS reader object (of
#' class `rtmsBrukerMCFReader`). A numeric index is used to identify which
#' acquisition the sample should be extracted from.
#'
#' @param reader An RTMS reader object of class `rtmsBrukerMCFReader`
#'
#' @param peaks A list of peak objects of class `rtmsPeak`
#' @param index A single numeric index specifying which acquisition the sample
#' set should be extracted from
#'
#' @returns An RTMS sample object of class `rtmsSample`
#'
#' @export
getBrukerMCFSample <- function(reader,peaks,index) {
	if (!inherits(reader,"rtmsBrukerMCFReader")) { stop("Parameter 'reader' must be of class 'rtmsBrukerMCFReader'.") }
	if (!all(vapply(peaks,function(p) inherits(p,"rtmsPeak"),FALSE))) {
		stop("Parameter 'peaks' must be a list of objects of class 'rtmsPeak'.")
	}
	if (index <=0 || index>nrow(reader$spotTable)) {
		stop("Parameter 'index' is out of bounds.")
	}

	fcon <- reader$con
	spotId <- reader$spotTable$id[index]
	fhigh <- reader$spotTable$frequencyHigh[index]
	fwidth <- reader$spotTable$frequencyWidth[index]
	fsize <- reader$spotTable$size[index]
	alpha <- reader$spotTable$alpha[index]
	beta <- reader$spotTable$beta[index]

	massToIndex <- function(p) { return(fsize*(fwidth-(fhigh/(p-alpha/fhigh) + beta))/fwidth) }
	indexToMass <- function(i) { return(fhigh/((fwidth*(fsize-i)/fsize)-beta) + alpha/fhigh) }

	# Move to extracted peak blob.  The sum of peaks within the
	# tranformed index window is the current assay measure
	peakdataBlob <- which(reader$offsetTable$id==spotId & reader$offsetTable$BlobResType==257)
	mcf_blobSeek(fcon,reader$offsetTable$Offset[peakdataBlob],reader$offsetTable$OffsetPage[peakdataBlob])

	mcf_checkBlobCodeWord(fcon)
	bin_checkBytes(fcon,charToRaw("\x01\x01"))
	seek(fcon,16,origin="current")
	numNamedEls <- bin_readVarInt(fcon)
	seek(fcon,25,origin="current") # Skip over "PeakFinderType" element

	name <- mcf_readNamedName(fcon)
	if (name!="Indexes") { stop("Second named element of peak data blob should be 'Indexes") }
	bin_checkBytes(fcon,charToRaw("\x03\x1F")) # Indexes should be an array of 8-byte floats
	numIndexes <- bin_readVarInt(fcon)
	peakIndexes <- readBin(fcon,"double",numIndexes,size=8,endian="little")
	peakMasses <- indexToMass(peakIndexes)

	name <- mcf_readNamedName(fcon)
	if (name!="Intensities") { stop("Third named element of peak data blob should be 'Intensities") }
	bin_checkBytes(fcon,charToRaw("\x03\x20")) # Intensities should be an array of 4-byte floats
	numIntensities <- bin_readVarInt(fcon)
	if (numIndexes!=numIntensities) { stop("Number of peak indicies should match number of peak intensities") }
	peakIntensities <- readBin(fcon,"double",numIndexes,size=4,endian="little")

	# Move to raw data blob.  From here we will extract all measured spectra
	# values within a given width of the desired peaks, and take their average
	rawdataBlob <- which(reader$offsetTable$id==spotId & reader$offsetTable$BlobResType==258)
	mcf_blobSeek(fcon,reader$offsetTable$Offset[rawdataBlob],reader$offsetTable$OffsetPage[rawdataBlob])

	mcf_checkBlobCodeWord(fcon)
	bin_checkBytes(fcon,charToRaw("\x01\x01"))
	seek(fcon,16,origin="current")
	numNamedEls <- bin_readVarInt(fcon)

	name <- mcf_readNamedName(fcon)
	if (name!="Intensities") { stop("First named element of raw data blob should be 'Intensities'") }
	# Intensities should be an array of 4-byte floats
	bin_checkBytes(fcon,as.raw(3),"Raw spectra must be an array.")
	typeByte <- readBin(fcon,"raw",1)
	if (typeByte==as.raw(32)) { # x20 indicates floats are uncompressed
		compressedSpectrum <- FALSE
		numValues <- bin_readVarInt(fcon)
	} else if (typeByte==as.raw(34)) { #x22 indicates floats have been compressed using gzip
		compressedSpectrum <- TRUE
		numBytes <- bin_readVarInt(fcon)
		gzippedBytes <- readBin(fcon,"raw",numBytes)
		unzippedBytes <- memDecompress(gzippedBytes,type="gzip")
		numValues <- length(unzippedBytes)/4
		spectrum <- readBin(unzippedBytes,"double",numValues,size=4,endian="little")
	} else { stop("Raw spectra must be an array of 32-bit floats.") }

	# For each desired peak, extract the relevant peak piece, maxima and, (if
	# applicable) window piece of the overall spectrum. This can be read directly
	# from the file if the data is uncompressed; otherwise it can be taken from
	# the extracted and decompressed spectrum.
	subsamples <- list()
	for (i in seq_along(peaks)) {
		currentPeak <- peaks[[i]]
		indexBounds <- massToIndex(currentPeak$bounds)
		indexBoundsD <- shrinkBounds(indexBounds)+1
		if (indexBoundsD[1]>numValues || indexBoundsD[2]<1) {
			peakPiece <- rtmsSpectrumPiece(c(),c())
		} else {
			indexboundsD <- pmin(pmax(indexBoundsD,1),numValues)
			currIndexes <- indexBoundsD[1]:indexBoundsD[2]
			if (compressedSpectrum) { currValues <- spectrum[currIndexes] }
			else {
				seek(fcon,4*(indexBoundsD[1]-1),origin="current")
				currValues <- readBin(fcon,"double",diff(indexBoundsD)+1,size=4,endian="little")
				seek(fcon,-4*indexBoundsD[2],origin="current")
			}
			peakPiece <- rtmsSpectrumPiece(indexToMass(currIndexes-1),currValues)
		}
		if (!is.null(currentPeak$window)) {
			indexBounds <- massToIndex(currentPeak$window)
			indexBoundsD <- shrinkBounds(indexBounds)+1
			if (indexBoundsD[1]>numValues || indexBoundsD[2]<1) {
				windowPiece <- rtmsSpectrumPiece(c(),c())
			} else {
				indexboundsD <- pmin(pmax(indexBoundsD,1),numValues)
				currIndexes <- indexBoundsD[1]:indexBoundsD[2]
				if (compressedSpectrum) { currValues <- spectrum[currIndexes] }
				else {
					seek(fcon,4*(indexBoundsD[1]-1),origin="current")
					currValues <- readBin(fcon,"double",diff(indexBoundsD)+1,size=4,endian="little")
					seek(fcon,-4*indexBoundsD[2],origin="current")
				}
				windowPiece <- rtmsSpectrumPiece(indexToMass(currIndexes-1),currValues)
			}
		} else { windowPiece <- NULL }

		relmax <- which(peakMasses>=currentPeak$bounds[1] & peakMasses<=currentPeak$bounds[2])
		maxima <- rtmsSpectrumPiece(peakMasses[relmax],peakIntensities[relmax])
		subsamples[[i]] <- new_rtmsSubsample(peakPiece,maxima,windowPiece)
	}

	new_rtmsSample(subsamples,peaks)
}

#' Extract a sample set from a Bruker MCF directory
#'
#' Extracts an RTMS sample object (of class `rtmsSampleSet`) from a multi-
#' acquisition Bruker MCF directory opened using an RTMS reader object (of
#' class `rtmsBrukerMCFReader`). A vector  numeric indices is used to identify
#' which acquisitions the sample set should be extracted from.
#'
#' @param reader An RTMS reader object of class `rtmsBrukerMCFReader`
#'
#' @param peaks A list of peak objects of class `rtmsPeak`
#' @param indices A vector of numeric indices specifying which acquisitions
#' the sample set should be extracted from
#'
#' @returns An RTMS sample set object of class `rtmsSampleSet`
#'
#' @export
getBrukerMCFSampleSet <- function(reader,peaks,indices) {
	if (!inherits(reader,"rtmsBrukerMCFReader")) { stop("Parameter 'reader' must be of class 'rtmsBrukerMCFReader'.") }
	if (!all(vapply(peaks,function(p) inherits(p,"rtmsPeak"),FALSE))) {
		stop("Parameter 'peaks' must be a list of objects of class 'rtmsPeak'.")
	}
	if (any(indices<=0) || any(indices>nrow(reader$spotTable))) {
		stop("Parameter 'indices' contains values that are out of bounds.")
	}

	fcon <- reader$con

	subsampleset <- vector("list",length(indices))
	for (index in indices) {
		spotId <- reader$spotTable$id[index]
		fhigh <- reader$spotTable$frequencyHigh[index]
		fwidth <- reader$spotTable$frequencyWidth[index]
		fsize <- reader$spotTable$size[index]
		alpha <- reader$spotTable$alpha[index]
		beta <- reader$spotTable$beta[index]

		massToIndex <- function(p) { return(fsize*(fwidth-(fhigh/(p-alpha/fhigh) + beta))/fwidth) }
		indexToMass <- function(i) { return(fhigh/((fwidth*(fsize-i)/fsize)-beta) + alpha/fhigh) }

		# Move to extracted peak blob.  The sum of peaks within the
		# tranformed index window is the current assay measure
		peakdataBlob <- which(reader$offsetTable$id==spotId & reader$offsetTable$BlobResType==257)
		mcf_blobSeek(fcon,reader$offsetTable$Offset[peakdataBlob],reader$offsetTable$OffsetPage[peakdataBlob])

		mcf_checkBlobCodeWord(fcon)
		bin_checkBytes(fcon,charToRaw("\x01\x01"))
		seek(fcon,16,origin="current")
		numNamedEls <- bin_readVarInt(fcon)
		seek(fcon,25,origin="current") # Skip over "PeakFinderType" element

		name <- mcf_readNamedName(fcon)
		if (name!="Indexes") { stop("Second named element of peak data blob should be 'Indexes") }
		bin_checkBytes(fcon,charToRaw("\x03\x1F")) # Indexes should be an array of 8-byte floats
		numIndexes <- bin_readVarInt(fcon)
		peakIndexes <- readBin(fcon,"double",numIndexes,size=8,endian="little")
		peakMasses <- indexToMass(peakIndexes)

		name <- mcf_readNamedName(fcon)
		if (name!="Intensities") { stop("Third named element of peak data blob should be 'Intensities") }
		bin_checkBytes(fcon,charToRaw("\x03\x20")) # Intensities should be an array of 4-byte floats
		numIntensities <- bin_readVarInt(fcon)
		if (numIndexes!=numIntensities) { stop("Number of peak indicies should match number of peak intensities") }
		peakIntensities <- readBin(fcon,"double",numIndexes,size=4,endian="little")

		# Move to raw data blob.  From here we will extract all measured spectra
		# values within a given width of the desired peaks, and take their average
		rawdataBlob <- which(reader$offsetTable$id==spotId & reader$offsetTable$BlobResType==258)
		mcf_blobSeek(fcon,reader$offsetTable$Offset[rawdataBlob],reader$offsetTable$OffsetPage[rawdataBlob])

		mcf_checkBlobCodeWord(fcon)
		bin_checkBytes(fcon,charToRaw("\x01\x01"))
		seek(fcon,16,origin="current")
		numNamedEls <- bin_readVarInt(fcon)

		name <- mcf_readNamedName(fcon)
		if (name!="Intensities") { stop("First named element of raw data blob should be 'Intensities'") }
		# Intensities should be an array of 4-byte floats
		bin_checkBytes(fcon,as.raw(3),"Raw spectra must be an array.")
		typeByte <- readBin(fcon,"raw",1)
		if (typeByte==as.raw(32)) { # x20 indicates floats are uncompressed
			compressedSpectrum <- FALSE
			numValues <- bin_readVarInt(fcon)
		} else if (typeByte==as.raw(34)) { #x22 indicates floats have been compressed using gzip
			compressedSpectrum <- TRUE
			numBytes <- bin_readVarInt(fcon)
			gzippedBytes <- readBin(fcon,"raw",numBytes)
			unzippedBytes <- memDecompress(gzippedBytes,type="gzip")
			numValues <- length(unzippedBytes)/4
			spectrum <- readBin(unzippedBytes,"double",numValues,size=4,endian="little")
		} else { stop("Raw spectra must be an array of 32-bit floats.") }

		# For each desired peak, extract the relevant peak piece, maxima and, (if
		# applicable) window piece of the overall spectrum. This can be read directly
		# from the file if the data is uncompressed; otherwise it can be taken from
		# the extracted and decompressed spectrum.
		subsamples <- list()
		for (i in seq_along(peaks)) {
			currentPeak <- peaks[[i]]
			indexBounds <- massToIndex(currentPeak$bounds)
			indexBoundsD <- shrinkBounds(indexBounds)+1
			if (indexBoundsD[1]>numValues || indexBoundsD[2]<1) {
				peakPiece <- rtmsSpectrumPiece(c(),c())
			} else {
				indexboundsD <- pmin(pmax(indexBoundsD,1),numValues)
				currIndexes <- indexBoundsD[1]:indexBoundsD[2]
				if (compressedSpectrum) { currValues <- spectrum[currIndexes] }
				else {
					seek(fcon,4*(indexBoundsD[1]-1),origin="current")
					currValues <- readBin(fcon,"double",diff(indexBoundsD)+1,size=4,endian="little")
					seek(fcon,-4*indexBoundsD[2],origin="current")
				}
				peakPiece <- rtmsSpectrumPiece(indexToMass(currIndexes-1),currValues)
			}
			if (!is.null(currentPeak$window)) {
				indexBounds <- massToIndex(currentPeak$window)
				indexBoundsD <- shrinkBounds(indexBounds)+1
				if (indexBoundsD[1]>numValues || indexBoundsD[2]<1) {
					windowPiece <- rtmsSpectrumPiece(c(),c())
				} else {
					indexboundsD <- pmin(pmax(indexBoundsD,1),numValues)
					currIndexes <- indexBoundsD[1]:indexBoundsD[2]
					if (compressedSpectrum) { currValues <- spectrum[currIndexes] }
					else {
						seek(fcon,4*(indexBoundsD[1]-1),origin="current")
						currValues <- readBin(fcon,"double",diff(indexBoundsD)+1,size=4,endian="little")
						seek(fcon,-4*indexBoundsD[2],origin="current")
					}
					windowPiece <- rtmsSpectrumPiece(indexToMass(currIndexes-1),currValues)
				}
			} else { windowPiece <- NULL }

			relmax <- which(peakMasses>=currentPeak$bounds[1] & peakMasses<=currentPeak$bounds[2])
			maxima <- rtmsSpectrumPiece(peakMasses[relmax],peakIntensities[relmax])
			subsamples[[i]] <- new_rtmsSubsample(peakPiece,maxima,windowPiece)
		}
		subsampleset[[which(indices==index)]] <- subsamples
	}
	if (!is.null(names(indices))) { names(subsampleset) <- names(indices) }

	new_rtmsSampleSet(subsampleset,peaks)
}

#' Retrieve peak intensities directly from an MCF file
#'
#' The size of mass spectrometric data in general, and Bruker MCF directories
#' specifically, makes the extraction of data a resource intensive and time
#' consuming process. `rtms` as a package is designed to reduce this burden,
#' but pulling a sample set from an MCF file can (in the event of compressed
#' spectra) requires reading nearly all data out of a file, which could take an
#' extremely long time over a network connection.  Since peak intensity
#' (calculated as the sum of local intensity maxima within a given peak width)
#' is one of the most common measurements used in evaluating spectra, and
#' because this measure can be extracted without extracting the full spectra,
#' this function aims to avoid expensive reading time by skipping the creation
#' of a sample set object and calculating peak intensity directly.  Other
#' measurements are not possible, but full spectra do not have to be read, even
#' when spectra are compressed, as local maxima are pre-processed and stored
#' separately in a Bruker MCF file.
#'
#' @param reader An RTMS reader object of class `rtmsBrukerMCFReader`
#'
#' @param peaks A list of peak objects of class `rtmsPeak`
#' @param indices A vector of numeric indices specifying which acquisitions
#' the measurements should be taken from
#'
#' @returns A data frame containing columns specifying the index of each
#' acquisition, the name of each acquistion (if `indices` is a named vector),
#' the peak value of the peak measure, the peak name (if `peaks` is a named
#' list), the measure name (which will always be "PeakIntensity"), and the
#' measured value for that sample and peak. Format matches the output of
#' `measureSampleSet`
#'
#' @export
getBrukerMCFIntensities <- function(reader,peaks,indices) {
	if (!inherits(reader,"rtmsBrukerMCFReader")) { stop("Parameter 'reader' must be of class 'rtmsBrukerMCFReader'.") }
	if (!all(vapply(peaks,function(p) inherits(p,"rtmsPeak"),FALSE))) {
		stop("Parameter 'peaks' must be a list of objects of class 'rtmsPeak'.")
	}
	if (any(indices<=0) || any(indices>nrow(reader$spotTable))) {
		stop("Parameter 'indices' contains values that are out of bounds.")
	}

	fcon <- reader$con

	intensityTable <- data.frame()
	# subsampleset <- vector("list",length(indices))
	for (iter in seq_along(indices)) {
		index <- indices[[iter]]
		spotId <- reader$spotTable$id[index]
		fhigh <- reader$spotTable$frequencyHigh[index]
		fwidth <- reader$spotTable$frequencyWidth[index]
		fsize <- reader$spotTable$size[index]
		alpha <- reader$spotTable$alpha[index]
		beta <- reader$spotTable$beta[index]

		massToIndex <- function(p) { return(fsize*(fwidth-(fhigh/(p-alpha/fhigh) + beta))/fwidth) }
		indexToMass <- function(i) { return(fhigh/((fwidth*(fsize-i)/fsize)-beta) + alpha/fhigh) }

		# Move to extracted peak blob.  The sum of peaks within the
		# tranformed index window is the current assay measure
		peakdataBlob <- which(reader$offsetTable$id==spotId & reader$offsetTable$BlobResType==257)
		mcf_blobSeek(fcon,reader$offsetTable$Offset[peakdataBlob],reader$offsetTable$OffsetPage[peakdataBlob])

		mcf_checkBlobCodeWord(fcon)
		bin_checkBytes(fcon,charToRaw("\x01\x01"))
		seek(fcon,16,origin="current")
		numNamedEls <- bin_readVarInt(fcon)
		seek(fcon,25,origin="current") # Skip over "PeakFinderType" element

		name <- mcf_readNamedName(fcon)
		if (name!="Indexes") { stop("Second named element of peak data blob should be 'Indexes") }
		bin_checkBytes(fcon,charToRaw("\x03\x1F")) # Indexes should be an array of 8-byte floats
		numIndexes <- bin_readVarInt(fcon)
		peakIndexes <- readBin(fcon,"double",numIndexes,size=8,endian="little")
		peakMasses <- indexToMass(peakIndexes)

		name <- mcf_readNamedName(fcon)
		if (name!="Intensities") { stop("Third named element of peak data blob should be 'Intensities") }
		bin_checkBytes(fcon,charToRaw("\x03\x20")) # Intensities should be an array of 4-byte floats
		numIntensities <- bin_readVarInt(fcon)
		if (numIndexes!=numIntensities) { stop("Number of peak indicies should match number of peak intensities") }
		peakIntensities <- readBin(fcon,"double",numIndexes,size=4,endian="little")

		# For each desired peak, extract the relevant maxima of the overall
		# spectrum.
		thisTable <- data.frame(index=index,
								peakValue=vapply(peaks,function(p) p$value,0),
								measure="PeakIntensity",
								value=0)
		if (!is.null(names(indices))) { thisTable$sample <- names(indices)[[iter]] }
		else { thisTable$sample <- as.character(index) }
		if (!is.null(names(peaks))) { thisTable$peakName <- names(peaks) }
		for (i in seq_along(peaks)) {
			currentPeak <- peaks[[i]]
			relmax <- which(peakMasses>=currentPeak$bounds[1] & peakMasses<=currentPeak$bounds[2])
			thisTable$value[[i]] <- sum(peakIntensities[relmax])
		}
		intensityTable <- rbind(intensityTable,thisTable)

	}

	intensityTable
}

#' Retrieve specific metadata values from a Bruker BAF file
#'
#' Retrieves a list of specific metadata values (including instrument data,
#' acquisition parameters, processing and analysis directives, etc.) for a
#' specific acquisition from from a Bruker multi-acquisition BAF directory
#' (represented by an `rtmsBrukerBAFReader` object).
#'
#' @param reader An RTMS reader object of class `rtmsBrukerMCFReader`
#'
#' @param names A character vector of metadata names
#' @param index A single numeric index specifying which acquisition the sample
#' set should be extracted from
#'
#' @returns A named list of values corresponding to the metadata values
#' specified.  All values will be returned as a string, including numeric
#' quantities (with units if appropriate).
#'
#' @export
getBrukerMCFMetadata <- function(reader,names,index) {
	meta <- getBrukerMCFAllMetadata(reader,index)
	filtered <- merge(data.frame(DisplayName=names),meta,sort=FALSE)
	values <- as.list(filtered$Value)
	names(values) <- filtered$DisplayName
	values
}

#' Retrieve all metadata values from a Bruker MCF file
#'
#' Retrieves a table of all metadata values (including instrument data,
#' acquisition parameters, processing and analysis directives, etc.) for a
#' specific acquisition from a Bruker multi-acquisition MCF directory
#' (represented by an `rtmsBrukerMCFReader` object).
#'
#' @param reader An RTMS reader object of class `rtmsBrukerMCFReader`
#'
#' @param index A single numeric index specifying which acquisition the sample
#' set should be extracted from
#'
#' @returns A data frame with all metadata parameters for the acquisition. The
#' data frame will have five columns: `Index`, the numeric index of the
#' acquisition; `PermanentName`, the internal identifier of the parameter in
#' Bruker software systems; `GroupName`, the group of parameters that each
#' value belongs to; `DisplayName`, the string used to specify the parameter to
#' users (i.e. how the parameter would be labelled in a user interface); and
#' `Value`, a character column containing the value of each metadata parameter.
#' Numeric quantities will also be returned as strings, with units if
#' appropriate.
#'
#' @export
getBrukerMCFAllMetadata <- function(reader,index) {
	blobRow <- which(reader$offsetTable$id==reader$spotTable$id[index] &
					 	reader$offsetTable$BlobResType==259)
	dectable <- reader$metadata$declarations
	reptable <- reader$metadata$replacements

	fcon <- reader$con
	metaBlobOffset <- reader$offsetTable$Offset[blobRow]
	metaBlobOffsetPage <- reader$offsetTable$OffsetPage[blobRow]
	mcf_blobSeek(fcon,metaBlobOffset,metaBlobOffsetPage)

	mcf_checkBlobCodeWord(fcon)
	bin_checkBytes(fcon,charToRaw("\x01\x01"))
	seek(fcon,16,origin="current")
	numNamedEls <- bin_readVarInt(fcon)

	thisMetaTable <- data.frame()
	for (neiter in seq_len(numNamedEls)) {
		name <- mcf_readNamedName(fcon)
		if (readBin(fcon,"raw",1)==as.raw(0)) { next }
		else { seek(fcon,-1,origin="current") }

		keyTable <- mcf_readKeyValueTable(fcon)

		if (name=="IntValues") {
			names(keyTable)[names(keyTable)=="Value"] <- "Code"
			keyTable <- merge(keyTable,reptable,all.x=TRUE)
			keyTable$Value[is.na(keyTable$Value)] <- as.character(keyTable$Code[is.na(keyTable$Value)])
			keyTable$Code <- NULL
			keyTable <- merge(dectable,keyTable,all.y=TRUE)
		} else if (name=="StringValues") {
			keyTable <- merge(dectable,keyTable,all.y=TRUE)
		} else if (name=="DoubleValues") {
			keyTable <- merge(dectable,keyTable,all.y=TRUE)
			keyTable$Value <- sprintf(keyTable$DisplayFormat,keyTable$Value)
		} else {
			stop("Other metadata values types not supported.")
		}
		thisMetaTable <- rbind(thisMetaTable,keyTable)
	}
	thisMetaTable$Index <- index
	rel <- thisMetaTable$Unit != ""
	thisMetaTable$Value[rel] <- paste(thisMetaTable$Value[rel],thisMetaTable$Unit[rel])
	thisMetaTable[,c("Index","PermanentName","DisplayName","GroupName","Value")]
}

new_rtmsBrukerMCFReader <- function(dir,files,spotTable,offsetTable,metadata,con=NULL) {
	stopifnot(is.character(dir))
	stopifnot(is.list(files))
	stopifnot(is.data.frame(spotTable))
	stopifnot(is.data.frame(offsetTable))
	stopifnot(is.list(metadata))
	if (!is.null(con)) { stopifnot(inherits(con,"file")) }

	structure(list(dir=dir, files=files, spotTable=spotTable,
				   offsetTable=offsetTable, metadata=metadata, con=con),
			  class="rtmsBrukerMCFReader")
}

readBrukerMCFIndexFile <- function(path,calibration=FALSE) {
	# Extract a table of blob information from an mcf_idx file in the SQLite format

	# Open index file for decoding
	scon <- file(path,"rb")
	on.exit(close(scon),add=TRUE)

	# Locate all B-tree table leaf pages for the main table (which lists the
	# titles, column names, and root pages of all tables and index in the
	# SQLite file.
	mainHandler <- function(valueList,cellIndex) {
		if (valueList[[2]]=="ContainerIndex") {
			if (wholenumber(valueList[[4]])) {
				return(data.frame(name="blobRootPage",value=valueList[[4]]))
			} else { stop("Page record column should be an integer") }
		} else if (valueList[[2]]=="Relations") {
			if (wholenumber(valueList[[4]])) {
				return(data.frame(name="relRootPage",value=valueList[[4]]))
			} else { stop("Page record column should be an integer") }
		} else { return(c()) }
	}
	mainTable <- sqlt_parseBTreeTable(scon,1,mainHandler,TRUE)$output

	containerRoot <- mainTable$value[which(mainTable$name=="blobRootPage")]
	if (length(containerRoot)==0) { stop("ContainerIndex table not found.") }
	containerHandler <- function(values,cellIndex) {
		valueList <- values
		names(valueList) <- c("GuidA","GuidB","BlobResType","Offset","BlobSize")
		valueList[["Index"]] <- cellIndex
		# Guid values are 8-byte integers in the file, and hence return two integers
		# each, which must be pasted together to be a single column in a data table
		valueList$id <- paste(c(valueList$GuidA,valueList$GuidB),collapse=" ")
		valueList$GuidA <- NULL
		valueList$GuidB <- NULL
		valueList[["OffsetPage"]] <- 0
		if (length(valueList$Offset)>1) { valueList$OffsetPage <- valueList$Offset[2]*2 }
		if (valueList$Offset[1]<0) {
			valueList$OffsetPage <- valueList$OffsetPage+1
			valueList$Offset <- valueList$Offset[1]+(2^31)
		} else { valueList$Offset <- valueList$Offset[1] }
		return(as.data.frame(valueList))
	}
	containerdf <- sqlt_parseBTreeTable(scon,containerRoot,containerHandler,FALSE)$output

	if (calibration) {
		relationRoot <- mainTable$value[which(mainTable$name=="relRootPage")]
		relationHandler <- function(values,cellIndex) {
			valueList <- values
			names(valueList) <- c("GuidA","GuidB","ToGuidA","ToGuidB","RelationType")
			# Guid values are 8-byte integers in the file, and hence return two integers
			# each, which must be pasted together to be a single column in a data table
			valueList$id <- paste(c(valueList$GuidA,valueList$GuidB),collapse=" ")
			valueList$GuidA <- NULL
			valueList$GuidB <- NULL
			valueList$toId <- paste(c(valueList$ToGuidA,valueList$ToGuidB),collapse=" ")
			valueList$ToGuidA <- NULL
			valueList$ToGuidB <- NULL
			return(as.data.frame(valueList))
		}
		relationdf <- sqlt_parseBTreeTable(scon,relationRoot,relationHandler,FALSE)$output

		containerdf <- merge(containerdf,relationdf,all.x=TRUE)
		containerdf <- containerdf[order(containerdf$Index),c("Index","id","toId","RelationType","BlobResType","Offset","OffsetPage","BlobSize")]
	} else {
		containerdf <- containerdf[order(containerdf$Index),c("Index","id","BlobResType","Offset","OffsetPage","BlobSize")]
	}
	return(containerdf)
}

readMCFCalibration <- function(path,offsetTable) {
	# Open the full Bruker calibration data file for reading
	fsize <- file.size(path)
	fcon <- file(path,"rb")
	rawfile <- readBin(fcon,"raw",fsize+1000)
	close(fcon)

	fcon <- rawConnection(rawfile,"rb")
	on.exit(close(fcon),add=TRUE)

	caltab <- data.frame()
	for (iter in seq_len(nrow(offsetTable))) {
		seek(fcon,offsetTable$Offset[iter])

		# Process blob header
		mcf_checkBlobCodeWord(fcon)
		bin_checkBytes(fcon,charToRaw("\x01\x01"))
		seek(fcon,16,origin="current")
		numNamedEls <- bin_readVarInt(fcon)

		# First named element should be "Transformator" a Bruker list that contains the list of
		# calibration parameters
		name <- mcf_readNamedName(fcon)
		if (name!="Transformator") { stop("First named element of parameter blob should be 'Transformator'") }
		bin_checkBytes(fcon,charToRaw("\x01")) # Transformator should be a named list of parameters
		seek(fcon,16,origin="current")

		# Convert calibration mode 4 to calibration mode 5 for simplicity
		calrow <- mcf_readNamedTableRow(fcon)
		if (calrow$calibMode==4) {
			calrow$alpha <- 0
			calrow$beta <- -calrow$beta
		}
		calrow$toId <- offsetTable$toId[iter]
		caltab <- rbind(caltab,as.data.frame(calrow))
	}
	caltab
}

retrieveMCFMetadata <- function(fcon,offset,offsetPage=0) {
	mcf_blobSeek(fcon,offset,offsetPage)

	# Process blob header
	mcf_checkBlobCodeWord(fcon)
	bin_checkBytes(fcon,charToRaw("\x01\x01"))
	seek(fcon,16,origin="current")
	numNamedEls <- bin_readVarInt(fcon)

	# First named element should be "Declarations" a Bruker data table that includes
	# the integer keys of all metadata elements
	name <- mcf_readNamedName(fcon)
	if (name!="Declarations") { stop("First named element of parameter blob should be 'Declarations'") }
	outTable <- mcf_readTable(fcon)

	# First named element should be "Replacements" a Bruker link table that contains
	# lookup tables for enumerated parameter values
	name <- mcf_readNamedName(fcon)
	if (name!="Replacements") { stop("Second named element of parameter blob should be 'Replacements'") }
	bin_checkBytes(fcon,charToRaw("\x02\x01")) # Declarations should be a index of parameter replacements
	seek(fcon,16,origin="current")
	numReps <- bin_readVarInt(fcon)

	valTable <- data.frame()
	for (nriter in seq_len(numReps)) {
		bin_checkBytes(fcon,as.raw(1))
		seek(fcon,16,origin="current")
		bin_checkBytes(fcon,as.raw(3))
		if (nriter==1) {
			nextName <- mcf_readNamedName(fcon)
		} else { bin_checkBytes(fcon,as.raw(11)) }
		codes <- mcf_readPrimitiveArray(fcon)
		bin_checkBytes(fcon,as.raw(10))
		values <- mcf_readPrimitiveArray(fcon)
		bin_checkBytes(fcon,as.raw(9))
		bin_checkBytes(fcon,as.raw(37))
		key <-  readBin(fcon,"integer",1,size=4,endian="little")
		if (length(codes)!=length(values)) {
			stop("Replacement codes must be matched by the same number of values")
		}
		if (length(codes>0)) {
			valTable <- rbind(valTable,data.frame(Key=key,Code=codes,Value=as.character(values)))
		}
	}

	return(list(declarations=outTable,replacements=valTable))
}

# Helper function to convert a floating-point range of indexes to an integer range
shrinkBounds <- function(bounds) { return(c(ceiling(bounds[1]),floor(bounds[2]))) }

# Checks if a data file represents a whole number
wholenumber <- function(dat) { return(is.numeric(dat) && round(dat)==dat) }

bin_readVarInt <- function(fcon,endian="little") {
	# A varint represents an integer with anywhere from one to nine bytes,
	# with each byte representing a "digit" of the integer in base-128, and each byte
	# but the last with its highest bit set to 1.  For example, the number 13 would just
	# be represented by a single byte "0D", but the number 290 (being equal to 2*128 + 34)
	# would be represented by "22 82" ( 34 in hex, 2 + 128 in hex ) in little endian form
	# and "02 A2" ( 2 in hex, 34 + 128 in hex) in big endian form
	nums <- c()
	for (iter in 1:8) {
		nextByte <- readBin(fcon,"integer",1,size=1,signed=FALSE)
		if (nextByte>127) { nextNum <- nextByte-128 }
		else { nextNum <- nextByte }
		if (endian=="little") { nums <- c(nums,nextNum) }
		else { nums <- c(nextNum,nums) }
		if (nextByte<=127) { break }
	}
	return(sum(nums*(128^(0:(length(nums)-1)))))
}
bin_checkBytes <- function(fcon,bytes,message="Invalid bytes") {
	# Check that the next string of bytes in a file matches a given set
	if (!all(readBin(fcon,"raw",length(bytes))==bytes)) { stop(message) }
}

sqlt_getPage <- function(scon,page) {
	# SQLite files are broken into pages, which we assume here have length
	# 1024; this function jumps to the location of a given page, loads the page
	# into memory and returns a raw connection to that data; it optionally jumps
	# to a given offset within that page
	seek(scon,(page-1)*1024)
	rawpage <- readBin(scon,"raw",1024)
	rawcon <- rawConnection(rawpage,"rb")
	return(rawcon)
}
sqlt_parseBTreeInteriorPage <- function(pagecon,upper) {
	seek(pagecon,2,origin="current")
	numCells <- readBin(pagecon,"integer",1,size=2,signed=FALSE,endian="big")
	cellStart <- readBin(pagecon,"integer",1,size=2,signed=FALSE,endian="big")
	seek(pagecon,1,origin="current")
	finalPage <- readBin(pagecon,"integer",size=4,endian="big")
	cellPointers <- readBin(pagecon,"integer",numCells,size=2,signed=FALSE,endian="big")

	newpagedf <- data.frame(Page=rep(NA,numCells),Upper=rep(NA,numCells),Type=rep("None",numCells))
	for (citer in seq_len(numCells)) {
		seek(pagecon,cellPointers[citer])
		newpagedf$Page[[citer]] <- readBin(pagecon,"integer",size=4,endian="big")
		newpagedf$Upper[[citer]] <- bin_readVarInt(pagecon,"big")
	}
	return(rbind(newpagedf,data.frame(Page=finalPage,Upper=upper,Type="None")))
}
sqlt_parseBTreeLeafPage <- function(pagecon,handler) {
	if (is.null(handler)) { return(c()) }
	seek(pagecon,2,origin="current")
	numCells <- readBin(pagecon,"integer",1,size=2,signed=FALSE,endian="big")
	cellStart <- readBin(pagecon,"integer",1,size=2,signed=FALSE,endian="big")
	seek(pagecon,1,origin="current")
	cellPointers <- readBin(pagecon,"integer",numCells,size=2,signed=FALSE,endian="big")

	rows <- rep(list(c()),numCells)
	for (citer in seq_len(numCells)) {
		seek(pagecon,cellPointers[citer])
		cellSize <- bin_readVarInt(pagecon,"big")
		cellIndex <- bin_readVarInt(pagecon,"big")
		valueList <- sqlt_parseRecord(pagecon)
		rows[[citer]] <- handler(valueList,cellIndex)
	}
	return(as.data.frame(do.call(rbind,rows)))
}
sqlt_parseBTreeTable <- function(scon,rootpage,handler,first=FALSE) {

	pastTheFirst <- !first
	pagedf <- data.frame(Page=rootpage,Upper=Inf,Type="None")
	outputdf <- data.frame()
	while (TRUE) {
		if (!any(pagedf$Type=="None")) { break }
		curpageind <- which(pagedf$Type=="None")[1]
		curpage <- pagedf$Page[curpageind]
		pagecon <- sqlt_getPage(scon,curpage)
		if (!pastTheFirst) {
			seek(pagecon,100)
			pastTheFirst <- TRUE
		}
		pageType <- readBin(pagecon,"integer",1,size=1,signed=FALSE)
		if (pageType==5) {
			pagedf <- rbind(pagedf,sqlt_parseBTreeInteriorPage(pagecon,pagedf$Upper[curpageind]))
			pagedf <- pagedf[-curpageind,]
		} else if (pageType==13) {
			outputdf <- rbind(outputdf,sqlt_parseBTreeLeafPage(pagecon,handler))
			pagedf$Type[curpageind] <- "Leaf"
		} else if (pageType!=5) {
			close(pagecon)
			stop("Page must be b-tree leaf or interior page")
		}
		close(pagecon)
	}
	pagedf <- pagedf[order(pagedf$Upper),]
	return(list(pages=pagedf,output=outputdf))
}
sqlt_parseRecord <- function(scon) {
	# In an SQLite data file, a row of a data table is converted to a string of bytes.
	# The byte string begins with a header of varints; one varint specifying the total
	# size of the header in bytes, and the remaining varints encoding the data types of
	# the elements of the record.  Then each element of the record is included in order
	output <- list()
	headerSize <- bin_readVarInt(scon,"big")
	headerRemaining <- headerSize-1
	dataTypes <- c()
	while (headerRemaining>0) {
		nextCode <- bin_readVarInt(scon,"big")
		dataTypes <- c(dataTypes,nextCode)
		headerRemaining <- headerRemaining-max(ceiling(log(nextCode,base=128)),1)
	}

	for (dtiter in 1:length(dataTypes)) {
		output[[dtiter]] <- sqlt_parseRecordValue(scon,dataTypes[dtiter])
	}
	return(output)
}
sqlt_parseRecordValue <- function(scon,type) {
	# Each possible data type in an SQLite record stream is represented by an integer.
	# The most important data types are represented by
	#   - 1, 2, 3, 4, 5, and 6, representing one, two, three, four, six, and eight
	#     byte signed integers, respectively
	#   - 7, representing an 8-byte (64 bit) floating point number
	#   - Even numbers greater than 12, representing a string of raw binary data
	#   - Odd numbers greater than 13, representing a variable-length string
	# Because R cannot reliably represent 8-byte integer values, data types 5 and 6
	# (representing six and eight-byte integers) actually return two four-byte integers
	# which can be handled later
	if (type==0) { return(NA) }
	else if (type<0 || !wholenumber(type)) { stop("Unsupported record data type") }
	else if (type==8) { return(0) }
	else if (type==9) { return(1) }
	else if (type==12) { return(as.raw(c())) }
	else if (type==13) { return("") }
	else if (type==1) { return(readBin(scon,"integer",1,size=1,signed=TRUE)) }
	else if (type==2) { return(readBin(scon,"integer",1,size=2,signed=TRUE,endian="big")) }
	else if (type==3) {
		tempraw <- c(as.raw(0),readBin(scon,"raw",3))
		return(readBin(tempraw,"integer",1,size=4,endian="big"))
	} else if (type==4) { return(readBin(scon,"integer",1,size=4,endian="big")) }
	else if (type==5) {
		tempraw <- c(as.raw(c(0,0)),readBin(scon,"raw",6))
		return(rev(readBin(tempraw,"integer",2,size=4,endian="big")))
	} else if (type==6) { return(rev(readBin(scon,"integer",2,size=4,endian="big"))) }
	else if (type==7) { return(readBin(scon,"double",1,size=8,endian="big")) }
	else if (type==10 || type==11) { stop("Unsupported record data type") }
	else if ((type%%2)==0) { return(readBin(scon,"raw",(type-12)/2)) }
	else { return(rawToChar(readBin(scon,"raw",(type-13)/2))) }
}

mcf_blobSeek <- function(scon,offset,page) {
	# A hacked little survival function.  It turns out R does not consistently
	# work with integers larger than 2147483647 (2^31-1); most of the offsets in
	# the Bruker raw data file go into the billions, so passing those offsets to
	# seek is not possible. This function represents an offset in terms of "pages"
	# of size 2^31, and then smaller offsets within those pages, which allows us
	# to traverse a 10 GB file
	seek(scon,0)
	if (page>0) {
		for (piter in 1:page) { seek(scon,2^31,origin="current") }
	}
	seek(scon,offset,origin="current")
}
mcf_checkBlobCodeWord <- function(fcon) {
	# Check the beginning of a Bruker raw data file blob for the code word
	# "C0DEAFFE" or "code monkey" in German.
	bin_checkBytes(fcon,charToRaw("\xC0\xDE\xAF\xFE"),"Invalid code word.")
}
mcf_readNamedName <- function(fcon) {
	# In a Bruker raw data file, a named element is marked by four bytes, counting down
	# from "FF FF FF FF", so the first byte must be greater than 127, and the next three
	# bytes must be "FF FF FF" (unless a blob contains more than 127 elements, which we
	# assume is impossible).  The next byte must be "0F", marking the beginning of the
	# name string, followed by a byte representing the length of the name string.  The
	# string itself can be read in ASCII from the next set of bytes corresponding to the
	# string length.
	if (!(readBin(fcon,"integer",1,size=1,signed=FALSE)>127)) { stop("Invalid object name.") }
	bin_checkBytes(fcon,charToRaw("\xFF\xFF\xFF\x0F"),"Invalid object name.")
	namelen <- readBin(fcon,"integer",1,size=1,signed=FALSE)
	return(rawToChar(readBin(fcon,"raw",namelen)))
}
mcf_readPrimitive <- function(fcon) {
	# Extracts primitive data types from the stream of a Bruker raw data file.
	# Primitive data types here mean integers, floats and strings. Each primitive type is
	# marked by a single byte:
	#   - x27 and x28 represent eight-byte integers (probably one signed and the other unsigned)
	#   - x25 and x26 represent four-byte integers (again probably one signed and the other unsigned)
	#   - x20 represents a four-byte (32 bit) floating point number
	#   - x1F represents an eight-byte (64 bit) floating point number
	#   - x2A represents an ASCII string, with a following varint specifying the number of characters
	# Other data type byte markers not handled by this function is x03, which represents an array of
	# items, which is then followed by bytes specifying that type and a varint specifying the number
	# of elements, and x01, which appears to allow specifying a custom data type, often a named list
	dataType <- readBin(fcon,"integer",1,size=1,signed=FALSE)
	if (dataType %in% c(39,40)) { return(readBin(fcon,"integer",1,size=8,endian="little")) }
	else if (dataType %in% c(37,38)) { return(readBin(fcon,"integer",1,size=4,endian="little")) }
	else if (dataType==32) { return(readBin(fcon,"double",1,size=4,endian="little")) }
	else if (dataType==31) { return(readBin(fcon,"double",1,size=8,endian="little")) }
	else if (dataType==0) { return(NULL) }
	else if (dataType==42) {
		stringLength <- bin_readVarInt(fcon)
		return(rawToChar(readBin(fcon,"raw",stringLength)))
	} else { stop(sprintf("Invalid primitive data type %d.",dataType)) }
}
mcf_readPrimitiveArray <- function(fcon) {
	# Reads an array of primitive data values using the same type marking as
	# the previous function.  Arrays are marked by an array byte (x03), a type
	# byte, and a varint listing the number of values to be extracted
	bin_checkBytes(fcon,as.raw(3)) # Arrays are marked with x03
	dataType <- readBin(fcon,"integer",1,size=1,signed=FALSE)
	numEls <- bin_readVarInt(fcon)
	if (numEls==0) { return(c()) }
	outVec <- rep(NA,numEls)
	for (viter in seq_along(outVec)) {
		if (dataType %in% c(39,40)) { outVec[viter] <- readBin(fcon,"integer",1,size=8,endian="little") }
		else if (dataType %in% c(37,38)) { outVec[viter] <- readBin(fcon,"integer",1,size=4,endian="little") }
		else if (dataType==32) { outVec[viter] <- readBin(fcon,"double",1,size=4,endian="little") }
		else if (dataType==31) { outVec[viter] <- readBin(fcon,"double",1,size=8,endian="little") }
		else if (dataType==0) {  }
		else if (dataType==42) {
			stringLength <- bin_readVarInt(fcon)
			outVec[viter] <- rawToChar(readBin(fcon,"raw",stringLength))
		} else { stop(sprintf("Invalid primitive data type %d.",dataType)) }
	}
	return(outVec)
}
mcf_readNamedTableRow <- function(fcon) {
	# In a data table in a Bruker raw data file, there is no separate row specifying column names
	# Instead, the first row of the table *may* contained named elements rather than bare primitives
	# so the names along with their values should be extracted as a list.
	#
	# The format of a table row is as follows:
	#   - A byte specifying the number (N) of columns in the row (its possible this is a varint and
	#     I've simply never seen a table with more than 127 columns, but we'll ignore that for now)
	#   - N elements:
	#      - Either:
	#        - a byte specifying the index of the element in the row (offset to begin at 2 for some
	#          reason, or
	#        - the name of the element
	#      - The primitive element itself
	#
	# The structure is not dissimilar to R lists, in which elements can be specified either by
	# their position, or by a name
	numRowCells <- readBin(fcon,"integer",1,size=1,signed=FALSE)
	output <- list()
	for (rciter in 1:numRowCells) {
		nextByte <- readBin(fcon,"integer",1,size=1,signed=FALSE)
		if (nextByte<=127) {
			if (nextByte!=(rciter+1)) { stop("Invalid row cell index") }
			name <- as.character(nextByte)
		} else {
			seek(fcon,-1,origin="current")
			name <- mcf_readNamedName(fcon)
		}
		prim <- mcf_readPrimitive(fcon)
		if (nextByte<=127) { output[[rciter]] <- prim }
		else { output[[name]] <- prim }
	}
	return(output)
}
mcf_readTableRow <- function(fcon,namevec=NULL) {
	# All rows but the first row (see previous function) in a data table contain unnamed
	# primitive elements.  Ideally the name of each element will have been extracted from
	# the first row, and can be passed to this function to apply the same names to the
	# elements of this row
	numRowCells <- readBin(fcon,"integer",1,size=1,signed=FALSE)
	if (is.null(namevec)) { namevec <- rep("",numRowCells) }
	else if (length(namevec)!=numRowCells) {
		stop("Name vector must contain the same number of elements as the table row")
	}
	output <- list()
	for (rciter in 1:numRowCells) {
		nextByte <- readBin(fcon,"integer",1,size=1,signed=FALSE)
		if (nextByte!=(rciter+1)) { stop("Invalid row cell index") }
		prim <- mcf_readPrimitive(fcon)
		if (namevec[rciter]=="") { output[[rciter]] <- prim }
		else { output[[namevec[rciter]]] <- prim }
	}
	return(output)
}
mcf_readTable <- function(fcon) {
	# Must be marked as an array of similar objects
	bin_checkBytes(fcon,charToRaw("\x03\x01"))
	seek(fcon,16,origin="current")
	numRows <- bin_readVarInt(fcon)

	outTable <- data.frame()
	for (riter in seq_len(numRows)) {
		if (riter==1) {
			curRow <- mcf_readNamedTableRow(fcon)
			namevec <- names(curRow)
			outTable <- rbind(outTable,as.data.frame(curRow))
		} else {
			curRow <- mcf_readTableRow(fcon,namevec)
			outTable <- rbind(outTable,as.data.frame(curRow))

		}
	}
	outTable
}
mcf_readNamedKeyValueRow <- function(fcon) {
	# A key-value table is similar to a generic Bruker raw data file table, with the
	# added constraint that each row must contain two elements, an integer named "Key"
	# and a primitive named "Value"
	bin_checkBytes(fcon,as.raw(2),"Key-value row must contain two elements")
	output <- list()
	for (rciter in 1:2) {
		nextByte <- readBin(fcon,"integer",1,size=1,signed=FALSE)
		if (nextByte<=127) {
			if (nextByte!=(rciter+1)) { stop("Invalid row cell index") }
			if (rciter==1) { name <- "Key" }
			else { name <- "Value" }
		} else {
			seek(fcon,-1,origin="current")
			name <- mcf_readNamedName(fcon)
			if ((rciter==1 && name!="Key") || (rciter==2 && name!="Value")) {
				stop("Names in a key-value row must be 'Key' and 'Value'")
			}
		}
		prim <- mcf_readPrimitive(fcon)
		if (rciter==1) {
			if (!is.integer(prim)) { stop("Key in key-value row must be an integer.") }
			output$Key <- prim
		} else { output$Value <- prim }
	}
	return(output)
}
mcf_readKeyValueRow <- function(fcon) {
	# Extracts an unnamed later row from a Bruker raw data file key-value table
	bin_checkBytes(fcon,as.raw(2),"Key-value row must contain two elements")
	output <- list()
	for (rciter in 1:2) {
		nextByte <- readBin(fcon,"integer",1,size=1,signed=FALSE)
		if (nextByte!=(rciter+1)) { stop("Invalid row cell index") }
		prim <- mcf_readPrimitive(fcon)
		if (rciter==1) {
			if (!is.integer(prim)) { stop("Key in key-value row must be an integer.") }
			output$Key <- prim
		} else { output$Value <- prim }
	}
	return(output)
}
mcf_readKeyValueTable <- function(fcon) {
	# Must be marked as an array of similar objects
	bin_checkBytes(fcon,charToRaw("\x03\x01"))
	seek(fcon,16,origin="current")
	numRows <- bin_readVarInt(fcon)

	keyTable <- data.frame()
	for (riter in 1:numRows) {
		if (riter==1) { curRow <- mcf_readNamedKeyValueRow(fcon) }
		else { curRow <- mcf_readKeyValueRow(fcon) }
		keyTable <- rbind(keyTable,as.data.frame(curRow))
	}
	keyTable
}
mcf_readNumericPrimitiveBlank <- function(fcon) {
	# This function calculates the correct number of bytes to read based on
	# the next data-type byte in a given file stream, useful for skipping
	# over a large stream of similar data elements
	# Obviously, as strings are variable in length, this function cannot be
	# applied to them
	dataType <- readBin(fcon,"integer",1,size=1,signed=FALSE)
	if (dataType%in%c(31,39,40)) { return(8) }
	else if (dataType%in%c(32,37,38)) { return(4) }
	else { stop(sprintf("Invalid primitive data type %d.",dataType)) }
}
mcf_readNamedTableRowBlank <- function(fcon) {
	# Like the previous function, determines the number of bytes that make
	# up each later row in a Bruker raw data file, so that they can be
	# skipped quickly.  This function assumes that all table elements are
	# numeric primitives
	numRowCells <- readBin(fcon,"integer",1,size=1,signed=FALSE)
	count <- 1
	for (rciter in 1:numRowCells) {
		nextByte <- readBin(fcon,"integer",1,size=1,signed=FALSE)
		count <- count+1
		if (nextByte<=127) {
			if (nextByte!=(rciter+1)) { stop("Invalid row cell index") }
		} else {
			seek(fcon,-1,origin="current")
			mcf_readNamedName(fcon)
		}
		primSize <- mcf_readNumericPrimitiveBlank(fcon)
		count <- count+1+primSize
		readBin(fcon,"raw",primSize)
	}
	return(count)
}

Try the rtms package in your browser

Any scripts or data that you put into this service are public.

rtms documentation built on June 7, 2023, 5:59 p.m.