#' MassBank-record Parser
#'
#' Can parse MassBank-records(only V2)
#'
#' @aliases parseMassBank
#' @usage parseMassBank(Files)
#' @param Files A path to the plaintext-record that should be read
#' @return The \code{mbWorkspace} that the plaintext-record creates.
#' @seealso \code{\link{validate}}
#' @author Erik Mueller
#' @examples \dontrun{
#' parseMassBank("filepath_to_records/RC00001.txt")
#' }
#' @export
parseMbRecord <- function(filename, readAnnotation = TRUE)
{
fileConnection <- file(filename)
record <- readLines(fileConnection)
close(fileConnection)
# This is verbatim Erik's extractor, with mb@compiled_ok[[i]] subbed by recData,
# and the peak / annotation extractor output slightly modified.
# Added: check for "NA" entries in SMILES, IUPAC, FORMULA
recData <- list()
recData[['ACCESSION']] <- substring(grep('ACCESSION:',record, value = TRUE, fixed = TRUE),12)
recData[['RECORD_TITLE']] <- substring(grep('RECORD_TITLE:',record, value = TRUE),12)
recData[['DATE']] <- format(as.Date(substring(grep('DATE:',record, value = TRUE, fixed = TRUE),7), format = "%Y.%m.%d"), "%Y.%m.%d")
recData[['AUTHORS']] <- substring(grep('AUTHORS:',record, value = TRUE, fixed = TRUE),10)
recData[['LICENSE']] <- substring(grep('LICENSE:',record, value = TRUE, fixed = TRUE),10)
recData[['COPYRIGHT']] <- substring(grep('COPYRIGHT:',record, value = TRUE, fixed = TRUE),12)
##publication <- substring(grep('PUBLICATION:',record, fixed = TRUE),14)
##if(length(publication) > 0){
#mb@compiled_ok[[i]][['PUBLICATION']] <- publication
##}
##The list of comments is handled differently
##in RMassBank, but the flattening should work anyway, if I'm correct(RMassBank uses internal values for comments)
commentlist <- list()
commentlist <- as.list(substring(grep('COMMENT:',record, value = TRUE, fixed = TRUE),10))
recData[['COMMENT']] <- list()
recData[['COMMENT']] <- commentlist
chnames <- list()
chnames <- as.list(substring(grep('CH$NAME:',record, value = TRUE, fixed = TRUE),10))
recData[['CH$NAME']] <- chnames
recData[['CH$COMPOUND_CLASS']] <- substring(grep('CH$COMPOUND_CLASS:',record, value = TRUE, fixed = TRUE),20)
recData[['CH$FORMULA']] <- substring(grep('CH$FORMULA:',record, value = TRUE, fixed = TRUE),13)
recData[['CH$EXACT_MASS']] <- as.numeric(substring(grep('CH$EXACT_MASS:',record, value = TRUE, fixed = TRUE),16))
recData[['CH$SMILES']] <- substring(grep('CH$SMILES:',record, value = TRUE, fixed = TRUE),12)
recData[['CH$IUPAC']] <- substring(grep('CH$IUPAC:',record, value = TRUE, fixed = TRUE),11)
##Again: Flattening this should be no Problem, although the structure is different -
##RMassBank names every type of link, but this isn't necessary here since we're only
##reading, not creating. If that's a problem, I'll change it.
links <- list()
links <- as.list(substring(grep('CH$LINK:',record, value = TRUE, fixed = TRUE),10))
recData[['CH$LINK']] <- links
##SP$ will be included later since it's kind of rarely used
recData[['AC$INSTRUMENT']] <- substring(grep('AC$INSTRUMENT:',record, value = TRUE, fixed = TRUE),16)
recData[['AC$INSTRUMENT_TYPE']] <- substring(grep('AC$INSTRUMENT_TYPE:',record, value = TRUE, fixed = TRUE),21)
##Get the Subvalues just like in RMassBank
##RECORD VERSION SPECIFIC READING INCLUDED
##This could convert Version 1 -> Version 2 if used right,
##Although I have no idea how well it'd do that
##I'll have to find the old specifications to do this right, until then it should only kind of work
##well enough to do some tests
Version <- 2
ac_ms <- list()
ac_ms[['MS_TYPE']] <- substring(grep('AC$MASS_SPECTROMETRY: MS_TYPE',record, value = TRUE, fixed = TRUE),31)
if(length(ac_ms[['MS_TYPE']]) == 0){
ac_ms[['MS_TYPE']] <- substring(grep('AC$ANALYTICAL_CONDITION: MS_TYPE',record, value = TRUE, fixed = TRUE),34)
Version <- 1
}
if(Version == 1){
##This not a real tag anymore(according to the specifications) but RMassBank still writes it...?
##I'll include it for the case that I'm reading V1-records
ac_ms[['IONIZATION']] <- substring(grep('AC$MASS_SPECTROMETRY: IONIZATION',record, value = TRUE, fixed = TRUE),34)
ac_ms[['ION_MODE']] <- substring(grep('AC$ANALYTICAL_CONDITION: MODE',record, value = TRUE, fixed = TRUE),31)
} else{
ac_ms[['ION_MODE']] <- substring(grep('AC$MASS_SPECTROMETRY: ION_MODE',record, value = TRUE, fixed = TRUE),32)
##Some of the following are part of the (optional) specification, but NOT in RMassBank(!)
##This is just for the sake of completeness
ac_ms[['COLLISION_ENERGY']] <- substring(grep('AC$MASS_SPECTROMETRY: COLLISION_ENERGY',record, value = TRUE, fixed = TRUE),40)
ac_ms[['COLLISION_GAS']] <- substring(grep('AC$MASS_SPECTROMETRY: COLLISION_GAS',record, value = TRUE, fixed = TRUE),37)
ac_ms[['DATE']] <- substring(grep('AC$MASS_SPECTROMETRY: DATE',record, value = TRUE, fixed = TRUE),28)
ac_ms[['DESOLVATION_GAS_FLOW']] <- substring(grep('AC$MASS_SPECTROMETRY: DESOLVATION_GAS_FLOW',record, value = TRUE, fixed = TRUE),44)
ac_ms[['DESOLVATION_TEMPERATURE']] <- substring(grep('AC$MASS_SPECTROMETRY: DESOLVATION_TEMPERATURE',record, value = TRUE, fixed = TRUE),47)
ac_ms[['IONIZATION_ENERGY']] <- substring(grep('AC$MASS_SPECTROMETRY: IONIZATION_ENERGY',record, value = TRUE, fixed = TRUE),41)
ac_ms[['LASER']] <- substring(grep('AC$MASS_SPECTROMETRY: LASER',record, value = TRUE, fixed = TRUE),29)
ac_ms[['MATRIX']] <- substring(grep('AC$MASS_SPECTROMETRY: MATRIX',record, value = TRUE, fixed = TRUE),30)
ac_ms[['MASS_ACCURACY']] <- substring(grep('AC$MASS_SPECTROMETRY: MASS_ACCURACY',record, value = TRUE, fixed = TRUE),37)
ac_ms[['REAGENT_GAS']] <- substring(grep('AC$MASS_SPECTROMETRY: REAGENT_GAS',record, value = TRUE, fixed = TRUE),35)
ac_ms[['SCANNING']] <- substring(grep('AC$MASS_SPECTROMETRY: SCANNING',record, value = TRUE, fixed = TRUE),32)
##These are in RMassBank, but not part of the specification?
##I think I'm misreading something...
#ac_ms[['FRAGMENTATION_MODE']] <- msmsdata$info$mode
#ac_ms[['PRECURSOR_TYPE']] <- precursor_types[spec$mode]
#ac_ms[['RESOLUTION']] <- msmsdata$info$res
ac_lc <- list();
ac_lc[['CAPILLARY_VOLTAGE']] <- substring(grep('AC$CHROMATOGRAPHY: CAPILLARY_VOLTAGE',record, value = TRUE, fixed = TRUE),36)
ac_lc[['COLUMN_NAME']] <- substring(grep('AC$CHROMATOGRAPHY: COLUMN_NAME',record, value = TRUE, fixed = TRUE),32)
ac_lc[['COLUMN_TEMPERATURE']] <- substring(grep('AC$CHROMATOGRAPHY: COLUMN_TEMPERATURE',record, value = TRUE, fixed = TRUE),39)
ac_lc[['FLOW_GRADIENT']] <- substring(grep('AC$CHROMATOGRAPHY: FLOW_GRADIENT',record, value = TRUE, fixed = TRUE),34)
ac_lc[['FLOW_RATE']] <- substring(grep('AC$CHROMATOGRAPHY: FLOW_RATE',record, value = TRUE, fixed = TRUE),30)
ac_lc[['RETENTION_TIME']] <- substring(grep('AC$CHROMATOGRAPHY: RETENTION_TIME',record, value = TRUE, fixed = TRUE),35)
ac_lc[['SOLVENT A']] <- substring(grep('AC$CHROMATOGRAPHY: SOLVENT A',record, value = TRUE, fixed = TRUE),30)
ac_lc[['SOLVENT B']] <- substring(grep('AC$CHROMATOGRAPHY: SOLVENT B',record, value = TRUE, fixed = TRUE),30)
ms_fi <- list()
ms_fi[['BASE_PEAK']] <- as.double(substring(grep('MS$FOCUSED_ION: BASE_PEAK',record, value = TRUE, fixed = TRUE),27))
ms_fi[['PRECURSOR_M/Z']] <- substring(grep('MS$FOCUSED_ION: PRECURSOR_M/Z',record, value = TRUE, fixed = TRUE),31)
ms_fi[['PRECURSOR_TYPE']] <- substring(grep('MS$FOCUSED_ION: PRECURSOR_TYPE',record, value = TRUE, fixed = TRUE),32)
if(ac_ms[['MS_TYPE']] == 'MS2'){
ms_fi[['PRECURSOR_M/Z']] <- as.double(ms_fi[['PRECURSOR_M/Z']])
}
}
namesAcms <- names(ac_ms)
namesAclc <- names(ac_lc)
namesMsfi <- names(ms_fi)
for(k in 1:length(ac_ms)){
if(length(ac_ms[[namesAcms[k]]]) == 0){
ac_ms[[namesAcms[k]]] <- NA
}
}
for(k in 1:length(ac_lc)){
if(length(ac_lc[[namesAclc[k]]]) == 0){
ac_lc[[namesAclc[k]]] <- NA
}
}
for(k in 1:length(ms_fi)){
if(length(ms_fi[[namesMsfi[k]]]) == 0){
ms_fi[[namesMsfi[k]]] <- NA
}
}
recData[['AC$MASS_SPECTROMETRY']] <- list()
recData[['AC$MASS_SPECTROMETRY']] <- ac_ms
recData[['AC$CHROMATOGRAPHY']] <- list()
recData[['AC$CHROMATOGRAPHY']] <- ac_lc
recData[['MS$FOCUSED_ION']] <- list()
recData[['MS$FOCUSED_ION']] <- ms_fi
##Can currently only read annotations of the type "m/z num {formula mass error(ppm)}"
##and'll only read it properly if there is only one annotation
##the strange conversion of the data.frames is there so RMassBank can actually write it again
PKannotationStart <- grep('PK$ANNOTATION:',record, fixed = TRUE) + 1
numpeak <- grep('PK$NUM_PEAK:',record, fixed = TRUE)
##Extract the peaks and write the data into a data.frame
PKStart <- grep('PK$PEAK:',record, fixed = TRUE) + 1
endslash <- tail(grep('//',record, fixed = TRUE),1)
if(PKStart < endslash){
splitted <- strsplit(record[PKStart:(endslash-1)]," ")
PKPeak <- matrix(nrow = endslash - PKStart, ncol = 3)
for(k in 1:length(splitted)){
splitted[[k]] <- splitted[[k]][which(splitted[[k]] != "")]
PKPeak[k,] <- splitted[[k]]
}
PKPeak <- as.data.frame(PKPeak, stringsAsFactors = FALSE)
PKPeak[] <- lapply(PKPeak, type.convert)
colnames(PKPeak) <- c("mz", "intensity", "intrel")
}
# If an annotation is present, read it and fill it into the PKPeak DF
if(length(PKannotationStart) > 0 && readAnnotation == TRUE){
if(PKannotationStart < numpeak){
splitted <- strsplit(record[PKannotationStart:(numpeak-1)]," ")
PKannotation <- matrix(nrow = numpeak - PKannotationStart, ncol = 5)
for(k in 1:length(splitted)){
splitted[[k]] <- splitted[[k]][which(splitted[[k]] != "")]
PKannotation[k,] <- splitted[[k]]
}
PKannotation <- as.data.frame(PKannotation, stringsAsFactors = FALSE)
PKannotation[] <- lapply(PKannotation, type.convert)
colnames(PKannotation) <- c("mz","formula", "formulaCount", "mzCalc", "dppm")
PKannotation$formula <- as.character(PKannotation$formula)
PKPeak <- merge(PKPeak, PKannotation, by="mz", all.x=TRUE)
}
}
namesComp <- names(recData)
for(k in 1:length(recData)){
if(length(recData[[namesComp[k]]]) == 0){
recData[[namesComp[k]]] <- NA
}
}
if(recData[["CH$SMILES"]] == "NA")
recData[["CH$SMILES"]] <- character(0)
# NA in IUPAC is fine because
# 1) it will not be processed by RMB
# 2) it's great to recognize unknown cpds (and then group by name)
# if(recData[["IUPAC"]] == "NA")
# recData[["IUPAC"]] <-
if(recData[["CH$FORMULA"]] == "NA")
recData[["CH$FORMULA"]] <- ""
# Compose the RmbSpectrum2
sp <- new("RmbSpectrum2")
sp <- addProperty(sp, "intrel", "numeric")
# set the data in the spectrum
sp <- setData(sp, PKPeak)
sp@info <- recData
# Read back from Info what was put there from RmbSpectrum2 vanilla
sp@info$res <- sp@info[['AC$MASS_SPECTROMETRY']][['RESOLUTION']]
sp@info$ce <- sp@info[['AC$MASS_SPECTROMETRY']][['COLLISION_ENERGY']]
sp@info$mode <- sp@info[['AC$MASS_SPECTROMETRY']][['FRAGMENTATION_MODE']]
sp@collisionEnergy <- as.numeric(gsub("([0-9]+).*", "\\1", sp@info$ce, perl=TRUE))
sp@precursorMz <- as.numeric(sp@info[["MS$FOCUSED_ION"]][['PRECURSOR_M/Z']])
sp@rt <- as.numeric(gsub("([0-9]+).*", "\\1", sp@info[["AC$CHROMATOGRAPHY"]][['RETENTION_TIME']], perl=TRUE))
# note: the compound info is read out from @info if reading multiple files
print(paste("Read", filename))
flush.console()
return(sp)
}
# Todo: for now, only MS2 are assumed. Also, if input is mixed pos/neg,
# output will be strange.
parseMbRecords <- function(files)
{
# parse all together
parsedFiles <- lapply(files, parseMbRecord)
# tease out single compounds
cpdList <- unlist(lapply(parsedFiles, function(f) f@info[["CH$IUPAC"]]))
cpdNoId <- which(cpdList == "NA")
cpdList[cpdNoId] <- unlist(lapply(parsedFiles, function(f) f@info[["CH$NAME"]][[1]]))
cpdSpectra <- split(parsedFiles, cpdList)
lspectra <- lapply(cpdSpectra, function(sps)
{
cpd <- new("RmbSpectraSet")
# Order by collision energies?
# Guess order from ACCESSION
spOrder <- order(unlist(lapply(sps, function(sp) (sp@info[["ACCESSION"]]))))
cpd@children <- as(sps[spOrder], "SimpleList")
# Select one spectrum to get compound data from:
sp <- sps[[1]]
cpd@mz <- as.numeric(sp@info[["MS$FOCUSED_ION"]][['PRECURSOR_M/Z']])
cpd@mode <- names(RMassBank:::.precursorTypes)[which(RMassBank:::.precursorTypes ==
sp@info[["MS$FOCUSED_ION"]][['PRECURSOR_TYPE']])]
cpd@name <- sp@info[["CH$NAME"]][[1]]
cpd@formula <- sp@info[['CH$FORMULA']]
cpd@smiles <- sp@info[['CH$SMILES']]
# Guess ID from ACCESSION
acc <- sp@info[["ACCESSION"]]
cpd@id <- substr(acc, 3, 6)
cpd
})
spectra <- as(lspectra,"SimpleList")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.