RMassBank: Workflow to process tandem MS files and build MassBank records

Documented in addPeaks annotator.default createMolfile exportMassbank flatten gatherData gatherDataBabel gatherDataUnknown gatherPubChem loadInfolist loadInfolists makeMollist mbWorkflow .parseTitleString readMbdata resetInfolists

# Script for writing MassBank files

#testtest change
#' Load MassBank compound information lists
#' 
#' Loads MassBank compound information lists (i.e. the lists which were created
#' in the first two steps of the MassBank \code{\link{mbWorkflow}} and
#' subsequently edited by hand.).
#' 
#' \code{resetInfolists} clears the information lists, i.e. it creates a new
#' empty list in \code{mbdata_archive}. \code{loadInfolist} loads a single CSV
#' file, whereas \code{loadInfolists} loads a whole directory.
#' 
#' @aliases loadInfolists loadInfolist resetInfolists
#' @usage loadInfolists(mb, path)
#' 
#'  loadInfolist(mb, fileName)
#' 
#'  resetInfolists(mb)
#' @param path Directory in which the namelists reside. All CSV files in this
#' directory will be loaded.
#' @param fileName A single namelist to be loaded.
#' @param mb The \code{mbWorkspace} to load/reset the lists in.
#' @return The new workspace with loaded/reset lists.
#' @author Michael Stravs
#' @examples
#' 
#' #
#' \dontrun{mb <- resetInfolists(mb)
#' 	mb <- loadInfolist(mb, "my_csv_infolist.csv")}
#' 
#' @export
loadInfolists <- function(mb, path)
{
  archivefiles <- list.files(path, ".csv", full.names=TRUE)
  for(afile in archivefiles)
    mb <- loadInfolist(mb, afile)
  return(mb)
}

# Load an "infolist". This loads a CSV file which should contain the entries
# edited and controlled by hand. All compound infos from fileName are added into the
# global mbdata_archive. Entries with a cpdID which was already present, are substituted
# by new entries from the fileName file.
#' @export
loadInfolist <- function(mb, fileName)
{
  # Prime a new infolist if it doesn't exist
  if(ncol(mb@mbdata_archive) == 0)
    mb <- resetInfolists(mb)
  mbdata_new <- read.csv(fileName, sep=",", stringsAsFactors=FALSE)
  # Legacy check for loading the Uchem format files.
  # Even if dbname_* are not used downstream of here, it's still good to keep them
  # for debugging reasons.
  n <- colnames(mbdata_new)
  cols <- c("id","dbcas","dataused")
  
  # Check if comma-separated or semicolon-separated
  d <- setdiff(cols, n)
  if(length(d)>0){
		mbdata_new <- read.csv2(fileName, stringsAsFactors=FALSE)
		n <- colnames(mbdata_new)
		d2 <- setdiff(cols, n)
		if(length(d2) > 0){
			stop("Some columns are missing in the infolist.")
		}
	}
  if("dbname_d" %in% colnames(mbdata_new))
  {
    colnames(mbdata_new)[[which(colnames(mbdata_new)=="dbname_d")]] <- "dbname"
    # dbname_e will be dropped because of the select= in the subset below.
  }
  if("COMMENT.EAWAG_UCHEM_ID" %in% colnames(mbdata_new))
    colnames(mbdata_new)[[which(colnames(mbdata_new)== "COMMENT.EAWAG_UCHEM_ID")]] <-
      "COMMENT.ID"
  
  # Clear from padding spaces and NAs
  mbdata_new <- as.data.frame(x = t(apply(mbdata_new, 1, function(r) 
    {
    # Substitute empty spaces by real NA values
    r[which(r == "")] <- NA
    # Trim spaces (in all non-NA fields)
    r[which(!is.na(r))] <- sub(pattern = "^ *([^ ]+) *$", replacement = "\\1", x = r[which(!is.na(r))])
    return(r)
  })), stringsAsFactors = FALSE)
  # use only the columns present in mbdata_archive, no other columns added in excel
  colNames <- colnames(mb@mbdata_archive)
  commentColNames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\.(?!CONFIDENCE)(?!ID)", perl = TRUE)]
  colNames <- c(colNames, commentColNames)

  ## The read infolists might not have all required / expected columns
  missingColNames <- colNames[! colNames %in% colnames(mbdata_new)]
  if (length(missingColNames >0)) {
    missingCols <- matrix(NA, ncol=length(missingColNames))
    colnames(missingCols) <- missingColNames
    mbdata_new <- cbind(mbdata_new, missingCols)
  }
    
  mbdata_new <- mbdata_new[, colNames]
  # substitute the old entires with the ones from our files
  # then find the new (previously inexistent) entries, and rbind them to the table
  new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id)
  old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id)
  
  for(colname in colnames(mb@mbdata_archive))
    mb@mbdata_archive[, colname] <- as.character(mb@mbdata_archive[, colname])
  
  for(entry in old_entries)
    mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,]
  mb@mbdata_archive <- rbind(mb@mbdata_archive, mbdata_new[mbdata_new$id==new_entries,])
  
  for(colname in colnames(mb@mbdata_archive))
    mb@mbdata_archive[, colname] <- as.factor(mb@mbdata_archive[, colname])
  
  return(mb)
}


# Resets the mbdata_archive to an empty version.
#' @export
resetInfolists <- function(mb) 
{    
	mb@mbdata_archive <-
			structure(list(X = integer(0), id = integer(0), dbcas = character(0), 
							dbname = character(0), dataused = character(0), COMMENT.CONFIDENCE = character(0), 
							COMMENT.ID = integer(0), CH.NAME1 = character(0), 
							CH.NAME2 = character(0), CH.NAME3 = character(0), CH.NAME4 = character(0), CH.NAME5 = character(0), CH.COMPOUND_CLASS = character(0), 
							CH.FORMULA = character(0), CH.EXACT_MASS = numeric(0), CH.SMILES = character(0), 
							CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), 
							CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), 
							CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), 
							CH.LINK.CHEMSPIDER = integer(0), CH.LINK.COMPTOX = character(0)), .Names = c("X", "id", "dbcas", 
							"dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", 
              "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.NAME4", "CH.NAME5", "CH.COMPOUND_CLASS", "CH.FORMULA", 
							"CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", 
							"CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM",
							"CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER", "CH.LINK.COMPTOX"), row.names = integer(0), class = "data.frame")
	if(getOption("RMassBank")$include_sp_tags)
	{
	  mb@mbdata_archive["SP.SAMPLE"] <- character(0)
	}
	return(mb)
	
}

# The workflow function, i.e. (almost) the only thing you actually need to call.
# See below for explanation of steps.
#' MassBank record creation workflow
#' 
#' Uses data generated by \code{\link{msmsWorkflow}} to create MassBank records.
#' 
#' See the vignette \code{vignette("RMassBank")} for detailed informations about the usage.
#' 
#' Steps:
#' 
#' Step 1: Find which compounds don't have annotation information yet. For these
#' 		 compounds, pull information from several databases (using gatherData).
#' 
#' Step 2: If new compounds were found, then export the infolist.csv and stop the workflow.
#' 		Otherwise, continue.
#' 
#' Step 3: Take the archive data (in table format) and reformat it to MassBank tree format.
#' 
#' Step 4: Compile the spectra. Using the skeletons from the archive data, create
#'   MassBank records per compound and fill them with peak data for each spectrum.
#'   Also, assign accession numbers based on scan mode and relative scan no.
#' 
#' Step 5: Convert the internal tree-like representation of the MassBank data into
#'  flat-text string arrays (basically, into text-file style, but still in memory)
#' 
#' Step 6: For all OK records, generate a corresponding molfile with the structure
#'   of the compound, based on the SMILES entry from the MassBank record. (This molfile
#'   is still in memory only, not yet a physical file)
#' 
#' Step 7: If necessary, generate the appropriate subdirectories, and actually write
#'   the files to disk.
#' 
#' Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank
#'   to attribute substances to their corresponding structure molfiles. 
#' 
#' @param steps Which steps in the workflow to perform.
#' @param infolist_path A path where to store newly downloaded compound informations,
#' 			which should then be manually inspected.
#' @param mb The \code{mbWorkspace} to work in.
#' @param gatherData A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"}
#' or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given 
#' in the settings. This setting will be ignored if retrieval is set to "standard"
#' @param filter If \code{TRUE}, the peaks will be filtered according to the standard processing workflow in RMassBank - 
#' only the best formula for a peak is retained, and only peaks passing multiplicity filtering are retained. If FALSE, it is assumed
#' that the user has already done filtering, and all peaks in the spectrum should be printed in the record (with or without formula.)
#' @return The processed \code{mbWorkspace}.
#' @seealso \code{\link{mbWorkspace-class}}
#' @author Michael A. Stravs, Eawag <michael.stravs@@eawag.ch>
#' @examples \dontrun{
#' 		mb <- newMbWorkspace(w) # w being a msmsWorkspace
#' 		mb <- loadInfolists(mb, "D:/myInfolistPath")
#' 		mb <- mbWorkflow(mb, steps=c(1:3), "newinfos.csv")
#' 		
#' }
#' @export
mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", gatherData = "online", filter = TRUE)
{
    # Step 1: Find which compounds don't have annotation information yet. For these
    # compounds, pull information from CTS (using gatherData).
    if(1 %in% steps)
    {
        mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id)
                message("mbWorkflow: Step 1. Gather info from several databases")
      # Which IDs are not in mbdata_archive yet?
      new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id)
      mb@mbdata <- lapply(new_ids, function(id) 
      {
            if(findLevel(id,TRUE) == "standard"){
            if(gatherData == "online"){
                    
                d <- gatherData(id)
            } 
            if(gatherData == "babel"){
                    # message("mbWorkflow: Step 1. Gather info using babel")
                d <- gatherDataBabel(id)
            }
        } else{
                # message("mbWorkflow: Step 1. Gather no info - Unknown structure")
                d <- gatherDataUnknown(id, mb@spectra[[1]]@mode, retrieval=findLevel(id,TRUE))
        }
		message(paste(id, ": ", d$dataused, sep=''))
        return(d)
      })
  }
  # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow.
  # Otherwise, continue!
  if(2 %in% steps)
  {
	message("mbWorkflow: Step 2. Export infolist (if required)")
    if(length(mb@mbdata)>0)
    {
      mbdata_mat <- flatten(mb@mbdata)
      write.csv(as.data.frame(mbdata_mat),infolist_path, na="")
            message(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder."))
      return(mb)
    }
    else
      message("No new data added.")
  }
  # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format.
  if(3 %in% steps)
  {
	message("mbWorkflow: Step 3. Data reformatting")
    mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata)
  }
  # Step 4: Compile the spectra! Using the skeletons from the archive data, create
  # MassBank records per compound and fill them with peak data for each spectrum.
  # Also, assign accession numbers based on scan mode and relative scan no.
  if(4 %in% steps)
  {
	  message("mbWorkflow: Step 4. Spectra compilation")
	  mb@compiled <- lapply(
			  selectSpectra(mb@spectra, "found", "object"),
			  function(r) {
				  message(paste("Compiling: ", r@name, sep=""))
				  mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]]
				  if(filter)
            res <- buildRecord(r, mbdata=mbdata, additionalPeaks=mb@additionalPeaks, filter = filterOK & best)
				  else
				    res <- buildRecord(r, mbdata=mbdata, additionalPeaks=mb@additionalPeaks)
          return(res)
			  })
	  # check which compounds have useful spectra
	  mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0))
        #mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0))
	  mb@problems <- which(is.na(mb@compiled))
	  mb@compiled_ok <- mb@compiled[mb@ok]
    mb@compiled_notOk <- mb@compiled[!mb@ok]
  }
  # Step 5: Convert the internal tree-like representation of the MassBank data into
  # flat-text string arrays (basically, into text-file style, but still in memory)
  if(5 %in% steps)
  {
	message("mbWorkflow: [Legacy Step 5. Flattening records] ignored")
    #mb@mbfiles <- lapply(mb@compiled_ok, function(cpd) toMassbank(cpd, mb@additionalPeaks))
    #mb@mbfiles_notOk <- lapply(mb@compiled_notOk, function(c) lapply(c, toMassbank))
  }
  # Step 6: For all OK records, generate a corresponding molfile with the structure
  # of the compound, based on the SMILES entry from the MassBank record. (This molfile
  # is still in memory only, not yet a physical file)
  if(6 %in% steps)
  {
    if(RMassBank.env$export.molfiles){
      message("mbWorkflow: Step 6. Generate molfiles")
      mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c@id)))
    } else
      warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 6 is therefore ignored.")
    }
  # Step 7: If necessary, generate the appropriate subdirectories, and actually write
  # the files to disk.
  if(7 %in% steps)
  {
	message("mbWorkflow: Step 7. Generate subdirs and export")
        
        ## create folder
        filePath_recData_valid   <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata")
        filePath_recData_invalid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata_invalid")
        filePath_molData         <- file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata")
        
        if(!file.exists(filePath_recData_valid)) if(!dir.create(filePath_recData_valid,recursive=TRUE))  stop(paste("Could not create folder", filePath_recData_valid))
        if(RMassBank.env$export.molfiles)
          if(!file.exists(filePath_molData)) if(!dir.create(filePath_molData,recursive=TRUE))  stop(paste("Could not create folder", filePath_molData))
        if(RMassBank.env$export.invalid & length(mb@mbfiles_notOk) > 0)
          if(!file.exists(filePath_recData_invalid)) if(!dir.create(filePath_recData_invalid,recursive=TRUE))  stop(paste("Could not create folder", filePath_recData_invalid))
        
        if(length(mb@molfile) == 0)
            mb@molfile <- as.list(rep(x = NA, times = length(mb@compiled_ok)))
        
        ## export valid spectra
        for(cnt in seq_along(mb@compiled_ok)){
            exportMassbank_recdata(
                mb@compiled_ok[[cnt]], 
                recDataFolder = filePath_recData_valid
            )
            if(RMassBank.env$export.molfiles)
              exportMassbank_moldata(
                mb@compiled_ok[[cnt]], 
                molfile = mb@molfile[[cnt]], 
                molDataFolder = filePath_molData
              )
        }
        
        ## export invalid spectra
            for(cnt in seq_along(mb@compiled_notOk))
                exportMassbank_recdata(
                    compiled = mb@mbfiles_notOk[[cnt]], 
                    recDataFolder = filePath_recData_invalid
                )
  }
  # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank
  # to attribute substances to their corresponding structure molfiles.
  if(8 %in% steps)
  {
        if(RMassBank.env$export.molfiles){
          message("mbWorkflow: Step 8. Create list.tsv")
          makeMollist(compiled = mb@compiled_ok)
        } else
            warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 8 is therefore ignored.")
  }
  return(mb)
}


# Calls openbabel and converts the SMILES code string (or retrieves the SMILES code from
# the ID, and then calls openbabel) to create a molfile in text format.
# If fileName is given, the file is directly stored. Otherwise, it is returned as a 
# character array.
#' Create MOL file for a chemical structure
#' 
#' Creates a MOL file (in memory or on disk) for a compound specified by the
#' compound ID or by a SMILES code.
#' 
#' The function invokes OpenBabel (and therefore needs a correctly set
#' OpenBabel path in the RMassBank settings), using the SMILES code retrieved
#' with \code{findSmiles} or using the SMILES code directly. The current
#' implementation of the workflow uses the latter version, reading the SMILES
#' code directly from the MassBank record itself.
#' 
#' @usage createMolfile(id_or_smiles, fileName = FALSE)
#' @param id_or_smiles The compound ID or a SMILES code.
#' @param fileName If the filename is set, the file is written directly to disk
#' using the specified filename. Otherwise, it is returned as a text array.
#' @return A character array containing the MOL/SDF format file, ready to be
#' written to disk.
#' @author Michael Stravs
#' @seealso \code{\link{findSmiles}}
#' @references OpenBabel: \url{http://openbabel.org}
#' @examples
#' 
#' # Benzene:
#' \dontrun{
#' createMolfile("C1=CC=CC=C1")
#' }
#' 
#' @export
createMolfile <- function(id_or_smiles, fileName = FALSE)
{
	.checkMbSettings()
	babeldir <- getOption("RMassBank")$babeldir
    
	if(!is.numeric(id_or_smiles)){
		smiles <- id_or_smiles
    } else{
        if(findLevel(id_or_smiles,TRUE) != "standard"){
            return(c(" ","$$$$"))
        }
		smiles <- findSmiles(id_or_smiles)
    }
    # if no babeldir was set, get the result from cactus.
	if(is.na(babeldir))
	{
		res <- getCactus(smiles, "sdf")
		
		if(any(is.na(res))){
			res <- getPcSDF(smiles)
		}
		if(any(is.na(res))){
			stop("Pubchem and Cactus both seem to be down.")
		}
		if(is.character(fileName))
			writeLines(res, fileName)
	}
	# otherwise use the better-tested OpenBabel toolkit.
	else
	{
		if(!is.character(fileName))
			cmd <- paste(babeldir, "babel -ismi -osdf -d -b --gen2D", sep='')
		else
			cmd <- paste(babeldir, "babel -ismi -osdf ", fileName , " -d -b --gen2D", sep='')
		res <- system(cmd, intern=TRUE, input=smiles, ignore.stderr=TRUE)
		# If we wrote to a file, read it back as return value.
		if(is.character(fileName))
			res <- readLines(fileName)
	} 
  #return(c(" ","$$$$"))
	return(res)
}



# Retrieve annotation data for a compound, from the internet service Pubchem
#' Retrieve supplemental annotation data from Pubchem
#' 
#' Retrieves annotation data for a compound from the internet service Pubchem 
#' based on the inchikey generated by babel or Cactus
#' 
#' The data retrieved is the Pubchem CID, a synonym from the Pubchem database,
#' the IUPAC name (using the preferred if available) and a Chebi link
#' 
#' @usage gatherPubChem(key)
#' @param key An Inchi-Key
#' @return Returns a list with 4 slots:
#' \code{PcID} The Pubchem CID
#' \code{Synonym} An arbitrary synonym for the compound name
#' \code{IUPAC} A IUPAC-name (preferred if available)
#' \code{Chebi} The identification number of the chebi database
#' @author Erik Mueller
#' @seealso \code{\link{mbWorkflow}}
#' @references Pubchem REST:
#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html}
#' Chebi:
#' \url{http://www.ebi.ac.uk/chebi}
#' @examples
#' 
#' # Gather data for compound ID 131
#' \dontrun{gatherPubChem("QEIXBXXKTUNWDK-UHFFFAOYSA-N")}
#' 
#' @export
gatherPubChem <- function(key){
	
	PubChemData <- list()
	
	##Trycatches are there because pubchem has connection issues 1 in 50 times.
	##Write NA into the respective fields if something goes wrong with the conenction or the data.
	
	##Retrieve Pubchem CID
	tryCatch(
		PubChemData$PcID <- getPcId(key),
		error=function(e){
		PubChemData$PcID <<- NA
	})
	
	##Retrieve a synonym to the name
	tryCatch(
		PubChemData$Synonym <- getPcSynonym(key),
		error=function(e){
		PubChemData$Synonym <<- NA
	})
	
	##Retrieve the IUPAC-name
	tryCatch(
		PubChemData$IUPAC <- getPcIUPAC(key),
		error=function(e){
		PubChemData$IUPAC <<- NA
	})
	
	##Retrieve the Chebi-ID
	tryCatch(
		PubChemData$Chebi <- getPcCHEBI(key),
		error=function(e){
		PubChemData$Chebi <<- NA
	})
	
	return(PubChemData)
}

# Retrieve annotation data for a compound, from the internet services Cactvs, Pubchem, Chemspider and CTS.
#' Retrieve annotation data
#' 
#' Retrieves annotation data for a compound from the internet services CTS, Pubchem, Chemspider and
#' Cactvs, based on the SMILES code and name of the compounds stored in the
#' compound list.
#' 
#' Composes the "upper part" of a MassBank record filled with chemical data
#' about the compound: name, exact mass, structure, CAS no., links to PubChem,
#' KEGG, ChemSpider.  The instrument type is also written into this block (even
#' if not strictly part of the chemical information). Additionally, index
#' fields are added at the start of the record, which will be removed later:
#' \code{id, dbcas, dbname} from the compound list, \code{dataused} to indicate
#' the used identifier for CTS search (\code{smiles} or \code{dbname}).
#' 
#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are
#' inserted empty and will be filled later on.
#' 
#' @usage gatherData(id)
#' @aliases gatherData
#' @param id The compound ID.
#' @return Returns a list of type \code{list(id= \var{compoundID}, ...,
#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ...
#' @author Michael Stravs
#' @seealso \code{\link{mbWorkflow}}
#' @references Chemical Translation Service:
#' \url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} 
#' cactus Chemical Identifier Resolver: 
#' \url{http://cactus.nci.nih.gov/chemical/structure}
#' MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' Pubchem REST:
#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html}
#' Chemspider InChI conversion:
#' \url{https://www.chemspider.com/InChI.asmx}
#' @examples
#' 
#' # Gather data for compound ID 131
#' \dontrun{gatherData(131)}
#' 
#' @export
gatherData <- function(id)
{ 
	##Preamble: Is a babeldir supplied?
	##If yes, use it
	
	.checkMbSettings()
	usebabel=TRUE
	babeldir <- getOption("RMassBank")$babeldir
	
	if(is.na(babeldir)){
		usebabel=FALSE
	}
	
	
	##Get all useful information from the local "database" (from the CSV sheet)
	
	smiles <- findSmiles(id)
	mass <- findMass(smiles)
	dbcas <- findCAS(id)
	dbname <- findName(id)
	if(is.na(dbname)) dbname <- ""
	if(is.na(dbcas)) dbcas <- ""
	iupacName <- dbname
	synonym <- dbname
	formula <- findFormula(id)
	
	##Convert SMILES to InChI key via Cactvs or babel. CTS doesn't "interpret" the SMILES per se,
	##it just matches identical known SMILES, so we need to convert to a "searchable" and
	##standardized format beforehand. Other databases are able to interpret the smiles.
	
	if(usebabel){
		cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey')
		inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE)
	} else{
		inchikey <- getCactus(smiles, 'stdinchikey')
		if(!is.na(inchikey)){
			##Split the "InChiKey=" part off the key
			inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]]
		} else{
		    inchikey_split <- getPcInchiKey(smiles)
		}
	}
	
	##Use Pubchem to retrieve information
	PcInfo <- gatherPubChem(inchikey_split)
	
	if(!is.null(PcInfo$Synonym) & !is.na(PcInfo$Synonym)){
		synonym <- PcInfo$Synonym
	}
	
	if(!is.null(PcInfo$IUPAC) & !is.na(PcInfo$IUPAC)){
		iupacName <- PcInfo$IUPAC
	}
	
	##Get Chemspider-ID
	csid <- getCSID(inchikey_split)
	
	if(is.na(csid)){
		##Get ChemSpider ID from Cactus if the Chemspider page is down
		csid <- getCactus(inchikey_split, 'chemspider_id')
	}
	
	##Get CompTox
	comptox <- getCompTox(inchikey_split)
	
	if(is.null(comptox)){
	  comptox <- NA
	}
	
	##Use CTS to retrieve information
	CTSinfo <- getCtsRecord(inchikey_split)
		
	if((CTSinfo[1] == "Sorry, we couldn't find any matching results") || is.null(CTSinfo[1]))
	{
		CTSinfo <- NA
	}
	
	##List the names
	if(iupacName == ""){
		warning(paste0("Compound ID ",id,": no IUPAC name could be identified."))
	}

	if(toupper(dbname) == toupper(synonym)){
		synonym <- dbname
	}
	
	if(toupper(dbname) == toupper(iupacName)){
		iupacName <- dbname
	}
	
	if(toupper(synonym) == toupper(iupacName)){
		synonym <- iupacName
	}
	
	names <- as.list(unique(c(dbname, synonym, iupacName)))
	
	##If no name is found, it must be supplied in one way or another
	if(all(sapply(names, function(x) x == ""))){
		stop("RMassBank wasn't able to extract a usable name for this compound from any database. Please supply a name manually.")
	}
	
	# Start to fill the MassBank record.
	# The top 4 entries will not go into the final record; they are used to identify
	# the record and also to facilitate manual editing of the exported record table.
	mbdata <- list()
	mbdata[['id']] <- id
	mbdata[['dbcas']] <- dbcas
	mbdata[['dbname']] <- dbname
	mbdata[['dataused']] <- "smiles"
	mbdata[['ACCESSION']] <- ""
	mbdata[['RECORD_TITLE']] <- ""
	mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d")
	mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors
	mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license
	mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright
	# Confidence annotation and internal ID annotation.
	# The ID of the compound will be written like:
	# COMMENT: EAWAG_UCHEM_ID 1234
	# if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID"
	mbdata[["COMMENT"]] <- list()
  if(findLevel(id) == "0"){
	mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment
	} else{
        level <- findLevel(id)
        if(level %in% c("1","1a")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)"
        }
        if(level == c("2")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)"
        }
        if(level == c("2a")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)"
        }
        if(level == c("2b")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)"
        }
        if(level == c("3")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)"
        }
        if(level == c("3a")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)"
        }
        if(level == c("3b")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)"
        }
        if(level == c("3c")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)"
        }
        if(level == c("3d")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)"
        }
        if(level == c("4")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)"
        }
        if(level == c("5")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)"
        }
	}
	
	mbdata[["COMMENT"]][["ID"]] = id
  
  ## add generic COMMENT information
  rowIdx <- which(.listEnvEnv$listEnv$compoundList$ID == id)
  properties      <- colnames(.listEnvEnv$listEnv$compoundList)
  properties2     <- gsub(x = properties, pattern = "^COMMENT ", replacement = "")
  theseProperties <- grepl(x = properties, pattern = "^COMMENT ")
  theseProperties <- theseProperties & (!(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ]) == "NA" | is.na(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ]))))
  mbdata[["COMMENT"]][properties2[theseProperties]] <- unlist(.listEnvEnv$listEnv$compoundList[rowIdx, theseProperties])
  
	# here compound info starts
	mbdata[['CH$NAME']] <- names
	# Currently we use a fixed value for Compound Class, since there is no useful
	# convention of what should go there and what shouldn't, and the field is not used
	# in search queries.
	mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class
	mbdata[['CH$FORMULA']] <- formula
	mbdata[['CH$EXACT_MASS']] <- mass
	mbdata[['CH$SMILES']] <- smiles
	
	if(usebabel){
		cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi')
		mbdata[['CH$IUPAC']] <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE)
	} else{
		mbdata[['CH$IUPAC']] <- getCactus(smiles, "stdinchi")
	}
	

	
	# Add all CH$LINK fields present in the compound datasets
	link <- list()
	# CAS
	if(!is.na(CTSinfo[1])){
		if("CAS" %in% CTS.externalIdTypes(CTSinfo))
		{
			# Prefer database CAS if it is also listed in the CTS results.
			# otherwise take the shortest one.
			cas <- CTS.externalIdSubset(CTSinfo,"CAS")
			if(dbcas %in% cas)
				link[["CAS"]] <- dbcas
			else
				link[["CAS"]] <- cas[[which.min(nchar(cas))]]
		} else{
			if(dbcas != ""){
				link[["CAS"]] <- dbcas
			}
		}
	} else{
		if(dbcas != ""){
			link[["CAS"]] <- dbcas
		}
	}
	
	
	# CHEBI
	if(is.na(PcInfo$Chebi[1])){
		if(!is.na(CTSinfo[1])){
			if("ChEBI" %in% CTS.externalIdTypes(CTSinfo))
			{
				# Cut off front "CHEBI:" if present
				chebi <- CTS.externalIdSubset(CTSinfo,"ChEBI")
				chebi <- chebi[[which.min(nchar(chebi))]]
				chebi <- strsplit(chebi,":")[[1]]
				link[["CHEBI"]] <- chebi[[length(chebi)]]
			}
		}
	} else{
		chebi <- PcInfo$Chebi
		chebi <- chebi[[which.min(nchar(chebi))]]
		chebi <- strsplit(chebi,":")[[1]]
		link[["CHEBI"]] <- chebi[[length(chebi)]]
	}
	# HMDB
	if(!is.na(CTSinfo[1])){
		if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo))
			link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]]
		# KEGG
		if("KEGG" %in% CTS.externalIdTypes(CTSinfo))
			link[["KEGG"]] <- CTS.externalIdSubset(CTSinfo,"KEGG")[[1]]
		# LipidMAPS
		if("LipidMAPS" %in% CTS.externalIdTypes(CTSinfo))
			link[["LIPIDMAPS"]] <- CTS.externalIdSubset(CTSinfo,"LipidMAPS")[[1]]
	}
	# PubChem CID
	if(is.na(PcInfo$PcID[1])){
		if(!is.na(CTSinfo[1])){
			if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo))
			{
				pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID")
				link[["PUBCHEM"]] <- paste0(min(pc))
			}
		}
	} else{
		link[["PUBCHEM"]] <- PcInfo$PcID[1]
	}
	
	
	if(!is.null(link[["PUBCHEM"]])){
		if(substr(link[["PUBCHEM"]],1,4) != "CID:"){
			link[["PUBCHEM"]] <- paste0("CID:", link[["PUBCHEM"]])
		}
	}
	
	link[["INCHIKEY"]] <- inchikey_split
	link[["COMPTOX"]] <- comptox
	if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid[!is.na(csid)])))
	mbdata[['CH$LINK']] <- link
		
	return(mbdata)  
}

# Retrieve annotation data for a compound, using only babel
#' Retrieve annotation data
#' 
#' Retrieves annotation data for a compound by using babel,
#' based on the SMILES code and name of the compounds stored in the
#' compound list.
#' 
#' Composes the "upper part" of a MassBank record filled with chemical data
#' about the compound: name, exact mass, structure, CAS no..  
#' The instrument type is also written into this block (even
#' if not strictly part of the chemical information). Additionally, index
#' fields are added at the start of the record, which will be removed later:
#' \code{id, dbcas, dbname} from the compound list.
#' 
#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are
#' inserted empty and will be filled later on.
#' 
#' This function is an alternative to gatherData, in case CTS is down or if information
#' on one or more of the compounds in the compound list are sparse
#'
#' @usage gatherDataBabel(id)
#' @param id The compound ID.
#' @return Returns a list of type \code{list(id= \var{compoundID}, ...,
#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ...
#' @author Michael Stravs, Erik Mueller
#' @seealso \code{\link{mbWorkflow}}
#' @references MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' @examples
#' 
#' # Gather data for compound ID 131
#' \dontrun{gatherDataBabel(131)}
#' 
#' @export
gatherDataBabel <- function(id){
		.checkMbSettings()
		babeldir <- getOption("RMassBank")$babeldir
		smiles <- findSmiles(id)
			
		
		# if no babeldir was set, throw an error that says that either CTS or babel have to be used
		if(is.na(babeldir))
		{
			stop("No babeldir supplied; It is currently not possible to convert the information without either babel or CTS")
		} else {
			###Babel conversion
			cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey')
			inchikey <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE)
			cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi')
			inchi <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE)
			
			##Read from Compoundlist
			smiles <- findSmiles(id)
			mass <- findMass(smiles)
			dbcas <- findCAS(id)
			dbname <- findName(id)
			if(is.na(dbname)) dbname <- ""
			if(is.na(dbcas)) dbcas <- ""
			formula <- findFormula(id)
			
			##Create 
			mbdata <- list()
			mbdata[['id']] <- id
			mbdata[['dbcas']] <- dbcas
			mbdata[['dbname']] <- dbname
			mbdata[['dataused']] <- "smiles"
			mbdata[['ACCESSION']] <- ""
			mbdata[['RECORD_TITLE']] <- ""
			mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d")
			mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors
			mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license
			mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright
			# Confidence annotation and internal ID annotation.
			# The ID of the compound will be written like:
			# COMMENT: EAWAG_UCHEM_ID 1234
			# if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID"
			mbdata[["COMMENT"]] <- list()
			if(findLevel(id) == "0"){
			mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment
            } else{
                level <- findLevel(id)
                if(level %in% c("1","1a")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)"
                }
                if(level == c("2")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)"
                }
                if(level == c("2a")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)"
                }
                if(level == c("2b")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)"
                }
                if(level == c("3")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)"
                }
                if(level == c("3a")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)"
                }
                if(level == c("3b")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)"
                }
                if(level == c("3c")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)"
                }
                if(level == c("3d")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)"
                }
                if(level == c("4")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)"
                }
                if(level == c("5")){
                     mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)"
                }
            }
			mbdata[["COMMENT"]][["ID"]] <- id

			# here compound info starts
			mbdata[['CH$NAME']] <- as.list(dbname)
			
			# Currently we use a fixed value for Compound Class, since there is no useful
			# convention of what should go there and what shouldn't, and the field is not used
			# in search queries.
			mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class
			mbdata[['CH$FORMULA']] <- formula
			mbdata[['CH$EXACT_MASS']] <- mass
			mbdata[['CH$SMILES']] <- smiles
			mbdata[['CH$IUPAC']] <- inchi
			
			link <- list()
			if(dbcas != "")
			link[["CAS"]] <- dbcas
			link[["INCHIKEY"]] <- inchikey
			mbdata[['CH$LINK']] <- link
		}
		return(mbdata)
}

# Retrieve annotation data for a compound, using only babel
#' Retrieve annotation data
#' 
#' Retrieves annotation data for an unknown compound by using basic information present
#'
#' Composes the "upper part" of a MassBank record filled with chemical data
#' about the compound: name, exact mass, structure, CAS no..  
#' The instrument type is also written into this block (even
#' if not strictly part of the chemical information). Additionally, index
#' fields are added at the start of the record, which will be removed later:
#' \code{id, dbcas, dbname} from the compound list.
#' 
#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are
#' inserted empty and will be filled later on.
#' 
#' This function is used to generate the data in case a substance is unknown,
#' i.e. not enough information is present to derive anything about formulas or links
#'
#' @usage gatherDataUnknown(id, mode, retrieval)
#' @param id The compound ID.
#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions 
#' 			([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-).
#' @param retrieval A value that determines whether the files should be handled either as "standard",
#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown"
#' if the only know thing is the m/z
#' @return Returns a list of type \code{list(id= \var{compoundID}, ...,
#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ...
#' @author Michael Stravs, Erik Mueller
#' @seealso \code{\link{mbWorkflow}}
#' @references MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' @examples
#' 
#' # Gather data for compound ID 131
#' \dontrun{gatherDataUnknown(131,"pH")}
#' 
#' @export
gatherDataUnknown <- function(id, mode, retrieval){
    .checkMbSettings()
    
    ##Read from Compoundlist
    smiles <- ""
    if(retrieval == "unknown"){
        mass <- findMass(id, "unknown", mode)
        formula <- ""
    }    
    if(retrieval == "tentative"){
        mass <- findMass(id, "tentative", mode)
        formula <- findFormula(id, "tentative")
    }
    dbcas <- NA
    dbname <- findName(id)
    if(is.na(dbname)) dbname <- paste("Unknown ID:",id)
    if(is.na(dbcas)) dbcas <- ""
    

    
    ##Create 
    mbdata <- list()
    mbdata[['id']] <- id
    mbdata[['dbcas']] <- dbcas
    mbdata[['dbname']] <- dbname
    mbdata[['dataused']] <- "none"
    mbdata[['ACCESSION']] <- ""
    mbdata[['RECORD_TITLE']] <- ""
    mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d")
    mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors
    mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license
    mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright
    # Confidence annotation and internal ID annotation.
    # The ID of the compound will be written like:
    # COMMENT: EAWAG_UCHEM_ID 1234
    # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID"
    mbdata[["COMMENT"]] <- list()
    if(findLevel(id) == "0"){
    mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment
	} else{
        level <- findLevel(id)
        if(level %in% c("1","1a")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)"
        }
        if(level == c("2")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)"
        }
        if(level == c("2a")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)"
        }
        if(level == c("2b")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)"
        }
        if(level == c("3")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)"
        }
        if(level == c("3a")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)"
        }
        if(level == c("3b")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)"
        }
        if(level == c("3c")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)"
        }
        if(level == c("3d")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)"
        }
        if(level == c("4")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)"
        }
        if(level == c("5")){
             mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)"
        }
    }
    mbdata[["COMMENT"]][["ID"]] <- id

    # here compound info starts
    mbdata[['CH$NAME']] <- as.list(dbname)
    
    # Currently we use a fixed value for Compound Class, since there is no useful
    # convention of what should go there and what shouldn't, and the field is not used
    # in search queries.
    mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class
    mbdata[['CH$FORMULA']] <- formula
    mbdata[['CH$EXACT_MASS']] <- mass
    mbdata[['CH$SMILES']] <- ""
    mbdata[['CH$IUPAC']] <- ""
    
    link <- list()
    mbdata[['CH$LINK']] <- link

    return(mbdata)
}

# Flatten the internal tree-like representation of MassBank data to a flat table.
# Note that this limits us, in that the fields should be constant over all records!
# Therefore, e.g. the fixed number of 3 names which may be filled.
# If anybody has a cooler solution, I'll be happy to hear from you :)
#
# Note: the records from gatherData have additional information which is discarded, like
# author, copyright etc. They will be re-filled automatically when reading the file.
#' Flatten, or re-read, MassBank header blocks
#' 
#' \code{flatten} converts a list of MassBank compound information sets (as
#' retrieved by \code{\link{gatherData}}) to a flat table, to be exported into
#' an \link[=loadInfolist]{infolist}. \code{readMbdata} reads a single record
#' from an infolist flat table back into a MassBank (half-)entry.
#' 
#' Neither the flattening system itself nor the implementation are particularly
#' fantastic, but since hand-checking of records is a necessary evil, there is
#' currently no alternative (short of coding a complete GUI for this and
#' working directly on the records.)
#' 
#' @aliases flatten readMbdata
#' @usage flatten(mbdata) 
#' 
#' readMbdata(row)
#' @param mbdata A list of MassBank compound information sets as returned from
#' \code{\link{gatherData}}.
#' @param row One row of MassBank compound information retrieved from an
#' infolist.
#' @return \code{flatten} returns a matrix (not a data frame) to be written to
#' CSV.
#' 
#' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID},
#' ..., 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc.
#' @author Michael Stravs
#' @seealso \code{\link{gatherData}},\code{\link{loadInfolist}}
#' @references MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' @examples \dontrun{
#' 	# Collect some data to flatten
#' 	ids <- c(40,50,60,70)
#'  data <- lapply(ids, gatherData)
#'  # Flatten the data trees to a table
#'  flat.table <- flatten(data)
#'  # reimport the table into a tree
#'  data.reimported <- apply(flat.table, 1, readMbdata)
#' }
#' 
#' @export
#' 
flatten <- function(mbdata)
{
  .checkMbSettings()
  
  colNames     <- names(unlist(mbdata[[1]]))
  commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\.")]
  
  colList <- c(
              "id",
              "dbcas",
              "dbname",
              "dataused",
              commentNames,
              #"COMMENT.CONFIDENCE",
              # Note: The field name of the internal id field is replaced with the real name
              # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() 
              # must use the full name including the info from options("RMassBank").
              #"COMMENT.ID",
              "CH$NAME1",
              "CH$NAME2",
              "CH$NAME3",
              "CH$NAME4",
              "CH$NAME5",
              "CH$COMPOUND_CLASS",
              "CH$FORMULA",
              "CH$EXACT_MASS",
              "CH$SMILES",
              "CH$IUPAC",
              "CH$LINK.CAS",
              "CH$LINK.CHEBI",
              "CH$LINK.HMDB",
              "CH$LINK.KEGG",
              "CH$LINK.LIPIDMAPS",
              "CH$LINK.PUBCHEM",
              "CH$LINK.INCHIKEY",
              "CH$LINK.CHEMSPIDER",
	          "CH$LINK.COMPTOX"
	          )
  # make an empty data frame with the right length
  rows <- length(mbdata)
  cols <- length(colList)
  mbframe <- matrix(data=NA, nrow=rows, ncol=cols)
  colnames(mbframe) <- colList
  #browser()
  for(row in 1:rows)
  {
    # fill in all the data into the dataframe: all columns which 
    # a) exist in the target dataframe and b) exist in the (unlisted) MB record
    # are written into the dataframe.
    data <- unlist(mbdata[[row]])
	# bugfix for the case of only one name
	if(!("CH$NAME1" %in% names(data)))
		data[["CH$NAME1"]] <- data[["CH$NAME"]]
    datacols <- intersect(colList, names(data))
    mbframe[row,datacols] <- data[datacols]
  }
  return(mbframe)
  
}

# Read data from a flat-table MassBank record row and feed it into a
# MassBank tree-like record. Also, prime the ACCESSION and RECORD_TITLE fields in the
# correct position in the record.
#' @export
readMbdata <- function(row)
{
  .checkMbSettings()
  
  # Listify the table row. Lists are just cooler to work with :)
  row <- as.list(row)
  
  mbdata <- list()
  # Accession and title are added empty for now, to have them in the right place.
  # Constants are read from the options or generated.
  mbdata[['ACCESSION']] <- ""
  mbdata[['RECORD_TITLE']] <- ""
  mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d")
  mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors
  mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license
  mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright
  if(getOption("RMassBank")$annotations$publication!="") {
    mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication
  }
  commentNames <- names(row)[grepl(x = names(row), pattern = "^COMMENT\\.")]
  commentNames <- commentNames[!is.na(row[commentNames])]
  
  # Read all determined fields from the file
  # This is not very flexible, as you can see...
    colList <- c(
              commentNames,
              #"COMMENT.CONFIDENCE",
              #"COMMENT.ID",
              "CH$NAME1",
              "CH$NAME2",
              "CH$NAME3",
              "CH$NAME4",
              "CH$NAME5",
              "CH$COMPOUND_CLASS",
              "CH$FORMULA",
              "CH$EXACT_MASS",
              "CH$SMILES",
              "CH$IUPAC",
              "CH$LINK.CAS",
              "CH$LINK.CHEBI",
              "CH$LINK.HMDB",
              "CH$LINK.KEGG",
              "CH$LINK.LIPIDMAPS",
              "CH$LINK.PUBCHEM",
              "CH$LINK.INCHIKEY",
              "CH$LINK.CHEMSPIDER",
              "CH$LINK.COMPTOX")
  mbdata[["COMMENT"]] = list()
  #mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]]
  # Again, our ID field. 
  #mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT.ID"]]
  mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\.", replacement = "")] <- row[commentNames]
  
  names = c(row[["CH.NAME1"]], row[["CH.NAME2"]], row[["CH.NAME3"]], row[["CH.NAME4"]], row[["CH.NAME5"]])
  names = names[which(!is.na(names))]
  
  names <- gsub("'", "`", names) 
  mbdata[["CH$NAME"]] = names
  mbdata[["CH$COMPOUND_CLASS"]] = row[["CH.COMPOUND_CLASS"]]
  mbdata[["CH$FORMULA"]] = row[["CH.FORMULA"]]
  mbdata[["CH$EXACT_MASS"]] = row[["CH.EXACT_MASS"]]
  mbdata[["CH$SMILES"]] = row[["CH.SMILES"]]
  mbdata[["CH$IUPAC"]] = row[["CH.IUPAC"]]
  # Add all links and then eliminate the NA values from the tree.
  link = list()
  link[["CAS"]] = row[["CH.LINK.CAS"]]
  link[["CHEBI"]] = row[["CH.LINK.CHEBI"]]
  link[["HMDB"]] = row[["CH.LINK.HMDB"]]
  link[["KEGG"]] = row[["CH.LINK.KEGG"]]
  link[["LIPIDMAPS"]] = row[["CH.LINK.LIPIDMAPS"]]
  link[["PUBCHEM"]] = row[["CH.LINK.PUBCHEM"]]
  link[["INCHIKEY"]] = row[["CH.LINK.INCHIKEY"]]
  link[["CHEMSPIDER"]] = row[["CH.LINK.CHEMSPIDER"]]
  link[["COMPTOX"]] = row[["CH.LINK.COMPTOX"]]
  link[which(is.na(link))] <- NULL
  mbdata[["CH$LINK"]] <- link

    ## SP$SAMPLE
  if(all(nchar(row[["SP.SAMPLE"]]) > 0, row[["SP.SAMPLE"]] != "NA", !is.na(row[["SP.SAMPLE"]]), na.rm = TRUE))
    mbdata[['SP$SAMPLE']] <- row[["SP.SAMPLE"]]


  
  return(mbdata)
  
}

#' Generate peak annotation from peaklist
#' 
#' Generates the PK$ANNOTATION entry from the peaklist obtained. This function is
#' overridable by using the "annotator" option in the settings file.
#' 
#' @param annotation A peak list to be annotated. Contains columns:
#' \code{"cpdID","formula","mzFound" ,"scan","mzCalc","dppm",
#'      "dbe","mz","int","formulaCount","parentScan","fM_factor","dppmBest",
#'     "formulaMultiplicity","intrel","mzSpec"}
#' 
#' @param type The ion type to be added to annotated formulas ("+" or "-" usually)
#' 
#' @return The annotated peak table. Table \code{colnames()} will be used for the
#' 		titles (preferrably don't use spaces in the column titles; however no format is
#' 		strictly enforced by the MassBank data format.
#' 
#' @examples 
#' \dontrun{
#' annotation <- annotator.default(annotation)
#' }
#' @author Michele Stravs, Eawag <stravsmi@@eawag.ch>
#' @export
annotator.default <- function(annotation, formulaTag)
{
  if(!is.null(formulaTag))
    type <- formulaTag
  else
    type <- ""
  
  annotation <- annotation[!is.na(annotation$formula),,drop=FALSE]
  annotation <- annotation[annotation$formula != "",,drop=FALSE]
  
  annotation$formula <- paste(annotation$formula, rep(type, length(annotation$formula)), sep='')
  # Select the right columns and name them correctly for output.
  annotation <- annotation[,c("mz","formula", "formulaCount", "mzCalc", "dppm")]
  colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)")
  
  return(annotation)
}

#' Parse record title
#' 
#' Parses a title for a single MassBank record using the title format
#' specified in the option titleFormat. Internally used, not exported.
#' 
#' If the option is not set, a standard title format is used (for record definition
#' version 1 or 2).
#' 
#' @usage .parseTitleString(mbrecord)
#' @param mbrecord A MassBank record in list format, as returned from
#' 	\code{\link{gatherSpectrum}}.
#' @return A string with the title.
#' @author Michael Stravs, Eawag
#' @seealso \code{\link{compileRecord}}
#' @references MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' @examples
#' \dontrun{
#' 		# used in compileRecord()
#' 		title <- .parseTitleString(mbrecord)
#' }
#' 
#' 
#' 
.parseTitleString <- function(mbrecord)
{
	
	varlist <- getOption("RMassBank")$titleFormat
	
	# Set the standard title format.
	if(is.null(varlist))
	{
		if(getOption("RMassBank")$use_version == 2)
		{
			varlist <- c(
					"{CH$NAME}",
					"{AC$INSTRUMENT_TYPE}",
					"{AC$MASS_SPECTROMETRY: MS_TYPE}",
					"CE: {RECORD_TITLE_CE}",
					"R={AC$MASS_SPECTROMETRY: RESOLUTION}",
					"{MS$FOCUSED_ION: PRECURSOR_TYPE}"
			)
		}
		else
		{
			varlist <- c(
					"{CH$NAME}",
					"{AC$INSTRUMENT_TYPE}",
					"{AC$ANALYTICAL_CONDITION: MS_TYPE}",
					"CE: {RECORD_TITLE_CE}",
					"R={AC$ANALYTICAL_CONDITION: RESOLUTION}",
					"{MS$FOCUSED_ION: PRECURSOR_TYPE}"
			)
		}
	}
  
	
	# Extract a {XXX} argument from each title section.
	# check that every title has one and only one match
	args <- regexec("\\{(.*)\\}", varlist)
	arglist <- regmatches(varlist, args)
	if(any(unlist(lapply(arglist, length)) != 2))
		stop("Title format is incorrectly specified: a section with not exactly 1 parameters")
	
	parsedVars <- lapply(varlist, function(var)
			{
				# Extract the specified parameter inside the {}.
				# I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB"
				args <- regexec("\\{(.*)\\}", var)
				arg <- regmatches(var, args)[[1]][[2]]
				# Split the parameter by colon if necessary
				splitVar <- strsplit(arg, ": ")[[1]]
				# Read the parameter value from the record
				if(length(splitVar) == 2)
					replaceVar <- mbrecord[[splitVar[[1]]]][[splitVar[[2]]]]
				else if(length(splitVar) ==  1)
					replaceVar <- mbrecord[[splitVar]]
				else
					stop(paste(
									"Title format is incorrectly specified:", var)
					)
				# Fix problems: NULL returns
				if(is.null(replaceVar))
					replaceVar <- ""
				# Fix problems: Names will have >= 1 match. Take the first
				if(length(replaceVar) > 1)
					replaceVar <- replaceVar[[1]]
                
                # Fix problems: Unknowns might have no name
                if(!length(replaceVar)){
                    replaceVar <- ""
                }
                
				# Substitute the parameter value into the string
				parsedVar <- sub("\\{(.*)\\}", replaceVar, var)	
				return(parsedVar)
			})
	title <- paste(parsedVars, collapse="; ")
	return(title)
}


# This converts the tree-like list (as obtained e.g. from compileRecord())
# into a plain text array, which can then be dumped to a file suitable for 
# MassBank upload.
#' Write MassBank record into character array
#' 
#' Writes a MassBank record in list format to a text array.
#' 
#' The function is a general conversion tool for the MassBank format; i.e. the
#' field names are not fixed. \code{mbdata} must be a named list, and the
#' entries can be as follows: \itemize{
#'  \item A single text line:
#' 
#' \code{'CH\$EXACT_MASS' = '329.1023'}
#' 
#'  is written as
#' 
#'  \code{CH\$EXACT_MASS: 329.1023} 
#' \item A character array:
#' 
#'  \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} 
#' 
#' is written as
#' 
#' \code{CH\$NAME: 2-Aminobenzimidazole}
#' 
#' \code{CH\$NAME: 1H-Benzimidazol-2-amine}
#' 
#' \item A named list of strings: 
#' 
#' 	\code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} 
#' 
#' is written as 
#' 
#' \code{CH\$LINK: CHEBI 27822}
#' 
#' \code{CH\$LINK: KEGG C10901} 
#' 
#' \item A data frame (e.g. the peak table) is written as specified in
#' the MassBank record format (Section 2.6.3): the column names are used as
#' headers for the first line, all data rows are printed space-separated. 
#' }
#' 
#' @usage toMassbank(mbdata)
#' @param mbdata A MassBank record in list format.
#' @return The result is a text array, which is ready to be written to the disk
#' as a file.
#' @note The function iterates over the list item names. \bold{This means that
#' duplicate entries in \code{mbdata} are (partially) discarded!} The correct
#' way to add them is by making a character array (as specified above): Instead
#' of \code{'CH\$NAME' = 'bla', 'CH\$NAME' = 'blub'} specify \code{'CH\$NAME' =
#' c('bla','blub')}.
#' @author Michael Stravs
#' @seealso \code{\link{compileRecord}}, \code{\link{mbWorkflow}}
#' @references MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' @examples
#' \dontrun{
#' # Read just the compound info skeleton from the Internet for some compound ID
#' id <- 35
#' mbdata <- gatherData(id)
#' #' # Export the mbdata blocks to line arrays 
#' # (there is no spectrum information, just the compound info...)
#' mbtext <- toMassbank(mbdata)
#' }
#' 
#' 
#' @export
setGeneric("toMassbank", function(o, ...) standardGeneric("toMassbank"))

#' @export
setMethod("toMassbank", "RmbSpectraSet", function(o, addAnnotation = getOption("RMassBank")$add_annotation)
    {
      lapply(o@children, function(s) toMassbank(s, addAnnotation))
    })

#' @export
setMethod("toMassbank", "RmbSpectrum2", function(o, addAnnotation = getOption("RMassBank")$add_annotation)
    {
      .toMassbank(o, addAnnotation)
    })

.toMassbank <- function (s, addAnnotation = getOption("RMassBank")$add_annotation)
{
  
  peaks <- getData(s)
  # check that peaks were normalized
  if(!("intrel" %in% colnames(peaks)))
  {
    s <- normalize(s, slot="intrel")
    peaks <- getData(s)
  }  
  
  # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record
  # makes no sense otherwise. Also, keep only the columns needed in the output.
  peaks <- peaks[ peaks$intrel >= 1,,drop=FALSE]	
  
  peaks$mz <- round(peaks$mz, 4)
  # Also format the other values, which are used in the annotation
  peaks$dppm <- round(peaks$dppm, 2)
  peaks$mzCalc <- round(peaks$mzCalc, 4)
  peaks$intensity <- round(peaks$intensity, 1)
  
  # Get polarity from Spectrum2 now!
  formulaTag <- ""
  if(s@polarity == 1) formulaTag <- "+"
  if(s@polarity == 0) formulaTag <- "-"
  # if polarity is -1, leave it unspecified. the "specs" seem to be 1 for +, 0 for - and -1 for ???
  # (when reading mzML I often get -1, when reading mzXML I get 1 and 0 respectively)
  
  annotator <- getOption("RMassBank")$annotator
  if(is.null(annotator))
    annotator <- "annotator.default"
  
  annotation <- do.call(annotator, list(annotation= peaks, formulaTag = formulaTag))
  
  peaks <- peaks[,c("mz", "intensity", "intrel")]
  peaks <- unique(peaks)
  # Name the columns correctly.
  colnames(peaks) <- c("m/z", "int.", "rel.int.")
  peaknum <- nrow(peaks)
  
  mbdata <- s@info
  
  mbdata[["PK$SPLASH"]] <- list(SPLASH = getSplash(peaks[,c("m/z", "int.")]))
  
  # Annotation:
  if(addAnnotation && (nrow(annotation) > 0))
    mbdata[["PK$ANNOTATION"]] <- annotation
  
  # Peak table
  mbdata[["PK$NUM_PEAK"]] <- peaknum
  mbdata[["PK$PEAK"]] <- peaks
  
  # mbf is an array of lines and count is the line counter.
  # Very old-school, but it works. :)
  mbf <- character(0)
  count <- 1
  lapply(names(mbdata), function(entry)
    {
      # If entry is a char line, add it to the file.
      # If it is a named sublist, add each subentry with name
      # If it is an unnamed sublist, add each subentry without name
      # if it is a dataframe, write in PEAKS mode
    
      # Note: this is were I liked "lapply" a little too much. "for" would
      # be more idiomatic, and wouldn't need the <<- assignments.
      
      # Data frame: table mode. A header line and one space-separated line for
      # each data frame row.
      if(is.data.frame(mbdata[[entry]]))
      {
        mbf[[count]] <<- paste(entry,": " ,
                               paste(colnames(mbdata[[entry]]), collapse=" "),
                               sep='')
        count <<- count+1
        for(row in 1:nrow(mbdata[[entry]]))
        {
          mbf[[count]] <<- paste("  ", 
                                 paste(mbdata[[entry]][row,],collapse=" "), 
                                 sep="")
          count <<- count+1
        }
        #browser()
      }
      # List with named items: Write every entry like CH$LINK: CAS 12-345-678
      else if(is.list(mbdata[[entry]]) & !is.null(names(mbdata[[entry]])))
      {
        
        lapply(names(mbdata[[entry]]), function(subentry)
        {
          if(subentry != "SPLASH"){
            mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='')
          } else {
            mbf[[count]] <<- paste(entry,": ", mbdata[[entry]][[subentry]], sep='')
          }
          #print(mbf)
          count <<- count + 1
        })
      }
      # Array (or list) of unnamed items: Write every entry like CH$NAME: Paracetamol
      # (iterative entry without subindices)
      else if (length(mbdata[[entry]]) > 1 & is.null(names(mbdata[[entry]])))
      {
        lapply(mbdata[[entry]], function(subentry)
        {
          mbf[[count]] <<- paste(entry,": ",subentry, sep='')
          #print(mbf)
          count <<- count + 1
        })   
      }
      # Length is 1: just write the entry like PK$NUM_PEAKS: 131
      else
      {
        mbf[[count]] <<- paste(entry,": ",mbdata[[entry]], sep='')
        count <<- count + 1
      }
    }
    ) # End of lapply block (per child spectrum)
  # Add mandatory EOF marker
  mbf[[count]] <- "//"
  return(mbf)
}

# Exports compiled and massbanked spectra, with their associated molfiles, to physical files.
# "compiled" is still used here, because we need an accessible accession number.
# In the plain text arrays, the accession number is already "hidden".
# compiled: is ONE "compiled" entry, i.e. ONE compound with e.g. 14 spectra.
# files: is a return value from lapply(toMassbank), i.e. contains 14 plain-text arrays
#  (for a 14-spectra method)
# molfile: a molfile from createMolfile
#' Export internally stored MassBank data to files
#' 
#' Exports MassBank recfile data arrays and corresponding molfiles to physical
#' files on hard disk, for one compound.
#' 
#' The data from \code{compiled} is still used here, because it contains the
#' "visible" accession number. In the plain-text format contained in
#' \code{files}, the accession number is not "accessible" anymore since it's in
#' the file.
#' 
#' @usage exportMassbank(compiled, files, molfile)
#' @param compiled Is ONE "compiled" entry, i.e. ONE compound with e.g. 14
#' spectra, as returned from \code{\link{compileRecord}}.
#' @param files A n-membered array (usually a return value from
#' \code{lapply(\link{toMassbank})}), i.e. contains n plain-text arrays with
#' MassBank records.
#' @param molfile A molfile from \code{\link{createMolfile}}
#' @return No return value.
#' @note An improvement would be to write the accession numbers into
#' \code{names(compiled)} and later into \code{names(files)} so \code{compiled}
#' wouldn't be needed here anymore. (The compound ID would have to go into
#' \code{names(molfile)}, since it is also retrieved from \code{compiled}.)
#' @author Michael Stravs
#' @seealso \code{\link{createMolfile}}, \code{\link{compileRecord}},
#' \code{\link{toMassbank}}, \code{\link{mbWorkflow}}
#' @references MassBank record format:
#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf}
#' @examples
#' \dontrun{
#' 		compiled <- compileRecord(record, mbdata, refilteredRcSpecs)
#' 		mbfiles <- toMassbank(compiled)
#' 		molfile <- createMolfile(compiled[[1]][["CH$SMILES"]])
#' 		exportMassbank(compiled, mbfiles, molfile)
#' }
#' 
#' @export
exportMassbank <- function(compiled, molfile = NULL)
{
  exportMassbank_recdata(
    compiled,   
    recDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata")
  )
  if(!is.null(molfile)) {
    exportMassbank_moldata(
      compiled,
      molfile,
      molDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata")
    )
    }
}

exportMassbank_recdata <- function(compiled, recDataFolder)
{
  #mb@mbfiles <- lapply(mb@compiled_ok, function(cpd) toMassbank(cpd, mb@additionalPeaks))
  
  files <- toMassbank(compiled)
  names(files) <- lapply(compiled@children, function(c) c@info[["ACCESSION"]] )
  
  molnames <- c()
  for(file in seq_len(length(files)))
  {
    # Read the accession no. from the corresponding "compiled" entry
    filename <- names(files)[[file]]
    # use this accession no. as filename
    filename <- paste(filename, ".txt", sep="")
    filePath <- file.path(recDataFolder,filename)
    write(files[[file]], filePath)
  }
}

exportMassbank_moldata <- function(compiled, molfile, molDataFolder)
{
  # Use internal ID for naming the molfiles
  if(findLevel(compiled@id,TRUE)=="standard"){
    molname <- sprintf("%04d", as.numeric(compiled@id))
    molname <- paste(molname, ".mol", sep="")
    write(molfile, file.path(molDataFolder,molname))
  }
}





# Makes a list.tsv with molfile -> massbank ch$name attribution.

#' Write list.tsv file
#' 
#' Makes a list.tsv file in the "moldata" folder.
#' 
#' Generates the list.tsv file which is needed by MassBank to connect records with
#' their respective molfiles. The first compound name is linked to a mol-file with
#' the compound ID (e.g. 2334.mol for ID 2334).
#' 
#' @param compiled A list of compiled spectra (in tree-format, as returned by \code{compileRecord}).
#' @return No return value.
#' @author Michael A. Stravs, Eawag <michael.stravs@@eawag.ch>
#' @examples \dontrun{
#' 		compiled <- compileRecord(record, mbdata, refilteredRcSpecs)
#' 		# a list.tsv for only one record:
#' 		clist <- list(compiled)
#' 		makeMollist(clist)
#' }
#' @export
makeMollist <- function(compiled)
{
  # For every "compiled" entry (here, compiled is not one "compiled" entry but the total
  # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the
  # first spectrum.) Make the ID into 0000 format.
    
  tsvlist <- t(sapply(compiled, function(entry)
    {
    name <- entry@children[[1]]@info[["CH$NAME"]][[1]]
    id <- sprintf("%04d", as.numeric(entry@id))
    molfilename <- paste(id,".mol",sep='')
    return(c(name,molfilename))
  }))
    
    IDs <- sapply(compiled, function(entry) return( sprintf("%04d", as.numeric(
                      entry@id))))
    level <- sapply(IDs, findLevel, compact=TRUE)
    validentries <- which(level == "standard")
  # Write the file with the 
    write.table(tsvlist[validentries,], 
              paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''),
              quote = FALSE,
              sep="\t",
              row.names=FALSE,
              col.names=FALSE
              )
}


# Load a dataframe or file into additional_peaks (or add additional points in there.)
# The columns cpdID, scan, mzFound, int, OK are mandatory. OK=1 means that the peaks
# will be added into the spectrum. mzFound and int will be taken for the table.
# No annotation will be written.
# Add peaks to the spectra by hand

#' Add additional peaks to spectra
#' 
#' Loads a table with additional peaks to add to the MassBank spectra. Required
#' columns are \code{cpdID, scan, int, mzFound, OK}.
#' 
#' All peaks with OK=1 will be included in the spectra.
#' 
#' @usage addPeaks(mb, filename_or_dataframe)
#' @param mb The \code{mbWorkspace} to load the peaks into.
#' @param filename_or_dataframe Filename of the csv file, or name of the R
#' dataframe containing the peaklist.
#' @return The \code{mbWorkspace} with loaded additional peaks.
#' @author Michael Stravs
#' @seealso \code{\link{mbWorkflow}}
#' @examples
#' 
#' 	\dontrun{addPeaks("myrun_additionalPeaks.csv")}
#' 
#' @export 
addPeaks <- function(mb, filename_or_dataframe)
{
	
	errorvar <- 0
	currEnvir <- environment()
	d <- 1
	
	if(is.data.frame(filename_or_dataframe))
		df <- filename_or_dataframe
	else
	tryCatch(
		df <- read.csv(filename_or_dataframe),
		error=function(e){
		currEnvir$errorvar <- 1
	})
	# I change your heuristic fix to another heuristic fix, because I will have to test for a column name change...
	
	if(!errorvar){
	
		if(ncol(df) < 2)
			df <- read.csv(filename_or_dataframe, sep=";")
		# here: the column int was renamed to intensity, and we need to be able to read old files. sorry.
		if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df)))
			df$intensity <- df$int
		
		cols <- c("cpdID", "scan", "mzFound", "intensity", "OK")
		n <- colnames(df)
		# Check if comma-separated or semicolon-separated
		d <- setdiff(cols, n)
		if(length(d)>0){
			stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.")
		}
	}
	
	culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")]
	
	
	if(nrow(mb@additionalPeaks) == 0)
		mb@additionalPeaks <- culled_df
	else
		mb@additionalPeaks <- rbind(mb@additionalPeaks, culled_df)
	return(mb)
}



gatherDataMinimal.cpd <- function(cpd){
  
  ##Read from Compoundlist
  if(length(cpd@smiles) == 1) smiles <- cpd@smiles
  else
    smiles <- ""
  
  ##Create 
  mbdata <- list()
  mbdata[['ACCESSION']] <- ""
  mbdata[['RECORD_TITLE']] <- ""
  mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d")
  # Confidence annotation and internal ID annotation.
  # The ID of the compound will be written like:
  # COMMENT: EAWAG_UCHEM_ID 1234
  # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID"
  if(length(cpd@id) > 0)
    mbdata[["COMMENT"]][["ID"]] <- cpd@id
  
  # here compound info starts
  mbdata[['CH$NAME']] <- cpd@name
  
  # Currently we use a fixed value for Compound Class, since there is no useful
  # convention of what should go there and what shouldn't, and the field is not used
  # in search queries.
  mbdata[['CH$FORMULA']] <- cpd@formula
  mbdata[['CH$EXACT_MASS']] <- round(findMz.formula(cpd@formula, "")$mzCenter, 4)
  
  if(cpd@smiles != "")
    mbdata[['CH$SMILES']] <- cpd@smiles
  
  link <- list()
  mbdata[['CH$LINK']] <- link

  return(mbdata)
}



gatherDataMinimal.spectrum <- function(spectrum){
  
  ##Read from Compoundlist
  if(length(cpd@smiles) == 1) smiles <- cpd@smiles
  else
    smiles <- ""
  
  ##Create 
  mbdata <- list()
  mbdata[['ACCESSION']] <- ""
  mbdata[['RECORD_TITLE']] <- ""
  mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d")
  # Confidence annotation and internal ID annotation.
  # The ID of the compound will be written like:
  # COMMENT: EAWAG_UCHEM_ID 1234
  # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID"
  
  # here compound info starts
  mbdata[['CH$NAME']] <- paste("parent", spectrum@precursorMz, "at RT", spectrum@rt, "- CE", spectrum@collisionEnergy) 
  
  # Currently we use a fixed value for Compound Class, since there is no useful
  # convention of what should go there and what shouldn't, and the field is not used
  # in search queries.
  
  return(mbdata)
}
sneumann/RMassBank documentation built on Oct. 20, 2020, 3:19 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sneumann/RMassBank
Workflow to process tandem MS files and build MassBank records

R/createMassBank.R
In sneumann/RMassBank: Workflow to process tandem MS files and build MassBank records

Documented in addPeaks annotator.default createMolfile exportMassbank flatten gatherData gatherDataBabel gatherDataUnknown gatherPubChem loadInfolist loadInfolists makeMollist mbWorkflow .parseTitleString readMbdata resetInfolists

R Package Documentation

Browse R Packages

We want your feedback!

sneumann/RMassBank Workflow to process tandem MS files and build MassBank records

R/createMassBank.R In sneumann/RMassBank: Workflow to process tandem MS files and build MassBank records

Documented in addPeaks annotator.default createMolfile exportMassbank flatten gatherData gatherDataBabel gatherDataUnknown gatherPubChem loadInfolist loadInfolists makeMollist mbWorkflow .parseTitleString readMbdata resetInfolists

R Package Documentation

Browse R Packages

We want your feedback!

sneumann/RMassBank
Workflow to process tandem MS files and build MassBank records

R/createMassBank.R
In sneumann/RMassBank: Workflow to process tandem MS files and build MassBank records