#' MS data conversion
#'
#' Conversion of MS analysis files between several open and closed data formats.
#'
#' @param algorithm Either \code{"pwiz"} (implemented by \command{msConvert} of
#' ProteoWizard), \code{"openms"} (implemented by \command{FileConverter} of
#' OpenMS) or \code{"bruker"} (implemented by DataAnalysis).
#'
#' @templateVar what \code{convertMSFiles} (except if \code{algorithm="bruker"})
#' @template uses-multiProc
#'
#'
#' @name convertMSFiles
NULL
MSFileExtensions <- function()
{
list(thermo = "raw",
bruker = c("d", "yep", "baf", "fid"),
agilent = "d",
ab = "wiff",
waters = "raw",
mzXML = "mzXML",
mzML = "mzML")
}
MSFileFormatIsDir <- function(format, ext)
{
# UNDONE: is agilent .d also a directory?
return((format == "bruker" && ext == "d") || (format == "waters" && ext == "raw"))
}
#' @details \code{MSFileFormats} returns a \code{character} with all supported
#' input formats (see below).
#' @param vendor If \code{TRUE} only vendor formats are returned.
#' @rdname convertMSFiles
#' @export
MSFileFormats <- function(algorithm = "pwiz", vendor = FALSE)
{
checkmate::assertChoice(algorithm, c("pwiz", "openms", "bruker"))
checkmate::assertFlag(vendor)
if (algorithm == "pwiz")
ret <- names(MSFileExtensions())
else if (algorithm == "openms")
ret <- c("mzXML", "mzML")
else # algorithm == "bruker"
ret <- "bruker"
if (vendor)
ret <- setdiff(ret, c("mzXML", "mzML"))
return(ret)
}
filterMSFileDirs <- function(files, from)
{
if (length(files) == 0)
return(files)
allFromExts <- MSFileExtensions()[from]
keep <- sapply(files, function(file)
{
fExt <- tools::file_ext(file)
fromExts <- pruneList(lapply(allFromExts, function(f) f[tolower(f) %in% tolower(fExt)]), checkEmptyElements = TRUE)
if (length(fromExts) == 0)
return(FALSE)
fromCheck <- names(fromExts)
shouldBeDir <- mapply(fromCheck, fromExts, SIMPLIFY = TRUE,
FUN = function(format, exts) sapply(exts, MSFileFormatIsDir, format = format))
if (!allSame(shouldBeDir))
return(TRUE) # can be either
isDir <- file.info(file, extra_cols = FALSE)$isdir
if (all(shouldBeDir))
return(isDir)
return(!isDir)
})
return(files[keep])
}
listMSFiles <- function(dirs, from)
{
dirs <- normalizePath(unique(dirs), mustWork = FALSE, winslash = "/")
allExts <- MSFileExtensions()
allExts <- unique(unlist(allExts[from]))
files <- list.files(dirs, full.names = TRUE, pattern = paste0(paste0(".+\\.", allExts, collapse = "|"), "$"),
ignore.case = TRUE)
# try to get to back to the original directory order
ord <- match(dirname(files), dirs)
files <- files[order(ord)]
return(filterMSFileDirs(files, from))
}
getMSFilePaths <- function(files, paths, from, mustExist = FALSE)
{
msFilePaths <- listMSFiles(paths, from)
msFilesNoExt <- tools::file_path_sans_ext(basename(msFilePaths))
found <- files %in% msFilesNoExt
if (mustExist && any(!found))
stop(sprintf("The following analyses are not found with a correct data format (valid: %s): %s",
paste0(from, collapse = ", "),
paste0(files[!found], collapse = ", ")),
call. = FALSE)
return(msFilePaths[match(files, msFilesNoExt, nomatch = 0)])
}
convertMSFilesPWiz <- function(inFiles, outFiles, to, centroid, filters, extraOpts, PWizBatchSize)
{
if (centroid != FALSE)
{
if (is.null(filters))
filters <- character()
filters <- c(paste("peakPicking", if (is.character(centroid)) centroid else ""), filters)
}
mainArgs <- paste0("--", to)
if (!is.null(filters))
mainArgs <- c(mainArgs, sapply(filters, function(f) c("--filter", f)))
if (!is.null(extraOpts))
mainArgs <- c(mainArgs, extraOpts)
pwpath <- findPWizPath()
if (is.null(pwpath) || !file.exists(file.path(pwpath, paste0("msconvert", if (Sys.info()[["sysname"]] == "Windows") ".exe" else ""))))
stop("Could not find ProteoWizard. You may set its location in the patRoon.path.pwiz option. See ?patRoon for more details.")
msc <- file.path(pwpath, "msconvert")
if (PWizBatchSize != 1 && length(inFiles) > 1)
{
outDir <- dirname(outFiles)
if (!allSame(outDir)) # UNDONE?
stop("If PWizBatchSize>1 then all output files must go to the same directory.")
outDir <- outDir[1]
if (PWizBatchSize == 0)
batches <- list(seq_along(inFiles))
else
batches <- splitInBatches(seq_along(inFiles), PWizBatchSize)
cmdQueue <- lapply(seq_along(batches), function(bi)
{
input <- tempfile("msconvert")
cat(inFiles[batches[[bi]]], sep = "\n", file = input)
logf <- paste0("pwiz-batch_", bi, ".txt")
# UNDONE: unlike PWizBatchSize==1 we don't (can't) set output file names here, is this a problem?
return(list(logFile = logf, command = msc, args = c("-f", input, "-o", outDir, mainArgs)))
})
}
else
{
cmdQueue <- lapply(seq_along(inFiles), function(fi)
{
basef <- basename(tools::file_path_sans_ext(inFiles[fi]))
logf <- paste0("pwiz-", basef, ".txt")
return(list(logFile = logf, command = msc,
args = c(inFiles[fi], "--outfile", outFiles[fi],
"-o", dirname(outFiles[fi]), mainArgs)))
})
}
executeMultiProcess(cmdQueue, function(cmd) {}, logSubDir = "convert")
invisible(NULL)
}
convertMSFilesOpenMS <- function(inFiles, outFiles, to, extraOpts)
{
mainArgs <- c()
if (!is.null(extraOpts))
mainArgs <- c(mainArgs, extraOpts)
msc <- getExtDepPath("openms", "FileConverter")
cmdQueue <- lapply(seq_along(inFiles), function(fi)
{
basef <- basename(tools::file_path_sans_ext(inFiles[fi]))
logf <- paste0("openms-", basef, ".txt")
return(list(logFile = logf, command = msc,
args = c("-in", inFiles[fi], "-out", outFiles[fi], mainArgs)))
})
executeMultiProcess(cmdQueue, function(cmd) {}, logSubDir = "convert")
invisible(NULL)
}
convertMSFilesBruker <- function(inFiles, outFiles, to, centroid)
{
# expConstant <- if (to == "mzXML") DAConstants$daMzXML else if (to == "mzData") DAConstants$daMzData else DAConstants$daMzML
expConstant <- if (to == "mzXML") DAConstants$daMzXML else DAConstants$daMzML
expSpecConstant <- if (centroid) DAConstants$daLine else DAConstants$daProfile
DA <- getDAApplication()
hideDAInScope()
fCount <- length(inFiles)
prog <- openProgBar(0, fCount)
for (i in seq_len(fCount))
{
ind <- getDAFileIndex(DA, inFiles[i], NULL)
if (ind == -1)
warning(paste("Failed to open file in DataAnalysis:", inFiles[i]))
else
DA[["Analyses"]][[ind]]$Export(outFiles[i], expConstant, expSpecConstant)
setTxtProgressBar(prog, i)
}
setTxtProgressBar(prog, fCount)
invisible(NULL)
}
#' @details \code{convertMSFiles} converts the data format of an analysis to
#' another. It uses tools from
#' \href{http://proteowizard.sourceforge.net/}{ProteoWizard}
#' (\command{msConvert} command), \href{http://www.openms.de/}{OpenMS}
#' (\command{FileConverter} command) or Bruker DataAnalysis to perform the
#' conversion. Supported input and output formats include \file{mzXML},
#' \file{.mzML} and several vendor formats, depending on which algorithm is
#' used.
#'
#' @param files,dirs The \code{files} argument should be a \code{character}
#' vector with input files. If \code{files} contains directories and
#' \code{dirs=TRUE} then files from these directories are also considered. An
#' alternative method to specify input files is by the \code{anaInfo}
#' argument. If the latter is specified \code{files} may be \code{NULL}.
#' @param outPath A character vector specifying directories that should be used
#' for the output. Will be re-cycled if necessary. If \code{NULL}, output
#' directories will be kept the same as the input directories.
#' @param anaInfo An \link[=analysis-information]{analysis info table} used to
#' retrieve input files. Either this argument or \code{files} (or both) should
#' be set (\emph{i.e.} not \code{NULL}).
#' @param from Input format (see below). These are used to find analyses when
#' \code{dirs=TRUE} or \code{anaInfo} is set.
#' @param to Output format: \code{"mzXML"} or \code{"mzML"}.
#' @param overWrite Should existing destination file be overwritten
#' (\code{TRUE}) or not (\code{FALSE})?
#' @param centroid Set to \code{TRUE} to enable centroiding (not supported if
#' \code{algorithm="openms"}). In addition, when \code{algorithm="pwiz"} the
#' value may be \code{"vendor"} to perform centroiding with the vendor
#' algorithm or \code{"cwt"} to use ProteoWizard's wavelet algorithm.
#' @param filters When \code{algorithm="pwiz"}: a \code{character} vector
#' specifying one or more filters. The elements of the specified vector are
#' directly passed to the \code{--filter} option (see
#' \href{http://proteowizard.sourceforge.net/tools/filters.html}{here})
#' @param extraOpts A \code{character} vector specifying any extra commandline
#' parameters passed to \command{msConvert} or \command{FileConverter}. Set to
#' \code{NULL} to ignore. For options: see
#' \href{https://abibuilder.informatik.uni-tuebingen.de/archive/openms/Documentation/release/latest/html/TOPP_FileConverter.html}{FileConverter}
#' and
#' \href{http://proteowizard.sourceforge.net/tools/msconvert.html}{msConvert}.
#' @param PWizBatchSize When \code{algorithm="pwiz"}: the number of analyses to
#' process by a single call to \command{msConvert}. Usually a value of one is
#' most efficient. Set to zero to run all analyses all at once from a single
#' call.
#'
#' @section Conversion formats: Possible output formats (\code{to} argument) are
#' \code{mzXML} and \code{mzML}.
#'
#' Possible input formats (\code{from} argument) depend on the algorithm that
#' was chosen and may include:
#'
#' \itemize{
#'
#' \item \code{thermo}: Thermo \file{.RAW} files (only
#' \code{algorithm="pwiz"}).
#'
#' \item \code{bruker}: Bruker \file{.d}, \file{.yep}, \file{.baf} and
#' \file{.fid} files (only \code{algorithm="pwiz"} or
#' \code{algorithm="bruker"}).
#'
#' \item \code{agilent}: Agilent \file{.d} files (only
#' \code{algorithm="pwiz"}).
#'
#' \item \code{ab}: AB Sciex \file{.wiff} files (only
#' \code{algorithm="pwiz"}).
#'
#' \item \code{waters} Waters \file{.RAW} files (only
#' \code{algorithm="pwiz"}).
#'
#' \item \code{mzXML}/\code{mzML}: Open format \file{.mzXML}/\file{.mzML}
#' files (only \code{algorithm="pwiz"} or \code{algorithm="openms"}).
#'
#' }
#'
#' Note that the actual supported file formats of ProteoWizard depend on how
#' it was installed (see
#' \href{http://proteowizard.sourceforge.net/formats/index.html}{here}).
#'
#' @examples \dontrun{
#' # Use FileConverter of OpenMS to convert between open mzXML/mzML format
#' convertMSFiles("standard-1.mzXML", to = "mzML", algorithm = "openms")
#'
#' # Convert all Thermo .RAW files in the analyses/raw directory to mzML and
#' # store the files in analyses/mzml. During conversion files are centroided by
#' # the peakPicking filter and only MS 1 data is kept.
#' convertMSFiles("analyses/raw", "analyses/mzml", dirs = TRUE, from = "thermo",
#' centroid = "vendor", filters = "msLevel 1")
#' }
#'
#' @references \insertRef{Rst2016}{patRoon} \cr\cr
#' \insertRef{Chambers2012}{patRoon}
#'
#' @rdname convertMSFiles
#' @export
convertMSFiles <- function(files = NULL, outPath = NULL, dirs = TRUE,
anaInfo = NULL, from = NULL, to = "mzML",
overWrite = FALSE, algorithm = "pwiz",
centroid = algorithm != "openms",
filters = NULL, extraOpts = NULL, PWizBatchSize = 1)
{
ac <- checkmate::makeAssertCollection()
checkmate::assertCharacter(files, min.len = 1, min.chars = 1, null.ok = !is.null(anaInfo), add = ac)
checkmate::assertCharacter(outPath, min.chars = 1, min.len = 1, null.ok = TRUE, add = ac)
assertCanCreateDirs(outPath, add = ac)
checkmate::assertFlag(dirs, add = ac)
checkmate::assertChoice(to, c("mzXML", "mzML"), add = ac) # UNDONE: enough for now?
checkmate::assertFlag(overWrite, add = ac)
checkmate::assertChoice(algorithm, c("pwiz", "openms", "bruker"), add = ac)
checkmate::assert(checkmate::checkFlag(centroid),
checkmate::checkChoice(centroid, c("vendor", "cwt")),
.var.name = "centroid")
checkmate::assertCharacter(filters, min.chars = 1, null.ok = TRUE, add = ac)
checkmate::assertCharacter(extraOpts, null.ok = TRUE, add = ac)
checkmate::assertCount(PWizBatchSize, add = ac)
checkmate::reportAssertions(ac)
if (centroid != FALSE && algorithm == "openms")
stop("Centroiding with OpenMS is currently not supported.")
else if ((centroid == "vendor" || centroid == "cwt") && algorithm != "pwiz")
stop("Vendor/cwt centroiding is only supported when algorithm=\"pwiz\"")
if (dirs || !is.null(anaInfo)) # from arg needs to be used?
{
if (algorithm == "pwiz")
from <- checkmate::matchArg(from, c("thermo", "bruker", "agilent", "ab", "waters", "mzXML", "mzML"),
several.ok = FALSE)
else if (algorithm == "openms")
from <- checkmate::matchArg(from, c("mzXML", "mzML"), several.ok = FALSE)
else # bruker
from <- checkmate::matchArg(from, "bruker")
if (from == to)
warning("Input and output formats are the same")
}
anaInfo <- assertAndPrepareAnaInfo(anaInfo, from, null.ok = !is.null(files))
if (!is.null(files))
{
if (dirs)
{
dirs <- files[file.info(files, extra_cols = FALSE)$isdir]
# filter out analyses "files" (are actually directories)
dirs <- files[!sapply(tools::file_ext(dirs), MSFileFormatIsDir, format = from)]
dirFiles <- listMSFiles(dirs, from)
files <- union(dirFiles, setdiff(files, dirs))
}
}
else
files <- character()
if (!is.null(anaInfo))
{
afiles <- getMSFilePaths(anaInfo$analysis, anaInfo$path, from)
files <- union(files, afiles)
}
if (is.null(outPath))
outPath <- dirname(files)
mkdirp(outPath)
# NOTE: use normalizePath() here to convert to backslashes on Windows: needed by msconvert
outPath <- normalizePath(rep(outPath, length.out = length(files)), mustWork = TRUE)
files <- normalizePath(files, mustWork = FALSE) # no mustWork, file existence will be checked later
basef <- basename(tools::file_path_sans_ext(files))
output <- normalizePath(file.path(outPath, paste0(basef, ".", to)),
mustWork = FALSE)
keepFiles <- sapply(seq_along(files), function(fi)
{
if (!file.exists(files[fi]))
printf("Skipping non-existing input analysis %s\n", files[fi])
else if (!overWrite && file.exists(output[fi]))
printf("Skipping existing output analysis %s\n", output[fi])
else
return(TRUE)
return(FALSE)
}, USE.NAMES = FALSE)
if (is.logical(keepFiles) && any(keepFiles))
{
files <- files[keepFiles]
output <- output[keepFiles]
if (algorithm == "pwiz")
convertMSFilesPWiz(files, output, to, centroid, filters, extraOpts, PWizBatchSize)
else if (algorithm == "openms")
convertMSFilesOpenMS(files, output, to, extraOpts)
else # bruker
convertMSFilesBruker(files, output, to, centroid)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.