#' get raw ms metadata
#'
#' figures out which metadata function to run, and returns back the metadata
#' generated by it.
#'
#' @param in_file the file to use
#'
#' @export
#' @return list
get_raw_ms_metadata = function(in_file){
is_mzml = regexpr("*.mzML", in_file)
if (is_mzml != -1) {
raw_metadata = get_mzml_metadata(in_file)
} else {
stop("Unsupported format for extracting metadata!")
}
raw_metadata
}
#' extract mzML header
#'
#' @param mzml_file the mzML file to get the header from
#'
#' @export
get_mzml_header = function(mzml_file){
file_con = file(mzml_file, open = "r")
file_content = vector("character", 200)
is_run = FALSE
i_line = 1
while (!is_run && (i_line < length(file_content))) {
file_content[i_line] = readLines(file_con, n = 1)
is_run = grepl("<run", file_content[i_line], ignore.case = TRUE)
i_line = i_line + 1
}
close(file_con)
file_content = file_content[1:(i_line - 1)]
if (sum(grepl("indexedmzML", file_content)) > 0) {
out_content = c(file_content, "</run>", "</mzML>", "</indexedmzML>")
} else {
out_content = c(file_content, "</run>", "</mzML>")
}
paste(out_content, collapse = "\n")
}
#' get mzML metadata
#'
#' @param mzml_file the mzML file to get metadata from
#'
#' @export
get_mzml_metadata = function(mzml_file){
mzml_header = get_mzml_header(mzml_file)
xml_doc = XML::xmlTreeParse(mzml_header, useInternalNodes = TRUE)
ns = XML::xmlNamespaceDefinitions(XML::xmlRoot(xml_doc), recursive = TRUE, simplify = TRUE)
missing_name = which(names(ns) %in% "")
names(ns)[missing_name] = "d1"
mz_metanodes = XML::getNodeSet(xml_doc, "/d1:mzML", ns)
if (length(mz_metanodes) == 0) {
mz_metanodes = XML::getNodeSet(xml_doc, "/d1:indexedmzML/d1:mzML", ns)
}
mz_meta = list()
tmp_attr = unclass(XML::xmlAttrs(mz_metanodes[[1]]))
attr(tmp_attr, "namespaces") = NULL
mz_meta[["mzML"]][[".attrs"]] = tmp_attr
other_nodes_2_get = c("cvList", "fileDescription",
"referenceableParamGroupList",
"softwareList",
"instrumentConfigurationList",
"dataProcessingList")
other_nodes = XML::xmlChildren(mz_metanodes[[1]])
other_list = lapply(other_nodes, XML::xmlToList)
mz_meta = c(mz_meta, other_list[other_nodes_2_get])
null_meta = purrr::map_lgl(mz_meta, is.null)
mz_meta = mz_meta[!null_meta]
mz_meta[["run"]][[".attrs"]] = XML::xmlAttrs(mz_metanodes[[1]][["run"]])
mz_meta = .remove_attrs(mz_meta)
mz_meta_frame = .to_data_frame(mz_meta)
#mz_meta_frame$run$scanPolarity = .get_scan_polarity(other_list$run$spectrumList)
mz_meta_frame$run$startTimeStamp = gsub("T", " ", mz_meta_frame$run$startTimeStamp)
mz_meta_frame
}
#' export metadata to json
#'
#' export the list metadata to a json string
#'
#' @param meta_list a list of metadata
#'
#' @export
meta_export_json = function(meta_list){
jsonlite::toJSON(meta_list, pretty = TRUE, auto_unbox = TRUE)
}
#' transform to data frame
#'
#' @param in_list the list of xml nodes to work on
#'
.to_data_frame = function(in_list){
if (class(in_list) == "list") {
out_list = lapply(in_list, .to_data_frame)
} else if (class(in_list) == "character") {
if (!is.null(names(in_list))) {
out_list = as.data.frame(t(as.matrix(in_list)))
} else {
out_list = in_list
}
}
#print(out_list)
out_list
}
#' remove attributes
#'
#' removes a list entry called ".attrs" from a list, and makes them first level
#' partners
#'
#' @param in_list the list to work on
#'
.remove_attrs = function(in_list){
if (class(in_list) == "list") {
out_list = in_list
list_names = names(out_list)
if (".attrs" %in% list_names) {
tmp_attrs = out_list[[".attrs"]]
name_attrs = names(tmp_attrs)
if (sum(name_attrs %in% list_names) == 0) {
for (i_name in name_attrs) {
out_list[[i_name]] = tmp_attrs[[i_name]]
}
out_list[[".attrs"]] = NULL
}
} else {
out_list = lapply(out_list, .remove_attrs)
}
# still need to check the rest of the pieces of the list!
out_list = lapply(out_list, .remove_attrs)
} else {
out_list = in_list
}
out_list
}
#' get_scan_mode
#'
#' takes a list from xmlToList for "run" and looks at whether all scans are positive, negative, or mixed
#'
#' @param spectrum_list the list of spectra
#'
.get_scan_polarity = function(spectrum_list){
spectrum_list[[".attrs"]] = NULL
scan_data = lapply(spectrum_list, function(in_spectrum){
cv_loc = which(names(in_spectrum) %in% "cvParam")
cv_data = unlist(in_spectrum[cv_loc])
scan_polarity = grep("scan", cv_data, value = TRUE)
scan_polarity
})
scan_polarity = as.character(unique(scan_data))
if ((length(scan_polarity) == 1) && (grepl("positive", scan_polarity))) {
out_polarity = "positive"
} else if ((length(scan_polarity) == 1) && (grepl("negative", scan_polarity))) {
out_polarity = "negative"
} else {
out_polarity = "mixed"
}
out_polarity
}
#' json mzML to data.frame
#'
#' Given a json file or list of lists, return a data.frame with the most important
#' bits of the data.
#'
#' @param in_file the file to read from
#'
#' @export
#'
#'
#' @return data.frame
#'
json_mzML_2_df = function(in_file) {
if (inherits(in_file, "character") && file.exists(in_file)) {
in_list = jsonlite::fromJSON(in_file, simplifyVector = FALSE)
} else if (inherits(in_file, "list")) {
in_list = in_file
}
out_df = purrr::map_df(in_list, function(list_entry){
data.frame(mzml_id = list_entry$mzML$id,
sample_id = gsub(".raw", "", basename(list_entry$file$raw$file)),
instrument_serial = list_entry$run$instrument$serial,
instrument_model = list_entry$run$instrument$model,
start_time = as.POSIXct(list_entry$run$startTimeStamp),
polarity = list_entry$run$scanPolarity,
raw_file = list_entry$file$raw$saved_path,
sha1 = list_entry$file$raw$sha1,
mzml_file = list_entry$file$mzml$saved_path,
original_path = paste(unlist(list_entry$file$raw$original_path), collapse = ";"),
stringsAsFactors = FALSE
)
})
out_df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.