#' get raw ms metadata
#'
#' figures out which metadata function to run, and returns back the metadata
#' generated by it.
#'
#' @param in_file the file to use
#'
#' @export
#' @return list
get_raw_ms_metadata <- function(in_file){
is_mzml <- regexpr("*.mzML", in_file)
if (is_mzml != -1) {
raw_metadata <- get_mzml_metadata(in_file)
} else {
stop("Unsupported format for extracting metadata!")
}
raw_metadata
}
#' extract mzML header
#'
#' @param mzml_file the mzML file to get the header from
#'
#' @export
get_mzml_header = function(mzml_file){
file_con = file(mzml_file, open = "r")
file_content = vector("character", 200)
is_run = FALSE
i_line = 1
while (!is_run && (i_line < length(file_content))) {
file_content[i_line] = readLines(file_con, n = 1)
is_run = grepl("<run", file_content[i_line], ignore.case = TRUE)
i_line = i_line + 1
}
close(file_con)
file_content = file_content[1:(i_line - 1)]
if (sum(grepl("indexedmzML", file_content)) > 0) {
out_content = c(file_content, "</run>", "</mzML>", "</indexedmzML>")
} else {
out_content = c(file_content, "</run>", "</mzML>")
}
paste(out_content, collapse = "\n")
}
#' get mzML metadata
#'
#' @param mzml_file the mzML file to get metadata from
#'
#' @importFrom XML xmlTreeParse xmlNamespaceDefinitions xmlRoot getNodeSet xmlAttrs xmlChildren xmlToList
#' @export
get_mzml_metadata <- function(mzml_file){
mzml_header = get_mzml_header(mzml_file)
xml_doc <- XML::xmlTreeParse(mzml_header, useInternalNodes = TRUE)
ns <- XML::xmlNamespaceDefinitions(XML::xmlRoot(xml_doc), recursive = TRUE, simplify = TRUE)
missing_name = which(names(ns) %in% "")
names(ns)[missing_name] <- "d1"
mz_metanodes <- XML::getNodeSet(xml_doc, "/d1:mzML", ns)
if (length(mz_metanodes) == 0) {
mz_metanodes <- XML::getNodeSet(xml_doc, "/d1:indexedmzML/d1:mzML", ns)
}
mz_meta <- list()
tmp_attr <- unclass(XML::xmlAttrs(mz_metanodes[[1]]))
attr(tmp_attr, "namespaces") <- NULL
mz_meta[["mzML"]][[".attrs"]] <- tmp_attr
other_nodes_2_get <- c("cvList", "fileDescription",
"referenceableParamGroupList",
"softwareList",
"instrumentConfigurationList",
"dataProcessingList")
other_nodes <- XML::xmlChildren(mz_metanodes[[1]])
other_list <- lapply(other_nodes, XML::xmlToList)
mz_meta <- c(mz_meta, other_list[other_nodes_2_get])
null_meta = purrr::map_lgl(mz_meta, is.null)
mz_meta = mz_meta[!null_meta]
mz_meta[["run"]][[".attrs"]] <- XML::xmlAttrs(mz_metanodes[[1]][["run"]])
mz_meta <- .remove_attrs(mz_meta)
mz_meta_frame <- .to_data_frame(mz_meta)
#mz_meta_frame$run$scanPolarity <- .get_scan_polarity(other_list$run$spectrumList)
mz_meta_frame$run$startTimeStamp <- gsub("T", " ", mz_meta_frame$run$startTimeStamp)
mz_meta_frame
}
#' export metadata to json
#'
#' export the list metadata to a json string
#'
#' @param meta_list a list of metadata
#'
#' @importFrom jsonlite toJSON
#' @export
meta_export_json <- function(meta_list){
jsonlite::toJSON(meta_list, pretty = TRUE, auto_unbox = TRUE)
}
#' transform to data frame
#'
#' @param in_list the list of xml nodes to work on
#'
.to_data_frame <- function(in_list){
if (class(in_list) == "list") {
out_list <- lapply(in_list, .to_data_frame)
} else if (class(in_list) == "character") {
if (!is.null(names(in_list))) {
out_list <- as.data.frame(t(as.matrix(in_list)))
} else {
out_list <- in_list
}
}
#print(out_list)
out_list
}
#' remove attributes
#'
#' removes a list entry called ".attrs" from a list, and makes them first level
#' partners
#'
#' @param in_list the list to work on
#'
.remove_attrs <- function(in_list){
if (class(in_list) == "list") {
out_list <- in_list
list_names <- names(out_list)
if (".attrs" %in% list_names) {
tmp_attrs <- out_list[[".attrs"]]
name_attrs <- names(tmp_attrs)
if (sum(name_attrs %in% list_names) == 0) {
for (i_name in name_attrs) {
out_list[[i_name]] <- tmp_attrs[[i_name]]
}
out_list[[".attrs"]] <- NULL
}
} else {
out_list <- lapply(out_list, .remove_attrs)
}
# still need to check the rest of the pieces of the list!
out_list <- lapply(out_list, .remove_attrs)
} else {
out_list <- in_list
}
out_list
}
#' get_scan_mode
#'
#' takes a list from xmlToList for "run" and looks at whether all scans are positive, negative, or mixed
#'
#' @param spectrum_list the list of spectra
#'
.get_scan_polarity <- function(spectrum_list){
spectrum_list[[".attrs"]] <- NULL
scan_data <- lapply(spectrum_list, function(in_spectrum){
cv_loc <- which(names(in_spectrum) %in% "cvParam")
cv_data <- unlist(in_spectrum[cv_loc])
scan_polarity <- grep("scan", cv_data, value = TRUE)
scan_polarity
})
scan_polarity <- as.character(unique(scan_data))
if ((length(scan_polarity) == 1) && (grepl("positive", scan_polarity))) {
out_polarity <- "positive"
} else if ((length(scan_polarity) == 1) && (grepl("negative", scan_polarity))) {
out_polarity <- "negative"
} else {
out_polarity <- "mixed"
}
out_polarity
}
#' json mzML to data.frame
#'
#' Given a json file or list of lists, return a data.frame with the most important
#' bits of the data.
#'
#' @param in_file the file to read from
#'
#' @export
#'
#' @importFrom purrr map_df
#'
#' @return data.frame
#'
json_mzML_2_df <- function(in_file) {
if (inherits(in_file, "character") && file.exists(in_file)) {
in_list <- jsonlite::fromJSON(in_file, simplifyVector = FALSE)
} else if (inherits(in_file, "list")) {
in_list <- in_file
}
out_df <- purrr::map_df(in_list, function(list_entry){
data.frame(mzml_id = list_entry$mzML$id,
sample_id = gsub(".raw", "", basename(list_entry$file$raw$file)),
instrument_serial = list_entry$run$instrument$serial,
instrument_model = list_entry$run$instrument$model,
start_time = as.POSIXct(list_entry$run$startTimeStamp),
polarity = list_entry$run$scanPolarity,
raw_file = list_entry$file$raw$saved_path,
sha1 = list_entry$file$raw$sha1,
mzml_file = list_entry$file$mzml$saved_path,
original_path = paste(unlist(list_entry$file$raw$original_path), collapse = ";"),
stringsAsFactors = FALSE
)
})
out_df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.