library(xml2) mzml_file <- "/home/rmflight/Music/big_data_notbackup/CESB_rawdata/replication_study/cancer_positive_mode/mzml_full_conversion/SBC110871exopos.mzML"
xml_doc <- read_xml(mzml_file) ns <- xml_ns(xml_doc)
mz_data <- xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML", ns) write_mz_json <- function(mzml_file){ # function for transforming attributes to a json string xml_attrs_2_json <- function(node_attrs){ node_strs <- lapply(names(node_attrs), function(in_node){ paste0('"', in_node, '": ', '"', node_attrs[in_node], '"') }) node_strs <- paste(node_strs, collapse = ",\n") node_strs } # start processing the document mz_doc <- read_xml(mzml_file) mz_ns <- xml_ns(mz_doc) mz_path <- c("indexedmzML", "mzML") which_namespace <- which.min(which(grepl("psi", mz_ns))) ns_pre <- names(mz_ns)[which_namespace] path_with_pre <- paste0("/", ns_pre, ":", mz_path) path_collapsed <- paste(path_with_pre, collapse = "") mz_node <- xml_find_first(mz_doc, path_collapsed, mz_ns) # now start processing the children mz_children <- xml_children(mz_node) in_nodes <- mz_children[1:6] # delete this line after debugging # function to turn the other nodes into json json_other_nodes <- function(json_string, in_nodes, print_node_name = TRUE){ node_names <- xml_name(in_nodes) node_class <- class(in_nodes) if (node_class == "xml_nodeset") { subset_type <- "deep" node_length <- length(in_nodes) length_match <- length(unique(node_names)) == node_length } else if (node_class == "xml_node") { subset_type <- "shallow" length_match <- TRUE node_length <- 1 } # if there is no repetition, we create an object # otherwise we will start an array # careful here, length of node is zero, subsetting node is bad if (length_match) { for (i_node in seq_len(node_length)) { if (subset_type == "deep") { use_node <- in_nodes[[i_node]] } else if (subset_type == "shallow") { use_node <- in_nodes } if (print_node_name) { print(xml_name(use_node)) json_string <- paste0(json_string, '{\n"', xml_name(use_node), '": ') } else { json_string <- paste0(json_string, '{\n') } use_attrs <- xml_attrs(use_node) if (length(use_attrs) > 0) { json_string <- paste0(json_string, xml_attrs_2_json(use_attrs)) } use_children <- xml_children(use_node) if (length(use_children) > 0) { json_string <- json_other_nodes(json_string, use_children) } else { json_string <- paste0(json_string, '},\n') } } } else { uniq_nodes <- unique(node_names) n_name <- length(uniq_nodes) if (n_name == 1) { print(uniq_nodes) json_string <- paste0(json_string, '\n"', uniq_nodes, '": [') for (j_node in seq_along(in_nodes)) { json_string <- json_other_nodes(json_string, in_nodes[[j_node]], FALSE) } json_string <- paste0(json_string, ']\n') } else { xml_name(in_nodes) <- make.names(node_names, unique = TRUE) json_string <- json_other_nodes(json_string, in_nodes) } } json_string } } sample_id <- xml_attr(mz_data, "id", ns)
cv_nodes <- xml_children(xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:cvList", ns))
file_description <- xml_children(xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:fileDescription", ns))
ref_param <- xml_children(xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:referenceableParamGroupList", ns))
software_list <- xml_children(xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:softwareList", ns))
instrument_configuration_list <- xml_children( xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:instrumentConfigurationList", ns))
data_processing <- xml_children( xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:dataProcessingList", ns) )
run_data <- xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:run", ns) run_data2 <- list(name = xml_name(run_data), attrs = xml_attrs(run_data))
children_path_list <- list(cv = c("indexedmzML", "mzML", "cvList"), file_description = c("indexedmzML", "mzML", "fileDescription"), ref_param = c("indexedmzML", "mzML", "referenceableParamGroupList"), software_list = c("indexedmzML", "mzML", "softwareList"), instrument_configuration_list = c("indexedmzML", "mzML", "instrumentConfigurationList"), data_processing = c("indexedmzML", "mzML", "dataProcessingList")) node_only_path_list <- list(mz_data = c("indexedmzML", "mzML"), run_data = c("indexedmzML", "mzML", "run")) spectrum_cvparam_path <- c("indexedmzML", "mzML", "run", "spectrumList", "spectrum", "cvParam")
namespace_obj <- ns namespace_str <- "psi" path_def <- children_path_list[["cv"]] make_xml_path <- function(path_def, namespace_obj, namespace_str = "psi"){ which_namespace <- which.min(which(grepl(namespace_str, namespace_obj))) ns_pre <- names(namespace_obj)[which_namespace] path_with_pre <- paste0("/", ns_pre, ":", path_def) path_collapsed <- paste(path_with_pre, collapse = "") path_collapsed }
child_paths <- lapply(children_path_list, make_xml_path, ns) child_nodes <- lapply(child_paths, function(in_path){ xml_children(xml_find_first(xml_doc, in_path, ns)) })
get_nested_children <- function(xml_obj){ print(xml_name(xml_obj)) obj_children <- xml_children(xml_obj) out_obj <- vector("list", 3) out_obj[[1]] <- xml_name(xml_obj) out_obj[[2]] <- xml_attrs(xml_obj) if (length(obj_children) > 0){ out_obj[[3]] <- lapply(obj_children, get_nested_children) } out_obj } all_children <- lapply(child_nodes, function(x){ lapply(x, get_nested_children) })
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.