library(xml2)

mzml_file <- "/home/rmflight/Music/big_data_notbackup/CESB_rawdata/replication_study/cancer_positive_mode/mzml_full_conversion/SBC110871exopos.mzML"
xml_doc <- read_xml(mzml_file)
ns <- xml_ns(xml_doc)
mz_data <- xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML", ns)

write_mz_json <- function(mzml_file){
  # function for transforming attributes to a json string
  xml_attrs_2_json <- function(node_attrs){
    node_strs <- lapply(names(node_attrs), function(in_node){
      paste0('"', in_node, '": ', '"', node_attrs[in_node], '"')
    })
    node_strs <- paste(node_strs, collapse = ",\n")
    node_strs
  }

  # start processing the document
  mz_doc <- read_xml(mzml_file)
  mz_ns <- xml_ns(mz_doc)

  mz_path <- c("indexedmzML", "mzML")
  which_namespace <- which.min(which(grepl("psi", mz_ns)))
  ns_pre <- names(mz_ns)[which_namespace]

  path_with_pre <- paste0("/", ns_pre, ":", mz_path)
  path_collapsed <- paste(path_with_pre, collapse = "")

  mz_node <- xml_find_first(mz_doc, path_collapsed, mz_ns)

    # now start processing the children
  mz_children <- xml_children(mz_node)

  in_nodes <- mz_children[1:6] # delete this line after debugging

  # function to turn the other nodes into json
  json_other_nodes <- function(json_string, in_nodes, print_node_name = TRUE){
    node_names <- xml_name(in_nodes)

    node_class <- class(in_nodes)

    if (node_class == "xml_nodeset") {
      subset_type <- "deep"
      node_length <- length(in_nodes)
      length_match <- length(unique(node_names)) == node_length
    } else if (node_class == "xml_node") {
      subset_type <- "shallow"
      length_match <- TRUE
      node_length <- 1
    }

    # if there is no repetition, we create an object
    # otherwise we will start an array
    # careful here, length of node is zero, subsetting node is bad
    if (length_match) {
      for (i_node in seq_len(node_length)) {
        if (subset_type == "deep") {
          use_node <- in_nodes[[i_node]]
        } else if (subset_type == "shallow") {
          use_node <- in_nodes
        }

        if (print_node_name) {
          print(xml_name(use_node))
          json_string <- paste0(json_string, '{\n"', xml_name(use_node), '": ')
        } else {
          json_string <- paste0(json_string, '{\n')
        }

        use_attrs <- xml_attrs(use_node)
        if (length(use_attrs) > 0) {
          json_string <- paste0(json_string, xml_attrs_2_json(use_attrs))
        }
        use_children <- xml_children(use_node)
        if (length(use_children) > 0) {
          json_string <- json_other_nodes(json_string, use_children)
        } else {
          json_string <- paste0(json_string, '},\n')
        }
      }
    } else {
      uniq_nodes <- unique(node_names)
      n_name <- length(uniq_nodes)
      if (n_name == 1) {
        print(uniq_nodes)
        json_string <- paste0(json_string, '\n"', uniq_nodes, '": [')
        for (j_node in seq_along(in_nodes)) {
          json_string <- json_other_nodes(json_string, in_nodes[[j_node]], FALSE)
        }
        json_string <- paste0(json_string, ']\n')
      } else {
        xml_name(in_nodes) <- make.names(node_names, unique = TRUE)
        json_string <- json_other_nodes(json_string, in_nodes)
      }
    }
    json_string
  }

}

sample_id <- xml_attr(mz_data, "id", ns)
cv_nodes <- xml_children(xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:cvList", ns))
file_description <- xml_children(xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:fileDescription", ns))
ref_param <- xml_children(xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:referenceableParamGroupList", ns))
software_list <- xml_children(xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:softwareList", ns))
instrument_configuration_list <- xml_children(
  xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:instrumentConfigurationList", ns))
data_processing <- xml_children(
  xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:dataProcessingList", ns)
)
run_data <- xml_find_first(xml_doc, "/d1:indexedmzML/d1:mzML/d1:run", ns)
run_data2 <- list(name = xml_name(run_data),
                  attrs = xml_attrs(run_data))
children_path_list <- list(cv = c("indexedmzML", "mzML", "cvList"),
                           file_description = c("indexedmzML", "mzML", "fileDescription"),
                           ref_param = c("indexedmzML", "mzML", "referenceableParamGroupList"),
                           software_list = c("indexedmzML", "mzML", "softwareList"),
                           instrument_configuration_list = c("indexedmzML", "mzML",
                                                             "instrumentConfigurationList"),
                           data_processing = c("indexedmzML", "mzML", "dataProcessingList"))

node_only_path_list <- list(mz_data = c("indexedmzML", "mzML"),
                            run_data = c("indexedmzML", "mzML", "run"))

spectrum_cvparam_path <- c("indexedmzML", "mzML", "run", "spectrumList", "spectrum", "cvParam")
namespace_obj <- ns
namespace_str <- "psi"
path_def <- children_path_list[["cv"]]
make_xml_path <- function(path_def, namespace_obj, namespace_str = "psi"){
  which_namespace <- which.min(which(grepl(namespace_str, namespace_obj)))
  ns_pre <- names(namespace_obj)[which_namespace]

  path_with_pre <- paste0("/", ns_pre, ":", path_def)
  path_collapsed <- paste(path_with_pre, collapse = "")

  path_collapsed
}
child_paths <- lapply(children_path_list, make_xml_path, ns)
child_nodes <- lapply(child_paths, function(in_path){
  xml_children(xml_find_first(xml_doc, in_path, ns))
})
get_nested_children <- function(xml_obj){
  print(xml_name(xml_obj))
  obj_children <- xml_children(xml_obj)
  out_obj <- vector("list", 3)
  out_obj[[1]] <- xml_name(xml_obj)
  out_obj[[2]] <- xml_attrs(xml_obj)
  if (length(obj_children) > 0){
    out_obj[[3]] <- lapply(obj_children, get_nested_children)
  } 
  out_obj
}


all_children <- lapply(child_nodes, function(x){
  lapply(x, get_nested_children)
})


MoseleyBioinformaticsLab/FTMS.peakCharacterization documentation built on April 27, 2022, 3:32 a.m.