R/file_metadata.R

Defines functions json_mzML_2_df .get_scan_polarity .remove_attrs .to_data_frame meta_export_json get_mzml_metadata get_mzml_header get_raw_ms_metadata

Documented in get_mzml_header get_mzml_metadata get_raw_ms_metadata .get_scan_polarity json_mzML_2_df meta_export_json .remove_attrs .to_data_frame

#' get raw ms metadata
#'
#' figures out which metadata function to run, and returns back the metadata
#' generated by it.
#'
#' @param in_file the file to use
#'
#' @export
#' @return list
get_raw_ms_metadata <- function(in_file){
  is_mzml <- regexpr("*.mzML", in_file)

  if (is_mzml != -1) {
    raw_metadata <- get_mzml_metadata(in_file)
  } else {
    stop("Unsupported format for extracting metadata!")
  }
  raw_metadata
}

#' extract mzML header
#'
#' @param mzml_file the mzML file to get the header from
#'
#' @export
get_mzml_header = function(mzml_file){
  file_con = file(mzml_file, open = "r")

  file_content = vector("character", 200)

  is_run = FALSE
  i_line = 1
  while (!is_run && (i_line < length(file_content))) {
    file_content[i_line] = readLines(file_con, n = 1)
    is_run = grepl("<run", file_content[i_line], ignore.case = TRUE)
    i_line = i_line + 1
  }
  close(file_con)
  file_content = file_content[1:(i_line - 1)]

  if (sum(grepl("indexedmzML", file_content)) > 0) {
    out_content = c(file_content, "</run>", "</mzML>", "</indexedmzML>")
  } else {
    out_content = c(file_content, "</run>", "</mzML>")
  }
  paste(out_content, collapse = "\n")
}

#' get mzML metadata
#'
#' @param mzml_file the mzML file to get metadata from
#'
#' @importFrom XML xmlTreeParse xmlNamespaceDefinitions xmlRoot getNodeSet xmlAttrs xmlChildren xmlToList
#' @export
get_mzml_metadata <- function(mzml_file){
  mzml_header = get_mzml_header(mzml_file)
  xml_doc <- XML::xmlTreeParse(mzml_header, useInternalNodes = TRUE)
  ns <- XML::xmlNamespaceDefinitions(XML::xmlRoot(xml_doc), recursive = TRUE, simplify = TRUE)
  missing_name = which(names(ns) %in% "")
  names(ns)[missing_name] <- "d1"

  mz_metanodes <- XML::getNodeSet(xml_doc, "/d1:mzML", ns)

  if (length(mz_metanodes) == 0) {
    mz_metanodes <- XML::getNodeSet(xml_doc, "/d1:indexedmzML/d1:mzML", ns)
  }

  mz_meta <- list()
  tmp_attr <- unclass(XML::xmlAttrs(mz_metanodes[[1]]))

  attr(tmp_attr, "namespaces") <- NULL
  mz_meta[["mzML"]][[".attrs"]] <- tmp_attr

  other_nodes_2_get <- c("cvList", "fileDescription",
                         "referenceableParamGroupList",
                         "softwareList",
                         "instrumentConfigurationList",
                         "dataProcessingList")

  other_nodes <- XML::xmlChildren(mz_metanodes[[1]])
  other_list <- lapply(other_nodes, XML::xmlToList)

  mz_meta <- c(mz_meta, other_list[other_nodes_2_get])
  null_meta = purrr::map_lgl(mz_meta, is.null)
  mz_meta = mz_meta[!null_meta]

  mz_meta[["run"]][[".attrs"]] <- XML::xmlAttrs(mz_metanodes[[1]][["run"]])

  mz_meta <- .remove_attrs(mz_meta)

  mz_meta_frame <- .to_data_frame(mz_meta)

  #mz_meta_frame$run$scanPolarity <- .get_scan_polarity(other_list$run$spectrumList)

  mz_meta_frame$run$startTimeStamp <- gsub("T", " ", mz_meta_frame$run$startTimeStamp)

  mz_meta_frame
}

#' export metadata to json
#'
#' export the list metadata to a json string
#'
#' @param meta_list a list of metadata
#'
#' @importFrom jsonlite toJSON
#' @export
meta_export_json <- function(meta_list){
  jsonlite::toJSON(meta_list, pretty = TRUE, auto_unbox = TRUE)
}

#' transform to data frame
#'
#' @param in_list the list of xml nodes to work on
#'
.to_data_frame <- function(in_list){
  if (class(in_list) == "list") {
    out_list <- lapply(in_list, .to_data_frame)
  } else if (class(in_list) == "character") {
    if (!is.null(names(in_list))) {
      out_list <- as.data.frame(t(as.matrix(in_list)))
    } else {
      out_list <- in_list
    }

  }
  #print(out_list)
  out_list
}

#' remove attributes
#'
#' removes a list entry called ".attrs" from a list, and makes them first level
#' partners
#'
#' @param in_list the list to work on
#'
.remove_attrs <- function(in_list){
  if (class(in_list) == "list") {
    out_list <- in_list
    list_names <- names(out_list)

    if (".attrs" %in% list_names) {
      tmp_attrs <- out_list[[".attrs"]]

      name_attrs <- names(tmp_attrs)

      if (sum(name_attrs %in% list_names) == 0) {
        for (i_name in name_attrs) {
          out_list[[i_name]] <- tmp_attrs[[i_name]]
        }
        out_list[[".attrs"]] <- NULL
      }
    } else {
      out_list <- lapply(out_list, .remove_attrs)
    }
    # still need to check the rest of the pieces of the list!
    out_list <- lapply(out_list, .remove_attrs)
  } else {
    out_list <- in_list
  }
  out_list
}

#' get_scan_mode
#'
#' takes a list from xmlToList for "run" and looks at whether all scans are positive, negative, or mixed
#'
#' @param spectrum_list the list of spectra
#'
.get_scan_polarity <- function(spectrum_list){
  spectrum_list[[".attrs"]] <- NULL
  scan_data <- lapply(spectrum_list, function(in_spectrum){
    cv_loc <- which(names(in_spectrum) %in% "cvParam")
    cv_data <- unlist(in_spectrum[cv_loc])
    scan_polarity <- grep("scan", cv_data, value = TRUE)

    scan_polarity
  })

  scan_polarity <- as.character(unique(scan_data))

  if ((length(scan_polarity) == 1) && (grepl("positive", scan_polarity))) {
    out_polarity <- "positive"
  } else if ((length(scan_polarity) == 1) && (grepl("negative", scan_polarity))) {
    out_polarity <- "negative"
  } else {
    out_polarity <- "mixed"
  }
  out_polarity
}

#' json mzML to data.frame
#'
#' Given a json file or list of lists, return a data.frame with the most important
#' bits of the data.
#'
#' @param in_file the file to read from
#'
#' @export
#'
#' @importFrom purrr map_df
#'
#' @return data.frame
#'
json_mzML_2_df <- function(in_file) {
  if (inherits(in_file, "character") && file.exists(in_file)) {
    in_list <- jsonlite::fromJSON(in_file, simplifyVector = FALSE)
  } else if (inherits(in_file, "list")) {
    in_list <- in_file
  }

  out_df <- purrr::map_df(in_list, function(list_entry){
    data.frame(mzml_id = list_entry$mzML$id,
               sample_id = gsub(".raw", "", basename(list_entry$file$raw$file)),
               instrument_serial = list_entry$run$instrument$serial,
               instrument_model = list_entry$run$instrument$model,
               start_time = as.POSIXct(list_entry$run$startTimeStamp),
               polarity = list_entry$run$scanPolarity,
               raw_file = list_entry$file$raw$saved_path,
               sha1 = list_entry$file$raw$sha1,
               mzml_file = list_entry$file$mzml$saved_path,
               original_path = paste(unlist(list_entry$file$raw$original_path), collapse = ";"),
               stringsAsFactors = FALSE
    )
  })
  out_df
}
MoseleyBioinformaticsLab/FTMS.peakCharacterization documentation built on April 27, 2022, 3:32 a.m.