R/read_msdata.R

Defines functions read_MRMkit_results read_MRMkit_raw_area_CSV read_table_wide read_long_table_csv read_MassHunterCSV_wide read_MassHunterCSV

Documented in read_long_table_csv read_MassHunterCSV read_MassHunterCSV_wide read_MRMkit_raw_area_CSV read_MRMkit_results read_table_wide

#' Read and convert an Agilent MassHunter Quant CSV result file
#'
#' @param filename File path of MassHunter Quant CSV file
#' @param silent Suppress messages
#'
#' @return A tibble in the long format
#' @export
#'
#' @importFrom stats na.omit setNames
#' @importFrom utils tail
#' @importFrom glue glue
#' @importFrom tidyselect vars_select_helpers
#'

read_MassHunterCSV <- function(filename, silent = FALSE) {

  if(!silent) print(glue::glue("Reading [{basename(filename)}] ..."))
  # if (shiny::isRunning())
  #   incProgress(1 / length(n_datafiles), detail = paste0(", basename(file)))
  #
  # Read Agilent MassHunter Quant Export file (CSV)
  suppressWarnings(
    datWide <-
      readr::read_csv(
        file = filename,
        col_names = FALSE,
        na = c("#N/A", "NULL"),
        trim_ws = TRUE,
        col_types = readr::cols(.default = "c"),
        locale = readr::locale(encoding = 'ISO-8859-1'), num_threads = 4,progress = TRUE
      ))
  warnings_datWide = readr::problems(datWide)



  # Remove text that is not required and remove dot chars that interfere later with the conversion wide to long
  # ToDo: Convert to tidyverse functions
  datWide[1, ] <-
    lapply(datWide[1, ], function(y)
      gsub(" Results", "", y))
  datWide[1, ] <-
    lapply(datWide[1, ], function(y)
      gsub(" Method", "", y))
  datWide[2, ] <- lapply(datWide[2, ], function(y)
    gsub("\\. ", "", y))
  datWide[2, ] <- lapply(datWide[2, ], function(y)
    gsub("\\.", "", y))
  datWide[2, ] <- lapply(datWide[2, ], function(y)
    gsub("/", "", y))


  datWide <- datWide |> dplyr::add_row(.after = 1)
  datWide[1, ] <- tibble::tibble(A = datWide[1,] |> unlist() |> dplyr::na_if("")) |>  tidyr::fill("A") |> unlist() |> as.list()

  datWide[1, ] <- replace(datWide[1,], stringr::str_detect(string = datWide[1,] , pattern = "AA"),"")
  datWide[2, ] <- replace(datWide[1,], !stringr::str_detect(string = datWide[1,] , pattern = "AA"),"")
  datWide[1, ] <- tibble::tibble(A = datWide[1,] |> unlist() |> dplyr::na_if("")) |>  tidyr::fill("A") |> unlist() |> as.list()

  # Concatenate rows
  datWide[1, ] <- paste(datWide[1, ], datWide[2, ], sep = " ") |> stringr::str_squish() |> as.list()
  datWide <- datWide[-2, ]


  # Fill in transition name in empty columns (different parameters of the same transition)
  # Dealing with exported "Qualifier" results: Adds the corresponding quantifier name (the first column of the group) with the Tag "QUAL" and the transition in front e.g. "Sph d18:0 [QUAL 302.3>266.2]"


  # Remove columns with no column names, as they are undefined (seems to be only Outlier Summary and Quantitation Message Summary)
  datWide <- datWide |> dplyr::select(-tidyselect::where(~ .x[2] == ""))

  # Concatenate rows containing parameters + transitions to the form parameter.sample and parameter.transition headers. This will later be converted with reshape()
  colnames(datWide) <- paste(datWide[2, ], datWide[1, ], sep = "\t")
  datWide <- datWide[-1:-2, ]


  # Rename some known column header names and remove columns that are not needed or not informative.

  new_colnames = c(
    "DataFileName" = "Data File\tSample",
    "SampleName" = "Name\tSample",
    "AcqTimeStamp" = "AcqDate-Time\tSample",
    "SampleType" = "Type\tSample",
    "VialPosition" = "Pos\tSample",
    "InjVolume" = "Vol\tSample",
    "Dilution" = "Dil\tSample",
    "SampleGroup" = "Sample Group\tSample",
    "InstrumentName" = "Instrument Name\tSample",
    "InstrumentType" = "Instrument Type\tSample",
    "AcqMethodFile" = "Acq. Method File\tSample",
    "AcqMethodPath" = "Acq. Method Path\tSample",
    "DataPath" = "Data Path\tSample",
    "QuantitationMessage" = "Quantitation Message\tSample",
    "NameCompound" = "Name\tCompound"
  )

  datWide <- datWide %>% dplyr::rename(dplyr::any_of(new_colnames))

  if("QuantitationMessage" %in% names(datWide))  stop("Field 'Quantitation Message' currently not supported: Please re-export your data in MH without this field.")
  if("NameCompound" %in% names(datWide)) stop("Compound table format is currently not supported. Please re-export your data in MH with compounds as columns.")
  if(! "DataFileName" %in% names(datWide)) stop("Unknown format or corrupt data file. Please try re-export from MH.")
  if (nrow(warnings_datWide)> 0) stop("Unknown format, or data file is corrupt. Please try re-export from MH.")
  # Remove ".Sample" from remaining sample description headers and remove known unused columns
  datWide <-
    datWide[, !(names(datWide) %in% c("NA\tSample", "Level\tSample", "Sample"))]
  names(datWide) <- sub("\\\tSample", "", names(datWide))

  keep.cols <- names(datWide) %in% c("")
  datWide <- datWide [!keep.cols]



  # Transform wide to long (tidy) table format
  # ------------------------------------------



  datWide <- datWide %>%
    dplyr::mutate(
      dplyr::across(.cols = dplyr::any_of(c("AcqTimeStamp")),
                    .fns = \(x) lubridate::parse_date_time(x, c("mdy_HM", "dmy_HM", "ymd_HM", "ydm_HM", "mdy_HM %p", "dmy_HM %p", "ymd_HM %p", "ydm_HM %p"))),
      dplyr::across(.cols = dplyr::any_of(c("SampleName")),
                    .fns = stringr::str_squish)
    )

  # obtain list with column names of the columns that define transition values (e.g. "RT Cer d16:0/18:0"). Delimuter is currently tab (\t)
  param_transition_names <-
    colnames(datWide[, -1:-tail(grep("\\\t", colnames(datWide), invert =  TRUE), 1)])


  # Obtain long table of all param-transition combinations, split param and compund name and then spread values of different param as columns
  datLong <- datWide %>%
    dplyr::mutate(RUN_ID = dplyr::row_number(), .before = 1) |>
    tidyr::pivot_longer(cols = tidyselect::all_of(param_transition_names), names_pattern = "(.*)\t(.*)$", names_to = c("Param", "FEATURE_NAME")) %>%
    tidyr::pivot_wider(names_from = "Param" ,values_from = "value")

  # Convert types of knows parameters and fields in the data set
  # ------------------------------------------------------------

  datLong <- datLong %>%
    dplyr::mutate(
      dplyr::across(.cols = dplyr::any_of(c("RT", "Area", "Height","FWHM","Width","SN","IntStart","IntEnd",
                                            "Symmetry","InjVolume", "Precursor Ion", "Product Ion", "Collision Energy")),
                    .fns = \(x) as.numeric(stringr::str_replace(x, ",", "."))),
      dplyr::across(.cols = dplyr::any_of(c("MI")),
                    .fns = as.logical),
      dplyr::across(.cols = dplyr::any_of(c("Ion Polarity")),
                    .fns = as.factor)
    )

  new_colnames <- c(DataName = "SampleName", PRECURSOR_MZ = "Precursor Ion", PRODUCT_MZ = "Product Ion",
                    CollisionEnergy = "Collision Energy", IonPolarity = "Ion Polarity")

  datLong <- datLong %>%
    dplyr::rename(dplyr::any_of(new_colnames)) %>%
    dplyr::mutate(ANALYSIS_ID = stringr::str_replace(.data$DataFileName, "\\.d", ""), .before = "DataName") %>%
    dplyr::mutate(dplyr::across(tidyselect::where(is.character), stringr::str_squish))

  if(!silent) {
    print(glue::glue("Imported {length(unique(datLong$DataFileName))} samples with {length(unique(datLong$FEATURE_NAME))} transitions. \n"))
  }



  datLong
}


#' Reads a wide CSV file with Feature Intensities
#'
#' @param file File name and path of the MassHunter Quant CSV file
#' @param field Peak parameter (e.g. Area, RT)
#' @param silent Suppress messages
#'
#' @return A tibble in the long format
#' @export
#'
#' @importFrom stats na.omit setNames
#' @importFrom utils tail
#' @importFrom tidyselect vars_select_helpers

#'
#' @examples
#' library(SLINGtools)
#'
#' data_file_path <- system.file("extdata",
#'   "Testdata_Lipidomics_MHQuant_Detailed.csv", package = "SLINGtools")
#' d_area <- read_MassHunterCSV_wide(data_file_path, field = "Area")
#' d_area
#'
#'
read_MassHunterCSV_wide <- function(file, field, silent = FALSE) {

  sample_def_cols = c(
    "DataFileName",
    "SampleName",
    "AcqTimeStamp",
    "SampleType",
    "VialPosition",
    "InjVolume",
    "SampleType"
  )
  d <- read_MassHunterCSV(file, silent) %>%
    dplyr::select(tidyselect::any_of(sample_def_cols), "FEATURE_NAME", {{field}})

  d %>% tidyr::pivot_wider(names_from = "FEATURE_NAME", values_from = {{field}})
}



#' Reads a long CSV file with Feature Intensities
#'
#' @param file File name and path of a plain long-format CSV file
#' @param field Peak parameter (e.g. Area, RT)
#' @param silent Suppress messages
#'
#' @return A tibble in the long format
#' @export
#'
#'
#'
#'
read_long_table_csv <- function(file, field, silent = FALSE) {

  sample_def_cols = c(
    "DataFileName",
    "SampleName",
    "AcqTimeStamp",
    "SampleType",
    "VialPosition",
    "InjVolume",
    "SampleType",
    "RUN_ID",
    "ANALYSIS_ID",
    "FEATURE_NAME",
    "Area",
    "RT",
    "FWHM",
    "SN"
  )

  d <- readr::read_csv(file, col_names = TRUE, trim_ws = TRUE, progress = TRUE) %>%
    dplyr::select(tidyselect::any_of(sample_def_cols), "FEATURE_NAME", Intensity = {{field}})

  d
}


#' Reads a wide CSV or XLSX sheet with Feature Values
#'
#' @param data MidarExperiment()
#' @param file File name and path of a plain wide-format CSV or XLS file
#' @param field Peak parameter (e.g. Area, RT)
#' @param sheet Sheet name
#' @param silent Suppress messages
#' @importFrom rlang :=
#'
#' @return A tibble in the long format
#' @export
#'
read_table_wide <- function(data, file, field, sheet = "", silent = FALSE) {

  if (!field %in% c("Intensity", "normIntensity", "Concentration", "RT", "FWHM")) stop("Field can only be one of: Intensity, normIntensity, Concentration or RT")

  var_field <- rlang::ensym(field)

  ext <- fs::path_ext(file)
  if(ext == "csv")
    d <- readr::read_csv(file, col_names = TRUE, trim_ws = TRUE, progress = TRUE, na = c("n/a", "N/A"))
  else if(ext == "xls" | ext == "xlsx"){
    if(sheet == "") stop("Please define sheet name via the `sheet` parameter")
    d <- readxl::read_excel(path = file, sheet = sheet, trim_ws = TRUE, progress = TRUE, na = c("n/a", "N/A"))
  }
  else
    stop("Invalid file format. Only CSV, XLS and XLSX supported.")

  d <- d |> dplyr::mutate(RUN_ID = row_number(),.before = 1)

  d <- d |> tidyr::pivot_longer(cols = -1:-2, names_to = "FEATURE_NAME" , values_to = field) %>%
    dplyr::rename(ANALYSIS_ID = 2) %>%
    mutate({{field}} := as.numeric(!!var_field))

  data@dataset_orig <- d
  data

}




#' Read and convert an Agilent MassHunter Quant CSV result file
#'
#' @param filename File path of MassHunter Quant CSV file
#' @param use_mrmkit_normdata use raw or MRMkit-normalized data
#' @param silent Suppress messages
#'
#' @return A tibble in the long format
#' @export
#'
#' @importFrom stats na.omit setNames
#' @importFrom utils tail
#' @importFrom glue glue
#' @importFrom readr read_csv
#' @importFrom tidyselect vars_select_helpers

#'
#' @examples
#' library(SLINGtools)
#'
#' data_file_path <- system.file("extdata",
#'   "Testdata_Lipidomics_MHQuant_Detailed.csv", package = "SLINGtools")
#' d <- read_MassHunterCSV(data_file_path)
#' d
#'
#'
read_MRMkit_raw_area_CSV<- function(filename, use_mrmkit_normdata = FALSE, silent = FALSE) {

  sep <- if_else(fs::path_ext(filename) == "csv", ",", "\t")

  d_mrmkit_raw <- readr::read_delim(filename, delim = sep, col_types = readr::cols(.default = "c"),
                                       col_names = TRUE, trim_ws = TRUE)

  # Extract MRMkit's "QC" info
  d_mrmkit_featureinfo <- d_mrmkit_raw %>%
    dplyr::filter(.data$name %in% c("Q1", "Q3", "RT", "D-ratio")) %>%
    tidyr::pivot_longer(-.data$name, names_to = "FEATURE_NAME", values_to = "value") %>%
    tidyr::pivot_wider(names_from = "name" ,values_from = "value") %>%
    dplyr::mutate(FEATURE_NAME = dplyr::if_else(stringr::str_detect(.data$FEATURE_NAME, "RT"), stringr::str_squish(stringr::str_extract(.data$FEATURE_NAME, ".*(?= RT)")),.data$FEATURE_NAME)) %>%
    dplyr::rename(PRECURSOR_MZ = .data$Q1, PRODUCT_MZ = .data$Q3) %>%
    dplyr::mutate(dplyr::across(dplyr::any_of(c("PRECURSOR_MZ", "PRODUCT_MZ", "RT")), as.numeric))


  d_mrmkit_data <- d_mrmkit_raw %>%
    dplyr::filter(!.data$name %in% c("Q1", "Q3", "RT", "D-ratio")) %>%
    dplyr::mutate(RUN_ID = dplyr::row_number(), .before = .data$name) %>%
    tidyr::pivot_longer(-.data$RUN_ID:-.data$name, names_to = "FEATURE_NAME", values_to = "Intensity") %>%
    dplyr::rename(ANALYSIS_ID  = .data$name) %>%
    dplyr::mutate(ANALYSIS_ID = stringr::str_squish(stringr::str_replace(.data$ANALYSIS_ID, stringr::fixed(".mzML"), ""))) %>%
    dplyr::mutate(FEATURE_NAME = dplyr::if_else(stringr::str_detect(.data$FEATURE_NAME, "RT"), stringr::str_squish(stringr::str_extract(.data$FEATURE_NAME, ".*(?= RT)")),.data$FEATURE_NAME)) %>%
    dplyr::mutate(dplyr::across(.data$Intensity, as.numeric)) %>%
    dplyr::left_join(d_mrmkit_featureinfo, by = "FEATURE_NAME") %>%
    dplyr::relocate(.data$Intensity, .after = dplyr::last_col()) %>%
    dplyr::mutate(dplyr::across(tidyselect::where(is.character), stringr::str_squish))

  if(use_mrmkit_normdata)
    d_mrmkit_data |> filter(str_detect(.data$ANALYSIS_ID, "^norm_"))
  else
    d_mrmkit_data |> filter(!str_detect(.data$ANALYSIS_ID, "^norm_"))

}



#' Reads a long CSV file with Feature Intensities
#'
#' @param data MidarExperiment object
#' @param raw_area_file file path of MRMkit csv output with raw peak areas
#' @param final_results_file file path of MRMkit csv output with final processed normalized peak areas
#' @param silent No comments printed
#' @return A tibble in the long format
#' @export
#'
#'
#'
#'
read_MRMkit_results <- function(data, raw_area_file, final_results_file, silent = FALSE) {

  sample_def_cols = c(
    "DataFileName",
    "SampleName",
    "AcqTimeStamp",
    "SampleType",
    "VialPosition",
    "InjVolume",
    "SampleType",
    "RUN_ID",
    "ANALYSIS_ID",
    "FEATURE_NAME",
    "Area",
    "RT",
    "FWHM",
    "SN"
  )

  d_raw <- read_MRMkit_raw_area_CSV(raw_area_file)
  data@dataset_orig <- d_raw

  sep <- if_else(fs::path_ext(final_results_file) == "csv", ",", "\t")

  if (final_results_file !=""){
    d_results <- readr::read_delim(final_results_file, delim = sep, col_names = TRUE, trim_ws = TRUE, progress = TRUE, show_col_types = FALSE) %>%
      dplyr::filter(!is.na(.data$type))%>%
      dplyr::mutate(filename = str_remove(.data$filename, "\\.mzML")) %>%
      dplyr::rename(ANALYSIS_ID = .data$filename) %>%
      #dplyr::mutate(RUN_ID = row_number(), .before = ANALYSIS_ID) %>%
      dplyr::select(-tidyselect::any_of(c("batch", "type"))) %>%
      tidyr::pivot_longer(-.data$ANALYSIS_ID, names_to = "FEATURE_NAME", values_to = "normIntensity")

    data@dataset <- data@dataset_orig %>% left_join(d_results, by=c("ANALYSIS_ID", "FEATURE_NAME"))

  }
  data
}
SLINGhub/SLINGtools documentation built on April 20, 2023, 5:09 a.m.