knitr::opts_chunk$set(fig.width = 8,
                      fig.height = 8,
                      fig.path = 'figures/temp/1_post_processing/',
                      warning = FALSE)

This vignette describes the post-processing of proteomics datasets (mainly re-formating) and of the metabolomics datasets (normalization including filtering based on quality metrics and signal drift correction; see the 'Normalization of the dataMatrix' section for details; log2 transformation). The input datasets are from the '1_processed' repository while the post-processed datasets are exported into the '2_post_processed' repository.

Note: The MutiDataSet/ExpressionSet Bioconductor framework is used throughout this vignette.

Mice IDs

processed.mset <- phenomis::reading(ProMetIS::processed_dir.c(),
                                    output.c = "set",
                                    report.c = "none")
preclin.eset <- processed.mset[["preclinical"]]
mice_id.df <- Biobase::pData(preclin.eset)
mice_id.df[, "id"] <- NULL

Proteomics

Set names and input files

proteo_files.vc <- vapply(ProMetIS::proteo_sets.vc(),
                          function(set.c) {
                            files.vc <- list.files(file.path(ProMetIS::processed_dir.c(), set.c),
                                                 pattern = ".xlsx", full.names = TRUE)
                            files.vc[!grepl("(ProMetIS|prometis)", basename(files.vc))]
                          }, FUN.VALUE = character(1))

proteo.mset <- MultiDataSet::createMultiDataSet()

MultiDataSet containing the data matrices

for (set.c in ProMetIS::proteo_sets.vc()) {

  # dataMatrix
  data.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c],
                                              sheet = 1),
                           stringsAsFactors = FALSE)
  rownames(data.df) <- data.df[, 1]
  data.df[, 1] <- NULL
  data.mn <- as.matrix(data.df)
  rm(data.df)
  mode(data.mn) <- "numeric"

  eset <- Biobase::ExpressionSet(assayData = data.mn,
                                 experimentData = Biobase::MIAME(title = set.c))
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)
}

sampleMetadata

for (set.c in ProMetIS::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  proteo_pda.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c],
                                                    sheet = 2),
                                 stringsAsFactors = FALSE)
  rownames(proteo_pda.df) <- gsub(".", "_",
                                  proteo_pda.df[, 1], fixed = TRUE)
  sample_names.vc <- Biobase::sampleNames(eset)
  if (set.c == "proteomics_liver") {
    stopifnot(identical(sort(rownames(proteo_pda.df)),
                        sort(sample_names.vc)))
    # proteo_pda.df <- proteo_pda.df[sample_names.vc, ]
  }
  proteo_pda.df <- proteo_pda.df[sample_names.vc, ]
  # stopifnot(identical(rownames(proteo_pda.df),
  #                     sample_names.vc))

  Biobase::pData(eset) <- proteo_pda.df
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)
}

variableMetadata

for (set.c in ProMetIS::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  proteo_fda.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c],
                                                    sheet = 3),
                                 stringsAsFactors = FALSE)
  rownames(proteo_fda.df) <- proteo_fda.df[, 1]
  stopifnot(identical(rownames(proteo_fda.df),
                      Biobase::featureNames(eset)))

  proteo_fda.df[, "uniprot_id"] <- sapply(proteo_fda.df[, "accession"],
                                          function(access.c)
                                            unlist(strsplit(access.c,
                                                            split = "|",
                                                            fixed = TRUE))[2],
                                          USE.NAMES = FALSE)

  Biobase::fData(eset) <- proteo_fda.df
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)
}

Renaming features and re-ordering samples

for (set.c in ProMetIS::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  # changing variable IDs to names
  feat_names.vc <- paste0(sapply(Biobase::fData(eset)[, "accession"],
                                 function(access.c)
                                   unlist(strsplit(access.c, split = "|",
                                                   fixed = TRUE))[2]),
                          "_",
                          sapply(Biobase::fData(eset)[, "description"],
                                 function(desc.c) {
                                   if (!is.na(desc.c) && desc.c != "") {
                                     desc.c <- unlist(strsplit(desc.c, split = "  OS="))[1]
                                     if (nchar(desc.c) > 25)
                                       desc.c <- paste0(substr(desc.c, 1, 24), ".")
                                     return(desc.c)
                                   }
                                 }))
  stopifnot(!any(duplicated(feat_names.vc)))
  Biobase::featureNames(eset) <- feat_names.vc

  # discarding pools
  if (set.c == "proteomics_liver")
    eset <- eset[, !grepl("(p|P)ool", Biobase::pData(eset)[, "sample name"])]

  # re-ordering samples
  if (set.c == "proteomics_liver") {
    eset <- eset[, order(as.integer(Biobase::pData(eset)[, "sample name"]))]
  } else {
    Biobase::sampleNames(eset) <- sapply(Biobase::sampleNames(eset),
                                                 function(samp.c) unlist(strsplit(samp.c, split = "_"))[2], USE.NAMES = FALSE)
    eset <- eset[, order(as.integer(Biobase::sampleNames(eset)))]
  }

  # renaming samples
  if (set.c == "proteomics_liver") {
    Biobase::sampleNames(eset) <- paste0(sapply(Biobase::pData(eset)[, "Condition"],
                                                        function(cond.c)
                                                          switch(cond.c,
                                                                 mx2 = "X",
                                                                 wt = "W",
                                                                 lat = "L")),
                                                 Biobase::pData(eset)[, "sample name"])
  } else {
    Biobase::sampleNames(eset) <- paste0(sapply(Biobase::pData(eset)[, "Condition"],
                                                        function(cond.c)
                                                          switch(cond.c,
                                                                 Mx2 = "X",
                                                                 CONT = "W",
                                                                 LAT = "L")),
                                                 Biobase::sampleNames(eset))
  }

  mice_konum.vc <- substr(rownames(mice_id.df), 1, 4)
  mice_koid.vc <- rownames(mice_id.df)
  if (set.c == "proteomics_plasma") {
    stopifnot(all(Biobase::sampleNames(eset) %in% mice_konum.vc))
    mice_koid.vc <- mice_koid.vc[mice_konum.vc %in% Biobase::sampleNames(eset)]
    mice_konum.vc <- mice_konum.vc[mice_konum.vc %in% Biobase::sampleNames(eset)]
  }

  stopifnot(identical(Biobase::sampleNames(eset),
                      mice_konum.vc))

  Biobase::sampleNames(eset) <- mice_koid.vc
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)
}

Adding meta data (gene, mouse_id, sex)

for (set.c in ProMetIS::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  Biobase::pData(eset) <- cbind.data.frame(mice_id.df[Biobase::sampleNames(eset), ],
                                           Biobase::pData(eset)[, setdiff(Biobase::varLabels(eset),
                                                                          colnames(mice_id.df))],
                                           stringsAsFactors = FALSE)
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)

}

Variable metadata: including new mice ids in the column names (count, abundance, etc.)

for (set.c in ProMetIS::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  samp.vc <- Biobase::sampleNames(eset)
  pdata.df <- Biobase::pData(eset)
  fvar.vc <- Biobase::fvarLabels(eset)

  for (i in seq_along(samp.vc)) {
    samp.c <- samp.vc[i]
    samp_id.c <- gsub("abundance_", "", pdata.df[i, "id"])
    fvar_id.vi <- grep(paste0(".+", samp_id.c), fvar.vc)
    fvar.vc[fvar_id.vi] <- gsub(switch(set.c,
                                       proteomics_liver = "mgf",
                                       proteomics_plasma = samp_id.c),
                                samp.c, fvar.vc[fvar_id.vi])
  }

  Biobase::fvarLabels(eset) <- fvar.vc

  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)

}

Re-ordering metadata ('supp_' columns at the end)

proteo.mset <- ProMetIS:::metadata_select(proteo.mset, step.c = "2_post_processed")

Saving

for (set.c in ProMetIS::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  # an 'id' extra column has been automatically created in the sampleMetadata by
  # 'MultiDataSet' and must be removed before saving
  pdata.df <- Biobase::pData(eset)
  pdata.df[, "id"] <- NULL
  Biobase::pData(eset) <- pdata.df

  phenomis::writing(eset,
                  file.path(gsub(ProMetIS::data_dir.c(),
                                 "../../ProMetIS/inst/extdata",
                                 ProMetIS::post_processed_dir.c()),
                            set.c),
                  overwrite.l = TRUE)
}

Metabolomics

Loading

metabo.mset <- phenomis::reading(ProMetIS::processed_dir.c(),
                                 subsets.vc = ProMetIS::metabo_sets.vc(),
                                 output.c = "set",
                                 report.c = "none")

Normalization of the dataMatrix

The workflow consists of the following steps:

| step | liver_c18hypersil_pos | liver_hilic_neg | plasma_c18hypersil_pos | plasma_hilic_neg | plasma_c18acquity_pos | plasma_c18acquity_neg | |-----------|-----------|-----------|-----------|-----------|-----------|-----------| | 0.4 \<= RT \<= 22 | | | | | X | X | | blank/samp \<= 0.33 | X | X | X | X | X | X | | poolDil: cor >= 0.7 | X | X | X | X | | | | drift loess, span=1 | none | none | pool | pool | sample | sample | | NA \<= 20% & var > 0 | X | X | X | X | X | X | | pool CV \<= 0.3 | X | X | X | X | X | X | | poolCV/sampCV \<= 1 | X | X | X | X | X | X | | corr, dRT, dmz | X | X | X | X | X | X |

metabo.mset <- ProMetIS:::.metabo_postprocessing(metabo.mset = metabo.mset,
                                                 drift_correct.c = "prometis")

Formatting and ordering sample names

metabo.mset <- ProMetIS:::.format_metabonames(metabo.mset = metabo.mset,
                                              mice_id.df = mice_id.df)

Log2 transformation

metabo.mset <- phenomis::transforming(metabo.mset, report.c = "none")

Re-ordering metadata ('supp_' columns at the end)

metabo.mset <- ProMetIS:::metadata_select(metabo.mset,
                                          step.c = "2_post_processed")

Saving

for (set.c in ProMetIS::metabo_sets.vc()) {

  eset <- metabo.mset[[set.c]]

  # an 'id' extra column has been automatically created in the sampleMetadata by
  # 'MultiDataSet' and must be removed before saving
  pdata.df <- Biobase::pData(eset)
  pdata.df[, "id"] <- NULL
  Biobase::pData(eset) <- pdata.df

  phenomis::writing(eset,
                    file.path(gsub(ProMetIS::data_dir.c(),
                                   "../../ProMetIS/inst/extdata",
                                   ProMetIS::post_processed_dir.c()),
                              set.c),
                    overwrite.l = TRUE)

}


IFB-ElixirFr/ProMetIS documentation built on May 21, 2024, 8:02 p.m.