knitr::opts_chunk$set(fig.width = 8, fig.height = 8, fig.path = 'figures/temp/1_post_processing/', warning = FALSE)
This vignette describes the post-processing of proteomics datasets (mainly re-formating) and of the metabolomics datasets (normalization including filtering based on quality metrics and signal drift correction; see the 'Normalization of the dataMatrix' section for details; log2 transformation). The input datasets are from the '1_processed' repository while the post-processed datasets are exported into the '2_post_processed' repository.
Note: The MutiDataSet
/ExpressionSet
Bioconductor framework is used throughout this vignette.
processed.mset <- phenomis::reading(ProMetIS::processed_dir.c(), output.c = "set", report.c = "none") preclin.eset <- processed.mset[["preclinical"]] mice_id.df <- Biobase::pData(preclin.eset) mice_id.df[, "id"] <- NULL
proteo_files.vc <- vapply(ProMetIS::proteo_sets.vc(), function(set.c) { files.vc <- list.files(file.path(ProMetIS::processed_dir.c(), set.c), pattern = ".xlsx", full.names = TRUE) files.vc[!grepl("(ProMetIS|prometis)", basename(files.vc))] }, FUN.VALUE = character(1)) proteo.mset <- MultiDataSet::createMultiDataSet()
for (set.c in ProMetIS::proteo_sets.vc()) { # dataMatrix data.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c], sheet = 1), stringsAsFactors = FALSE) rownames(data.df) <- data.df[, 1] data.df[, 1] <- NULL data.mn <- as.matrix(data.df) rm(data.df) mode(data.mn) <- "numeric" eset <- Biobase::ExpressionSet(assayData = data.mn, experimentData = Biobase::MIAME(title = set.c)) stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in ProMetIS::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] proteo_pda.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c], sheet = 2), stringsAsFactors = FALSE) rownames(proteo_pda.df) <- gsub(".", "_", proteo_pda.df[, 1], fixed = TRUE) sample_names.vc <- Biobase::sampleNames(eset) if (set.c == "proteomics_liver") { stopifnot(identical(sort(rownames(proteo_pda.df)), sort(sample_names.vc))) # proteo_pda.df <- proteo_pda.df[sample_names.vc, ] } proteo_pda.df <- proteo_pda.df[sample_names.vc, ] # stopifnot(identical(rownames(proteo_pda.df), # sample_names.vc)) Biobase::pData(eset) <- proteo_pda.df stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in ProMetIS::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] proteo_fda.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c], sheet = 3), stringsAsFactors = FALSE) rownames(proteo_fda.df) <- proteo_fda.df[, 1] stopifnot(identical(rownames(proteo_fda.df), Biobase::featureNames(eset))) proteo_fda.df[, "uniprot_id"] <- sapply(proteo_fda.df[, "accession"], function(access.c) unlist(strsplit(access.c, split = "|", fixed = TRUE))[2], USE.NAMES = FALSE) Biobase::fData(eset) <- proteo_fda.df stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in ProMetIS::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] # changing variable IDs to names feat_names.vc <- paste0(sapply(Biobase::fData(eset)[, "accession"], function(access.c) unlist(strsplit(access.c, split = "|", fixed = TRUE))[2]), "_", sapply(Biobase::fData(eset)[, "description"], function(desc.c) { if (!is.na(desc.c) && desc.c != "") { desc.c <- unlist(strsplit(desc.c, split = " OS="))[1] if (nchar(desc.c) > 25) desc.c <- paste0(substr(desc.c, 1, 24), ".") return(desc.c) } })) stopifnot(!any(duplicated(feat_names.vc))) Biobase::featureNames(eset) <- feat_names.vc # discarding pools if (set.c == "proteomics_liver") eset <- eset[, !grepl("(p|P)ool", Biobase::pData(eset)[, "sample name"])] # re-ordering samples if (set.c == "proteomics_liver") { eset <- eset[, order(as.integer(Biobase::pData(eset)[, "sample name"]))] } else { Biobase::sampleNames(eset) <- sapply(Biobase::sampleNames(eset), function(samp.c) unlist(strsplit(samp.c, split = "_"))[2], USE.NAMES = FALSE) eset <- eset[, order(as.integer(Biobase::sampleNames(eset)))] } # renaming samples if (set.c == "proteomics_liver") { Biobase::sampleNames(eset) <- paste0(sapply(Biobase::pData(eset)[, "Condition"], function(cond.c) switch(cond.c, mx2 = "X", wt = "W", lat = "L")), Biobase::pData(eset)[, "sample name"]) } else { Biobase::sampleNames(eset) <- paste0(sapply(Biobase::pData(eset)[, "Condition"], function(cond.c) switch(cond.c, Mx2 = "X", CONT = "W", LAT = "L")), Biobase::sampleNames(eset)) } mice_konum.vc <- substr(rownames(mice_id.df), 1, 4) mice_koid.vc <- rownames(mice_id.df) if (set.c == "proteomics_plasma") { stopifnot(all(Biobase::sampleNames(eset) %in% mice_konum.vc)) mice_koid.vc <- mice_koid.vc[mice_konum.vc %in% Biobase::sampleNames(eset)] mice_konum.vc <- mice_konum.vc[mice_konum.vc %in% Biobase::sampleNames(eset)] } stopifnot(identical(Biobase::sampleNames(eset), mice_konum.vc)) Biobase::sampleNames(eset) <- mice_koid.vc stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in ProMetIS::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] Biobase::pData(eset) <- cbind.data.frame(mice_id.df[Biobase::sampleNames(eset), ], Biobase::pData(eset)[, setdiff(Biobase::varLabels(eset), colnames(mice_id.df))], stringsAsFactors = FALSE) stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in ProMetIS::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] samp.vc <- Biobase::sampleNames(eset) pdata.df <- Biobase::pData(eset) fvar.vc <- Biobase::fvarLabels(eset) for (i in seq_along(samp.vc)) { samp.c <- samp.vc[i] samp_id.c <- gsub("abundance_", "", pdata.df[i, "id"]) fvar_id.vi <- grep(paste0(".+", samp_id.c), fvar.vc) fvar.vc[fvar_id.vi] <- gsub(switch(set.c, proteomics_liver = "mgf", proteomics_plasma = samp_id.c), samp.c, fvar.vc[fvar_id.vi]) } Biobase::fvarLabels(eset) <- fvar.vc stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
proteo.mset <- ProMetIS:::metadata_select(proteo.mset, step.c = "2_post_processed")
for (set.c in ProMetIS::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] # an 'id' extra column has been automatically created in the sampleMetadata by # 'MultiDataSet' and must be removed before saving pdata.df <- Biobase::pData(eset) pdata.df[, "id"] <- NULL Biobase::pData(eset) <- pdata.df phenomis::writing(eset, file.path(gsub(ProMetIS::data_dir.c(), "../../ProMetIS/inst/extdata", ProMetIS::post_processed_dir.c()), set.c), overwrite.l = TRUE) }
metabo.mset <- phenomis::reading(ProMetIS::processed_dir.c(), subsets.vc = ProMetIS::metabo_sets.vc(), output.c = "set", report.c = "none")
The workflow consists of the following steps:
| step | liver_c18hypersil_pos | liver_hilic_neg | plasma_c18hypersil_pos | plasma_hilic_neg | plasma_c18acquity_pos | plasma_c18acquity_neg | |-----------|-----------|-----------|-----------|-----------|-----------|-----------| | 0.4 \<= RT \<= 22 | | | | | X | X | | blank/samp \<= 0.33 | X | X | X | X | X | X | | poolDil: cor >= 0.7 | X | X | X | X | | | | drift loess, span=1 | none | none | pool | pool | sample | sample | | NA \<= 20% & var > 0 | X | X | X | X | X | X | | pool CV \<= 0.3 | X | X | X | X | X | X | | poolCV/sampCV \<= 1 | X | X | X | X | X | X | | corr, dRT, dmz | X | X | X | X | X | X |
metabo.mset <- ProMetIS:::.metabo_postprocessing(metabo.mset = metabo.mset, drift_correct.c = "prometis")
metabo.mset <- ProMetIS:::.format_metabonames(metabo.mset = metabo.mset, mice_id.df = mice_id.df)
metabo.mset <- phenomis::transforming(metabo.mset, report.c = "none")
metabo.mset <- ProMetIS:::metadata_select(metabo.mset, step.c = "2_post_processed")
for (set.c in ProMetIS::metabo_sets.vc()) { eset <- metabo.mset[[set.c]] # an 'id' extra column has been automatically created in the sampleMetadata by # 'MultiDataSet' and must be removed before saving pdata.df <- Biobase::pData(eset) pdata.df[, "id"] <- NULL Biobase::pData(eset) <- pdata.df phenomis::writing(eset, file.path(gsub(ProMetIS::data_dir.c(), "../../ProMetIS/inst/extdata", ProMetIS::post_processed_dir.c()), set.c), overwrite.l = TRUE) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.