knitr::opts_chunk$set(fig.width = 8, fig.height = 8, fig.path = 'figures/temp/1_post_processing/', message = FALSE, warning = FALSE)
mice_id.df <- read.table(file.path(LATMX2::processed_dir.c(), "mice_id.tsv"), row.names = 1, header = TRUE, sep = "\t") mice_id.vc <- rownames(mice_id.df) mice_num.vc <- substr(mice_id.vc, 2, 4) stopifnot(identical(as.integer(mice_num.vc), sort(as.integer(mice_num.vc))))
clinic_setname.c <- "clinics" # table from PHENOMIN (47 * 1098) clinic_raw.df <- read.table(file.path(LATMX2::processed_dir.c(), clinic_setname.c, "phenomin.tsv"), check.names = FALSE, comment.char = "", header = TRUE, quote = "", sep = "\t") clinic_proc_names.vc <- colnames(clinic_raw.df) colnames(clinic_raw.df) <- make.names(colnames(clinic_raw.df), unique = TRUE) # checking sample order stopifnot(identical(clinic_raw.df[, "mouse_nb"], sort(clinic_raw.df[, "mouse_nb"])))
# restricting to the 42 samples analyzed in ProMetIS clinic_raw.df <- clinic_raw.df[as.character(clinic_raw.df[, "mouse_nb"]) %in% mice_num.vc, ] stopifnot(identical(as.character(clinic_raw.df[, "mouse_nb"]), mice_num.vc)) rownames(clinic_raw.df) <- mice_id.vc # sampleMetadata (42 * 1098) clinic_pda.df <- clinic_raw.df # labeling the 'supplementary' information (to be kept at the end of the # sampleMetadata and variableMetadata tables) supp.vi <- which(!(colnames(clinic_pda.df) %in% colnames(mice_id.df))) stopifnot(length(supp.vi) == 1094) colnames(clinic_pda.df)[supp.vi] <- paste0("supp_", colnames(clinic_pda.df)[supp.vi])
# initial dataMatrix (42 * 1091) variable.vi <- which(!(colnames(clinic_raw.df) %in% c(colnames(mice_id.df), c("mouse_id", "genotype", "project")))) stopifnot(length(variable.vi) == 1091) clinic_dat.df <- clinic_raw.df[, variable.vi] dat_proc_names.vc <- clinic_proc_names.vc[variable.vi] rm(clinic_raw.df) rm(clinic_proc_names.vc) ## discarding variables which cannot be converted to numerics ### removing 1 variable with all values identical except 1 (for "W627m") ### note that all 'Test.de.Tolérance.au.Glucose' tests have a 'NA' value for this mouse only discard_imagerie.vl <- grepl("Imagerie.du.squelette.par.rayons.X.Digits.integrity", colnames(clinic_dat.df)) stopifnot(sum(discard_imagerie.vl) == 1) ### removing date of sacrifice discard_sacrifice.vl <- grepl("Date.of.sacrifice", colnames(clinic_dat.df)) stopifnot(sum(discard_sacrifice.vl) == 2) ### removing 471 columns with NA or "" values only discard_navoid.vl <- apply(clinic_dat.df, 2, function(y) { unique.vc <- unique(as.character(y)) unique.vc <- unique.vc[!is.na(unique.vc) & (unique.vc != "")] length(unique.vc) < 2 }) stopifnot(sum(discard_navoid.vl) == 478) discard_feat.vl <- discard_imagerie.vl | discard_sacrifice.vl | discard_navoid.vl stopifnot(sum(discard_feat.vl) == 481) clinic_dat.df <- clinic_dat.df[, !discard_feat.vl] dat_proc_names.vc <- dat_proc_names.vc[!discard_feat.vl] ## keeping numerics and factors with nlevels > 2 clinic_class.vc <- sapply(clinic_dat.df, data.class) stopifnot(identical(unique(clinic_class.vc), c("numeric", "factor"))) ## selecting numerics (678) keep_numeric.vl <- clinic_class.vc == "numeric" stopifnot(sum(keep_numeric.vl) == 571) ## selecting factors with 2 levels (45) keep_factor2.vl <- logical(ncol(clinic_dat.df)) for (j in which(clinic_class.vc == "factor")) { clinic_dat.f <- clinic_dat.df[, j] if (nlevels(clinic_dat.f) == 2) { clinic_dat.df[, j] <- as.integer(clinic_dat.df[, j]) keep_factor2.vl[j] <- TRUE } } stopifnot(sum(keep_factor2.vl) == 1) ## aggregating selections (725) keep_aggreg.vl <- keep_numeric.vl | keep_factor2.vl stopifnot(sum(keep_aggreg.vl) == 572) clinic_dat.mn <- as.matrix(clinic_dat.df[, keep_aggreg.vl]) dat_proc_names.vc <- dat_proc_names.vc[keep_aggreg.vl] ## removing variables with > 20% NA (including 100% NA in females) ## or with variance < 1e-5 keep_notna_zerovar.vl <- ProMetIA:::.filter_na_zerovar(clinic_dat.mn, na_thresh.n = 0.2, var_thresh.n = 1e-5) stopifnot(sum(keep_notna_zerovar.vl) == 213) clinic_dat.mn <- clinic_dat.mn[, keep_notna_zerovar.vl] dat_proc_names.vc <- dat_proc_names.vc[keep_notna_zerovar.vl] stopifnot(identical(dim(clinic_dat.mn), as.integer(c(42, 213))))
# variableMetadata (238 * 2) category.vc <- sapply(colnames(clinic_dat.mn), function(name.c) unlist(strsplit(name.c, split = ".", fixed = TRUE))[1]) category.vc <- gsub("Auditory", "Auditory and PPI", gsub("Clinical", "Clinical chemistry", gsub("Grip", "Grip test", gsub("Hot", "Hot plate", gsub("Modified", "SHIRPA", gsub("Open", "Open field", gsub("Pavlovian", "Pavlovian fear cond.", gsub("Shock", "Shock threshold", gsub("Test", "Glucose tolerance test", gsub("Ultrafocus", "Body composition", category.vc)))))))))) full.vc <- sapply(colnames(clinic_dat.mn), function(name.c) { name.vc <- unlist(strsplit(name.c, split = ".", fixed = TRUE)) if (length(name.vc) == 1) { return("") } else { name.vc <- name.vc[-1] name.vc <- name.vc[name.vc != ""] return(paste(name.vc, collapse = "_")) } }) clinic_fda.df <- data.frame(row.names = colnames(clinic_dat.mn), initial_names = dat_proc_names.vc, category = category.vc, full_info = full.vc, stringsAsFactors = FALSE) stopifnot(identical(dim(clinic_fda.df), as.integer(c(213, 3))))
# converting to ExpressionSet clinic.eset <- Biobase::ExpressionSet(assayData = t(clinic_dat.mn), phenoData = Biobase::AnnotatedDataFrame(data = clinic_pda.df), featureData = Biobase::AnnotatedDataFrame(data = clinic_fda.df), experimentData = Biobase::MIAME(title = clinic_setname.c)) stopifnot(methods::validObject(clinic.eset)) print(clinic.eset)
phenomis::writing(clinic.eset, file.path("../../LATMX2/inst/extdata/2_post_processed", clinic_setname.c), overwrite.l = TRUE)
proteo_files.vc <- vapply(LATMX2::proteo_sets.vc(), function(set.c) { file.c <- list.files(file.path(LATMX2::processed_dir.c(), set.c), pattern = ".xlsx", full.names = TRUE) stopifnot(length(file.c) == 1) file.c }, FUN.VALUE = character(1)) proteo.mset <- MultiDataSet::createMultiDataSet()
for (set.c in LATMX2::proteo_sets.vc()) { # dataMatrix data.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c], sheet = 1), stringsAsFactors = FALSE) rownames(data.df) <- data.df[, 1] data.df[, 1] <- NULL data.mn <- as.matrix(data.df) rm(data.df) mode(data.mn) <- "numeric" eset <- Biobase::ExpressionSet(assayData = data.mn, experimentData = Biobase::MIAME(title = set.c)) stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in LATMX2::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] proteo_pda.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c], sheet = 2), stringsAsFactors = FALSE) rownames(proteo_pda.df) <- gsub(".", "_", proteo_pda.df[, 1], fixed = TRUE) sample_names.vc <- Biobase::sampleNames(eset) if (set.c == "proteomics_liver") { stopifnot(identical(sort(rownames(proteo_pda.df)), sort(sample_names.vc))) proteo_pda.df <- proteo_pda.df[sample_names.vc, ] } stopifnot(identical(rownames(proteo_pda.df), sample_names.vc)) colnames(proteo_pda.df) <- paste0("supp_", colnames(proteo_pda.df)) Biobase::pData(eset) <- proteo_pda.df stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in LATMX2::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] proteo_fda.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c], sheet = 3), stringsAsFactors = FALSE) rownames(proteo_fda.df) <- proteo_fda.df[, 1] stopifnot(identical(rownames(proteo_fda.df), Biobase::featureNames(eset))) proteo_fda.df[, "uniprot_id"] <- sapply(proteo_fda.df[, "accession"], function(access.c) unlist(strsplit(access.c, split = "|", fixed = TRUE))[2], USE.NAMES = FALSE) supp.vi <- which(!(colnames(proteo_fda.df) %in% c("accession", "description", "uniprot_id"))) colnames(proteo_fda.df)[supp.vi] <- paste0("supp_", colnames(proteo_fda.df)[supp.vi]) Biobase::fData(eset) <- proteo_fda.df stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in LATMX2::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] # changing variable IDs to names feat_names.vc <- paste0(sapply(Biobase::fData(eset)[, "accession"], function(access.c) unlist(strsplit(access.c, split = "|", fixed = TRUE))[2]), "_", sapply(Biobase::fData(eset)[, "description"], function(desc.c) { if (!is.na(desc.c) && desc.c != "") { desc.c <- unlist(strsplit(desc.c, split = " OS="))[1] if (nchar(desc.c) > 25) desc.c <- paste0(substr(desc.c, 1, 24), ".") return(desc.c) } })) stopifnot(!any(duplicated(feat_names.vc))) Biobase::featureNames(eset) <- feat_names.vc # discarding pools if (set.c == "proteomics_liver") eset <- eset[, !grepl("(p|P)ool", Biobase::pData(eset)[, "supp_sample name"])] # re-ordering samples if (set.c == "proteomics_liver") { eset <- eset[, order(as.integer(Biobase::pData(eset)[, "supp_sample name"]))] } else { Biobase::sampleNames(eset) <- sapply(Biobase::sampleNames(eset), function(samp.c) unlist(strsplit(samp.c, split = "_"))[2], USE.NAMES = FALSE) eset <- eset[, order(as.integer(Biobase::sampleNames(eset)))] } # renaming samples if (set.c == "proteomics_liver") { Biobase::sampleNames(eset) <- paste0(sapply(Biobase::pData(eset)[, "supp_Condition"], function(cond.c) switch(cond.c, mx2 = "X", ctrl = "W", lat = "L")), Biobase::pData(eset)[, "supp_sample name"]) } else { Biobase::sampleNames(eset) <- paste0(sapply(Biobase::pData(eset)[, "supp_Condition"], function(cond.c) switch(cond.c, Mx2 = "X", CONT = "W", LAT = "L")), Biobase::sampleNames(eset)) } mice_konum.vc <- substr(mice_id.vc, 1, 4) mice_koid.vc <- mice_id.vc if (set.c == "proteomics_plasma") { stopifnot(all(Biobase::sampleNames(eset) %in% mice_konum.vc)) mice_koid.vc <- mice_koid.vc[mice_konum.vc %in% Biobase::sampleNames(eset)] mice_konum.vc <- mice_konum.vc[mice_konum.vc %in% Biobase::sampleNames(eset)] } stopifnot(identical(Biobase::sampleNames(eset), mice_konum.vc)) Biobase::sampleNames(eset) <- mice_koid.vc stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in LATMX2::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] Biobase::pData(eset) <- cbind.data.frame(mice_id.df[Biobase::sampleNames(eset), ], Biobase::pData(eset), stringsAsFactors = FALSE) stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in LATMX2::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] eset <- ProMetIA::format_imputation(eset) stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
for (set.c in LATMX2::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] samp.vc <- Biobase::sampleNames(eset) pdata.df <- Biobase::pData(eset) fvar.vc <- Biobase::fvarLabels(eset) for (i in seq_along(samp.vc)) { samp.c <- samp.vc[i] samp_id.c <- gsub("abundance_", "", pdata.df[i, "id"]) fvar_id.vi <- grep(paste0("^supp_.+", samp_id.c), fvar.vc) fvar.vc[fvar_id.vi] <- gsub(switch(set.c, proteomics_liver = "mgf", proteomics_plasma = samp_id.c), samp.c, fvar.vc[fvar_id.vi]) } Biobase::fvarLabels(eset) <- fvar.vc stopifnot(methods::validObject(eset)) proteo.mset <- MultiDataSet::add_eset(proteo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
proteo.mset <- ProMetIA::ordermeta_mset(proteo.mset)
for (set.c in LATMX2::proteo_sets.vc()) { eset <- proteo.mset[[set.c]] # an 'id' extra column has been automatically created in the sampleMetadata by # 'MultiDataSet' and must be removed before saving pdata.df <- Biobase::pData(eset) pdata.df[, "id"] <- NULL Biobase::pData(eset) <- pdata.df phenomis::writing(eset, file.path("../../LATMX2/inst/extdata/2_post_processed", set.c), overwrite.l = TRUE) }
metabo.mset <- phenomis::reading(file.path(LATMX2::processed_dir.c()), subsets.vc = LATMX2::metabo_sets.vc(), report.c = "none")
for (set.c in LATMX2::metabo_sets.vc()) { eset <- metabo.mset[[set.c]] if (grepl("(hyper|hilic)", set.c)) { if (grepl("c18hyper", set.c)) { # setting NA values to 0 exprs.mn <- Biobase::exprs(eset) exprs.mn[is.na(exprs.mn)] <- ifelse(set.c == "metabolomics_liver_c18hyper-pos", 0, 1) Biobase::exprs(eset) <- exprs.mn } # computing quality metrics eset <- phenomis::inspecting(eset, title.c = set.c) # discarding features with mean intensity in samples / mean intensity in pools < 3 eset <- eset[Biobase::fData(eset)[, "blankMean_over_sampleMean"] <= 0.33, ] # discarding features with a Pearson correlation with pool dilution < 0.7 eset <- eset[Biobase::fData(eset)[, "poolDil_cor"] >= 0.7, ] } else if (grepl("acqui", set.c)) { # discarding features eluted before 0.4 min or after 22 min eset <- eset[Biobase::fData(eset)[, "rt"] >= 0.4 & Biobase::fData(eset)[, "rt"] <= 22, ] # discarding features with mean intensity in samples / mean intensity in pools < 2 (or 4) fold.i <- ifelse(set.c == "metabolomics_plasma_c18acqui-pos", 2, 4) eset <- eset[Biobase::fData(eset)[, "fold"] >= fold.i, ] eset <- eset[Biobase::fData(eset)[, "tstat"] <= 0, ] } else stop("Unknown metabolomics dataset name.") # discarding blanks and diluted pools (QCs) eset <- eset[, Biobase::pData(eset)[, "sampleType"] %in% c("sample", "pool")] # signal drift correction if (set.c == "metabolomics_liver_c18hyper-pos") { reference.c <- "none" } else if (set.c == "metabolomics_plasma_c18hyper-pos") { reference.c <- "pool" } else if (set.c == "metabolomics_liver_hilic-neg") { reference.c <- "none" } else if (set.c == "metabolomics_plasma_hilic-neg") { reference.c <- "none" } else if (set.c == "metabolomics_plasma_c18acqui-pos") { reference.c <- "sample" } else if (set.c == "metabolomics_plasma_c18acqui-neg") { reference.c <- "pool" } else stop() if (reference.c != "none") eset <- phenomis::correcting(eset, reference.c = reference.c, title.c = set.c) # updating quality metrics eset <- phenomis::inspecting(eset, title.c = set.c) # discarding features with a coefficient of variation in pools > 30% eset <- eset[Biobase::fData(eset)[, "pool_CV"] <= 0.3, ] if (grepl("acqui", set.c)) { # discarding features with a ratio 'coefficient of variation in samples / coefficient of variation in pools' < 0.8 eset <- eset[Biobase::fData(eset)[, "poolCV_over_sampleCV"] <= 1.25, ] } # discarding 'pool' samples (i.e. QCs) eset <- eset[, Biobase::pData(eset)[, "sampleType"] != "pool"] if (grepl("acqui", set.c)) { # discarding the few features containing NA (generated after the drift correction) eset <- eset[apply(Biobase::exprs(eset), 1, function(feat.vn) !any(is.na(feat.vn))), ] # discarding features highly correlated to others from the same pcgroup with higher mean intensity eset <- ProMetIA::filter_pcgroup(eset) if (set.c == "metabolomics_plasma_c18acqui-pos") { # discarding features with 90% quantile < 5000 eset <- eset[apply(Biobase::exprs(eset), 1, function(feat.vn) quantile(feat.vn, 0.9)) >= 5000, ] } } stopifnot(methods::validObject(eset)) metabo.mset <- MultiDataSet::add_eset(metabo.mset, eset, dataset.type = set.c, GRanges = NA, overwrite = TRUE, warnings = FALSE) }
metabo.mset <- ProMetIA::format_metabonames(metabo.mset, mice_id.df = mice_id.df, mice_id.vc = mice_id.vc, mice_num.vc = mice_num.vc)
metabo.mset <- phenomis::transforming(metabo.mset)
metabo.mset <- ProMetIA::ordermeta_mset(metabo.mset)
for (set.c in LATMX2::metabo_sets.vc()) { eset <- metabo.mset[[set.c]] # an 'id' extra column has been automatically created in the sampleMetadata by # 'MultiDataSet' and must be removed before saving pdata.df <- Biobase::pData(eset) pdata.df[, "id"] <- NULL Biobase::pData(eset) <- pdata.df phenomis::writing(eset, file.path("../../LATMX2/inst/extdata/2_post_processed", set.c), overwrite.l = TRUE) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.