knitr::opts_chunk$set(fig.width = 8,
                      fig.height = 8,
                      fig.path = 'figures/temp/1_post_processing/',
                      message = FALSE,
                      warning = FALSE)

Mice IDs

mice_id.df <- read.table(file.path(LATMX2::processed_dir.c(),
                                   "mice_id.tsv"),
                         row.names = 1,
                         header = TRUE, sep = "\t")
mice_id.vc <- rownames(mice_id.df)
mice_num.vc <- substr(mice_id.vc, 2, 4)
stopifnot(identical(as.integer(mice_num.vc),
                    sort(as.integer(mice_num.vc))))

Clinics

Loading

clinic_setname.c <- "clinics"

# table from PHENOMIN (47 * 1098)
clinic_raw.df <- read.table(file.path(LATMX2::processed_dir.c(),
                                      clinic_setname.c,
                                      "phenomin.tsv"),
                            check.names = FALSE, comment.char = "",
                            header = TRUE, quote = "", sep = "\t")
clinic_proc_names.vc <- colnames(clinic_raw.df)
colnames(clinic_raw.df) <- make.names(colnames(clinic_raw.df), unique = TRUE)

# checking sample order
stopifnot(identical(clinic_raw.df[, "mouse_nb"],
                    sort(clinic_raw.df[, "mouse_nb"])))

sampleMetadata

# restricting to the 42 samples analyzed in ProMetIS

clinic_raw.df <- clinic_raw.df[as.character(clinic_raw.df[, "mouse_nb"]) %in% mice_num.vc, ]
stopifnot(identical(as.character(clinic_raw.df[, "mouse_nb"]), mice_num.vc))
rownames(clinic_raw.df) <- mice_id.vc

# sampleMetadata (42 * 1098)

clinic_pda.df <- clinic_raw.df

# labeling the 'supplementary' information (to be kept at the end of the
# sampleMetadata and variableMetadata tables)

supp.vi <- which(!(colnames(clinic_pda.df) %in% colnames(mice_id.df)))
stopifnot(length(supp.vi) == 1094)
colnames(clinic_pda.df)[supp.vi] <- paste0("supp_",
                                           colnames(clinic_pda.df)[supp.vi])

dataMatrix

# initial dataMatrix (42 * 1091)

variable.vi <-  which(!(colnames(clinic_raw.df) %in% c(colnames(mice_id.df),
                                                       c("mouse_id",
                                                         "genotype",
                                                         "project"))))
stopifnot(length(variable.vi) == 1091)
clinic_dat.df <- clinic_raw.df[, variable.vi]
dat_proc_names.vc <- clinic_proc_names.vc[variable.vi]

rm(clinic_raw.df)
rm(clinic_proc_names.vc)

## discarding variables which cannot be converted to numerics
### removing 1 variable with all values identical except 1 (for "W627m")
### note that all 'Test.de.Tolérance.au.Glucose' tests have a 'NA' value for this mouse only

discard_imagerie.vl <- grepl("Imagerie.du.squelette.par.rayons.X.Digits.integrity",
                             colnames(clinic_dat.df))
stopifnot(sum(discard_imagerie.vl) == 1)

### removing date of sacrifice
discard_sacrifice.vl <- grepl("Date.of.sacrifice", colnames(clinic_dat.df))
stopifnot(sum(discard_sacrifice.vl) == 2)

### removing 471 columns with NA or "" values only
discard_navoid.vl <- apply(clinic_dat.df, 2,
                           function(y) {
                             unique.vc <- unique(as.character(y))
                             unique.vc <- unique.vc[!is.na(unique.vc) & (unique.vc != "")]
                             length(unique.vc) < 2
                           })
stopifnot(sum(discard_navoid.vl) == 478)

discard_feat.vl <- discard_imagerie.vl |
  discard_sacrifice.vl |
  discard_navoid.vl
stopifnot(sum(discard_feat.vl) == 481)

clinic_dat.df <- clinic_dat.df[, !discard_feat.vl]
dat_proc_names.vc <- dat_proc_names.vc[!discard_feat.vl]

## keeping numerics and factors with nlevels > 2

clinic_class.vc <- sapply(clinic_dat.df, data.class)
stopifnot(identical(unique(clinic_class.vc), c("numeric", "factor")))

## selecting numerics (678)
keep_numeric.vl <- clinic_class.vc == "numeric"
stopifnot(sum(keep_numeric.vl) == 571)

## selecting factors with 2 levels (45)
keep_factor2.vl <- logical(ncol(clinic_dat.df))
for (j in which(clinic_class.vc == "factor")) {
  clinic_dat.f <- clinic_dat.df[, j]
  if (nlevels(clinic_dat.f) == 2) {
    clinic_dat.df[, j] <- as.integer(clinic_dat.df[, j])
    keep_factor2.vl[j] <- TRUE
  }
}
stopifnot(sum(keep_factor2.vl) == 1)

## aggregating selections (725)
keep_aggreg.vl <- keep_numeric.vl |
  keep_factor2.vl
stopifnot(sum(keep_aggreg.vl) == 572)
clinic_dat.mn <- as.matrix(clinic_dat.df[, keep_aggreg.vl])
dat_proc_names.vc <- dat_proc_names.vc[keep_aggreg.vl]

## removing variables with > 20% NA (including 100% NA in females)
## or with variance < 1e-5
keep_notna_zerovar.vl <- ProMetIA:::.filter_na_zerovar(clinic_dat.mn,
                                                       na_thresh.n = 0.2,
                                                       var_thresh.n = 1e-5)

stopifnot(sum(keep_notna_zerovar.vl) == 213)
clinic_dat.mn <- clinic_dat.mn[, keep_notna_zerovar.vl]
dat_proc_names.vc <- dat_proc_names.vc[keep_notna_zerovar.vl]

stopifnot(identical(dim(clinic_dat.mn), as.integer(c(42, 213))))

variableMetadata

# variableMetadata (238 * 2)
category.vc <- sapply(colnames(clinic_dat.mn),
                      function(name.c)
                        unlist(strsplit(name.c,
                                        split = ".",
                                        fixed = TRUE))[1])
category.vc <- gsub("Auditory", "Auditory and PPI",
                    gsub("Clinical", "Clinical chemistry",
                         gsub("Grip", "Grip test",
                              gsub("Hot", "Hot plate",
                                   gsub("Modified", "SHIRPA",
                                        gsub("Open", "Open field",
                                             gsub("Pavlovian", "Pavlovian fear cond.",
                                                  gsub("Shock", "Shock threshold",
                                                       gsub("Test", "Glucose tolerance test",
                                                            gsub("Ultrafocus", "Body composition",
                                                                 category.vc))))))))))
full.vc <- sapply(colnames(clinic_dat.mn),
                  function(name.c) {
                    name.vc <- unlist(strsplit(name.c,
                                               split = ".",
                                               fixed = TRUE))
                    if (length(name.vc) == 1) {
                      return("")
                    } else {
                      name.vc <- name.vc[-1]
                      name.vc <- name.vc[name.vc != ""]
                      return(paste(name.vc, collapse = "_"))
                    }
                  })
clinic_fda.df <- data.frame(row.names = colnames(clinic_dat.mn),
                            initial_names = dat_proc_names.vc,
                            category = category.vc,
                            full_info = full.vc,
                            stringsAsFactors = FALSE)
stopifnot(identical(dim(clinic_fda.df), as.integer(c(213, 3))))

ExpressionSet

# converting to ExpressionSet
clinic.eset <- Biobase::ExpressionSet(assayData = t(clinic_dat.mn),
                                      phenoData = Biobase::AnnotatedDataFrame(data = clinic_pda.df),
                                      featureData = Biobase::AnnotatedDataFrame(data = clinic_fda.df),
                                      experimentData = Biobase::MIAME(title = clinic_setname.c))
stopifnot(methods::validObject(clinic.eset))

print(clinic.eset)

Saving (not run)

phenomis::writing(clinic.eset,
                  file.path("../../LATMX2/inst/extdata/2_post_processed",
                            clinic_setname.c),
                  overwrite.l = TRUE)

Proteomics

Set names and input files

proteo_files.vc <- vapply(LATMX2::proteo_sets.vc(),
                          function(set.c) {
                            file.c <- list.files(file.path(LATMX2::processed_dir.c(), set.c),
                                                 pattern = ".xlsx", full.names = TRUE)
                            stopifnot(length(file.c) == 1)
                            file.c
                          }, FUN.VALUE = character(1))

proteo.mset <- MultiDataSet::createMultiDataSet()

MultiDataSet containing the data matrices

for (set.c in LATMX2::proteo_sets.vc()) {

  # dataMatrix
  data.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c],
                                              sheet = 1),
                           stringsAsFactors = FALSE)
  rownames(data.df) <- data.df[, 1]
  data.df[, 1] <- NULL
  data.mn <- as.matrix(data.df)
  rm(data.df)
  mode(data.mn) <- "numeric"

  eset <- Biobase::ExpressionSet(assayData = data.mn,
                                 experimentData = Biobase::MIAME(title = set.c))
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)
}

sampleMetadata

for (set.c in LATMX2::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  proteo_pda.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c],
                                                    sheet = 2),
                                 stringsAsFactors = FALSE)
  rownames(proteo_pda.df) <- gsub(".", "_",
                                  proteo_pda.df[, 1], fixed = TRUE)
  sample_names.vc <- Biobase::sampleNames(eset)
  if (set.c == "proteomics_liver") {
    stopifnot(identical(sort(rownames(proteo_pda.df)),
                        sort(sample_names.vc)))
    proteo_pda.df <- proteo_pda.df[sample_names.vc, ]
  }
  stopifnot(identical(rownames(proteo_pda.df),
                      sample_names.vc))

  colnames(proteo_pda.df) <- paste0("supp_", colnames(proteo_pda.df))

  Biobase::pData(eset) <- proteo_pda.df
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)
}

variableMetadata

for (set.c in LATMX2::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  proteo_fda.df <- as.data.frame(readxl::read_excel(proteo_files.vc[set.c],
                                                    sheet = 3),
                                 stringsAsFactors = FALSE)
  rownames(proteo_fda.df) <- proteo_fda.df[, 1]
  stopifnot(identical(rownames(proteo_fda.df),
                      Biobase::featureNames(eset)))

  proteo_fda.df[, "uniprot_id"] <- sapply(proteo_fda.df[, "accession"],
                                          function(access.c)
                                            unlist(strsplit(access.c,
                                                            split = "|",
                                                            fixed = TRUE))[2],
                                          USE.NAMES = FALSE)

  supp.vi <- which(!(colnames(proteo_fda.df) %in% c("accession",
                                                    "description",
                                                    "uniprot_id")))
  colnames(proteo_fda.df)[supp.vi] <- paste0("supp_",
                                             colnames(proteo_fda.df)[supp.vi])

  Biobase::fData(eset) <- proteo_fda.df
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)
}

Renaming features and re-ordering samples

for (set.c in LATMX2::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  # changing variable IDs to names
  feat_names.vc <- paste0(sapply(Biobase::fData(eset)[, "accession"],
                                 function(access.c)
                                   unlist(strsplit(access.c, split = "|",
                                                   fixed = TRUE))[2]),
                          "_",
                          sapply(Biobase::fData(eset)[, "description"],
                                 function(desc.c) {
                                   if (!is.na(desc.c) && desc.c != "") {
                                     desc.c <- unlist(strsplit(desc.c, split = "  OS="))[1]
                                     if (nchar(desc.c) > 25)
                                       desc.c <- paste0(substr(desc.c, 1, 24), ".")
                                     return(desc.c)
                                   }
                                 }))
  stopifnot(!any(duplicated(feat_names.vc)))
  Biobase::featureNames(eset) <- feat_names.vc

  # discarding pools
  if (set.c == "proteomics_liver")
    eset <- eset[, !grepl("(p|P)ool", Biobase::pData(eset)[, "supp_sample name"])]

  # re-ordering samples
  if (set.c == "proteomics_liver") {
    eset <- eset[, order(as.integer(Biobase::pData(eset)[, "supp_sample name"]))]
  } else {
    Biobase::sampleNames(eset) <- sapply(Biobase::sampleNames(eset),
                                                 function(samp.c) unlist(strsplit(samp.c, split = "_"))[2], USE.NAMES = FALSE)
    eset <- eset[, order(as.integer(Biobase::sampleNames(eset)))]
  }

  # renaming samples
  if (set.c == "proteomics_liver") {
    Biobase::sampleNames(eset) <- paste0(sapply(Biobase::pData(eset)[, "supp_Condition"],
                                                        function(cond.c)
                                                          switch(cond.c,
                                                                 mx2 = "X",
                                                                 ctrl = "W",
                                                                 lat = "L")),
                                                 Biobase::pData(eset)[, "supp_sample name"])
  } else {
    Biobase::sampleNames(eset) <- paste0(sapply(Biobase::pData(eset)[, "supp_Condition"],
                                                        function(cond.c)
                                                          switch(cond.c,
                                                                 Mx2 = "X",
                                                                 CONT = "W",
                                                                 LAT = "L")),
                                                 Biobase::sampleNames(eset))
  }

  mice_konum.vc <- substr(mice_id.vc, 1, 4)
  mice_koid.vc <- mice_id.vc
  if (set.c == "proteomics_plasma") {
    stopifnot(all(Biobase::sampleNames(eset) %in% mice_konum.vc))
    mice_koid.vc <- mice_koid.vc[mice_konum.vc %in% Biobase::sampleNames(eset)]
    mice_konum.vc <- mice_konum.vc[mice_konum.vc %in% Biobase::sampleNames(eset)]
  }

  stopifnot(identical(Biobase::sampleNames(eset),
                      mice_konum.vc))

  Biobase::sampleNames(eset) <- mice_koid.vc
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)
}

Adding meta data (gene, mouse_nb, sex)

for (set.c in LATMX2::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  Biobase::pData(eset) <- cbind.data.frame(mice_id.df[Biobase::sampleNames(eset), ],
                                           Biobase::pData(eset),
                                           stringsAsFactors = FALSE)
  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)

}

formatting the 'imputation' metrics

  for (set.c in LATMX2::proteo_sets.vc()) {

    eset <- proteo.mset[[set.c]]

    eset <- ProMetIA::format_imputation(eset)
    stopifnot(methods::validObject(eset))

    proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                          eset,
                                          dataset.type = set.c,
                                          GRanges = NA,
                                          overwrite = TRUE,
                                          warnings = FALSE)

  }

Variable metadata: including new mice ids in the column names (count, abundance, etc.)

for (set.c in LATMX2::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  samp.vc <- Biobase::sampleNames(eset)
  pdata.df <- Biobase::pData(eset)
  fvar.vc <- Biobase::fvarLabels(eset)

  for (i in seq_along(samp.vc)) {
    samp.c <- samp.vc[i]
    samp_id.c <- gsub("abundance_", "", pdata.df[i, "id"])
    fvar_id.vi <- grep(paste0("^supp_.+", samp_id.c), fvar.vc)
    fvar.vc[fvar_id.vi] <- gsub(switch(set.c,
                                       proteomics_liver = "mgf",
                                       proteomics_plasma = samp_id.c),
                                samp.c, fvar.vc[fvar_id.vi])
  }

  Biobase::fvarLabels(eset) <- fvar.vc

  stopifnot(methods::validObject(eset))

  proteo.mset <- MultiDataSet::add_eset(proteo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)

}

Re-ordering metadata ('supp_' columns at the end)

proteo.mset <- ProMetIA::ordermeta_mset(proteo.mset)

Saving (not run)

for (set.c in LATMX2::proteo_sets.vc()) {

  eset <- proteo.mset[[set.c]]

  # an 'id' extra column has been automatically created in the sampleMetadata by
  # 'MultiDataSet' and must be removed before saving
  pdata.df <- Biobase::pData(eset)
  pdata.df[, "id"] <- NULL
  Biobase::pData(eset) <- pdata.df

  phenomis::writing(eset,
                  file.path("../../LATMX2/inst/extdata/2_post_processed",
                            set.c),
                  overwrite.l = TRUE)
}

Metabolomics (in progress)

Loading

metabo.mset <- phenomis::reading(file.path(LATMX2::processed_dir.c()),
                                 subsets.vc = LATMX2::metabo_sets.vc(),
                                 report.c = "none")

Normalization of the dataMatrix

for (set.c in LATMX2::metabo_sets.vc()) {

  eset <- metabo.mset[[set.c]]

  if (grepl("(hyper|hilic)", set.c)) {

    if (grepl("c18hyper", set.c)) {

      # setting NA values to 0
      exprs.mn <- Biobase::exprs(eset)
      exprs.mn[is.na(exprs.mn)] <- ifelse(set.c == "metabolomics_liver_c18hyper-pos", 0, 1)
      Biobase::exprs(eset) <- exprs.mn

    }

    # computing quality metrics
    eset <- phenomis::inspecting(eset, title.c = set.c)

    # discarding features with mean intensity in samples / mean intensity in pools < 3
    eset <- eset[Biobase::fData(eset)[, "blankMean_over_sampleMean"] <= 0.33, ]

    # discarding features with a Pearson correlation with pool dilution < 0.7
    eset <- eset[Biobase::fData(eset)[, "poolDil_cor"] >= 0.7, ]


  } else if (grepl("acqui", set.c)) {

    # discarding features eluted before 0.4 min or after 22 min
    eset <- eset[Biobase::fData(eset)[, "rt"] >= 0.4 & Biobase::fData(eset)[, "rt"] <= 22, ]

    # discarding features with mean intensity in samples / mean intensity in pools < 2 (or 4)
    fold.i <- ifelse(set.c == "metabolomics_plasma_c18acqui-pos", 2, 4)

    eset <- eset[Biobase::fData(eset)[, "fold"] >= fold.i, ]

    eset <- eset[Biobase::fData(eset)[, "tstat"] <= 0, ]

  } else
    stop("Unknown metabolomics dataset name.")

  # discarding blanks and diluted pools (QCs)
  eset <- eset[, Biobase::pData(eset)[, "sampleType"] %in% c("sample", "pool")]

  # signal drift correction
  if (set.c == "metabolomics_liver_c18hyper-pos") {
    reference.c <- "none"
  } else if (set.c == "metabolomics_plasma_c18hyper-pos") {
    reference.c <- "pool"
  } else if (set.c == "metabolomics_liver_hilic-neg") {
    reference.c <- "none"
  } else if (set.c == "metabolomics_plasma_hilic-neg") {
    reference.c <- "none"
  } else if (set.c == "metabolomics_plasma_c18acqui-pos") {
    reference.c <- "sample"
  } else if (set.c == "metabolomics_plasma_c18acqui-neg") {
    reference.c <- "pool"
  } else
    stop()

  if (reference.c != "none")
    eset <- phenomis::correcting(eset, reference.c = reference.c, title.c = set.c)

  # updating quality metrics
  eset <- phenomis::inspecting(eset, title.c = set.c)

  # discarding features with a coefficient of variation in pools > 30% 
  eset <- eset[Biobase::fData(eset)[, "pool_CV"] <= 0.3, ]

  if (grepl("acqui", set.c)) {

    # discarding features with a ratio 'coefficient of variation in samples / coefficient of variation in pools' < 0.8
    eset <- eset[Biobase::fData(eset)[, "poolCV_over_sampleCV"] <= 1.25, ]

  }

  # discarding 'pool' samples (i.e. QCs)
  eset <- eset[, Biobase::pData(eset)[, "sampleType"] != "pool"]

  if (grepl("acqui", set.c)) {

    # discarding the few features containing NA (generated after the drift correction)
    eset <- eset[apply(Biobase::exprs(eset), 1, function(feat.vn) !any(is.na(feat.vn))), ]

    # discarding features highly correlated to others from the same pcgroup with higher mean intensity
    eset <- ProMetIA::filter_pcgroup(eset)

    if (set.c == "metabolomics_plasma_c18acqui-pos") {

      # discarding features with 90% quantile < 5000
      eset <- eset[apply(Biobase::exprs(eset), 1,
                         function(feat.vn) quantile(feat.vn, 0.9)) >= 5000, ]

    }

  }

  stopifnot(methods::validObject(eset))

  metabo.mset <- MultiDataSet::add_eset(metabo.mset,
                                        eset,
                                        dataset.type = set.c,
                                        GRanges = NA,
                                        overwrite = TRUE,
                                        warnings = FALSE)

}

Formatting and ordering sample names

metabo.mset <- ProMetIA::format_metabonames(metabo.mset,
                                            mice_id.df = mice_id.df,
                                            mice_id.vc = mice_id.vc,
                                            mice_num.vc = mice_num.vc)

Log2 transformation

metabo.mset <- phenomis::transforming(metabo.mset)

Re-ordering metadata ('supp_' columns at the end)

metabo.mset <- ProMetIA::ordermeta_mset(metabo.mset)

Saving (not run)

for (set.c in LATMX2::metabo_sets.vc()) {

  eset <- metabo.mset[[set.c]]

  # an 'id' extra column has been automatically created in the sampleMetadata by
  # 'MultiDataSet' and must be removed before saving
  pdata.df <- Biobase::pData(eset)
  pdata.df[, "id"] <- NULL
  Biobase::pData(eset) <- pdata.df

  phenomis::writing(eset,
                  file.path("../../LATMX2/inst/extdata/2_post_processed",
                            set.c),
                  overwrite.l = TRUE)

}


ProMetIS/ProMetIA documentation built on March 6, 2020, 2:11 a.m.