knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  warning = FALSE,
  message = TRUE,
  out.width = "100%"
)

Data preparation

We just use the dataset which are from this step.

library(tidymass)
load("data_cleaning/POS/object_pos")
load("data_cleaning/NEG/object_neg")

Change batch to character.

object_pos <- 
  object_pos %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::mutate(batch = as.character(batch))

object_neg <- 
  object_neg %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::mutate(batch = as.character(batch))
object_pos
object_neg

Data quality assessment before data cleaning

Here, we can use the massqc package to assess the data quality.

We can just use the massqc_report() function to get a html format quality assessment report.

massqc::massqc_report(object = object_pos,
                      path = "data_cleaning/POS/data_quality_before_data_cleaning")

A html format report and pdf figures will be placed in the folder data_cleaning/POS/data_quality_before_data_cleaning/Report.

The html report is below:

Negative mode.

massqc::massqc_report(object = object_neg, 
                      path = "data_cleaning/NEG/data_quality_before_data_cleaning")

The PCA score plot is used to show the batch effect of positive and negative dataset.

Positive mode:

Negative mode:

We can see that no matter in positive and negative mode, batch effect is serious.

Remove noisy metabolic features

Remove variables which have MVs in more than 20% QC samples and in at lest 50% samples in control group or case group.

Positive mode

object_pos %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::count(group)

MV percentage in QC samples.

show_variable_missing_values(object = object_pos %>% 
                               activate_mass_dataset(what = "sample_info") %>% 
                               filter(class == "QC"), 
                             percentage = TRUE) +
  scale_size_continuous(range = c(0.01, 2))
qc_id =
  object_pos %>%
  activate_mass_dataset(what = "sample_info") %>%
  filter(class == "QC") %>%
  pull(sample_id)

control_id =
  object_pos %>%
  activate_mass_dataset(what = "sample_info") %>%
  filter(group == "Control") %>%
  pull(sample_id)

case_id =
  object_pos %>%
  activate_mass_dataset(what = "sample_info") %>%
  filter(group == "Case") %>%
  pull(sample_id)

object_pos =
  object_pos %>%
  mutate_variable_na_freq(according_to_samples = qc_id) %>%
  mutate_variable_na_freq(according_to_samples = control_id) %>%
  mutate_variable_na_freq(according_to_samples = case_id)

head(extract_variable_info(object_pos))

Remove variables.

object_pos <- 
  object_pos %>% 
  activate_mass_dataset(what = "variable_info") %>%
  filter(na_freq < 0.2 & (na_freq.1 < 0.5 | na_freq.2 < 0.5))
object_pos

Only 5,101 variables left.

Negative mode

object_neg %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::count(group)

MV percentage in QC samples.

show_variable_missing_values(object = object_neg %>% 
                               activate_mass_dataset(what = "sample_info") %>% 
                               filter(class == "QC"), 
                             percentage = TRUE) +
  scale_size_continuous(range = c(0.01, 2))
qc_id =
  object_neg %>%
  activate_mass_dataset(what = "sample_info") %>%
  filter(class == "QC") %>%
  pull(sample_id)

control_id =
  object_neg %>%
  activate_mass_dataset(what = "sample_info") %>%
  filter(group == "Control") %>%
  pull(sample_id)

case_id =
  object_neg %>%
  activate_mass_dataset(what = "sample_info") %>%
  filter(group == "Case") %>%
  pull(sample_id)

object_neg =
  object_neg %>%
  mutate_variable_na_freq(according_to_samples = qc_id) %>%
  mutate_variable_na_freq(according_to_samples = control_id) %>%
  mutate_variable_na_freq(according_to_samples = case_id)

head(extract_variable_info(object_neg))

Remove variables.

object_neg <- 
  object_neg %>% 
  activate_mass_dataset(what = "variable_info") %>%
  filter(na_freq < 0.2 & (na_freq.1 < 0.5 | na_freq.2 < 0.5))
object_neg

4104 features left.

Filter outlier samples

We can use the detect_outlier() to find the outlier samples.

More information about how to detect outlier samples can be found here.

Positive mode

massdataset::show_sample_missing_values(object = object_pos,
                                        color_by = "group",
                                        order_by = "injection.order",
                                        percentage = TRUE) +
  theme(axis.text.x = element_text(size = 2)) +
  scale_size_continuous(range = c(0.1, 2)) +
  ggsci::scale_color_aaas()

Detect outlier samples.

outlier_samples =
  object_pos %>%
  `+`(1) %>% 
  log(2) %>%
  scale() %>%
  detect_outlier()

outlier_samples
outlier_table <-
extract_outlier_table(outlier_samples)

outlier_table %>% 
  head()

outlier_table %>% 
  apply(1, function(x){
    sum(x)
  }) %>% 
  `>`(0) %>% 
  which()

No outlier samples in positive mode.

Negative mode

massdataset::show_sample_missing_values(object = object_neg,
                                        color_by = "group",
                                        order_by = "injection.order",
                                        percentage = TRUE) +
  theme(axis.text.x = element_text(size = 2)) +
  scale_size_continuous(range = c(0.1, 2)) +
  ggsci::scale_color_aaas()

Detect outlier samples.

outlier_samples =
  object_neg %>%
  `+`(1) %>% 
  log(2) %>%
  scale() %>%
  detect_outlier()

outlier_samples
outlier_table <-
extract_outlier_table(outlier_samples)

outlier_table %>% 
  head()

outlier_table %>% 
  apply(1, function(x){
    sum(x)
  }) %>% 
  `>`(0) %>% 
  which()

No outlier samples in negative mode.

Missing value imputation

get_mv_number(object_pos)
object_pos <- 
  impute_mv(object = object_pos, method = "knn")

get_mv_number(object_pos)
get_mv_number(object_neg)

object_neg <- 
  impute_mv(object = object_neg, method = "knn")

get_mv_number(object_neg)

Data normalization and integration

Positive mode

object_pos <- 
  normalize_data(object_pos, method = "median")

object_pos2 <- 
  integrate_data(object_pos, method = "subject_median")
object_pos2 %>% 
  `+`(1) %>% 
  log(2) %>% 
  massqc::massqc_pca(color_by = "batch", line = FALSE)

Negative mode

object_neg <- 
  normalize_data(object_neg, method = "median")

object_neg2 <- 
  integrate_data(object_neg, method = "subject_median")
object_neg2 %>% 
  `+`(1) %>% 
  log(2) %>% 
  massqc::massqc_pca(color_by = "batch", line = FALSE)

Save data for subsequent analysis.

save(object_pos2, file = "data_cleaning/POS/object_pos2")
save(object_neg2, file = "data_cleaning/NEG/object_neg2")

Session information

sessionInfo()


tidymass/tidymass documentation built on April 5, 2022, 2:10 a.m.