knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE, message = TRUE, out.width = "100%" )
We just use the dataset which are from this step.
library(tidymass) load("data_cleaning/POS/object_pos") load("data_cleaning/NEG/object_neg")
Change batch to character.
object_pos <- object_pos %>% activate_mass_dataset(what = "sample_info") %>% dplyr::mutate(batch = as.character(batch)) object_neg <- object_neg %>% activate_mass_dataset(what = "sample_info") %>% dplyr::mutate(batch = as.character(batch))
object_pos
object_neg
Here, we can use the massqc
package to assess the data quality.
We can just use the massqc_report()
function to get a html format quality assessment report.
massqc::massqc_report(object = object_pos, path = "data_cleaning/POS/data_quality_before_data_cleaning")
A html format report and pdf figures will be placed in the folder data_cleaning/POS/data_quality_before_data_cleaning/Report
.
The html report is below:
Negative mode.
massqc::massqc_report(object = object_neg, path = "data_cleaning/NEG/data_quality_before_data_cleaning")
The PCA score plot is used to show the batch effect of positive and negative dataset.
Positive mode:
Negative mode:
We can see that no matter in positive and negative mode, batch effect is serious.
Remove variables which have MVs in more than 20% QC samples and in at lest 50% samples in control group or case group.
object_pos %>% activate_mass_dataset(what = "sample_info") %>% dplyr::count(group)
MV percentage in QC samples.
show_variable_missing_values(object = object_pos %>% activate_mass_dataset(what = "sample_info") %>% filter(class == "QC"), percentage = TRUE) + scale_size_continuous(range = c(0.01, 2))
qc_id = object_pos %>% activate_mass_dataset(what = "sample_info") %>% filter(class == "QC") %>% pull(sample_id) control_id = object_pos %>% activate_mass_dataset(what = "sample_info") %>% filter(group == "Control") %>% pull(sample_id) case_id = object_pos %>% activate_mass_dataset(what = "sample_info") %>% filter(group == "Case") %>% pull(sample_id) object_pos = object_pos %>% mutate_variable_na_freq(according_to_samples = qc_id) %>% mutate_variable_na_freq(according_to_samples = control_id) %>% mutate_variable_na_freq(according_to_samples = case_id) head(extract_variable_info(object_pos))
Remove variables.
object_pos <- object_pos %>% activate_mass_dataset(what = "variable_info") %>% filter(na_freq < 0.2 & (na_freq.1 < 0.5 | na_freq.2 < 0.5)) object_pos
Only 5,101 variables left.
object_neg %>% activate_mass_dataset(what = "sample_info") %>% dplyr::count(group)
MV percentage in QC samples.
show_variable_missing_values(object = object_neg %>% activate_mass_dataset(what = "sample_info") %>% filter(class == "QC"), percentage = TRUE) + scale_size_continuous(range = c(0.01, 2))
qc_id = object_neg %>% activate_mass_dataset(what = "sample_info") %>% filter(class == "QC") %>% pull(sample_id) control_id = object_neg %>% activate_mass_dataset(what = "sample_info") %>% filter(group == "Control") %>% pull(sample_id) case_id = object_neg %>% activate_mass_dataset(what = "sample_info") %>% filter(group == "Case") %>% pull(sample_id) object_neg = object_neg %>% mutate_variable_na_freq(according_to_samples = qc_id) %>% mutate_variable_na_freq(according_to_samples = control_id) %>% mutate_variable_na_freq(according_to_samples = case_id) head(extract_variable_info(object_neg))
Remove variables.
object_neg <- object_neg %>% activate_mass_dataset(what = "variable_info") %>% filter(na_freq < 0.2 & (na_freq.1 < 0.5 | na_freq.2 < 0.5)) object_neg
4104 features left.
We can use the detect_outlier()
to find the outlier samples.
More information about how to detect outlier samples can be found here.
massdataset::show_sample_missing_values(object = object_pos, color_by = "group", order_by = "injection.order", percentage = TRUE) + theme(axis.text.x = element_text(size = 2)) + scale_size_continuous(range = c(0.1, 2)) + ggsci::scale_color_aaas()
Detect outlier samples.
outlier_samples = object_pos %>% `+`(1) %>% log(2) %>% scale() %>% detect_outlier() outlier_samples
outlier_table <- extract_outlier_table(outlier_samples) outlier_table %>% head() outlier_table %>% apply(1, function(x){ sum(x) }) %>% `>`(0) %>% which()
No outlier samples in positive mode.
massdataset::show_sample_missing_values(object = object_neg, color_by = "group", order_by = "injection.order", percentage = TRUE) + theme(axis.text.x = element_text(size = 2)) + scale_size_continuous(range = c(0.1, 2)) + ggsci::scale_color_aaas()
Detect outlier samples.
outlier_samples = object_neg %>% `+`(1) %>% log(2) %>% scale() %>% detect_outlier() outlier_samples
outlier_table <- extract_outlier_table(outlier_samples) outlier_table %>% head() outlier_table %>% apply(1, function(x){ sum(x) }) %>% `>`(0) %>% which()
No outlier samples in negative mode.
get_mv_number(object_pos) object_pos <- impute_mv(object = object_pos, method = "knn") get_mv_number(object_pos)
get_mv_number(object_neg) object_neg <- impute_mv(object = object_neg, method = "knn") get_mv_number(object_neg)
object_pos <- normalize_data(object_pos, method = "median") object_pos2 <- integrate_data(object_pos, method = "subject_median")
object_pos2 %>% `+`(1) %>% log(2) %>% massqc::massqc_pca(color_by = "batch", line = FALSE)
object_neg <- normalize_data(object_neg, method = "median") object_neg2 <- integrate_data(object_neg, method = "subject_median")
object_neg2 %>% `+`(1) %>% log(2) %>% massqc::massqc_pca(color_by = "batch", line = FALSE)
Save data for subsequent analysis.
save(object_pos2, file = "data_cleaning/POS/object_pos2") save(object_neg2, file = "data_cleaning/NEG/object_neg2")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.