knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  warning = FALSE,
  message = TRUE,
  out.width = "100%"
)

Data preparation

After the raw data processing, peak tables for positive and negative mode have been generated.

Next, we need to get the peak table and sample information and organize them as mass_dataset class objects.

library(tidymass)
library(tidyverse)

Positive mode

Load object.

load("mzxml_ms1_data/POS/Result/object")
object_pos <- object
object_pos

Read sample information.

sample_info_pos <- readr::read_csv("sample_info/sample_info_pos.csv")
head(sample_info_pos)

Add sample_info_pos to object_pos

object_pos %>% 
  extract_sample_info() %>% 
  head()

object_pos <- 
  object_pos %>% 
  activate_mass_dataset(what = "sample_info") %>%
  dplyr::select(-c("group", "class", "injection.order"))
object_pos =
  object_pos %>%
  activate_mass_dataset(what = "sample_info") %>%
  left_join(sample_info_pos,
            by = "sample_id")

object_pos %>% 
  extract_sample_info() %>% 
  head()

Save the object_pos in a new folder named as data_cleaning.

dir.create("data_cleaning/POS", showWarnings = FALSE, recursive = TRUE)
save(object_pos, file = "data_cleaning/POS/object_pos")
object_pos
dim(object_pos)
object_pos %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::count(class)

object_pos %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::count(group)

object_pos %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::count(batch)

So for positive mode, we have 259 samples and 10,149 variables. 220 subject samples and 39 QC samples. 110 control samples and 110 case samples. Two batches in total, 112 samples in batch 1 and 147 in batch 2.

Next, we can get the peak distributation plot of positive mode.

object_pos %>%
  `+`(1) %>% 
  log(10) %>%
  show_mz_rt_plot() +
  scale_size_continuous(range = c(0.01, 2))

We can explore the missing values (mvs) in positive mode data.

get_mv_number(object = object_pos)

785,821 mvs in total.

get_mv_number(object = object_pos, by = "sample") %>% 
  head()

Missing value number in each sample.

get_mv_number(object = object_pos, by = "variable") %>% 
  head()

Missing value number in each variable.

We can use the figure to show the missing value information.

show_missing_values(object = object_pos, show_column_names = FALSE, percentage = TRUE)

Show the mvs in samples.

show_sample_missing_values(object = object_pos, percentage = TRUE)

Show the mvs in variables

show_variable_missing_values(
  object = object_pos,
  percentage = TRUE,
  show_x_text = FALSE,
  show_x_ticks = FALSE
) +
  scale_size_continuous(range = c(0.01, 1))

Negative mode

Load object.

load("mzxml_ms1_data/NEG/Result/object")
object_neg <- object
object_neg

Read sample information.

sample_info_neg <-
  readr::read_csv("sample_info/sample_info_neg.csv")
head(sample_info_neg)

Add sample_info_neg to object_neg

object_neg %>% 
  extract_sample_info() %>% 
  head()

object_neg <- 
  object_neg %>% 
  activate_mass_dataset(what = "sample_info") %>%
  dplyr::select(-c("group", "class", "injection.order"))
object_neg <-
  object_neg %>%
  activate_mass_dataset(what = "sample_info") %>%
  left_join(sample_info_neg,
            by = "sample_id")

object_neg %>% 
  extract_sample_info() %>% 
  head()

Save the object_neg in a new folder named as data_cleaning.

dir.create("data_cleaning/NEG", showWarnings = FALSE, recursive = TRUE)
save(object_neg, file = "data_cleaning/NEG/object_neg")
object_neg
dim(object_neg)
object_neg %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::count(class)

object_neg %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::count(group)

object_neg %>% 
  activate_mass_dataset(what = "sample_info") %>% 
  dplyr::count(batch)

So for negative mode, we have 259 samples and 8,804 variables. 220 subject samples and 39 QC samples. 110 control samples and 110 case samples. Two batches in total, 112 samples in batch 1 and 147 in batch 2.

Next, we can get the peak distributation plot of negative mode.

object_neg %>%
  `+`(1) %>% 
  log(10) %>%
  show_mz_rt_plot() +
  scale_size_continuous(range = c(0.01, 2))

We can explore the missing values in negitive mode data.

get_mv_number(object = object_neg)

748,253 mvs in total.

get_mv_number(object = object_neg, by = "sample") %>% 
  head()

Missing value number in each sample.

get_mv_number(object = object_neg, by = "variable") %>% 
  head()

Missing value number in each variable.

We can use the figure to show the missing value information.

show_missing_values(object = object_neg, show_column_names = FALSE, percentage = TRUE)

Show the mvs in samples.

show_sample_missing_values(object = object_neg, percentage = TRUE)

Show the mvs in variables.

show_variable_missing_values(object = object_neg, 
                             percentage = TRUE, 
                             show_x_text = FALSE, 
                             show_x_ticks = FALSE) +
  scale_size_continuous(range = c(0.01, 1))

Conclusion

So from those exploration, we have a brief summary of our data. Next, we will use masscleaner pacakge to do the data cleaning of data.

Session information

sessionInfo()


tidymass/tidymass documentation built on April 5, 2022, 2:10 a.m.