knitr::opts_chunk$set(echo = TRUE)

1. Introduction

The batch effect correction analysis module performs an automatic or specific correction on the peak data table generated by peak picking step. Multiple correction methods have been embeded inside. They include combat, WaveICA, EigenMS, QC_RLSC, ANCOVA, RUV_random, RUV_2, RUVseq series (RUV_s, RUV_r and RUV_g), NOMIS, CCMN. The detailed limitation and mathmatic mechanism of different methods have been illustrated in our manuscript. Here, two cases will be provided to show users a step to step workflow for this moduel.

2. Data preparation

The peak table generated by FormatPeakList function needs to be manually prepared by hand to supplements the corresponding information below.

·1st column The samples name;

·2nd column Sample groups/classes information, if there are any QC samples, please mark them as QC;

·3rd column Sample injection order, if no related information, please leave them empty;

·4th column Batch information, please provide consistent characters within the same batch;

·other column Features' intensity should be provided. If any internal standards, please mark the feature with "IS".

·NOTE: Please provide as more information as possible. If some information ommitted, please leave the columns empty.

Here is an example table.
library(knitr)
d<-kable(read.csv("vignette_figures/bc.csv",header = T))
d

3. Case 1 - A simulated sample

3.1 Data Downloading

# Use Google API for data downloading here. 
# Please "install.packages('googledrive')" and "install.packages('httpuv')"first.
library(googledrive);
temp <- tempfile(fileext = ".csv")
# Please authorize your google account to access the data
dl <- drive_download(
  as_id("1vObn7HfgtZ2E6tPOyQyxRLineUZB1zeG"), path = temp, overwrite = TRUE)
cat('File downloaded:
  * BC_example.csv
Saved locally as:
  * /tmp/RtmpIBzO2x/file11c61acc1747.csv')

3.2 Library Package

# Load the MetaboAnalystR package
library("MetaboAnalystR")

3.3 Initializing mSet Object

mSet <- InitDataObjects("pktable", "utils", FALSE)
cat('[1] "MetaboAnalyst R objects initialized ..."')

3.4 Data Reading

## we set samples in "row" according to the table format. If your samples are in column, set it as "col".
mSet <- Read.BatchCSVdata(mSet, dl$local_path, "row")

3.5 Automatic Correction

mSet <- PerformBatchCorrection(mSet)
cat('[1] "Correcting using the Automatically method !"
[1] "Correcting with Combat..."
Standardizing Data across genes
[1] "Correcting with WaveICA..."
[1] "Correcting with EigenMS..."
[1] "Correcting with QC-RLSC..."
[1] "Correcting with ANCOVA..."
[1] "Correcting with RUV-random..."
[1] "Correcting with RUV-2..."
[1] "Correcting with RUVs..."
[1] "Correcting with RUVr..."
Loading required package: edgeR
Loading required package: limma
[1] "Correcting with RUVg..."
[1] "Correcting with NOMIS..."
[1] "Correcting with CCMN..."
[1] "Best results generated by  combat  !"')
# Show the distances between batches
mSet$dataSet$interbatch_dis
cat('combat_edata    WaveICA_edata    EigenMS_edata    QC_RLSC_edata     ANCOVA_edata RUV_random_edata      RUV_2_edata      RUV_s_edata      RUV_r_edata 
        9.367183        10.099356        13.105851        10.978753        11.470403        10.163140        10.842479        22.628060        11.146660 
     RUV_g_edata      NOMIS_edata       CCMN_edata 
       11.018191        11.196554        11.104648 ')
knitr::include_graphics("vignette_figures/Example1_BC PCAdpi300.png",dpi = 480)
knitr::include_graphics("vignette_figures/Example1_BC Trenddpi300.png",dpi = 480)

4. Case 2 - IBD Benchmark Data

This Benchmark data is a large batch data with over 600 samples. The data file size is over 70 MB. This part is designed for repeating our published results. Running of this part may take over 30min to finish. You are strongly recommonded to use your own data instead of this large file to avoid the consuming of patience.

4.1 Data Downloading

# Use Google API for data downloading peak feature here. 
# Please "install.packages('googledrive')" and "install.packages('httpuv')"first.
library(googledrive);
temp <- tempfile(fileext = ".csv")
# Please authorize your google account to access the data
dl1 <- drive_download(
  as_id("1wEh2P81J_xFWJs5y4mq98-FsjxJ5wmBO"), path = temp, overwrite = TRUE)
# Use Google API for data downloading meta data here. 
# Please "install.packages('googledrive')" and "install.packages('httpuv')"first.
library(googledrive);
temp <- tempfile(fileext = ".csv")
# Please authorize your google account to access the data
dl2 <- drive_download(
  as_id("1IOF2pCYrd0sbmFt3x9XOcwYj6XEA2z1n"), path = temp, overwrite = TRUE)
cat('File downloaded:
  * metaboanalyst_input.csv
Saved locally as:
  * /tmp/RtmpIBzO2x/file11c64ad429b2.csv')

4.2 Library Package

# Load the MetaboAnalystR package
library("MetaboAnalystR")

4.3 Data Filtering and Normalization

mSet<-InitDataObjects("pktable", "stat", FALSE)
cat('Starting Rserve:
 /usr/lib/R/bin/R CMD /home/xialab/R/x86_64-pc-linux-gnu-library/3.6/Rserve/libs//Rserve --no-save 


R version 3.6.3 (2020-02-29) -- "Holding the Windsock"
Copyright (C) 2020 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.',
"Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.",

'##> SOCK_ERROR: bind error #98(address already in use)
Rserv started in daemon mode.
[1] "MetaboAnalyst R objects initialized ..."')
mSet<-Read.TextData(mSet, dl1$local_path, "col", "disc")
mSet<-SanityCheckData(mSet)
mSet<-ReplaceMin(mSet);
mSet<-FilterVariable(mSet, "iqr", "F", 25)
mSet<-PreparePrenormData(mSet)
mSet<-Normalization(mSet, "MedianNorm", "LogNorm", "NULL", ratio=FALSE, ratioNum=20)
cat('[1] "Successfully passed sanity check!"                                                                    
 [2] "Samples are not paired."                                                                              
 [3] "4 groups were detected in samples."                                                                   
 [4] "Only English letters, numbers, underscore, hyphen and forward slash (/) are allowed."                 
 [5] "<font color=\"orange\">Other special characters or punctuations (if any) will be stripped off.</font>"
 [6] "All data values are numeric."                                                                         
 [7] "A total of 744120 (14.4%) missing values were detected."                                              
 [8] "<u>By default, these values will be replaced by a small value.</u>"                                   
 [9] "Click <b>Skip</b> button if you accept the default practice"                                          
[10] "Or click <b>Missing value imputation</b> to use other methods" ')
cat('[1] " Further feature filtering based on Interquantile Range Reduced to 5000 features based on Interquantile Range"
[1] " Further feature filtering based on Interquantile Range Reduced to 5000 features based on Interquantile Range"')
### Data Orgnization
normalized_set <- mSet[["dataSet"]][["norm"]]
ordered_normalized_set <- normalized_set[ order(row.names(normalized_set)), ]
### import metadata
meta_data <- read.csv(dl2$local_path)
new_normalized_set <- cbind(meta_data[,2:4], ordered_normalized_set);
write.csv(new_normalized_set,file = "new_normalized_set.csv")
#perform PCA
mSet <- PCA.Anal(mSet)
mSet <- PlotPCAPairSummary(mSet, "pca_pair_0_", "png", 72, width=NA, 5)
mSet <- PlotPCAScree(mSet, "pca_scree_0_", "png", 72, width=NA, 5)
mSet <- PlotPCA2DScore(mSet, "pca_score2d_0_", "png", 72, width=NA,style = 2, 1,2,0.95,0,0)
knitr::include_graphics("vignette_figures/IBD_BC_pca_score2d_0_dpi72.png",dpi = 120)

4.4 Initializing mSet Object

mSet <- InitDataObjects("pktable", "utils", FALSE)
cat('[1] "MetaboAnalyst R objects initialized ..."')

4.5 Data Reading

## we set samples in "row" according to the table format. If your samples are in column, set it as "col".
mSet <- Read.BatchCSVdata(mSet, "new_normalized_set.csv", "row")

4.6 Automatic Correction

mSet <- PerformBatchCorrection(mSet)
cat('[1] "Correcting using the Automatically method !"
[1] "Correcting with Combat..."
Standardizing Data across genes
[1] "Correcting with WaveICA..."
[1] "Correcting with EigenMS..."
[1] "Correcting with QC-RLSC..."
Cluster size 2216 broken into 551 1665 
Done cluster 551 
Cluster size 1665 broken into 1025 640 
Done cluster 1025 
Done cluster 640 
Done cluster 1665 
[1] "Correcting with ANCOVA..."
[1] "Best results generated by  EigenMS  !"')
# Show the distances between batches
mSet$dataSet$interbatch_dis
cat('combat_edata WaveICA_edata EigenMS_edata QC_RLSC_edata  ANCOVA_edata 
     39.42686      38.32145      24.18179      36.58138      38.38796  ')
knitr::include_graphics("vignette_figures/Example2_image_BC PCAdpi300.png",dpi = 480)
knitr::include_graphics("vignette_figures/Example2_image_BC Trenddpi300.png",dpi = 480)

4.7 Data visualization - PCA plotting with groups

## remove the batch column in the corrected result
data_corrected <- read.csv("/home/xialab/Documents/MetaboAnalyst_batch_data.csv")
data_corrected_new <- data_corrected[,-3]
write.csv(data_corrected_new,file = "MetaboAnalyst_batch_data_stats.csv",row.names =  F)
mSet<-InitDataObjects("pktable", "stat", FALSE)
mSet<-Read.TextData(mSet, "MetaboAnalyst_batch_data_stats.csv", "row", "disc")
mSet<-SanityCheckData(mSet)
mSet<-ReplaceMin(mSet);
mSet<-FilterVariable(mSet, "iqr", "F", 25)
mSet<-PreparePrenormData(mSet)
mSet<-Normalization(mSet, "NULL", "NULL", "NULL", ratio=FALSE, ratioNum=20) # No need to normalize again
mSet <- PCA.Anal(mSet)
mSet <- PlotPCAPairSummary(mSet, "pca_pair_0_", "png", 72, width=NA, 5)
mSet <- PlotPCAScree(mSet, "pca_scree_0_", "png", 72, width=NA, 5)
mSet <- PlotPCA2DScore(mSet, "pca_score2d_0_", "png", 72, width=NA, style = 2, 1,2,0.95,0,0)
knitr::include_graphics("vignette_figures/IBD_BCfree_pca_score2d_0_dpi72.png",dpi = 120)
# ============ END OF THIS VIGNETTE =============


xia-lab/MetaboAnalystR3.0 documentation built on May 6, 2020, 11:03 p.m.