proBatch.R
In proBatch: Tools for Diagnostics and Corrections of Batch Effects in Proteomics

## ----global_options, include=FALSE--------------------------------------------
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE, fig.pos = 'h')

## ----setup, include = FALSE---------------------------------------------------
chooseCRANmirror(graphics=FALSE, ind=1)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = '#>'
)

options(tinytex.verbose = TRUE)


## ----batch_workflow, include = TRUE, fig.align = 'center', echo=FALSE, fig.cap='proBatch in batch correction workflow', out.width = '50%'----
knitr::include_graphics('Batch_effects_workflow_staircase.png')

## ----dependencies, eval = FALSE-----------------------------------------------
#  bioc_deps <- c("GO.db", "impute", "preprocessCore", "pvca","sva" )
#  cran_deps <- c("corrplot", "data.table", "ggplot2", "ggfortify","lazyeval",
#                 "lubridate", "pheatmap", "reshape2","readr", "rlang",
#                 "tibble", "dplyr", "tidyr", "wesanderson","WGCNA")
#  
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  BiocManager::install(bioc_deps)
#  install.packages(cran_deps)

## ----install_proBatch, fig.show='hold', eval = FALSE--------------------------
#  #install the development version from GitHub:
#  install.packages('devtools')
#  devtools::install_github('symbioticMe/proBatch', build_vignettes = TRUE)

## ----load_packages------------------------------------------------------------
require(dplyr)
require(tibble)
require(ggplot2)

## ----col_names----------------------------------------------------------------
feature_id_col = 'peptide_group_label'
measure_col = 'Intensity'
sample_id_col = 'FullRunName'
essential_columns = c(feature_id_col, measure_col, sample_id_col)

## ----tech_bio_cols------------------------------------------------------------
technical_factors = c('MS_batch', 'digestion_batch')
biological_factors = c('Strain', 'Diet', 'Sex')
biospecimen_id_col = 'EarTag'

## -----------------------------------------------------------------------------
batch_col = 'MS_batch'

## ----load_data, fig.show='hold'-----------------------------------------------
library(proBatch)
data('example_proteome', 'example_sample_annotation', 'example_peptide_annotation', 
     package = 'proBatch')

## ----date_to_order, fig.show='hold'-------------------------------------------
generated_sample_annotation <- date_to_sample_order(example_sample_annotation,
                                          time_column = c('RunDate','RunTime'),
                                          new_time_column = 'generated_DateTime',
                                          dateTimeFormat = c('%b_%d', '%H:%M:%S'),
                                          new_order_col = 'generated_order',
                                          instrument_col = NULL)
library(knitr)
kable(generated_sample_annotation[1:5,] %>%
  select(c('RunDate', 'RunTime', 'order', 'generated_DateTime', 'generated_order')))

## ----pep_annotation, fig.show='hold'------------------------------------------
generated_peptide_annotation <- create_peptide_annotation(example_proteome, 
                                        feature_id_col = 'peptide_group_label',
                                        protein_col = 'Protein')

## ----reduce_proteome----------------------------------------------------------
example_proteome = example_proteome %>% select(one_of(essential_columns))
gc()

## ----long_to_matrix-----------------------------------------------------------
example_matrix <- long_to_matrix(example_proteome, 
                                 feature_id_col = 'peptide_group_label',
                                 measure_col = 'Intensity', 
                                 sample_id_col = 'FullRunName')

## -----------------------------------------------------------------------------
log_transformed_matrix <- log_transform_dm(example_matrix, 
                                           log_base = 2, offset = 1)

## ---- fig.show='hold'---------------------------------------------------------
color_list <- sample_annotation_to_colors(example_sample_annotation, 
                                          factor_columns = c('MS_batch', 'digestion_batch',
                                                                'EarTag', 'Strain', 
                                                                'Diet', 'Sex'),
                                          numeric_columns = c('DateTime','order'))

## ----plot_mean, fig.show='hold', fig.width=5, fig.height=2--------------------
plot_sample_mean(log_transformed_matrix, example_sample_annotation, order_col = 'order', 
                 batch_col = batch_col, color_by_batch = TRUE, ylimits = c(12, 16.5),
                 color_scheme = color_list[[batch_col]])

## ----plot_boxplots, fig.show='hold', fig.width=10, fig.height=5---------------
log_transformed_long <- matrix_to_long(log_transformed_matrix)
batch_col = 'MS_batch'
plot_boxplot(log_transformed_long, example_sample_annotation, 
             batch_col = batch_col, color_scheme = color_list[[batch_col]])

## ----median_normalization_log, fig.show='hold'--------------------------------
median_normalized_matrix = normalize_data_dm(log_transformed_matrix, 
                                             normalize_func = 'medianCentering')

## ----median_normalization_raw, fig.show='hold'--------------------------------
median_normalized_matrix = normalize_data_dm(example_matrix, 
                                             normalize_func = 'medianCentering', 
                                             log_base = 2, offset = 1)

## ----quantile_norm, fig.show='hold'-------------------------------------------
quantile_normalized_matrix = normalize_data_dm(log_transformed_matrix,
                                               normalize_func = 'quantile')

## ----plot_mean_normalized, fig.show='hold', fig.width=5, fig.height=2---------
plot_sample_mean(quantile_normalized_matrix, example_sample_annotation, 
                 color_by_batch = TRUE, ylimits = c(12, 16), 
                 color_scheme = color_list[[batch_col]])

## ----plot_hierarchical, fig.show='hold', fig.width=10, fig.height=5-----------
selected_annotations <- c('MS_batch',  'digestion_batch', 'Diet')

#Plot clustering between samples 
plot_hierarchical_clustering(quantile_normalized_matrix, 
                             sample_annotation = example_sample_annotation,
                             color_list = color_list, 
                             factors_to_plot = selected_annotations,
                             distance = 'euclidean', agglomeration = 'complete',
                             label_samples = FALSE)

## ----plot_heatmap, fig.show='hold', fig.width=10, fig.height=11---------------
plot_heatmap_diagnostic(quantile_normalized_matrix, example_sample_annotation, 
             factors_to_plot = selected_annotations, 
             cluster_cols = TRUE, 
             color_list = color_list,
             show_rownames = FALSE, show_colnames = FALSE)

## ----plot_PCA, fig.show='hold', fig.width=10, fig.height=8.5------------------
pca1 = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'MS_batch', 
              plot_title = 'MS batch', color_scheme = color_list[['MS_batch']])
pca2 = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'digestion_batch', 
         plot_title = 'Digestion batch', color_scheme = color_list[['digestion_batch']])
pca3 = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'Diet',  
         plot_title = 'Diet', color_scheme = color_list[['Diet']])
pca4 = plot_PCA(quantile_normalized_matrix, example_sample_annotation, color_by = 'DateTime',  
         plot_title = 'DateTime', color_scheme = color_list[['DateTime']])

library(ggpubr)
ggarrange(pca1, pca2, pca3, pca4, ncol = 2, nrow = 2)

## ----plot_PCA_spec, fig.show='hold', fig.width=5, fig.height=5----------------
pca_spec = plot_PCA(quantile_normalized_matrix, example_sample_annotation, 
                    color_by = 'digestion_batch', 
         plot_title = 'Digestion batch')
pca_spec

## ----plot_PVCA, fig.show='hold', fig.width=5, fig.height=5--------------------
plot_PVCA(quantile_normalized_matrix, example_sample_annotation, 
                  technical_factors = technical_factors,
                  biological_factors = biological_factors)

## ----plot_spikeIn, fig.show='hold'--------------------------------------------
quantile_normalized_long <- matrix_to_long(quantile_normalized_matrix)
plot_spike_in(quantile_normalized_long, example_sample_annotation, 
              peptide_annotation = example_peptide_annotation,
              protein_col = 'Gene', spike_ins = 'BOVINE_A1ag', 
              plot_title = 'Spike-in BOVINE protein peptides',
              color_by_batch = TRUE, color_scheme = color_list[[batch_col]])


## ----loess_fit, fig.show='hold', fig.width=5, fig.height=2.4------------------
loess_fit_df <- adjust_batch_trend_df(quantile_normalized_long, example_sample_annotation)

## ----loess_30, fig.show='hold', fig.width=5, fig.height=2.4-------------------
loess_fit_30 <- adjust_batch_trend_df(quantile_normalized_long, example_sample_annotation, 
                                      span = 0.3)

plot_with_fitting_curve(feature_name = '10231_QDVDVWLWQQEGSSK_2', 
                        fit_df = loess_fit_30, fit_value_col = 'fit',
                        df_long = quantile_normalized_long, 
                        sample_annotation = example_sample_annotation, 
                        color_by_batch = TRUE, color_scheme = color_list[[batch_col]], 
                        plot_title = 'Span = 30%')

## ----loess_70, fig.show='hold', fig.width=5, fig.height=2.4-------------------
loess_fit_70 <- adjust_batch_trend_df(quantile_normalized_long, example_sample_annotation, 
                                      span = 0.7)
plot_with_fitting_curve(feature_name = '10231_QDVDVWLWQQEGSSK_2', 
                        fit_df = loess_fit_70, fit_value_col = 'fit',
                        df_long = quantile_normalized_long, 
                        sample_annotation = example_sample_annotation, 
                        color_by_batch = TRUE, color_scheme = color_list[[batch_col]],
                        plot_title = 'Span = 70%')

## ----median_batch, fig.show='hold', fig.width=3, fig.height=2.4---------------
peptide_median_df <- center_feature_batch_medians_df(loess_fit_df, example_sample_annotation)
plot_single_feature(feature_name = '10231_QDVDVWLWQQEGSSK_2', df_long = peptide_median_df, 
            sample_annotation = example_sample_annotation, measure_col = 'Intensity',
            plot_title = 'Feature-level Median Centered')

## ----comBat, fig.show='hold'--------------------------------------------------
comBat_df <- correct_with_ComBat_df(loess_fit_df, example_sample_annotation)

## ----combat_result, fig.show='hold',  fig.width=3, fig.height=2.4-------------
plot_single_feature(feature_name = '10231_QDVDVWLWQQEGSSK_2', 
                    df_long = loess_fit_df, 
                    sample_annotation = example_sample_annotation, 
                    plot_title = 'Loess Fitted')
plot_single_feature(feature_name = '10231_QDVDVWLWQQEGSSK_2', 
                    df_long =  comBat_df, 
                    sample_annotation = example_sample_annotation, 
                    plot_title = 'ComBat corrected')

## ----batch_corr_general, fig.show='hold'--------------------------------------
batch_corrected_df <- correct_batch_effects_df(df_long = quantile_normalized_long,
                                               sample_annotation = example_sample_annotation, 
                                               discrete_func = 'ComBat', 
                                               continuous_func = 'loess_regression',
                                               abs_threshold = 5, pct_threshold = 0.20)
batch_corrected_matrix = long_to_matrix(batch_corrected_df)

## ----setup_corr_heatmap, fig.show='hold', fig.height=5, fig.width=8-----------
earTags <- c('ET1524', 'ET2078', 'ET1322', 'ET1566', 'ET1354', 'ET1420', 'ET2154',
             'ET1515', 'ET1506', 'ET2577', 'ET1681', 'ET1585', 'ET1518', 'ET1906')

replicate_filenames = example_sample_annotation %>%
  filter(MS_batch %in% c('Batch_2', 'Batch_3')) %>%
  filter(EarTag %in% earTags) %>% 
  pull(!!sym('FullRunName'))

## ---- fig.show='hold', fig.width=10, fig.height=11----------------------------
p1_exp = plot_sample_corr_heatmap(log_transformed_matrix, 
                                  samples_to_plot = replicate_filenames, 
                                  plot_title = 'Correlation of selected samples')

## -----------------------------------------------------------------------------
breaksList <- seq(0.7, 1, by = 0.01) # color scale of pheatmap 
heatmap_colors = colorRampPalette(
  rev(RColorBrewer::brewer.pal(n = 7, name = 'RdYlBu')))(length(breaksList) + 1)

## ----corr_samples_heatmap, fig.show='hold', fig.height=2.12, fig.width=3.35----
# Plot the heatmap with annotations for the chosen samples
factors_to_show = c(batch_col, biospecimen_id_col)

p1 = plot_sample_corr_heatmap(log_transformed_matrix, 
                              samples_to_plot = replicate_filenames,
                              sample_annotation = example_sample_annotation,
                              factors_to_plot = factors_to_show,
                              plot_title = 'Log transformed correlation matrix of 
                              \nselected replicated samples', 
                              color_list = color_list,
                              heatmap_color = heatmap_colors, breaks = breaksList, 
                              cluster_rows= FALSE, cluster_cols=FALSE,fontsize = 4,
                              annotation_names_col = TRUE, annotation_legend = FALSE, 
                              show_colnames = FALSE)

p2 = plot_sample_corr_heatmap(batch_corrected_matrix, 
                              samples_to_plot = replicate_filenames,
                              sample_annotation = example_sample_annotation,
                              factors_to_plot = factors_to_show,
                               plot_title = 'Batch Corrected
                              \nselected replicated samples', 
                              color_list = color_list,
                              heatmap_color = heatmap_colors, breaks = breaksList, 
                              cluster_rows= FALSE, cluster_cols=FALSE,fontsize = 4,
                              annotation_names_col = TRUE, annotation_legend = FALSE, 
                              show_colnames = FALSE)
library(gridExtra)
grid.arrange(grobs = list(p1$gtable, p2$gtable), ncol=2)

## ----corr_samples_distrib, fig.show='hold', fig.width=3.2, fig.height=3.5-----
sample_cor_raw <- plot_sample_corr_distribution(log_transformed_matrix,
                                                 example_sample_annotation,
                                                 #repeated_samples = replicate_filenames,
                                                 batch_col = 'MS_batch', 
                                                 biospecimen_id_col = 'EarTag', 
                                                 plot_title = 'Correlation of samples (raw)',
                                                 plot_param = 'batch_replicate')
raw_corr = sample_cor_raw + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  ylim(0.7,1) + xlab(NULL)

sample_cor_batchCor <- plot_sample_corr_distribution(batch_corrected_matrix,
                                                     example_sample_annotation, 
                                                     batch_col = 'MS_batch', 
                                                     plot_title = 'Batch corrected',
                                                     plot_param = 'batch_replicate')
corr_corr = sample_cor_batchCor + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  ylim(0.7, 1) + xlab(NULL)

library(gtable)
library(grid)
g2 <- ggplotGrob(raw_corr)
g3 <- ggplotGrob(corr_corr)
g <- cbind(g2, g3, size = 'first')
grid.draw(g)

## ----correlation_of_peptides, eval = FALSE------------------------------------
#  peptide_cor_raw <- plot_peptide_corr_distribution(log_transformed_matrix,
#                                                    example_peptide_annotation,
#                                                    protein_col = 'Gene',
#                                                    plot_title = 'Peptide correlation (raw)')
#  
#  peptide_cor_batchCor <- plot_peptide_corr_distribution(batch_corrected_matrix,
#                                                         example_peptide_annotation,
#                                                         protein_col = 'Gene',
#                                                         plot_title = 'Peptide correlation (batch corrected)')
#  g2 <- ggplotGrob(peptide_cor_raw+
#    ylim(c(-1, 1)))
#  g3 <- ggplotGrob(peptide_cor_batchCor+
#    ylim(c(-1, 1)))
#  g <- cbind(g2, g3, size = 'first')
#  grid.draw(g)

## ----sessionInfo, eval=TRUE---------------------------------------------------
sessionInfo()

## ----citation-----------------------------------------------------------------
citation('proBatch')