R/proteome_wide_diagnostics.R
In proBatch: Tools for Diagnostics and Corrections of Batch Effects in Proteomics

Documented in calculate_PVCA plot_heatmap_diagnostic plot_heatmap_generic plot_hierarchical_clustering plot_PCA plot_PVCA plot_PVCA.df prepare_PVCA_df

#' cluster the data matrix to visually inspect which confounder dominates
#'
#' @inheritParams proBatch
#' @param distance distance metric used for clustering
#' @param agglomeration agglomeration methods as used by \code{hclust}
#' @param label_samples if \code{TRUE} sample IDs (column names of 
#' \code{data_matrix}) will be printed
#' @param label_font size of the font. Is active if \code{label_samples} is 
#' \code{TRUE}, ignored otherwise
#' @param fill_the_missing numeric value determining how  missing values 
#' should be substituted. If \code{NULL}, features with missing values are 
#' excluded.
#' @param ... other parameters of \code{plotDendroAndColors} from 
#' \code{WGCNA} package
#'
#' @return No return
#' @examples
#' 
#' selected_batches = example_sample_annotation$MS_batch %in% 
#'                                               c('Batch_1', 'Batch_2')
#' selected_samples = example_sample_annotation$FullRunName[selected_batches]
#' test_matrix = example_proteome_matrix[,selected_samples]
#' 
#' hierarchical_clustering_plot <- plot_hierarchical_clustering(
#' example_proteome_matrix, example_sample_annotation,
#' factors_to_plot = c('MS_batch', 'Diet', 'DateTime'),
#' color_list = NULL,  
#' distance = "euclidean", agglomeration = 'complete',
#' label_samples = FALSE)
#' 
#' #with defined color scheme:
#' color_list <- sample_annotation_to_colors (example_sample_annotation, 
#' factor_columns = c('MS_batch', "Strain", "Diet", "digestion_batch"),
#' numeric_columns = c('DateTime', 'order'))
#' hierarchical_clustering_plot <- plot_hierarchical_clustering(
#' example_proteome_matrix, example_sample_annotation,
#' factors_to_plot = c('MS_batch', "Strain", 'DateTime', "digestion_batch"),
#' color_list = color_list,  
#' distance = "euclidean", agglomeration = 'complete',
#' label_samples = FALSE)
#' 
#' @export
#'
#' @seealso \code{\link[stats]{hclust}}, 
#'   \code{\link{sample_annotation_to_colors}},
#'   \code{\link[WGCNA]{plotDendroAndColors}}
plot_hierarchical_clustering  <- function(data_matrix, sample_annotation,
                                          sample_id_col = 'FullRunName',
                                          color_list = NULL,
                                          factors_to_plot = NULL,
                                          fill_the_missing = 0,
                                          distance = "euclidean",
                                          agglomeration = 'complete',
                                          label_samples = TRUE, label_font = .2,
                                          filename = NULL, 
                                          width = 38, height = 25, 
                                          units = c('cm','in','mm'), 
                                          plot_title = NULL,
                                          ...){

  df_long = matrix_to_long(data_matrix, sample_id_col = sample_id_col)
  df_long = check_sample_consistency(sample_annotation, sample_id_col, df_long, 
                                     merge = FALSE)
  data_matrix = long_to_matrix(df_long, sample_id_col = sample_id_col)
  rm(df_long)
  
  warning_message <- 'Hierarchical clustering cannot operate with missing values
                      in the matrix'
  data_matrix = handle_missing_values(data_matrix, warning_message, 
                                      fill_the_missing)
  
  dist_matrix = dist(t(as.matrix(data_matrix)), method = distance)
  hierarchical_clust = hclust(dist_matrix, method = agglomeration)
  if (label_samples){
    cex.dendroLabels = label_font
    if (ncol(data_matrix) > 80){
      warning('Too many samples, adjust the font with `label_font` argument or
              remove labels by setting `label_samples = FALSE` in 
              function call')
    }
  } else{
    cex.dendroLabels = 0.9
  }
  
  factors_without_colors = setdiff(factors_to_plot, names(color_list))
  if(length(factors_without_colors) > 0){
    warning('color_list for samples annotation not defined, inferring 
             automatically. Numeric/factor columns are guessed, for more 
            controlled color mapping use sample_annotation_to_colors()')
    color_list_new <- sample_annotation_to_colors(sample_annotation, 
                                                sample_id_col = sample_id_col,
                                                factor_columns = factors_without_colors, 
                                                numeric_columns = NULL)
    color_list = c(color_list, color_list_new)
  }
  
  
  if (length(setdiff(names(color_list), factors_to_plot)) > 0 && 
      !is.null(factors_to_plot)){
    color_list = color_list[factors_to_plot]
  }
  
  #transform color list to color_df
  sample_annotation = sample_annotation %>% 
    filter(!!sym(sample_id_col) %in% colnames(data_matrix))
  color_df = color_list_to_df(color_list, sample_annotation, sample_id_col)
  
  if (is.null(filename)){
    plotDendroAndColors(hierarchical_clust, color_df, rowTextAlignment = 'left',
                        main = plot_title,
                        hang = -0.1, addGuide = TRUE, dendroLabels = FALSE, 
                        cex.dendroLabels = cex.dendroLabels,
                        ...)
  } else {
    
    units_adjusted = adjust_units(units, width, height)
    units = units_adjusted$unit
    width = units_adjusted$width
    height = units_adjusted$height
    
    if (is.na(width)){
      width = 7
    }
    if (is.na(height)){
      height = 7
    }
    if (file_ext(filename) == 'pdf'){
      pdf(file = filename, width = width, height = height, title = plot_title)
    } else if(file_ext(filename) == 'png'){
      png(filename = filename, width = width, height = height, units = units, 
          res = 300)
    } else{
      stop('currently only pdf and png extensions for filename are implemented')
    }
    
    plotDendroAndColors(hierarchical_clust, color_df, rowTextAlignment = 'left',
                        main = plot_title,
                        hang = -0.1, addGuide = TRUE, dendroLabels = FALSE, 
                        cex.dendroLabels = cex.dendroLabels,
                        ...)
    
    dev.off()
  }
}

#' Plot the heatmap of samples (cols) vs features (rows)
#'
#' @inheritParams proBatch
#' @param factors_to_plot vector of technical and biological factors to be 
#' plotted in this diagnostic plot (assumed to be present in 
#' \code{sample_annotation})
#' @param fill_the_missing numeric value that the missing values are
#'   substituted with, or \code{NULL} if features with missing values are to be 
#'   excluded.
#' @param color_for_missing special color to make missing values. 
#' Usually black or white, depending on \code{heatmap_color}
#' @param heatmap_color vector of colors used in heatmap (typicall a gradient)
#' @param cluster_rows boolean value determining if rows 
#' should be clustered
#' @param cluster_cols boolean value determining if columns 
#' should be clustered
#' @param factors_of_feature_ann vector of factors that characterize features, 
#' as listed in \code{peptide_annotation}
#' @param color_list_features list, as returned by 
#' \code{sample_annotation_to_colors}, 
#' but mapping \code{peptide_annotation} where each item contains a color vector 
#' for each factor to be mapped to the color.
#' @param ... other parameters of \code{link[pheatmap]{pheatmap}}
#' 
#' @return object returned by \code{link[pheatmap]{pheatmap}}
#' @export
#' 
#' @examples 
#' 
#' log_transformed_matrix = log_transform_dm(example_proteome_matrix)
#' heatmap_plot <- plot_heatmap_diagnostic(log_transformed_matrix, 
#' example_sample_annotation, 
#' factors_to_plot = c("MS_batch",  "digestion_batch", "Diet", 'DateTime'), 
#' cluster_cols = TRUE, cluster_rows = FALSE,
#' show_rownames = FALSE, show_colnames = FALSE)
#' 
#' color_list <- sample_annotation_to_colors (example_sample_annotation, 
#' factor_columns = c('MS_batch','EarTag', "Strain", 
#' "Diet", "digestion_batch", "Sex"),
#' numeric_columns = c('DateTime', 'order'))
#' 
#' log_transformed_matrix = log_transform_dm(example_proteome_matrix)
#' heatmap_plot <- plot_heatmap_diagnostic(log_transformed_matrix, 
#' example_sample_annotation, 
#' factors_to_plot = c("MS_batch",  "digestion_batch", "Diet", 'DateTime'), 
#' cluster_cols = TRUE, cluster_rows = FALSE,
#' color_list = color_list,
#' show_rownames = FALSE, show_colnames = FALSE)
#'
#' @seealso \code{\link{sample_annotation_to_colors}}, 
#' \code{\link[pheatmap]{pheatmap}}
#' 
#' @name plot_heatmap_diagnostic
plot_heatmap_diagnostic <- function(data_matrix, sample_annotation = NULL, 
                                    sample_id_col = 'FullRunName',
                                    factors_to_plot = NULL,
                                    fill_the_missing = -1, 
                                    color_for_missing = 'black',
                                    heatmap_color = colorRampPalette(
                                      rev(brewer.pal(n = 7, 
                                                     name = "RdYlBu")))(100),
                                    cluster_rows = TRUE, cluster_cols = FALSE,
                                    color_list = NULL,
                                    peptide_annotation = NULL,
                                    feature_id_col = 'peptide_group_label',
                                    factors_of_feature_ann = c('KEGG_pathway',
                                                               'evolutionary_distance'),
                                    color_list_features = NULL,
                                    filename = NULL, width = 7, height = 7, 
                                    units = c('cm','in','mm'), 
                                    plot_title = NULL,
                                    ...){
  
  df_long = matrix_to_long(data_matrix, sample_id_col = sample_id_col)
  df_long = check_sample_consistency(sample_annotation, sample_id_col, df_long, 
                                     merge = FALSE)
  data_matrix = long_to_matrix(df_long, sample_id_col = sample_id_col)
  rm(df_long)
  
  #infer the color scheme for sample annotation (cols)
  if(is.null(color_list) && !is.null(sample_annotation)){
    warning('color_list for samples (cols) not defined, inferring automatically.
            Numeric/factor columns are guessed, for more controlled color 
            mapping use sample_annotation_to_colors()')
    color_list = sample_annotation_to_colors(sample_annotation = sample_annotation, 
                                             sample_id_col = sample_id_col, 
                                             factor_columns = factors_to_plot,
                                             numeric_columns = NULL,
                                             guess_factors = TRUE)
  }
  
  #infer the color scheme for feature annotation (rows)
  if(is.null(color_list_features) && !is.null(peptide_annotation)){
    warning('color_list_features for features (rows) not defined, inferring 
            automatically. Numeric/factor columns are guessed, for more 
            controlled color mapping use sample_annotation_to_colors()')
    color_list_features = sample_annotation_to_colors(peptide_annotation, 
                                                      sample_id_col = feature_id_col, 
                                                      factor_columns = factors_of_feature_ann,
                                                      numeric_columns = NULL,
                                                      guess_factors = TRUE)
  }
  
  p <- plot_heatmap_generic(data_matrix, 
                            column_annotation_df = sample_annotation,
                            row_annotation_df = peptide_annotation, 
                            fill_the_missing = fill_the_missing, 
                            col_ann_id_col = sample_id_col,
                            row_ann_id_col = feature_id_col,
                            columns_for_cols = factors_to_plot,
                            columns_for_rows = factors_of_feature_ann,
                            cluster_rows = cluster_cols, 
                            cluster_cols = cluster_cols,
                            annotation_color_cols = color_list,
                            annotation_color_rows = color_list_features,
                            heatmap_color = heatmap_color,
                            color_for_missing = color_for_missing,
                            filename = filename, width = width, height = width, 
                            units = units, 
                            plot_title = plot_title,
                            ...)
  return(p)
}

#' Plot the heatmap
#'
#' @inheritParams proBatch
#' @param data_matrix the matrix of data to be plotted
#' @param column_annotation_df data frame annotating columns of 
#' \code{data_matrix}
#' @param row_annotation_df data frame annotating rows of \code{data_matrix}
#' @param col_ann_id_col column of \code{column_annotation_df} whose values are
#' unique identifiers of columns in \code{data_matrix}
#' @param row_ann_id_col column of \code{row_annotation_df} whose values are 
#' unique identifiers of rows in \code{data_matrix}
#' @param columns_for_cols vector of factors (columns) of 
#' \code{column_annotation_df}
#' that will be mapped to color annotation of heatmap columns
#' @param columns_for_rows vector of factors (columns) of 
#' \code{row_annotation_df}
#' that will be mapped to color annotation of heatmap rows
#' @param cluster_rows boolean: whether the rows should be clustered
#' @param cluster_cols boolean: whether the rows should be clustered
#' @param annotation_color_cols list of color vectors for column annotation,
#' for each factor to be plotted; for factor-like variables a named vector 
#' (names should correspond to the levels of factors). Advisable to supply here
#' color list returned by \code{sample_annotation_to_colors}
#' @param annotation_color_rows list of color vectors for row annotation,
#' for each factor to be plotted; for factor-like variables a named vector 
#' (names should correspond to the levels of factors). Advisable to supply here
#' color list returned by \code{sample_annotation_to_colors}
#' @param fill_the_missing numeric value that the missing values are
#'   substituted with, or \code{NULL} if features with missing values are to be 
#'   excluded.
#' @param color_for_missing special color to make missing values. 
#' Usually black or white, depending on \code{heatmap_color}
#' @param heatmap_color vector of colors used in heatmap (typicall a gradient)
#' @param ... other parameters of \code{link[pheatmap]{pheatmap}}
#'
#' @return pheatmap-type object
#' @export
#' 
#' @examples 
#' 
#' p <- plot_heatmap_generic(log_transform_dm(example_proteome_matrix), 
#' column_annotation_df = example_sample_annotation,
#' columns_for_cols = c("MS_batch",  "digestion_batch", "Diet", 'DateTime'),
#' plot_title = 'test_heatmap',
#' show_rownames = FALSE, show_colnames = FALSE)
#' 
plot_heatmap_generic <- function(data_matrix, 
                                 column_annotation_df = NULL,
                                 row_annotation_df = NULL, 
                                 col_ann_id_col = 'FullRunName',
                                 row_ann_id_col = 'peptide_group_label',
                                 columns_for_cols = c('MS_batch','Diet', 
                                                      'DateTime','order'),
                                 columns_for_rows = c('KEGG_pathway',
                                                      'WGCNA_module',
                                                      'evolutionary_distance'),
                                 cluster_rows = FALSE, cluster_cols = TRUE,
                                 annotation_color_cols = NULL,
                                 annotation_color_rows = NULL,
                                 fill_the_missing = -1, 
                                 color_for_missing = 'black',
                                 heatmap_color = colorRampPalette(
                                   rev(brewer.pal(n = 7, 
                                                  name = "RdYlBu")))(100),
                                 filename = NULL, width = 7, height = 7, 
                                 units = c('cm','in','mm'), 
                                 plot_title = NULL,
                                 ...){
  
  #deal with the missing values
  warning_message <- 'Heatmap cannot operate with missing values in the matrix'
  data_matrix = handle_missing_values(data_matrix, warning_message, 
                                      fill_the_missing)
  if (is.null(fill_the_missing) & any(is.na(data_matrix)) &
      (!cluster_rows | !cluster_cols) ){
    message('With NAs removed, clustering of heatmap will work, 
              specify: cluster_rows = T, cluster_cols = T')
  }
  
  if(!is.null(fill_the_missing)){
    heatmap_color = c(color_for_missing, heatmap_color)
  }
  
  annotation_col = NA
  annotation_row = NA
  if(!is.null(column_annotation_df)){
    if(!is.null(columns_for_cols)){
      annotation_col = column_annotation_df %>% 
        select(one_of(col_ann_id_col, columns_for_cols))
    } else {
      annotation_col = column_annotation_df
    }
    annotation_col = annotation_col %>%
      mutate_if(is.POSIXct, as.numeric) %>%
      remove_rownames %>% 
      column_to_rownames(var=col_ann_id_col)  
  }
  
  if(!is.null(row_annotation_df)){
    if(!is.null(columns_for_rows)){
      annotation_row = row_annotation_df %>% 
        select(one_of(row_ann_id_col, columns_for_rows))
    } else {
      annotation_row = row_annotation_df
    }
    
    annotation_row = annotation_row %>%
      mutate_if(is.POSIXct, as.numeric) %>%
      remove_rownames %>% 
      column_to_rownames(var=row_ann_id_col)
  }
  
  if(is.null(column_annotation_df) && is.null(row_annotation_df)){
    warning("annotation_row and annotation_col are not specified for heatmap 
            (annotation of rows/cols such as sample annotation will not be plotted)")
  }
  
  if(!identical(annotation_col, NA) & (is.data.frame(annotation_col) | is.matrix(annotation_col))){
    if (!setequal(rownames(annotation_col), colnames(data_matrix))){
      warning('coloring by column annotation will not work: annotation rownames do not match data matrix column names')
    }
  }
  
  if(!identical(annotation_row, NA) & (is.data.frame(annotation_row) | is.matrix(annotation_row))){
    if (!setequal(rownames(annotation_row), rownames(data_matrix))){
      warning('coloring by row annotation will not work: annotation rownames do not match data matrix column names')
    }
  }
  
  if(is.null(plot_title)){
    plot_title = NA
  }
  if(is.null(filename)){
    filename = NA
  }
  units_adjusted = adjust_units(units, width, height)
  units = units_adjusted$unit
  width = units_adjusted$width
  height = units_adjusted$height
  
  if(is.null(annotation_color_cols) && is.null(annotation_color_rows)){
    annotation_color_list = NA
  } else {
    annotation_color_list = c(annotation_color_cols, annotation_color_rows)
  }
  
  p <- pheatmap(data_matrix, 
                cluster_rows = cluster_rows, cluster_cols = cluster_cols,
                color = heatmap_color,
                annotation_col = annotation_col, 
                annotation_row = annotation_row, 
                annotation_colors = annotation_color_list,
                filename = filename, width = width, height = height,
                main = plot_title, ...)
  return(p)
}

#' Calculate variance distribution by variable
#' 
#' @inheritParams proBatch
#' @param factors_for_PVCA vector of factors from \code{sample_annotation}, that
#'   are used in PVCA analysis
#' @param pca_threshold the percentile value of the minimum amount of the
#'   variabilities that the selected principal components need to explain
#' @param variance_threshold the percentile value of weight each of the factors
#'   needs to explain (the rest will be lumped together)
#' @param fill_the_missing numeric value determining how  missing values 
#' should be substituted. If \code{NULL}, features with missing values are 
#' excluded.
#' @return data frame of weights of Principal Variance Components
#' @export
#' 
#' @examples
#' 
#' matrix_test <- example_proteome_matrix[1:150, ]
#' pvca_df <- calculate_PVCA(matrix_test, example_sample_annotation, 
#' factors_for_PVCA = c('MS_batch', 'digestion_batch',"Diet", "Sex", "Strain"),
#' pca_threshold = .6, variance_threshold = .01, fill_the_missing = -1)
calculate_PVCA <- function(data_matrix, sample_annotation, 
                           feature_id_col = 'peptide_group_label',
                           sample_id_col = 'FullRunName',
                           factors_for_PVCA = c('MS_batch', 'digestion_batch',
                                                "Diet", "Sex", "Strain"),
                           pca_threshold = .6, variance_threshold = .01,
                           fill_the_missing = -1) {
  
  df_long = matrix_to_long(data_matrix, sample_id_col = sample_id_col)
  df_long = check_sample_consistency(sample_annotation, sample_id_col, df_long,
                                     batch_col = NULL, order_col = NULL, 
                                     facet_col = NULL, merge = FALSE)
  data_matrix = long_to_matrix(df_long, sample_id_col = sample_id_col)
  
  sample_annotation = sample_annotation   %>% 
    select(one_of(c(sample_id_col, factors_for_PVCA))) %>%
    mutate_if(is.POSIXct, as.numeric) %>%
    as.data.frame()%>%
    column_to_rownames(var = sample_id_col)
  
  data_matrix = check_feature_id_col_in_dm(feature_id_col, data_matrix)
  
  warning_message <- 'PVCA cannot operate with missing values in the matrix'
  data_matrix = handle_missing_values(data_matrix, warning_message, 
                                      fill_the_missing)
  
  covrts.annodf = Biobase::AnnotatedDataFrame(data=sample_annotation)
  data_matrix = data_matrix[, rownames(sample_annotation)]
  expr_set = Biobase::ExpressionSet(assayData = data_matrix, 
                                    phenoData = covrts.annodf)
  pvcaAssess = pvcaBatchAssess(expr_set, factors_for_PVCA, 
                               threshold = pca_threshold)
  pvcaAssess_df = data.frame(weights = as.vector(pvcaAssess$dat),
                             label = pvcaAssess$label,
                             stringsAsFactors = FALSE)
  
  label_of_small = sprintf('Below %1.0f%%', 100*variance_threshold)
  if (sum(pvcaAssess_df$weights < variance_threshold) > 1){
    small_weights <- pvcaAssess_df$weights < variance_threshold
    pvca_res_small = sum(pvcaAssess_df$weights[small_weights])
    pvca_res = pvcaAssess_df[pvcaAssess_df$weights >= variance_threshold, ]
    pvca_res_add = data.frame(weights = pvca_res_small, label = label_of_small)
    pvca_res = rbind(pvca_res, pvca_res_add)
  } else {
    pvca_res = pvcaAssess_df
  }
  return(pvca_res)
}

#' Plot variance distribution by variable
#' 
#' @inheritParams proBatch
#' @param technical_factors vector \code{sample_annotation} column names that 
#' are technical covariates
#' @param biological_factors vector \code{sample_annotation} column names, that
#'   are biologically meaningful covariates
#' @param colors_for_bars four-item color vector, specifying colors for the
#'   following categories: c('residual', 'biological', 'biol:techn',
#'   'technical')
#' @param pca_threshold the percentile value of the minimum amount of the
#'   variabilities that the selected principal components need to explain
#' @param variance_threshold the percentile value of weight each of the 
#'  covariates needs to explain (the rest will be lumped together)
#' @param fill_the_missing numeric value determining how  missing values 
#' should be substituted. If \code{NULL}, features with missing values are 
#' excluded.
#' If \code{NULL}, features with missing values are excluded.
#'
#' @return \code{ggplot} object with the plot
#' @export
#'
#' @examples 
#' matrix_test <- example_proteome_matrix[1:150, ]
#' pvca_plot <- plot_PVCA(matrix_test, example_sample_annotation, 
#' technical_factors = c('MS_batch', 'digestion_batch'),
#' biological_factors = c("Diet", "Sex", "Strain"))
#' 
#' \dontrun{
#' pvca_plot <- plot_PVCA(matrix_test, example_sample_annotation, 
#' technical_factors = c('MS_batch', 'digestion_batch'),
#' biological_factors = c("Diet", "Sex", "Strain"), 
#' filename = 'test_PVCA.png', width = 28, height = 22, units = 'cm')
#' }
#' 
#' @seealso \code{\link{sample_annotation_to_colors}}, 
#' \code{\link[ggplot2]{ggplot}}
plot_PVCA <- function(data_matrix, sample_annotation,
                      feature_id_col = 'peptide_group_label',
                      sample_id_col = 'FullRunName',
                      technical_factors = c('MS_batch', 'instrument'),
                      biological_factors = c('cell_line','drug_dose'),
                      fill_the_missing = -1,
                      pca_threshold = .6, variance_threshold = .01,
                      colors_for_bars = NULL,
                      filename = NULL, width = NA, height = NA, 
                      units = c('cm','in','mm'),
                      plot_title = NULL,
                      theme = 'classic'){
  pvca_res = prepare_PVCA_df(data_matrix = data_matrix, 
                             sample_annotation = sample_annotation,
                             feature_id_col = feature_id_col,
                             sample_id_col = sample_id_col,
                             technical_factors = technical_factors,
                             biological_factors = biological_factors,
                             fill_the_missing = fill_the_missing,
                             pca_threshold = pca_threshold, 
                             variance_threshold = variance_threshold)
  
  gg = plot_PVCA.df(pvca_res = pvca_res)
  return(gg)
}

#' prepare the weights of Principal Variance Components 
#'
#' @inheritParams proBatch
#' @param technical_factors vector \code{sample_annotation} column names that 
#' are technical covariates
#' @param biological_factors vector \code{sample_annotation} column names, that
#'   are biologically meaningful covariates
#' @param pca_threshold the percentile value of the minimum amount of the
#'   variabilities that the selected principal components need to explain
#' @param variance_threshold the percentile value of weight each of the 
#'  covariates needs to explain (the rest will be lumped together)
#' @param fill_the_missing numeric value determining how  missing values 
#' should be substituted. If \code{NULL}, features with missing values are 
#' excluded.
#' If \code{NULL}, features with missing values are excluded.
#'
#' @return data frame with weights and factors, combined in a way ready for plotting
#' @export
#'
#' @examples
#' matrix_test <- example_proteome_matrix[1:150, ]
#' pvca_df_res <- prepare_PVCA_df(matrix_test, example_sample_annotation, 
#' technical_factors = c('MS_batch', 'digestion_batch'),
#' biological_factors = c("Diet", "Sex", "Strain"), 
#' pca_threshold = .6, variance_threshold = .01, fill_the_missing = -1)
prepare_PVCA_df <- function(data_matrix, sample_annotation,
                            feature_id_col = 'peptide_group_label',
                            sample_id_col = 'FullRunName',
                            technical_factors = c('MS_batch', 'instrument'),
                            biological_factors = c('cell_line','drug_dose'),
                            fill_the_missing = -1,
                            pca_threshold = .6, variance_threshold = .01){
  
  factors_for_PVCA = c(technical_factors, biological_factors)
  
  pvca_res = calculate_PVCA(data_matrix, sample_annotation, 
                            feature_id_col = feature_id_col,
                            sample_id_col = sample_id_col,
                            factors_for_PVCA = factors_for_PVCA,
                            pca_threshold = pca_threshold, 
                            variance_threshold = variance_threshold,
                            fill_the_missing = fill_the_missing)
  
  tech_interactions = expand.grid(technical_factors, 
                                  technical_factors) %>%
    mutate(tech_interactions = paste(Var1, Var2, sep = ':')) %>%
    pull(tech_interactions)
  biol_interactions = expand.grid(biological_factors, 
                                  biological_factors) %>%
    mutate(biol_interactions = paste(Var1, Var2, sep = ':')) %>%
    pull(biol_interactions)
  
  label_of_small = sprintf('Below %1.0f%%', 100*variance_threshold)
  technical_factors = c(technical_factors, tech_interactions)
  biological_factors = c(biological_factors, biol_interactions)
  pvca_res = pvca_res %>% mutate(category = ifelse(label %in% technical_factors, 
                                                   'technical',
                                                   ifelse(label %in% biological_factors, 
                                                          'biological',
                                                          ifelse(label %in% c(label_of_small, 'resid'), 
                                                                 'residual', 'biol:techn'))))
  
  pvca_res = pvca_res %>%
    arrange(desc(weights)) %>%
    arrange(label == label_of_small) %>%
    arrange(label == 'resid')
  return(pvca_res)
}

#' plot PVCA, when the analysis is completed
#'
#' @inheritParams proBatch
#' @param pvca_res data frame of weights of Principal Variance Components, result
#' of \code{calculate_PVCA}
#' @param colors_for_bars four-item color vector, specifying colors for the
#'   following categories: c('residual', 'biological', 'biol:techn',
#'   'technical')
#'   
#' @return \code{ggplot} object with bars as weights, colored by bio/tech factors
#' @export
#'
#' @examples
#' matrix_test <- example_proteome_matrix[1:150, ]
#' pvca_df_res <- prepare_PVCA_df(matrix_test, example_sample_annotation, 
#' technical_factors = c('MS_batch', 'digestion_batch'),
#' biological_factors = c("Diet", "Sex", "Strain"), 
#' pca_threshold = .6, variance_threshold = .01, fill_the_missing = -1)
#' colors_for_bars = c('grey', 'green','blue','red')
#' names(colors_for_bars) = c('residual', 'biological','biol:techn','technical')
#' 
#' pvca_plot <- plot_PVCA.df(pvca_df_res, colors_for_bars)
plot_PVCA.df <- function(pvca_res,
                      colors_for_bars = NULL,
                      filename = NULL, width = NA, height = NA, 
                      units = c('cm','in','mm'),
                      plot_title = NULL,
                      theme = 'classic'){
  pvca_res = pvca_res %>%
    mutate(label = factor(label, levels=label))
  
  y_title = 'Weighted average proportion variance'
  gg  = ggplot(pvca_res, aes(x = label, y = weights, fill = category))+
    geom_bar(stat = 'identity', color = 'black')+
    ylab(y_title)
  
  
  if(is.null(colors_for_bars)){
    colors_for_bars = c('grey', wes_palettes$Rushmore[3:5])
    names(colors_for_bars) = c('residual', 'biological', 
                               'biol:techn', 'technical')
    
  } else {
    if (length(colors_for_bars) != 4){
      color_names = paste(c('residual', 'biological', 'biol:techn', 
                            'technical'), collapse = ' ')
      warning(sprintf('four colors for: %s were expected', color_names))
    }
  }
  gg = gg + scale_fill_manual(values = colors_for_bars)
  
  if (!is.null(plot_title)){
    gg = gg + ggtitle(plot_title)
  }
  
  #Change the theme
  if(!is.null(theme) && theme == 'classic'){
    gg = gg + theme_classic()
  }else{
    message("plotting with default ggplot theme, only theme = 'classic' 
            implemented")
  }
  
  save_ggplot(filename, units, width, height, gg)
  
  gg = gg +
    theme(axis.title.x = NULL, 
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
    xlab(NULL)+
    theme(text = element_text(size=15))+
    guides(fill=guide_legend(override.aes=list(color=NA), title=NULL))
  
  return(gg)
}

#' plot PCA plot
#'
#' @inheritParams proBatch
#' @param color_by column name (as in \code{sample_annotation}) to color by
#' @param PC_to_plot principal component numbers for x and y axis
#' @param fill_the_missing numeric value determining how  missing values 
#' should be substituted. If \code{NULL}, features with missing values are 
#' excluded.
#' If \code{NULL}, features with missing values are excluded.
#'
#' @return ggplot scatterplot colored by factor levels of column specified in
#'   \code{factor_to_color}
#' @export
#'
#' @examples 
#' pca_plot <- plot_PCA(example_proteome_matrix, example_sample_annotation, 
#' color_by = 'MS_batch', plot_title = "PCA colored by MS batch")
#' pca_plot <- plot_PCA(example_proteome_matrix, example_sample_annotation, 
#' color_by = 'DateTime', plot_title = "PCA colored by DateTime")
#' 
#' color_list <- sample_annotation_to_colors (example_sample_annotation, 
#' factor_columns = c('MS_batch', 'digestion_batch'),
#' numeric_columns = c('DateTime','order'))
#' pca_plot <- plot_PCA(example_proteome_matrix, example_sample_annotation, 
#' color_by = 'DateTime', color_scheme = color_list[['DateTime']])
#' 
#' \dontrun{
#' pca_plot <- plot_PCA(example_proteome_matrix, example_sample_annotation, 
#' color_by = 'DateTime', plot_title = "PCA colored by DateTime",
#' filename = 'test_PCA.png', width = 14, height = 9, units = 'cm')
#' }
#' 
#' @seealso \code{\link[ggfortify]{autoplot.pca_common}}, 
#' \code{\link[ggplot2]{ggplot}}
plot_PCA <- function(data_matrix, sample_annotation,
                     feature_id_col = 'peptide_group_label',
                     sample_id_col = 'FullRunName',
                     color_by = 'MS_batch',
                     PC_to_plot = c(1,2), fill_the_missing = -1,
                     color_scheme = 'brewer',
                     filename = NULL, width = NA, height = NA, 
                     units = c('cm','in','mm'),
                     plot_title = NULL,
                     theme = 'classic'){
  
  df_long = matrix_to_long(data_matrix, sample_id_col = sample_id_col)
  df_long = check_sample_consistency(sample_annotation, sample_id_col, df_long, 
                                     batch_col = color_by, order_col = NULL, 
                                     facet_col = NULL, merge = FALSE)
  data_matrix = long_to_matrix(df_long, sample_id_col = sample_id_col)
  
  data_matrix = check_feature_id_col_in_dm(feature_id_col, data_matrix)
  
  warning_message <- 'PCA cannot operate with missing values in the matrix'
  data_matrix = handle_missing_values(data_matrix, warning_message, 
                                      fill_the_missing)
  
  pr_comp_res <- prcomp(t(data_matrix))
  gg = autoplot(pr_comp_res, data = sample_annotation,
                x = PC_to_plot[1], y = PC_to_plot[2])
  
  
  #add colors
  
  if(length(color_by) > 1){
    warning('Coloring by the first column specified')
    color_by = color_by[1]
  } #TODO: create the ggpubr graph with multiple panels, colored by factors
  gg = color_by_factor(color_by_batch = TRUE, 
                       batch_col = color_by, gg = gg, 
                       color_scheme = color_scheme, 
                       sample_annotation = sample_annotation,
                       fill_or_color = 'color')
  
  #Add the title
  if(!is.null(plot_title)) {
    gg = gg + ggtitle(plot_title)+
      theme(plot.title = element_text(face = 'bold',hjust = .5))
  }
  
  #Change the theme
  if(!is.null(theme) && theme == 'classic'){
    gg = gg + theme_classic()
  }else{
    message("plotting with default ggplot theme, only theme = 'classic' 
            implemented")
  }
  
  save_ggplot(filename, units, width, height, gg)
  
  return(gg)
}
Any scripts or data that you put into this service are public.
proBatch documentation built on Nov. 8, 2020, 4:55 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
proBatch
Tools for Diagnostics and Corrections of Batch Effects in Proteomics

R/proteome_wide_diagnostics.R
In proBatch: Tools for Diagnostics and Corrections of Batch Effects in Proteomics

Defines functions plot_PCA plot_PVCA.df prepare_PVCA_df plot_PVCA calculate_PVCA plot_heatmap_generic plot_heatmap_diagnostic plot_hierarchical_clustering

Documented in calculate_PVCA plot_heatmap_diagnostic plot_heatmap_generic plot_hierarchical_clustering plot_PCA plot_PVCA plot_PVCA.df prepare_PVCA_df

Try the proBatch package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

proBatch Tools for Diagnostics and Corrections of Batch Effects in Proteomics

R/proteome_wide_diagnostics.R In proBatch: Tools for Diagnostics and Corrections of Batch Effects in Proteomics

Defines functions plot_PCA plot_PVCA.df prepare_PVCA_df plot_PVCA calculate_PVCA plot_heatmap_generic plot_heatmap_diagnostic plot_hierarchical_clustering

Documented in calculate_PVCA plot_heatmap_diagnostic plot_heatmap_generic plot_hierarchical_clustering plot_PCA plot_PVCA plot_PVCA.df prepare_PVCA_df

Try the proBatch package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

proBatch
Tools for Diagnostics and Corrections of Batch Effects in Proteomics

R/proteome_wide_diagnostics.R
In proBatch: Tools for Diagnostics and Corrections of Batch Effects in Proteomics