In ningzhibin/rmdocpu: Metalab report upon provided templates

htmltools::img(src = "https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/rmd/iMetaReport.png", 
               alt = 'logo', 
               style = 'position:absolute; top:0; right:0; padding:10px;width:150px;height:150px;')

knitr::opts_chunk$set(echo = FALSE,warning = FALSE, message = FALSE, cache = FALSE)

# enviroment setup
source("https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/subfunctions_general.r")
source("https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/subfunctions_general_update.r")

library(tidyverse)
library(ggplot2)
library(d3heatmap)
library(plotly)

# other reqruied package:
#  DT

#library(gplots)

# local usage test and debug
# rmarkdown::render("MQ_report_peptides.Rmd", params = list(input_datatable =  your_readin_tbl))
# or with meta information
# rmarkdown::render("MQ_report_peptides.Rmd", params = list(input_datatable =  your_readin_tbl, meta_table = meta_table_input))


# todo meta check

peptide.txt  <- read.delim("peptides.txt", header = TRUE,check.names = FALSE, stringsAsFactors = FALSE)


#meta_table <- read.delim("metadata.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE) # test with meta file

tidy_peptides <- function(peptide.txt){ 

  peptide_sequence <- peptide.txt$Sequence # only keep the first one

  # do the row wise filtering
  index_contaminant <- grep("\\+", peptide.txt$`Potential contaminant`) # note that + is a special character
  index_reverse <- grep("\\+", peptide.txt$Reverse)
  index_to_remove <- c(index_contaminant,index_reverse)

  if(length(index_to_remove) >0){ # some times there are no rows to remove
      peptide.txt <- peptide.txt[-index_to_remove,] # filtered table
      peptide_sequence <- peptide_sequence[-index_to_remove] # filtered ids
  }

  n_contaminant <- length(index_contaminant)
  n_reversed <- length(index_reverse)


  # extra the intensity column matrix
  if(any(grepl("LFQ intensity ", colnames(peptide.txt)))){ # if there are LFQ intensity columns, take out the LFQ columns
    intensity_columns <- peptide.txt[,grep("LFQ intensity ", colnames(peptide.txt))]
    colnames(intensity_columns) <- gsub("LFQ intensity ", "", colnames(intensity_columns))

  }else{ # otherwise take out intensity column, even if there is one column, without any experiment desgin

    intensity_columns <-   peptide.txt[,grep("Intensity ", colnames(peptide.txt)),drop =  FALSE]
    colnames(intensity_columns)<-gsub("Intensity ", "", colnames(intensity_columns))

  }

  return(list("intensity_matrix" = intensity_columns,
              "peptide_sequence" =peptide_sequence,
              "n_contaminant" = n_contaminant,
              "n_reversed" = n_reversed,
              "score" = peptide.txt$Score,
              "Charges" =peptide.txt$Charges,
              "length" = peptide.txt$Length,
              "misscleavage" = peptide.txt$"Missed cleavages"

              ))

}

# input

peptide.txt <- params$input_datatable


# Note: The folling analysis with meta info assumes that
# 1st columns as sample name, 2nd column as experiment name, 3rd column and after as grouping

meta_table <- params$meta_table

# tidy and process:

peptide_tidyed <- tidy_peptides(peptide.txt)

df_intensity <- peptide_tidyed$intensity_matrix
sparsity <- rowSums(df_intensity > 0) # here sparsity is number of present values
index_all_na_rows <- which(sparsity == 0)
df_intensity <- df_intensity[-index_all_na_rows,,drop = FALSE]
peptide_sequence <- peptide_tidyed$peptide_sequence[-index_all_na_rows]


# check meta_table

if( is.null(meta_table)){ 

  meta_info <- "* **No meta information provided**"

}else if (any(is.na(meta_table))){ 

  meta_info <-  "* **The meta information provided has missing values, please check again**"

}else if(!(all(as.vector(data.frame$rawfile) %in% meta_table[,1]) && all( meta_table[,1] %in% as.vector(data.frame$rawfile)))){
  meta_info <-  "* **The raw files in meta information provided do not match the raw file names in the summary.txt, please check again**"

}else{
 meta_info <-   c("* **Groups: **",unique(meta_table[,3]))  
}

Intro

This report provides some basic description of the peptide identificaiton from database search. Users can use this to quickly check the overal quality of the experiment Users can download the clean peptide quantification matrix for downstream analysis

Take-home figures

Number of contaminant: r peptide_tidyed$n_contaminant
Number of reversed: r peptide_tidyed$n_reversed
Number of qualified peptides: r nrow(peptide_tidyed$intensity_matrix)
Number of quailfied peptide without intensity information(0 intensities): r length(index_all_na_rows)
Number of experiment: r ncol(df_intensity)
All experiments: r colnames(df_intensity)

r if( !is.null(meta_table)){ c("* **Meta/grouping info provided: **",unique(meta_table[,3]))}else{ "* **No meta information provided**"}

Peptide Charge States

peptide_charge <- as.data.frame(table(peptide_tidyed$Charges))
colnames(peptide_charge) = c("Charge_state", "Freq")


p<- ggplot2::ggplot(data = peptide_charge)+
  geom_col(aes(x = Charge_state,y = Freq))+
  theme_bw()
plotly::ggplotly(p)

#plot_ly(peptide_charge, x = ~Charge_state, y = ~Freq, type = "bar") %>% add_markers()

Peptide Length

peptide_length <- as.data.frame(table(peptide_tidyed$length))
colnames(peptide_length) = c("peptide_length", "Freq")


p<- ggplot2::ggplot(data = peptide_length)+
  geom_col(aes(x = peptide_length,y = Freq))+
  theme_bw()
plotly::ggplotly(p)

#plot_ly(peptide_charge, x = ~Charge_state, y = ~Freq, type = "bar") %>% add_markers()

Peptide Score distribution

df_score <- data.frame(Score  = peptide_tidyed$score)


p<- ggplot2::ggplot(data=df_score, aes(Score)) +
  geom_histogram(aes(y=..density..),color= "black",fill="white")+
  geom_density(alpha=.2, fill="#FF6666")+
  theme_bw()

plotly::ggplotly(p)

Peptide Intensity distribution

In most cases, label-free quantification provides a decent way for metaproteomics profiling.

Sparisty Profile

Presence of valid intensity across samples

df_sparsity <- as.data.frame(table(sparsity))
df_sparsity_dec <- df_sparsity[order(df_sparsity$sparsity,decreasing = TRUE),]
df_sparsity_dec$sparsity <- factor(df_sparsity_dec$sparsity, levels = df_sparsity_dec$sparsity)


p<- ggplot2::ggplot(data = df_sparsity_dec)+
  geom_col(aes(x = sparsity,y = Freq))+
  xlab("Presence in the Peptide Intensity Matrix")+
  theme_bw()
plotly::ggplotly(p)

Sparsity cummulative curve

Figure shows how many peptides have more than N presence, which helps to set the presence cutoff

#df_sparsity_dec <- df_sparsity[order(df_sparsity$sparsity,decreasing = TRUE),]

df_sparsity_dec$cumsum <-  cumsum(df_sparsity_dec$Freq)

# redefine the sparisty sequence, in order to keep the order in the plot, This is the best way as far as I know
#df_sparsity_dec$sparsity <- factor(df_sparsity_dec$sparsity, levels = df_sparsity_dec$sparsity)


p<- ggplot2::ggplot(data = df_sparsity_dec)+
  geom_col(aes(x = sparsity,y = cumsum))+
  xlab("Presence in the Peptide Intensity Matrix")+
  theme_bw()
plotly::ggplotly(p)

Intensity disitribution acrross samples/experiments

#note: in this log10 intenisty matrix, 0 is converted into infinity, therefore will not show in the box plot, and does not affect the distribution
df_intensity_log10 <-log10(df_intensity)

data_matrix_log10_melt<-reshape2::melt(as.matrix(df_intensity_log10))

colnames(data_matrix_log10_melt) <- c("Proteins", "Samples", "Log10(Inensity)")

p<-ggplot(data_matrix_log10_melt)+
    geom_boxplot(aes(x = Samples, y = `Log10(Inensity)`)) +
    theme_bw()+
    theme(axis.text.x = element_text(angle = 90, hjust = 1))


plotly::ggplotly(p)

# sometimes the quantity of pepetide is too large, only show limited number of rows

if(ncol(df_intensity) > 1){ # only if there are more than 1 columns in experssion matrix 

  sparsity <- rowSums(df_intensity > 0) 

  index_P50 <- which(sparsity >= ncol(df_intensity)/2)
  index_P100 <- which(sparsity == ncol(df_intensity))

  if(length(index_P100) < 1000){ # if Q100 has less than 1000 rows, then plot Q50 on heatmap
    index_keep <- index_P50
  }else{
    index_keep <- index_P100
  }


}

r if(ncol(df_intensity) > 1){ "# Overall Expression Profile"}

r if(ncol(df_intensity) > 1){ "## Heatmap"}

r if(ncol(df_intensity) > 1){if(length(index_P100) < 1000){ "Heatmap of Presence50" }else{ "Heatmap of Presence 100" } }

# sometimes the quantity of pepetide is too large, only show limited number of rows

if(ncol(df_intensity) > 1){ # only if there are more than 1 columns in experssion matrix 

  df_intensity_heatmap_log10 <-log10(df_intensity[index_keep, , drop = FALSE]+1)
  rownames(df_intensity_heatmap_log10) <- peptide_sequence[index_keep]
  d3heatmap(df_intensity_heatmap_log10,show_grid = FALSE, color = "OrRd")

}

r if(ncol(df_intensity) > 3){"# PCA Analysis"}

if(ncol(df_intensity) > 3){ # otherwise, no point doing PCA

  df_intensity_log10 <- log10(df_intensity+1)

  if(is.null(meta_table)){

    #pca <-PCA_wrapper_prcomp2(data_matrix = as.matrix(df_intensity_log10), inputation = TRUE)
    PCA_result <- prcomp(t(df_intensity_log10))
    loading <- as.data.frame(PCA_result$x)
    #plot(loading$PC1, loading$PC2)
    #plot_ly(data = loading, x = ~PC1, y = ~PC2) %>% add_markers() %>% add_text(text = row.names(loading))

    p1 <- plot_ly(loading, x = ~PC1, y = ~PC2, z = ~PC3) %>%
    add_markers() %>%
    add_text(text = row.names(loading))



    # for screen plot
    sd <- PCA_result$sde
    var <- sd^2
    var.percent <- var/sum(var) * 100

    PCs <- paste("PC", 1:length(var.percent))
    df_scree <- data.frame(PC  = factor(PCs, levels = PCs), ratio =var.percent)


    p2 <- plot_ly(data = df_scree,x = ~PC, y = ~ratio, type = "bar") %>%
      layout(title = "Scree Plot of Principle components", xaxis = list(title = 'Principle Component'), yaxis = list(title = 'Variance(%)'))

    htmltools::tagList(list(p1, p2)) 

  }else{

    # do pca analysis with meta groupign information

    pca <-PCA_wrapper_prcomp2(data_matrix = as.matrix(df_intensity_log10), data_meta =  meta_table[,c(2,3)], inputation = FALSE)


    # plotting
    print(pca$pca_scree_plot)

      cat("Scree plot ##  shows the performance of the PCA analysis, the more percentage the top procomponets accounts, the better separation")

    print(pca$pca_component_plot)

      cat("2d PCA Component plot ##")

    print(pca$pca_confidence)

      cat("2d PCA Component plot ## with confidence boundaries")

    print(pca$pca_component_plot_kmeans)

      cat("2d PCA Component plot with K-means grouping, which is non-supervised grouping ")

    pca$pca_component_plot_3d_interactive

  }

}

r if(length(unique(meta_table[,3])) >=2 && all(table(meta_table[,3]) >=2)) {"# ANOVA TEST \n**Peptides with significiant(p < 0.05) change between any groups**"}

# will do this block while: 1, more than (includeing) two meta/groups, 2, each group has more than (includeing) 2 samples.

if(length(unique(meta_table[,3])) >=2  && all(table(meta_table[,3]) >=2)){

pvalues_anova <- matrix_PostHoc(df_intensity,meta_table[,3])

df_intensity_p <- cbind(pvalues_anova, df_intensity)

rownames(df_intensity_p) <- peptide_sequence

df_intensity_p_filtered <- df_intensity_p[which(!is.na(df_intensity_p$p_PostHoc_pairs)),] 

DT::datatable(df_intensity_p_filtered, extensions = 'Buttons',options = list(dom = "Blfrtip",scrollX=TRUE, buttons = list('copy', 'print', list(
        extend = 'collection',
        buttons = c('csv', 'excel', 'pdf'),
        text = 'Download'
      ))))

}

Download the neat table

The table is clean peptides expression table, with reversed and contaminant removed. The values are the LFQ(with label free quantification turned on) or raw protein intensity from Maxquant output. You can download/export and start from this table for downstream analysis using the (i)Metalab faminly apps.

The talbe can be further visualized by our shiny apps shiny.imetalab.ca

df_intensity <- as.data.frame(df_intensity)

rownames(df_intensity) <- peptide_sequence

DT::datatable(df_intensity, extensions = 'Buttons',options = list(dom = "Blfrtip",scrollX=TRUE, buttons = list('copy', 'print', list(
        extend = 'collection',
        buttons = c('csv', 'excel', 'pdf'),
        text = 'Download'
      ))))