htmltools::img(src = "https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/rmd/iMetaReport.png", alt = 'logo', style = 'position:absolute; top:0; right:0; padding:10px;width:150px;height:150px;')
knitr::opts_chunk$set(echo = FALSE,warning = FALSE, message = FALSE, cache = FALSE) # enviroment setup source("https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/subfunctions_general.r") source("https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/subfunctions_general_update.r") library(tidyverse) library(ggplot2) library(d3heatmap) library(plotly) # other reqruied package: # DT #library(gplots) # local usage test and debug # rmarkdown::render("MQ_report_peptides.Rmd", params = list(input_datatable = your_readin_tbl)) # or with meta information # rmarkdown::render("MQ_report_peptides.Rmd", params = list(input_datatable = your_readin_tbl, meta_table = meta_table_input)) # todo meta check
peptide.txt <- read.delim("peptides.txt", header = TRUE,check.names = FALSE, stringsAsFactors = FALSE) #meta_table <- read.delim("metadata.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE) # test with meta file
tidy_peptides <- function(peptide.txt){ peptide_sequence <- peptide.txt$Sequence # only keep the first one # do the row wise filtering index_contaminant <- grep("\\+", peptide.txt$`Potential contaminant`) # note that + is a special character index_reverse <- grep("\\+", peptide.txt$Reverse) index_to_remove <- c(index_contaminant,index_reverse) if(length(index_to_remove) >0){ # some times there are no rows to remove peptide.txt <- peptide.txt[-index_to_remove,] # filtered table peptide_sequence <- peptide_sequence[-index_to_remove] # filtered ids } n_contaminant <- length(index_contaminant) n_reversed <- length(index_reverse) # extra the intensity column matrix if(any(grepl("LFQ intensity ", colnames(peptide.txt)))){ # if there are LFQ intensity columns, take out the LFQ columns intensity_columns <- peptide.txt[,grep("LFQ intensity ", colnames(peptide.txt))] colnames(intensity_columns) <- gsub("LFQ intensity ", "", colnames(intensity_columns)) }else{ # otherwise take out intensity column, even if there is one column, without any experiment desgin intensity_columns <- peptide.txt[,grep("Intensity ", colnames(peptide.txt)),drop = FALSE] colnames(intensity_columns)<-gsub("Intensity ", "", colnames(intensity_columns)) } return(list("intensity_matrix" = intensity_columns, "peptide_sequence" =peptide_sequence, "n_contaminant" = n_contaminant, "n_reversed" = n_reversed, "score" = peptide.txt$Score, "Charges" =peptide.txt$Charges, "length" = peptide.txt$Length, "misscleavage" = peptide.txt$"Missed cleavages" )) }
# input peptide.txt <- params$input_datatable # Note: The folling analysis with meta info assumes that # 1st columns as sample name, 2nd column as experiment name, 3rd column and after as grouping meta_table <- params$meta_table
# tidy and process: peptide_tidyed <- tidy_peptides(peptide.txt) df_intensity <- peptide_tidyed$intensity_matrix sparsity <- rowSums(df_intensity > 0) # here sparsity is number of present values index_all_na_rows <- which(sparsity == 0) df_intensity <- df_intensity[-index_all_na_rows,,drop = FALSE] peptide_sequence <- peptide_tidyed$peptide_sequence[-index_all_na_rows] # check meta_table if( is.null(meta_table)){ meta_info <- "* **No meta information provided**" }else if (any(is.na(meta_table))){ meta_info <- "* **The meta information provided has missing values, please check again**" }else if(!(all(as.vector(data.frame$rawfile) %in% meta_table[,1]) && all( meta_table[,1] %in% as.vector(data.frame$rawfile)))){ meta_info <- "* **The raw files in meta information provided do not match the raw file names in the summary.txt, please check again**" }else{ meta_info <- c("* **Groups: **",unique(meta_table[,3])) }
This report provides some basic description of the peptide identificaiton from database search. Users can use this to quickly check the overal quality of the experiment Users can download the clean peptide quantification matrix for downstream analysis
Number of contaminant: r peptide_tidyed$n_contaminant
Number of reversed: r peptide_tidyed$n_reversed
Number of qualified peptides: r nrow(peptide_tidyed$intensity_matrix)
Number of quailfied peptide without intensity information(0 intensities): r length(index_all_na_rows)
Number of experiment: r ncol(df_intensity)
All experiments: r colnames(df_intensity)
r if( !is.null(meta_table)){ c("* **Meta/grouping info provided: **",unique(meta_table[,3]))}else{ "* **No meta information provided**"}
peptide_charge <- as.data.frame(table(peptide_tidyed$Charges)) colnames(peptide_charge) = c("Charge_state", "Freq") p<- ggplot2::ggplot(data = peptide_charge)+ geom_col(aes(x = Charge_state,y = Freq))+ theme_bw() plotly::ggplotly(p) #plot_ly(peptide_charge, x = ~Charge_state, y = ~Freq, type = "bar") %>% add_markers()
peptide_length <- as.data.frame(table(peptide_tidyed$length)) colnames(peptide_length) = c("peptide_length", "Freq") p<- ggplot2::ggplot(data = peptide_length)+ geom_col(aes(x = peptide_length,y = Freq))+ theme_bw() plotly::ggplotly(p) #plot_ly(peptide_charge, x = ~Charge_state, y = ~Freq, type = "bar") %>% add_markers()
df_score <- data.frame(Score = peptide_tidyed$score) p<- ggplot2::ggplot(data=df_score, aes(Score)) + geom_histogram(aes(y=..density..),color= "black",fill="white")+ geom_density(alpha=.2, fill="#FF6666")+ theme_bw() plotly::ggplotly(p)
In most cases, label-free quantification provides a decent way for metaproteomics profiling.
Presence of valid intensity across samples
df_sparsity <- as.data.frame(table(sparsity)) df_sparsity_dec <- df_sparsity[order(df_sparsity$sparsity,decreasing = TRUE),] df_sparsity_dec$sparsity <- factor(df_sparsity_dec$sparsity, levels = df_sparsity_dec$sparsity) p<- ggplot2::ggplot(data = df_sparsity_dec)+ geom_col(aes(x = sparsity,y = Freq))+ xlab("Presence in the Peptide Intensity Matrix")+ theme_bw() plotly::ggplotly(p)
Figure shows how many peptides have more than N presence, which helps to set the presence cutoff
#df_sparsity_dec <- df_sparsity[order(df_sparsity$sparsity,decreasing = TRUE),] df_sparsity_dec$cumsum <- cumsum(df_sparsity_dec$Freq) # redefine the sparisty sequence, in order to keep the order in the plot, This is the best way as far as I know #df_sparsity_dec$sparsity <- factor(df_sparsity_dec$sparsity, levels = df_sparsity_dec$sparsity) p<- ggplot2::ggplot(data = df_sparsity_dec)+ geom_col(aes(x = sparsity,y = cumsum))+ xlab("Presence in the Peptide Intensity Matrix")+ theme_bw() plotly::ggplotly(p)
#note: in this log10 intenisty matrix, 0 is converted into infinity, therefore will not show in the box plot, and does not affect the distribution df_intensity_log10 <-log10(df_intensity) data_matrix_log10_melt<-reshape2::melt(as.matrix(df_intensity_log10)) colnames(data_matrix_log10_melt) <- c("Proteins", "Samples", "Log10(Inensity)") p<-ggplot(data_matrix_log10_melt)+ geom_boxplot(aes(x = Samples, y = `Log10(Inensity)`)) + theme_bw()+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) plotly::ggplotly(p)
# sometimes the quantity of pepetide is too large, only show limited number of rows if(ncol(df_intensity) > 1){ # only if there are more than 1 columns in experssion matrix sparsity <- rowSums(df_intensity > 0) index_P50 <- which(sparsity >= ncol(df_intensity)/2) index_P100 <- which(sparsity == ncol(df_intensity)) if(length(index_P100) < 1000){ # if Q100 has less than 1000 rows, then plot Q50 on heatmap index_keep <- index_P50 }else{ index_keep <- index_P100 } }
r if(ncol(df_intensity) > 1){ "# Overall Expression Profile"}
r if(ncol(df_intensity) > 1){ "## Heatmap"}
r if(ncol(df_intensity) > 1){if(length(index_P100) < 1000){ "Heatmap of Presence50" }else{ "Heatmap of Presence 100" } }
# sometimes the quantity of pepetide is too large, only show limited number of rows if(ncol(df_intensity) > 1){ # only if there are more than 1 columns in experssion matrix df_intensity_heatmap_log10 <-log10(df_intensity[index_keep, , drop = FALSE]+1) rownames(df_intensity_heatmap_log10) <- peptide_sequence[index_keep] d3heatmap(df_intensity_heatmap_log10,show_grid = FALSE, color = "OrRd") }
r if(ncol(df_intensity) > 3){"# PCA Analysis"}
if(ncol(df_intensity) > 3){ # otherwise, no point doing PCA df_intensity_log10 <- log10(df_intensity+1) if(is.null(meta_table)){ #pca <-PCA_wrapper_prcomp2(data_matrix = as.matrix(df_intensity_log10), inputation = TRUE) PCA_result <- prcomp(t(df_intensity_log10)) loading <- as.data.frame(PCA_result$x) #plot(loading$PC1, loading$PC2) #plot_ly(data = loading, x = ~PC1, y = ~PC2) %>% add_markers() %>% add_text(text = row.names(loading)) p1 <- plot_ly(loading, x = ~PC1, y = ~PC2, z = ~PC3) %>% add_markers() %>% add_text(text = row.names(loading)) # for screen plot sd <- PCA_result$sde var <- sd^2 var.percent <- var/sum(var) * 100 PCs <- paste("PC", 1:length(var.percent)) df_scree <- data.frame(PC = factor(PCs, levels = PCs), ratio =var.percent) p2 <- plot_ly(data = df_scree,x = ~PC, y = ~ratio, type = "bar") %>% layout(title = "Scree Plot of Principle components", xaxis = list(title = 'Principle Component'), yaxis = list(title = 'Variance(%)')) htmltools::tagList(list(p1, p2)) }else{ # do pca analysis with meta groupign information pca <-PCA_wrapper_prcomp2(data_matrix = as.matrix(df_intensity_log10), data_meta = meta_table[,c(2,3)], inputation = FALSE) # plotting print(pca$pca_scree_plot) cat("Scree plot ## shows the performance of the PCA analysis, the more percentage the top procomponets accounts, the better separation") print(pca$pca_component_plot) cat("2d PCA Component plot ##") print(pca$pca_confidence) cat("2d PCA Component plot ## with confidence boundaries") print(pca$pca_component_plot_kmeans) cat("2d PCA Component plot with K-means grouping, which is non-supervised grouping ") pca$pca_component_plot_3d_interactive } }
r if(length(unique(meta_table[,3])) >=2 && all(table(meta_table[,3]) >=2)) {"# ANOVA TEST \n**Peptides with significiant(p < 0.05) change between any groups**"}
# will do this block while: 1, more than (includeing) two meta/groups, 2, each group has more than (includeing) 2 samples. if(length(unique(meta_table[,3])) >=2 && all(table(meta_table[,3]) >=2)){ pvalues_anova <- matrix_PostHoc(df_intensity,meta_table[,3]) df_intensity_p <- cbind(pvalues_anova, df_intensity) rownames(df_intensity_p) <- peptide_sequence df_intensity_p_filtered <- df_intensity_p[which(!is.na(df_intensity_p$p_PostHoc_pairs)),] DT::datatable(df_intensity_p_filtered, extensions = 'Buttons',options = list(dom = "Blfrtip",scrollX=TRUE, buttons = list('copy', 'print', list( extend = 'collection', buttons = c('csv', 'excel', 'pdf'), text = 'Download' )))) }
The table is clean peptides expression table, with reversed and contaminant removed. The values are the LFQ(with label free quantification turned on) or raw protein intensity from Maxquant output. You can download/export and start from this table for downstream analysis using the (i)Metalab faminly apps.
The talbe can be further visualized by our shiny apps shiny.imetalab.ca
df_intensity <- as.data.frame(df_intensity) rownames(df_intensity) <- peptide_sequence DT::datatable(df_intensity, extensions = 'Buttons',options = list(dom = "Blfrtip",scrollX=TRUE, buttons = list('copy', 'print', list( extend = 'collection', buttons = c('csv', 'excel', 'pdf'), text = 'Download' ))))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.