htmltools::img(src = "https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/rmd/iMetaReport.png", alt = 'logo', style = 'position:absolute; top:0; right:0; padding:10px;width:150px;height:150px;')
source("https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/subfunctions_general.r") source("https://raw.githubusercontent.com/ningzhibin/rmdocpu/master/inst/subfunctions_general_update.r") # enviroment setup knitr::opts_chunk$set(echo = FALSE,warning = FALSE, message = FALSE, cache = FALSE) library(tidyverse) library(ggplot2) library(d3heatmap) library(plotly) #library(gplots) #library(corrplot) #library(DT) # for interactive data table display # local usage test and debug # rmarkdown::render("MQ_report_proteinGroups.Rmd", params = list(input_datatable = your_readin_tbl)) # or with meta information # rmarkdown::render("MQ_report_proteinGroups.Rmd", params = list(input_datatable = your_readin_tbl, meta = your_meta_table))
tidy_proteingroups <- function(proteinGroups){ # extract the primary protein ID, protein.ids_split <- strsplit(as.vector(proteinGroups$"Protein IDs"), ";| ") # this is a list of list of split names ; for maxquant result, space( ) for Kai's open-search result protein_primary_ids <- unlist(lapply(protein.ids_split, function(x) x[1])) # only keep the first one #rownames(proteinGroups) <- protein_primary_ids # rename the rownames of the matrix # do the row wise filtering index_contaminant <- grep("\\+",proteinGroups[,grep( "ontaminant", colnames(proteinGroups))]) # note that + is a special character index_reverse <- grep("\\+", proteinGroups$Reverse) index_to_remove <- c(index_contaminant,index_reverse) if(length(index_to_remove) >0){ proteinGroups <- proteinGroups[-index_to_remove,] # filtered table protein_primary_ids <- protein_primary_ids[-index_to_remove] # filtered ids } n_contaminant <- length(index_contaminant) n_reversed <- length(index_reverse) # extra the intensity column matrix if(any(grepl("LFQ intensity ", colnames(proteinGroups)))){ # if there are LFQ intensity columns, take out the LFQ columns intensity_columns <- proteinGroups[,grep("LFQ intensity ", colnames(proteinGroups))] colnames(intensity_columns)<-gsub("LFQ intensity ", "", colnames(intensity_columns)) }else{ # otherwise take out intensity column, even only one column intensity_columns <- proteinGroups[,grep("Intensity ", colnames(proteinGroups)),drop = FALSE] colnames(intensity_columns)<-gsub("Intensity ", "", colnames(intensity_columns)) } return(list("intensity_matrix" = intensity_columns, "n_contaminant" = n_contaminant, "n_reversed" = n_reversed, "n_unique_peptides" = proteinGroups$"Unique peptides", "score" = proteinGroups$Score, "protein_primary_ids" =protein_primary_ids )) }
# input if(is.null(params$input_datatable)){ # test with local test with local files in the same dir, proteinGroups.txt <- read.delim("final_proteins.tsv", header = TRUE,check.names = FALSE, stringsAsFactors = FALSE) }else{ # opencpu render from data table by parametrized input proteinGroups.txt <- params$input_datatable } # Note: The folling analysis with meta info assumes that # 1st columns as sample name, 2nd column as experiment name, 3rd column and after as grouping meta_table <- params$meta_table # if there is any null value in the meta colum, set the meta as NULL if(any(is.na(meta_table$meta1))){ meta_table <- NULL } #meta_table <- read.delim("metadata.txt", header = TRUE, check.names = FALSE, stringsAsFactors = FALSE) # test with meta file
# tidy proteingroups_tidyed <- tidy_proteingroups(proteinGroups.txt) rm(proteinGroups.txt) # process df_intensity <- proteingroups_tidyed$intensity_matrix sparsity <- rowSums(df_intensity > 0) # here sparsity is number of present values index_all_na_rows <- which(sparsity == 0) df_intensity <- df_intensity[-index_all_na_rows,,drop = FALSE] protein_primary_ids <- proteingroups_tidyed$protein_primary_ids[-index_all_na_rows]
This report provides some basic description of the proteingroups identificaiton from maxquant database search. The report is based on the proteinGroups.txt, without defined experimental design/grouping information Users can use this to quickly check the overal quality of the experiment
Number of contaminant: r proteingroups_tidyed$n_contaminant
Number of reversed: r proteingroups_tidyed$n_reversed
Number of qualified proteingroups: r nrow(proteingroups_tidyed$intensity_matrix)
Number of quailfied proteingroups without intensity information: r length(index_all_na_rows)
Number of experiment: r ncol(df_intensity)
All experiments: r colnames(df_intensity)
r if( !is.null(meta_table)){ c("* **Meta/grouping info provided: **",unique(meta_table[,3]))}else{ "* **No meta information provided**"}
peptide_summary <- as.data.frame(table(proteingroups_tidyed$n_unique_peptides)) colnames(peptide_summary) = c("Unique_Peptides", "Freq") p<- ggplot2::ggplot(data = peptide_summary)+ geom_col(aes(x = Unique_Peptides,y = Freq))+ theme_bw() plotly::ggplotly(p)
df_score <- data.frame(Score = proteingroups_tidyed$score) rm(proteingroups_tidyed) p<- ggplot2::ggplot(data=df_score, aes(Score)) + geom_histogram(aes(y=..density..),color= "black",fill="white")+ geom_density(alpha=.2, fill="#FF6666")+ theme_bw() plotly::ggplotly(p)
Columns starting with "LFQ_intensity_" will be selected if exist (meaning LFQ option was checked for label free quantification, which is deliberately processed by Maxquant already), otherwise Intensity_ columns will be used instead for protein experession.
df_sparsity <- as.data.frame(table(sparsity)) p<- ggplot2::ggplot(data = df_sparsity)+ geom_col(aes(x = sparsity,y = Freq))+ xlab("Presence in the Protein Intensity Matrix")+ theme_bw() plotly::ggplotly(p)
#note: in this log10 intenisty matrix, 0 is converted into infinity, therefore will not show in the box plot, and does not affect the distribution df_intensity_log10 <-log10(df_intensity) data_matrix_log10_melt<-reshape2::melt(as.matrix(df_intensity_log10)) colnames(data_matrix_log10_melt) <- c("Proteins", "Samples", "Log10(Inensity)") p<-ggplot(data_matrix_log10_melt)+ geom_boxplot(aes(x = Samples, y = `Log10(Inensity)`)) + theme_bw()+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) plotly::ggplotly(p) rm(p) rm(data_matrix_log10_melt)
r if(ncol(df_intensity) > 1){ "# Overall Expression Profile"}
r if(ncol(df_intensity) > 1){ "## Heatmap of Top200 intensity proteins"}
# only if there is experiment desgin and number of columns in protein experssion value is more than 1, if(ncol(df_intensity) > 1){ sparsity <- rowSums(df_intensity > 0) index_Q100 <- which(sparsity == ncol(df_intensity)) # index_Q100 is the index of Q100 proteins if(length(index_Q100) >200){ topN = 200 df_intensity$total_intensity<- rowSums(df_intensity) # add total intensity column df_intensity_heatmap <- dplyr::top_n(df_intensity, topN, total_intensity) # top 100 rows df_intensity_heatmap <- df_intensity_heatmap[, 1: (ncol(df_intensity_heatmap)-1)] }else{ #topN = length(index_Q100) df_intensity_heatmap <- df_intensity[index_Q100, , drop = FALSE] # the Q100 peptide matrix #rownames(df_intensity_log10) <- protein_primary_ids } df_intensity_heatmap_log10 <-log10(df_intensity_heatmap+1) d3heatmap::d3heatmap(df_intensity_heatmap_log10,show_grid = FALSE, color = "OrRd") }
r if(ncol(df_intensity) > 3 && nrow(df_intensity) > 5 ){ "## PCA Analysis"}
if(ncol(df_intensity) > 3){ # otherwise, no point to do PCA df_intensity_log10 <- log10(df_intensity+1) if(is.null(meta_table)){ #pca <-PCA_wrapper_prcomp2(data_matrix = as.matrix(df_intensity_log10), inputation = TRUE) PCA_result <- prcomp(t(df_intensity_log10)) loading <- as.data.frame(PCA_result$x) #plot(loading$PC1, loading$PC2) #plot_ly(data = loading, x = ~PC1, y = ~PC2) %>% add_markers() %>% add_text(text = row.names(loading)) p1 <- plot_ly(loading, x = ~PC1, y = ~PC2, z = ~PC3) %>% add_markers() %>% add_text(text = row.names(loading)) # for screen plot sd <- PCA_result$sde var <- sd^2 var.percent <- var/sum(var) * 100 PCs <- paste("PC", 1:length(var.percent)) df_scree <- data.frame(PC = factor(PCs, levels = PCs), ratio =var.percent) p2 <- plot_ly(data = df_scree,x = ~PC, y = ~ratio, type = "bar") %>% layout(title = "Scree Plot of Principle components", xaxis = list(title = 'Principle Component'), yaxis = list(title = 'Variance(%)')) htmltools::tagList(list(p1, p2)) }else{ # do pca analysis with meta groupign information pca <-PCA_wrapper_prcomp2(data_matrix = as.matrix(df_intensity_log10), data_meta = meta_table[,c(2,3)], inputation = TRUE) # plotting print(pca$pca_scree_plot) cat("Scree plot ## shows the performance of the PCA analysis, the more percentage the top procomponets accounts, the better separation") print(pca$pca_component_plot) cat("2d PCA Component plot ##") print(pca$pca_confidence) cat("2d PCA Component plot ## with confidence boundaries") print(pca$pca_component_plot_kmeans) cat("2d PCA Component plot with K-means grouping, which is non-supervised grouping ") pca$pca_component_plot_3d_interactive } }
r if(length(unique(meta_table[,3])) >=2 && all(table(meta_table[,3]) >=2)) {"# ANOVA TEST \n**Proteins with significiant(p < 0.05) change between any groups**"}
# will do this block while: 1, more than (includeing) two meta/groups, 2, each group has more than (includeing) 2 samples. if(length(unique(meta_table[,3])) >=2 && all(table(meta_table[,3]) >=2)){ pvalues_anova <- matrix_PostHoc(df_intensity,meta_table[,3]) df_intensity_p <- cbind(pvalues_anova, df_intensity) rownames(df_intensity_p) <- protein_primary_ids df_intensity_p_filtered <- df_intensity_p[which(!is.na(df_intensity_p$p_PostHoc_pairs)),] DT::datatable(df_intensity_p_filtered, extensions = 'Buttons',options = list(dom = "Blfrtip",scrollX=TRUE, buttons = list('copy', 'print', list( extend = 'collection', buttons = c('csv', 'excel', 'pdf'), text = 'Download' )))) }
The table is clean proteingroups expression table, with reversed and contaminant removed. Only the first protein id was used to represent the proteingroups. The values are the LFQ(label free quantification turned on) or raw protein intensity. The talbe can be further visualized by our shiny apps shiny.imetalab.ca
df_intensity <- as.data.frame(df_intensity) rownames(df_intensity) <- protein_primary_ids DT::datatable(df_intensity, extensions = 'Buttons',options = list(dom = "Blfrtip",scrollX=TRUE, buttons = list('copy', 'print', list( extend = 'collection', buttons = c('csv', 'excel', 'pdf'), text = 'Download' ))))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.