knitr::opts_chunk$set(fig.cap = NULL, fig.path = params$output_figure) library(knitr) library(data.table) library(ggplot2) library(ggrepel) library(GGally) library(umap) library(FactoMineR) library(factoextra) library(corrplot) library(viridis) library(ggpubr) library(Hmisc) library(plotly) library(stringr) library(bit64)
#assert that all the stuff we need is there. stopifnot(exists("expdes")) stopifnot(exists("prot")) stopifnot(exists("prot_int")) if (file.exists(file.path(upload_folder,"evidence.txt"))){ evidence <- generic_mq_table_reader(upload_folder, 'evidence.txt') }
This QC Report is designed to help Scientists quickly assess the several aspect of experiment quality, There are 4 categories in the QC report:
More about the QC Report
\newpage
expdes <- expdes[,c("condition", "experiment", "reporter_channel", "replicate")] kable(expdes, col.names = c("Condition", "MQ-Experiment", "Channel", "Replicate"))
\newpage
PCA is an unsupervised machine learning algorithm that clusters your data, showing us which replicates were more or less similar. PCA creates new dimensions that collapse the overall variance in your data into fewer dimensions. For a healthy experiment we expect: If replicates don’t cluster together, it is possible that the variance between the replicates is greater than the variance between the experimental conditions. It is also possible that the difference between the experimental conditions is encoded in other dimensions not represented here. A scree plot shows the amount of variance explained by each dimension extracted by PCA. A high degree of variance in the first few dimensions may suggest large differences between your samples.Experiment Health Details
\hfill\break
# Prepare PCA data table # limiting the data by number of missing values as per input parameters: # limitImputedRatio # checkBy pca_dt <- prot_int[, .( id, condition, replicate, log2NIntNorm, run_id )] pca_dt <- dcast(pca_dt, condition + replicate ~ id, value.var = "log2NIntNorm") num_dimensions = min(pca_dt[, .N-1], 10) res.pca <- PCA(pca_dt[, 3:ncol(pca_dt)], graph = FALSE, ncp = num_dimensions) eig.val <- get_eigenvalue(res.pca) eig.val <- data.table(dims = rownames(eig.val), eig.val) samples.pca <- get_pca_ind(res.pca) samples.coord <- data.table(pca_dt[, 1:2], samples.pca$coord) samples.coord[, Run := str_c(condition, replicate, sep = " - ")] p <- ggplot(samples.coord, aes(x = Dim.1, y=Dim.2, colour=condition, fill = condition, label=Run)) + stat_ellipse(geom = "polygon", alpha=0.1) + geom_point(size = 3, alpha = 0.7) + theme_minimal() + scale_x_continuous(str_c("PCA 1 - ", eig.val[dims == "Dim.1", round(variance.percent,1)], "%")) + scale_y_continuous(str_c("PCA 2 - ", eig.val[dims == "Dim.2", round(variance.percent,1)], "%")) if (output_format == "pdf"){ p = p + geom_label_repel(aes(label = Run, fill = NULL), box.padding = 0.35, point.padding = 0.5, segment.color = 'grey50', show.legend = FALSE) p } else { ggplotly(p, tooltip = c("label")) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) }
scree_plot <- ggplot(eig.val[1:num_dimensions], aes(x=reorder(dims, -`variance.percent`), y=`variance.percent`)) + geom_bar(stat="identity", fill = "skyblue2") + theme_minimal() + geom_text_repel(aes(label=(round(`variance.percent`,1))), direction = 'y') + scale_x_discrete("PCA components") + scale_y_continuous("% Variance") + ggtitle("Scree plot") if (output_format == "pdf"){ scree_plot } else { ggplotly(scree_plot, tooltip = c("y")) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) }
name_of_column <- colnames(prot)[colnames(prot) == "adj.P.Val"] if (!identical(name_of_column, character(0)) && prot[get(name_of_column) <= 0.05, .N] > 1) { DE_ids <- prot[get(name_of_column) <= 0.05, unique(id)] pca_dt <- prot_int[id %in% DE_ids, .( id, condition, replicate, log2NIntNorm )] pca_dt <- dcast(pca_dt, condition + replicate ~ id, value.var = "log2NIntNorm") num_dimensions = min(pca_dt[, .N-1], 10) res.pca <- PCA(pca_dt[, 3:ncol(pca_dt)], graph = FALSE, ncp = num_dimensions) eig.val <- get_eigenvalue(res.pca) eig.val <- data.table(dims = rownames(eig.val), eig.val) samples.pca <- get_pca_ind(res.pca) samples.coord <- data.table(pca_dt[, 1:2], samples.pca$coord) samples.coord[, Run := str_c(condition, replicate, sep = " - ")] p <- ggplot(samples.coord, aes(x = Dim.1, y=Dim.2, colour=condition, fill=condition, label=Run)) + stat_ellipse(geom = "polygon", alpha=0.1) + geom_point(size = 3, alpha = 0.7) + theme_minimal() + scale_x_continuous(str_c("PCA 1 - ", eig.val[dims == "Dim.1", round(variance.percent,1)], "%")) + scale_y_continuous(str_c("PCA 2 - ", eig.val[dims == "Dim.2", round(variance.percent,1)], "%")) + ggtitle(str_c("PCA plot using the ", length(DE_ids), " DE proteins")) if (output_format == "pdf"){ p = p + geom_label_repel(aes(label = Run, fill = NULL), box.padding = 0.35, point.padding = 0.5, segment.color = 'grey50', show.legend = FALSE) p } else { ggplotly(p, tooltip = c("label")) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) } }
\newpage
The Coefficient of Variation or Relative Standard Deviation, is calculated by the ratio of the standard deviation to the mean. It is used to measure the precision of a measure, in this case protein/peptide intensity. Here the CV is calculated within experimental conditions separately.
Coefficient of Variation Details
\hfill\break
cvdt <- prot_int[Imputed == 0][, countRep := .N, by = .(id, condition)] cvdt[, countRepMax := max(countRep), by = .(id, condition)] cvdt[, ReplicatePC := countRep/countRepMax] cvdt[, intensity := as.double(intensity)] cvdt <- cvdt[ReplicatePC >= 0.5, .(cv = sd(intensity)/mean(intensity)), by = .(id, condition)] p <- ggplot(cvdt, aes(x=cv, fill=condition, colour=condition)) + geom_density(alpha=0.4) + theme_minimal() + scale_x_continuous("% CV", labels = scales::percent) + ggtitle("Proteins - Reporter Intensity CV") if (output_format == "pdf"){ p } else { ggplotly(p) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) }
# this chunk is very expensive on small datasets... int_corr_dt <- dcast.data.table( prot_int[Imputed == 0], id ~ condition + replicate, value.var = "log2NIntNorm", sep = "_" ) DT_corMatrix <- rcorr(as.matrix(int_corr_dt[, 2:ncol(int_corr_dt)])) DT_corMatrix <- DT_corMatrix$r DT_corMatrix[DT_corMatrix <= -1] = -1 DT_corMatrix[DT_corMatrix >= 1] = 1 #corrplot(DT_corMatrix, type = "upper", title = "Correlation Matrix", tl.cex = 0.5, mar = c(1,0,1.5,0)) #if (num_files > 10) { # corrplot(DT_corMatrix, type = "upper", title = "Correlation Matrix", tl.cex = 0.5, mar = c(1,0,1.5,0)) # #} else { # corrplot(DT_corMatrix, type = "upper", title = "Correlation Matrix", tl.cex = 0.5, mar = c(1,0,1.5,0), #addCoef.col = "white", number.cex = .5) # #} name_of_column <- colnames(prot)[colnames(prot) == "adj.P.Val"] if (!identical(name_of_column, character(0)) && prot[get(name_of_column) <= 0.05, .N] > 4) { # DE_ids <- prot[get(name_of_column) <= 0.05, unique(ProteinGroupId)] DE_ids <- prot[get(name_of_column) <= 0.05, unique(id)] int_corr_dt <- int_corr_dt[id %in% DE_ids] DT_corMatrix <- rcorr(as.matrix(int_corr_dt[, 2:ncol(int_corr_dt)])) DT_corMatrix <- DT_corMatrix$r DT_corMatrix[DT_corMatrix <= -1] = -1 DT_corMatrix[DT_corMatrix >= 1] = 1 corrplot(DT_corMatrix, type = "upper", title = "Correlation Matrix - DE peptides", tl.cex = 0.5, mar = c(1,0,1.5,0), addCoef.col = "white", number.cex = .5) } int_corr_dt <- dcast.data.table( prot_int[Imputed == 0], id ~ condition + replicate, value.var = "log2NIntNorm", sep = "_" )
\newpage
The quantitative analysis relies on the comparison of intensities corresponding to the same protein/peptide across multiple samples; the ideal situation is when a protein is measured in all the samples (100%). In the first plot we show a frequency plot that measures the percentage of samples where each protein/peptide was detected. The presence of a large number of proteins/peptides at low percentage values could signify an issue with the sample acquisition. In the second plot we show the percentage of proteins/peptides detected in each LC-MS/MS run. Ideally all the samples should be clustered together, the presence of samples at low percentage values could signify an issue during the acquisition of that specific sample. If the samples are clustered depending on the experimental condition it could signify a systematic bias.Data Completeness Details
\hfill\break
# need to understand what replicate PC means for prot_int with TMT num_files <- nrow(expdes) dt <- unique(prot_int[Imputed == 0, .(ReplicatePC = .N/num_files), by = .(id)]) ymax <- max(dt[, .N, by = .(ReplicatePC)][, N]) ymax <- 0.01 * ymax + ymax p <- ggplot(dt, aes(x = ReplicatePC)) + annotate('rect', xmin = 0.7, xmax = 1.05, ymin = 0, ymax = ymax, alpha=0.2) + geom_histogram(binwidth = max(0.1, round(1/max(num_files), 2)), fill="skyblue2") + theme_minimal() + scale_x_continuous("Percentage of measurements per protein", labels = scales::percent, limits = c(0, 1.15), breaks = seq(0, 1, 0.1)) + annotate('text', x = 0.5, y = 0.8*ymax, label=str_c("Number of proteins with\nmissing values <= 30%:\n ", dt[ReplicatePC >= 0.7, .N])) if (output_format == "pdf"){ p } else { ggplotly(p, tooltip = c("y")) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) }
dt <- prot_int[Imputed == 0, .(`% measured values` = .N/prot_int[, length(unique(id))]), by = .(run_id, condition, replicate)] dt[, Run := str_c(condition, replicate, sep = " - ")] dt[, `% measured values` := 100*(round(`% measured values`, 2))] p<- ggplot(dt, aes(y = `% measured values`, x = Run, color = condition, fill = condition)) + geom_bar(stat ="identity", position = "dodge")+ theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.2), panel.grid.major.y = element_blank(), panel.border = element_blank(), axis.ticks.y = element_blank() ) + scale_x_discrete("Experiment - Channel") if (output_format == "pdf"){ p } else { ggplotly(p, tooltip = c("y")) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) }
\newpage
Accurate quantitative analysis relies on consistent sample preparation. Digestion efficiency is one source of variability between samples. This plot shows the proportion of missed cleavage events in each sample. The ideal scenario would be to have a very low proportion of missed cleavages values greater than 0 and for the proportions to be consistent across samples. Any discrepancy between samples or experimental conditions could indicate issues in the sample preparation.Missed Cleavages Details
if (exists("evidence")){ dt <- copy(evidence) dt <- dt[, .(percent = .N), by = .(`missed cleavages`, `raw file`)] dt[, `missed cleavages` := as.factor(`missed cleavages`)] p <- ggplot(dt, aes(x = `raw file`, y = percent, fill=`missed cleavages`, label=`raw file`)) + geom_bar(stat="identity", position=position_fill(reverse = TRUE)) + theme_minimal() + scale_y_continuous("% missed c leavages", labels = scales::percent) + theme(axis.text.x = element_text(angle = 90, vjust = 0.2), panel.grid.major.y = element_blank(), panel.border = element_blank(), axis.ticks.y = element_blank(), legend.position = "top" ) + labs(fill = "Missed\nCleavages") if (output_format == "pdf"){ p } else { ggplotly(p, tooltip = c("y")) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) } } else { cat("Please upload your evidence.txt file to include Missed Cleavage Statistics in this QC report.") }
\newpage
PIF or "Parent Ion Fraction" indicates the fraction the target peak makes up of the total intensity in the inclusion window.Parent Ion Fraction Details
if (exists("evidence")){ evidence$`raw file` <- as.character(evidence$`raw file`) p <- ggplot(evidence, aes(x = `raw file`, y = pif, fill = '#AEC6CF', colour = '#AEC6CF')) + geom_boxplot() + #geom_violin() + theme_minimal() + #coord_cartesian(ylim = c(0, boxplot.stats(fdt[, fwhm])$stats[ 5])) + theme(axis.text.x = element_text(angle = 90, vjust = 0.2), panel.grid.major.y = element_blank(), panel.border = element_blank(), axis.ticks.y = element_blank(), legend.position = "none" ) + scale_x_discrete("Raw File") + scale_y_continuous("Precursor Ion Fraction") if (output_format == "pdf"){ p } else { p # ggplotly(p, tooltip = c("y")) %>% config(displayModeBar = T, # modeBarButtons = list(list('toImage')), # displaylogo = F) } } else { cat("Please upload your evidence.txt file to include Precursor Ion Fraction Statistics in this QC report.") }
\newpage
Identifications of PSM’s (peptide spectral matches), peptides and proteins are a measure of the number of inferences we make at each of those levels as a result of each run. Low counts in a run may suggest a systematic flaw in the experiment that needs to be addressed prior to interpretation.Identifications Details
\hfill\break
# ignore fractions for now. dt <- copy(prot_int[Imputed == 0]) dt[is.na(condition), condition := "Library"] #if (dt[, any(fraction > 0)]) { #dt <- dt[, .N, by = .(condition, Replicate, fraction)] #dt[, Run := str_c(condition, Replicate, fraction, sep = " - ")] # #} else { dt <- dt[, .N, by = .(condition, replicate)] dt[, Run := str_c(condition, replicate, sep = " - ")] #} p <- ggplot(dt, aes(x = as.factor(Run), y = N, color = condition, label=replicate)) + geom_segment( aes(x=as.factor(Run), xend=as.factor(Run), y=0, yend=N), color="skyblue") + geom_point(size=2, alpha=0.9) + coord_flip() + theme_minimal() + scale_x_discrete("Condtion + Replicate") + scale_y_continuous("# Protein Measurements") + theme(axis.text.x = element_text(angle = 90, vjust = 0.2), panel.grid.major.y = element_blank(), panel.border = element_blank(), axis.ticks.y = element_blank() ) if (output_format == "pdf"){ p } else { ggplotly(p, tooltip = c("label", "y")) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) }
\newpage
During post-processing of your maxquant output, MD performs the following steps: FoldChange statistics are then calculated using Limma with data that is free of missing values and systematic bias.Normalisation and Imputation Details
tmp <- prot_int tmp$Imputed <- as.character(prot_int$Imputed) p <- ggplot(tmp, aes(x=log2NIntNorm, fill=Imputed, colour = Imputed)) + geom_density(alpha=0.4) + theme_minimal() + ggtitle("Intensity (Imputed vs Not)") if (output_format == "pdf"){ p } else { ggplotly(p) %>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) }
p <- ggplot(prot_int[Imputed==F], aes(x=interaction(reporter_channel, experiment), y=log2NInt, fill=condition, colour=condition)) + geom_boxplot() + theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.2), panel.grid.major.y = element_blank(), panel.border = element_blank(), axis.ticks.y = element_blank() ) + scale_x_discrete("Channel.Run") + scale_y_continuous("Log2 Reporter Intensity") fig <- ggplotly(p, tooltip = c("y"))%>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) fig$x$data <- lapply(fig$x$data, FUN = function(x){ x$marker$outliercolor = x$line$color # When creating plot p with ggplot if you specify fill = cut use x$fill$color instead of $line$color x$marker$color = x$line$color # When creating plot p with ggplot if you specify fill = cut use x$fill$color instead $line$color x$marker$line = x$line$color # When creating plot p with ggplot if you specify fill = cut use x$fill$color instead $line$color return(x) }) if (output_format == "pdf"){ p } else { p #fig }
p <- ggplot(prot_int, aes(x=interaction(reporter_channel, experiment), y=log2NIntNorm, fill=condition, colour=condition)) + geom_boxplot() + theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.2), panel.grid.major.y = element_blank(), panel.border = element_blank(), axis.ticks.y = element_blank() ) + scale_x_discrete("Channel.Run") + scale_y_continuous("Normalized Log2 Reporter Intensity") fig <- ggplotly(p, tooltip = c("y"))%>% config(displayModeBar = T, modeBarButtons = list(list('toImage')), displaylogo = F) fig$x$data <- lapply(fig$x$data, FUN = function(x){ x$marker$outliercolor = x$line$color # When creating plot p with ggplot if you specify fill = cut use x$fill$color instead of $line$color x$marker$color = x$line$color # When creating plot p with ggplot if you specify fill = cut use x$fill$color instead $line$color x$marker$line = x$line$color # When creating plot p with ggplot if you specify fill = cut use x$fill$color instead $line$color return(x) }) if (output_format == "pdf"){ p } else { p #fig }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.