options(width = 100, digits = 4) knitr::opts_chunk$set(collapse = TRUE, comment = "#>", echo = TRUE, cache = FALSE, prompt = FALSE, tidy = TRUE, comment = NA, message = FALSE, warning = FALSE, tidy = TRUE, tidy.opts = list(width.cutoff = 60), fig.width = 8, fig.height = 5, dev = "png")
On this page, the analysis workflow presented in the main text is applied on each of the four example datasets on the protein group level. Additionally, two more datasets were downloaded. The same analyses are also performed on both precursor- and protein-levels on these additional datasets.
library(tidyverse) library(protDP) dfList <- seq(1, 5, 2) lineColours <- RColorBrewer::brewer.pal(3, "Dark2")
LFQ intensities summarised by MaxLFQ [@cox2014accurate] were extracted from the DIA-NN [@demichev2020dia] output on the protein group level. The log2-transformation is applied to LFQ intensities.
data("datasetA") dat <- log2(datasetA$prot) dim(dat)
The overall proportion of missing data on the protein level is equal to
sum(is.na(dat)) / length(dat)
hyeProt <- gatherResults(dat) for (i in 1:length(dfList)) { if (i == 1) plotEmpSplines(hyeProt$nuis, X = hyeProt$splineFits_params0[[i]]$X, hyeProt$splineFits[[i]]$params, lineCol = lineColours[i], point.cex = 0.15, ylim = c(0, 1.04)) if (i > 1) plotEmpSplines(hyeProt$nuis, X = hyeProt$splineFits_params0[[i]]$X, hyeProt$splineFits[[i]]$params, lineCol = lineColours[i], newPlot = FALSE) } legend("bottomright", legend = paste("df = ", dfList), col = lineColours, lwd = 2, lty = 1, cex = 0.8)
ggplot(slice(hyeProt$devs, 2:4), aes(x = df, y = percDevReduced)) + geom_point() + geom_line() + geom_text(aes(label = signif(percDevReduced, 2)), vjust = -0.8) + scale_x_continuous(breaks = c(1, 3, 5)) + labs(x = "d.f.", y = "Reduced deviance(%)") + ylim(0.90, 1) + theme_classic()
plotEmpSplines(hyeProt$nuis, X = hyeProt$splineFits_params0[[1]]$X, hyeProt$cappedLinearFit$params, capped = TRUE, lineCol = lineColours[1], ylim = c(0, 1.04), point.cex = 0.15)
The estimated parameters are
round(hyeProt$cappedLinearFit$params, 2)
We see that the estimated $\alpha$ value is 1.
plotDPC(hyeProt$dpcFit, jitter.amount = NULL, point.cex = 0.15, ylim = c(0, 1.04))
The estimated parameters for the detection probability curve are
round(hyeProt$dpcFit$beta, 2)
LFQ intensities summarised by MaxLFQ [@cox2014accurate] were extracted from the DIA-NN [@demichev2020dia] report for protein groups. The log2-transformation is first applied to LFQ intensities.
data("datasetB") dat <- log2(datasetB$prot) dim(dat)
The overall proportion of missing data on the protein-level is
sum(is.na(dat)) / length(dat)
scProt <- gatherResults(dat, b1.upper = Inf) for (i in 1:length(dfList)) { if (i == 1) plotEmpSplines(scProt$nuis, X = scProt$splineFits_params0[[i]]$X, scProt$splineFits[[i]]$params, lineCol = lineColours[i], add.jitter = FALSE, point.cex = 0.15) if (i > 1) plotEmpSplines(scProt$nuis, X = scProt$splineFits_params0[[i]]$X, scProt$splineFits[[i]]$params, lineCol = lineColours[i], newPlot = FALSE) } legend("bottomright", legend = paste("df = ", dfList), col = lineColours, lwd = 2, lty = 1, cex = 0.8)
ggplot(slice(scProt$devs, 2:4), aes(x = df, y = percDevReduced)) + geom_point() + geom_line() + geom_text(aes(label = signif(percDevReduced, 2)), vjust = -0.8) + scale_x_continuous(breaks = c(1, 3, 5)) + labs(x = "d.f.", y = "Reduced deviance(%)") + ylim(0.90, 1) + theme_classic()
plotEmpSplines(scProt$nuis, X = scProt$splineFits_params0[[1]]$X, scProt$cappedLinearFit$params, capped = TRUE, lineCol = lineColours[1], point.cex = 0.15)
Estimated parameters are
round(scProt$cappedLinearFit$params, 2)
plotDPC(scProt$dpcFit, add.jitter = FALSE, point.cex = 0.1)
Parameters of the detection probability curve are as follows:
round(scProt$dpcFit$beta, 2)
For the protein group level analysis, we use the proteinGroups.txt
file from the MaxQuant output. The LFQ intensities are first log2-transformed.
data("datasetC") dat <- log2(datasetC$prot) dim(dat)
The overall proportion of missingness in protein group level data is
sum(is.na(dat)) / length(dat)
hepg2Prot <- gatherResults(dat) for (i in 1:length(dfList)) { if (i == 1) plotEmpSplines(hepg2Prot$nuis, X = hepg2Prot$splineFits_params0[[i]]$X, hepg2Prot$splineFits[[i]]$params, lineCol = lineColours[i], jitter.amount = 1/ncol(dat)/2) if (i > 1) plotEmpSplines(hepg2Prot$nuis, X = hepg2Prot$splineFits_params0[[i]]$X, hepg2Prot$splineFits[[i]]$params, lineCol = lineColours[i], newPlot = FALSE) } legend("bottomright", legend = paste("df = ", dfList), col = lineColours, lwd = 2, lty = 1, cex = 0.8)
ggplot(slice(hepg2Prot$devs, 2:4), aes(x = df, y = percDevReduced)) + geom_point() + geom_line() + geom_text(aes(label = signif(percDevReduced, 2)), vjust = -0.8) + scale_x_continuous(breaks = c(1, 3, 5)) + labs(x = "d.f.", y = "Reduced deviance(%)") + ylim(0.90, 1) + theme_classic()
plotEmpSplines(hepg2Prot$nuis, X = hepg2Prot$splineFits_params0[[1]]$X, hepg2Prot$cappedLinearFit$params, capped = TRUE, lineCol = lineColours[1], jitter.amount = 1/ncol(dat)/2)
The estimated parameters are
round(hepg2Prot$cappedLinearFit$params, 2)
plotDPC(hepg2Prot$dpcFit, jitter.amount = 1/ncol(dat)/2)
Parameters of the detection probability curve are
round(hepg2Prot$dpcFit$beta, 2)
For the protein group level analysis, we use the proteinGroups.txt
file from the MaxQuant output downloaded from the ProteomeXchange Consortium via the PRIDE partner repository with the dataset identifier PXD014777.
data("datasetD") dat <- log2(datasetD$prot) dim(dat)
The overall proportion of missing data in the protein group level data is
sum(is.na(dat)) / length(dat)
ddaPlasmaProt <- gatherResults(dat) for (i in 1:length(dfList)) { if (i == 1) plotEmpSplines(ddaPlasmaProt$nuis, X = ddaPlasmaProt$splineFits_params0[[i]]$X, ddaPlasmaProt$splineFits[[i]]$params, lineCol = lineColours[i], add.jitter = FALSE) if (i > 1) plotEmpSplines(ddaPlasmaProt$nuis, X = ddaPlasmaProt$splineFits_params0[[i]]$X, ddaPlasmaProt$splineFits[[i]]$params, lineCol = lineColours[i], newPlot = FALSE) } legend("bottomright", legend = paste("df = ", dfList), col = lineColours, lwd = 2, lty = 1, cex = 0.8)
ggplot(slice(ddaPlasmaProt$devs, 2:4), aes(x = df, y = percDevReduced)) + geom_point() + geom_line() + geom_text(aes(label = signif(percDevReduced, 2)), vjust = -0.8) + scale_x_continuous(breaks = c(1, 3, 5)) + labs(x = "d.f.", y = "Reduced deviance(%)") + ylim(0.90, 1) + theme_classic()
plotEmpSplines(ddaPlasmaProt$nuis, X = ddaPlasmaProt$splineFits_params0[[1]]$X, ddaPlasmaProt$cappedLinearFit$params, capped = TRUE, lineCol = lineColours[1], add.jitter = FALSE)
Estimated parameters are
round(ddaPlasmaProt$cappedLinearFit$params, 2)
plotDPC(ddaPlasmaProt$dpcFit, add.jitter = FALSE)
Parameters for the fitted detection probability curve are
round(ddaPlasmaProt$dpcFit$beta, 2)
Cryopreserved left ventricular myocardium samples from the human hearts were analysed. MS data were acquired in DIA mode and analysed by Spectronaunt v12 with a DDA spectral library generated from the pooled sample [@li2020core]. Details on sample preparation, LC-MS/MS workflow and data processing steps including the generation of the spectial library can be found in @li2020core. Here we consider the healthy donor heart samples. Both precursor- and protein group-level data are log2-transformed before analysis.
data("shbheart") shbheart_prec <- shbheart$prec dim(shbheart_prec) shbheart_prot <- shbheart$prot dim(shbheart_prot)
The overall proportion of missing data on the precursor-level is
sum(is.na(shbheart_prec)) / length(shbheart_prec)
While the overall proportion of missingness on the protein group-level is
sum(is.na(shbheart_prot)) / length(shbheart_prot)
The analysis workflow presented in the manuscript is applied on the dataset on both precursor- and protein group-level data:
res <- list(prec = gatherResults(shbheart_prec), prot = gatherResults(shbheart_prot, b0.upper = Inf))
par(mfrow = c(2, 1)) for (res_i in 1:2) { eachRes <- res[[res_i]] for (i in 1:length(dfList)) { if (i == 1) plotEmpSplines(eachRes$nuis, X = eachRes$splineFits_params0[[i]]$X, eachRes$splineFits[[i]]$params, lineCol = lineColours[i], jitter.amount = 1/ncol(shbheart_prec)/2, point.cex = 0.15) if (i > 1) plotEmpSplines(eachRes$nuis, X = eachRes$splineFits_params0[[i]]$X, eachRes$splineFits[[i]]$params, lineCol = lineColours[i], newPlot = FALSE) } title(sub = c("Precursor-level", "Protein-level")[res_i]) legend("bottomright", legend = paste("df = ", dfList), col = lineColours, lwd = 2, lty = 1, cex = 0.8) }
devPlot1 <- ggplot(slice(res[[1]]$devs, 2:4), aes(x = df, y = percDevReduced)) + geom_point() + geom_line() + geom_text(aes(label = signif(percDevReduced, 2)), vjust = -0.8) + scale_x_continuous(breaks = c(1, 3, 5)) + labs(x = "d.f.", y = "Reduced deviance(%)") + ylim(0.90, 1) + theme_classic() + ggtitle("Precursor-level") devPlot2 <- ggplot(slice(res[[2]]$devs, 2:4), aes(x = df, y = percDevReduced)) + geom_point() + geom_line() + geom_text(aes(label = signif(percDevReduced, 2)), vjust = -0.8) + scale_x_continuous(breaks = c(1, 3, 5)) + labs(x = "d.f.", y = "Reduced deviance(%)") + ylim(0.90, 1) + theme_classic() + ggtitle("Protein-level") gridExtra::grid.arrange(devPlot1, devPlot2, ncol = 2)
par(mfrow = c(2, 1)) for (res_i in 1:2) { eachRes <- res[[res_i]] plotEmpSplines(eachRes$nuis, X = eachRes$splineFits_params0[[1]]$X, eachRes$cappedLinearFit$params, capped = TRUE, lineCol = lineColours[1], jitter.amount = 1/ncol(shbheart_prec)/2, point.cex = 0.15) title(sub = c("Precursor-level", "Protein-level")[res_i]) }
With estimated parameters on precursor-level data being:
round(res[["prec"]]$cappedLinearFit$params, 2)
and estimated parameters on protein-level data being:
round(res[["prot"]]$cappedLinearFit$params, 2)
Estimated $\alpha$ values are equal to 1 on both precursor- and protein group-level data.
par(mfrow = c(2, 1)) for (res_i in 1:2) { eachRes <- res[[res_i]] plotDPC(eachRes$dpcFit, jitter.amount = 1/ncol(shbheart_prec)/2, point.cex = 0.15) title(sub = c("Precursor-level", "Protein-level")[res_i]) }
With estimated parameters on precursor-level data being:
round(res[["prec"]]$dpcFit$beta, 2)
and estimated parameters on protein-level data being:
round(res[["prot"]]$dpcFit$beta, 2)
Three concentrations of UPS1 (25 fmol, 10 fmol and 5 fmol) were spiked in yeast extract [@giai2016calibration]. This is a DDA dataset and MS raw data were processed by MaxQuant. Here we look at the dataset which compares 25 fmol to 10 fmol spiked-ins. Processed data were downloaded from ProteomeXchange Consortium via the PRIDE partner repository with the dataset identifier PXD002370. For peptide-level data, we use the peptides.txt
file and for the protein-level analysis, we use the proteinGroups.txt
file from the MaxQuant output. Log2-transformation is first applied before analysis.
data("ratio2.5") usp1_prec <- log2(ratio2.5$prec) dim(usp1_prec) usp1_prot <- log2(ratio2.5$prot) dim(usp1_prot)
The overall proportion of missing data for the precursor-level data is
sum(is.na(usp1_prec)) / length(usp1_prec)
The overall proportion of missing data for the protein-level data is
sum(is.na(usp1_prot)) / length(usp1_prot)
res <- list(prec = gatherResults(usp1_prec, b0.upper = Inf), prot = gatherResults(usp1_prot, b0.upper = Inf))
par(mfrow = c(2, 1)) for (res_i in 1:2) { eachRes <- res[[res_i]] for (i in 1:length(dfList)) { if (i == 1) plotEmpSplines(eachRes$nuis, X = eachRes$splineFits_params0[[i]]$X, eachRes$splineFits[[i]]$params, lineCol = lineColours[i], jitter.amount = 1/ncol(usp1_prec)/2, point.cex = 0.15, ylim = c(0, 1.08)) if (i > 1) plotEmpSplines(eachRes$nuis, X = eachRes$splineFits_params0[[i]]$X, eachRes$splineFits[[i]]$params, lineCol = lineColours[i], newPlot = FALSE) } title(sub = c("Precursor-level", "Protein-level")[res_i]) legend("bottomright", legend = paste("df = ", dfList), col = lineColours, lwd = 2, lty = 1, cex = 0.8) }
devPlot1 <- ggplot(slice(res[[1]]$devs, 2:4), aes(x = df, y = percDevReduced)) + geom_point() + geom_line() + geom_text(aes(label = signif(percDevReduced, 2)), vjust = -0.8) + scale_x_continuous(breaks = c(1, 3, 5)) + labs(x = "d.f.", y = "Reduced deviance(%)") + ylim(0.90, 1) + theme_classic() + ggtitle("Precursor-level") devPlot2 <- ggplot(slice(res[[2]]$devs, 2:4), aes(x = df, y = percDevReduced)) + geom_point() + geom_line() + geom_text(aes(label = signif(percDevReduced, 2)), vjust = -0.8) + scale_x_continuous(breaks = c(1, 3, 5)) + labs(x = "d.f.", y = "Reduced deviance(%)") + ylim(0.90, 1) + theme_classic() + ggtitle("Protein-level") gridExtra::grid.arrange(devPlot1, devPlot2, ncol = 2)
The spline with 1 degree of freedom contributes the majority to the total amount of reduced deviance in both precursor- and protein group-level data.
par(mfrow = c(2, 1)) for (res_i in 1:2) { eachRes <- res[[res_i]] plotEmpSplines(eachRes$nuis, X = eachRes$splineFits_params0[[1]]$X, eachRes$cappedLinearFit$params, capped = TRUE, lineCol = lineColours[1], jitter.amount = 1/ncol(usp1_prec)/2, point.cex = 0.15, ylim = c(0, 1.08)) title(sub = c("Precursor-level", "Protein-level")[res_i]) }
With estimated parameters on precursor-level data being:
round(res[["prec"]]$cappedLinearFit$params, 2)
and estimated parameters on protein-level data being:
round(res[["prot"]]$cappedLinearFit$params, 2)
Estimated $\alpha$ values are 1 for both precursor- and protein group-level data.
par(mfrow = c(2, 1)) for (res_i in 1:2) { eachRes <- res[[res_i]] plotDPC(eachRes$dpcFit, jitter.amount = 1/ncol(usp1_prec)/2, point.cex = 0.15, ylim = c(0, 1.08)) title(sub = c("Precursor-level", "Protein-level")[res_i]) }
With estimated parameters for the fitted detection probability curve for precursor-level data being:
round(res$prec$dpcFit$beta, 2)
and estimated parameters for the fitted detection probability curve for the protein group-level data being:
round(res$prot$dpcFit$beta, 2)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.