BiocStyle::markdown()
library("RforProteomics")
library("BiocManager")
library("protViz")
library("BiocManager")
library("DT")
library("mzR")
library("MSnbase")
library("knitr")
library("rpx")
library("xtable")
library("RColorBrewer")
library("MALDIquant")
library("MALDIquantForeign")
library("pRoloc")
library("pRolocdata")
library("msmsTests")
library("msmsEDA")
library("e1071")

Introduction

This vignette illustrates existing \R{} and Bioconductor infrastructure for the visualisation of mass spectrometry and proteomics data. The code details the visualisations presented in

Gatto L, Breckels LM, Naake T, Gibb S. Visualisation of proteomics data using R and Bioconductor. Proteomics. 2015 Feb 18. doi: 10.1002/pmic.201400392. PubMed PMID: 25690415.

NB: I you are interested in R packages for mass spectrometry-based proteomics and metabolomics, see also the R for Mass Spectrometry initiative packages and the tutorial book

References

Relevant packages

library("RforProteomics")
pp <- proteomicsPackages()
msp <- massSpectrometryPackages()

There are currently r nrow(pp) Proteomics and r nrow(msp) MassSpectrometry packages in Bioconductor version r as.character(BiocManager::version()). Other non-Bioconductor packages are described in the r Biocexptpkg("RforProteomics") vignette (open it in R with RforProteomics() or read it online.)

DT::datatable(pp)
DT::datatable(msp)

Ascombe's quartet

kable(anscombe, format = "html")
tab <- matrix(NA, 5, 4)
colnames(tab) <- 1:4
rownames(tab) <- c("var(x)", "mean(x)",
                   "var(y)", "mean(y)",
                   "cor(x,y)")

for (i in 1:4)
    tab[, i] <- c(var(anscombe[, i]),
                  mean(anscombe[, i]),
                  var(anscombe[, i+4]),
                  mean(anscombe[, i+4]),
                  cor(anscombe[, i], anscombe[, i+4]))
kable(tab)

While the residuals of the linear regression clearly indicate fundamental differences in these data, the most simple and straightforward approach is visualisation to highlight the fundamental differences in the datasets.

ff <- y ~ x

mods <- setNames(as.list(1:4), paste0("lm", 1:4))

par(mfrow = c(2, 2), mar = c(4, 4, 1, 1))
for (i in 1:4) {
    ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
    plot(ff, data = anscombe, pch = 19, xlim = c(3, 19), ylim = c(3, 13))
    mods[[i]] <- lm(ff, data = anscombe)
    abline(mods[[i]])
}
kable(sapply(mods, residuals))

The MA plot example

The following code chunk connects to the PXD000001 data set on the ProteomeXchange repository and fetches the mzTab file. After missing values filtering, we extract relevant data (log2 fold-changes and log10 mean expression intensities) into data.frames.

library("rpx")
px1 <- PXDataset("PXD000001")
mztab <- pxget(px1, "F063721.dat-mztab.txt")

library("MSnbase")
## here, we need to specify the (old) mzTab version 0.9
qnt <- readMzTabData(mztab, what = "PEP", version = "0.9")
sampleNames(qnt) <- reporterNames(TMT6)
qnt <- filterNA(qnt)
## may be combineFeatuers

spikes <- c("P02769", "P00924", "P62894", "P00489")
protclasses <- as.character(fData(qnt)$accession)
protclasses[!protclasses %in% spikes] <- "Background"


madata42 <- data.frame(A = rowMeans(log(exprs(qnt[, c(4, 2)]), 10)),
                       M = log(exprs(qnt)[, 4], 2) - log(exprs(qnt)[, 2], 2),
                       data = rep("4vs2", nrow(qnt)),
                       protein = fData(qnt)$accession,
                       class = factor(protclasses))

madata62 <- data.frame(A = rowMeans(log(exprs(qnt[, c(6, 2)]), 10)),
                       M = log(exprs(qnt)[, 6], 2) - log(exprs(qnt)[, 2], 2),
                       data = rep("6vs2", nrow(qnt)),
                       protein = fData(qnt)$accession,
                       class = factor(protclasses))


madata <- rbind(madata42, madata62)

The traditional plotting system

par(mfrow = c(1, 2))
plot(M ~ A, data = madata42, main = "4vs2",
     xlab = "A", ylab = "M", col = madata62$class)
plot(M ~ A, data = madata62, main = "6vs2",
     xlab = "A", ylab = "M", col = madata62$class)

lattice

library("lattice")
latma <- xyplot(M ~ A | data, data = madata,
                groups = madata$class,
                auto.key = TRUE)
print(latma)

ggplot2

library("ggplot2")
ggma <- ggplot(aes(x = A, y = M, colour = class), data = madata,
               colour = class) +
    geom_point() +
    facet_grid(. ~ data)
print(ggma)

Customization

library("RColorBrewer")
bcols <- brewer.pal(4, "Set1")
cls <- c("Background" = "#12121230",
         "P02769" = bcols[1],
         "P00924" = bcols[2],
         "P62894" = bcols[3],
         "P00489" = bcols[4])
ggma2 <- ggplot(aes(x = A, y = M, colour = class),
                data = madata) + geom_point(shape = 19) +
    facet_grid(. ~ data) + scale_colour_manual(values = cls) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))
print(ggma2)

The MAplot method for MSnSet instances

MAplot(qnt, cex = .8)

An interactive r CRANpkg("shiny") app for MA plots

This (now outdated and deprecated) app is based on Mike Love's shinyMA application, adapted for a proteomics data. A screen shot is displayed below.

See the excellent r CRANpkg("shiny") web page for tutorials and the Mastering Shiny book for details on shiny.

Volcano plots

Below, using the r Biocpkg("msmsTest") package, we load a example MSnSet data with spectral couting data (from the r Biocpkg("msmsEDA") package) and run a statistical test to obtain (adjusted) p-values and fold-changes.

library("msmsEDA")
library("msmsTests")
data(msms.dataset)
## Pre-process expression matrix
e <- pp.msms.data(msms.dataset)
## Models and normalizing condition
null.f <- "y~batch"
alt.f <- "y~treat+batch"
div <- apply(exprs(e), 2, sum)
## Test
res <- msms.glm.qlll(e, alt.f, null.f, div = div)
lst <- test.results(res, e, pData(e)$treat, "U600", "U200 ", div,
                    alpha = 0.05, minSpC = 2, minLFC = log2(1.8),
                    method = "BH")

Here, we produce the volcano plot by hand, with the plot function. In the second plot, we limit the x axis limits and add grid lines.

plot(lst$tres$LogFC, -log10(lst$tres$p.value))
plot(lst$tres$LogFC, -log10(lst$tres$p.value),
     xlim = c(-3, 3))
grid()

Below, we use the res.volcanoplot function from the r Biocpkg("msmsTests") package. This functions uses the sample annotation stored with the quantitative data in the MSnSet object to colour the samples according to their phenotypes.

## Plot
res.volcanoplot(lst$tres,
                max.pval = 0.05,
                min.LFC = 1,
                maxx = 3,
                maxy = NULL,
                ylbls = 4)

A PCA plot

Using the counts.pca function from the r Biocpkg("msmsEDA") package:

library("msmsEDA")
data(msms.dataset)
msnset <- pp.msms.data(msms.dataset)
lst <- counts.pca(msnset, wait = FALSE)

It is also possible to generate the PCA data using the prcomp. Below, we extract the coordinates of PC1 and PC2 from the counts.pca result and plot them using the plot function.

pcadata <- lst$pca$x[, 1:2]
head(pcadata)
plot(pcadata[, 1], pcadata[, 2],
     xlab = "PCA1", ylab = "PCA2")
grid()

Plotting with R

plotfuns <- rbind(c("scatterplots", "plot", "xyplot", "geom_point"),
                  c("histograms", "hist", "histgram", "geom_histogram"),
                  c("density plots", "plot(density())", "densityplot", "geom_density"),
                  c("boxplots", "boxplot", "bwplot", "geom_boxplot"),
                  c("violin plots", "vioplot::vioplot", "bwplot(..., panel = panel.violin)", "geom_violin"),
                  c("line plots", "plot, matplot", "xyploy, parallelplot", "geom_line"),
                  c("bar plots", "barplot", "barchart", "geom_bar"),
                  c("pie charts", "pie", "", "geom_bar with polar coordinates"),
                  c("dot plots", "dotchart", "dotplot", "geom_point"),
                  c("stip plots", "stripchart", "stripplot", "goem_point"),
                  c("dendrogramms", "plot(hclust())", "latticeExtra package", "ggdendro package"),
                  c("heatmaps", "image, heatmap", "levelplot", "geom_tile"))


colnames(plotfuns) <- c("plot type", "traditional", "lattice", "ggplot2")
kable(plotfuns)

Below, we are going to use a data from the r Biocexptpkg("pRolocdata") to illustrate the plotting functions.

library("pRolocdata")
data(tan2009r1)

Scatter plots

See the MA and volcano plot examples.

The default plot type is p, for points. Other important types are l for lines and h for histogram (see below).

Historams and density plots

We extract the (normalised) intensities of the first sample

x <- exprs(tan2009r1)[, 1]

and plot the distribution with a histogram and a density plot next to each other on the same figure (using the mfrow par plotting paramter)

par(mfrow = c(1, 2))
hist(x)
plot(density(x))

Box plots and violin plots

we first extract the r nrow(tan2009r1) proteins by r ncol(tan2009r1) samples data matrix and plot the sample distributions next to each other using boxplot and beanplot (from the r CRANpkg("beanplot") package).

library("beanplot")
x <- exprs(tan2009r1)
par(mfrow = c(2, 1))
boxplot(x)
beanplot(x[, 1], x[, 2], x[, 3], x[, 4], log = "")

Line plots

below, we produce line plots that describe the protein quantitative profiles for two sets of proteins, namely er and mitochondrial proteins using matplot.

we need to transpose the matrix (with t) and set the type to both (b), to display points and lines, the colours to red and steel blue, the point characters to 1 (an empty point) and the line type to 1 (a solid line).

er <- fData(tan2009r1)$markers == "ER"
mt <- fData(tan2009r1)$markers == "mitochondrion"

par(mfrow = c(2, 1))
matplot(t(x[er, ]), type = "b", col = "red", pch = 1, lty = 1)
matplot(t(x[mt, ]), type = "b", col = "steelblue", pch = 1, lty = 1)

In the last section, about spatial proteomics, we use the specialised plotDist function from the r Biocpkg("pRoloc") package to generate such figures.

Bar and dot charts

To illustrate bar and dot charts, we cound the number of proteins in the respective class using table.

x <- table(fData(tan2009r1)$markers)
x
par(mfrow = c(1, 2))
barplot(x)
dotchart(as.numeric(x))

Heatmaps

The easiest to produce a complete heatmap is with the heatmap function:

heatmap(exprs(tan2009r1))

To produce the a heatmap without the dendrograms, one can use the image function on a matrix or the specialised version for MSnSet objects from the r Biocpkg("MSnbase") package.

par(mfrow = c(1, 2))
x <- matrix(1:9, ncol = 3)
image(x)
image(tan2009r1)

See also r CRANpkg("gplots")'s heatmap.2 function and the r Biocpkg("Heatplus") Bioconductor package for more advanced heatmaps and the r CRANpkg("corrplot") package for correlation matrices.

Dendrograms

The easiest way to produce and plot a dendrogram is:

d <- dist(t(exprs(tan2009r1))) ## distance between samples
hc <- hclust(d) ## hierarchical clustering
plot(hc) ## visualisation

See also r CRANpkg("dendextend") and this post to illustrate r CRANpkg("latticeExtra") and r CRANpkg("ggdendro").

Venn diagrams

Visualising mass spectrometry data

Direct access to the raw data

library("mzR")
mzf <- pxget(px1,
             "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML")
ms <- openMSfile(mzf)

hd <- header(ms)
ms1 <- which(hd$msLevel == 1)

rtsel <- hd$retentionTime[ms1] / 60 > 30 & hd$retentionTime[ms1] / 60 < 35
lout <- matrix(NA, ncol = 10, nrow = 8)
lout[1:2, ] <- 1
for (ii in 3:4)
    lout[ii, ] <- c(2, 2, 2, 2, 2, 2, 3, 3, 3, 3)
lout[5, ] <- rep(4:8, each = 2)
lout[6, ] <- rep(4:8, each = 2)
lout[7, ] <- rep(9:13, each = 2)
lout[8, ] <- rep(9:13, each = 2)
i <- ms1[which(rtsel)][1]
j <- ms1[which(rtsel)][2]
ms2 <- (i+1):(j-1)
layout(lout)
par(mar=c(4,2,1,1))
plot(chromatogram(ms)[[1]], type = "l")
abline(v = hd[i, "retentionTime"], col = "red")
par(mar = c(3, 2, 1, 0))
plot(peaks(ms, i), type = "l", xlim = c(400, 1000))
legend("topright", bty = "n",
       legend = paste0(
           "Acquisition ", hd[i, "acquisitionNum"],  "\n",
           "Retention time ", formatRt(hd[i, "retentionTime"])))
abline(h = 0)
abline(v = hd[ms2, "precursorMZ"],
       col = c("#FF000080",
           rep("#12121280", 9)))
par(mar = c(3, 0.5, 1, 1))
plot(peaks(ms, i), type = "l", xlim = c(521, 522.5), yaxt = "n")
abline(h = 0)
abline(v = hd[ms2, "precursorMZ"], col = "#FF000080")
par(mar = c(2, 2, 0, 1))
for (ii in ms2) {
    p <- peaks(ms, ii)
    plot(p, xlab = "", ylab = "", type = "h", cex.axis = .6)
    legend("topright",
           legend = paste0("Prec M/Z\n", round(hd[ii, "precursorMZ"], 2)),
           bty = "n", cex = .8)
}
M2 <- MSmap(ms, i:j, 100, 1000, 1, hd)
plot3D(M2)

MS barcoding

par(mar=c(4,1,1,1))
image(t(matrix(hd$msLevel, 1, nrow(hd))),
      xlab="Retention time",
      xaxt="n", yaxt="n", col=c("black","steelblue"))
k <- round(range(hd$retentionTime) / 60)
nk <- 5
axis(side=1, at=seq(0,1,1/nk), labels=seq(k[1],k[2],k[2]/nk))

Animation

The following animation scrolls over 5 minutes of retention time for a MZ range between 521 and 523.

library("animation")
an1 <- function() {
    for (i in seq(0, 5, 0.2)) {
        rtsel <- hd$retentionTime[ms1] / 60 > (30 + i) &
            hd$retentionTime[ms1] / 60 < (35 + i)
        M <- MSmap(ms, ms1[rtsel], 521, 523, .005, hd)
        M@map[msMap(M) == 0] <- NA
        print(plot3D(M, rgl = FALSE))
    }
}

saveGIF(an1(), movie.name = "msanim1.gif")
knitr::include_graphics("./figures/msanim1.gif")

The code chunk below scrolls of a slice of retention times while keeping the retention time constant between 30 and 35 minutes.

an2 <- function() {
    for (i in seq(0, 2.5, 0.1)) {
        rtsel <- hd$retentionTime[ms1] / 60 > 30 & hd$retentionTime[ms1] / 60 < 35
        mz1 <- 520 + i
        mz2 <- 522 + i
        M <- MSmap(ms, ms1[rtsel], mz1, mz2, .005, hd)
        M@map[msMap(M) == 0] <- NA
        print(plot3D(M, rgl = FALSE))
    }
}

saveGIF(an2(), movie.name = "msanim2.gif")
knitr::include_graphics("./figures/msanim2.gif")

The r Biocpkg("MSnbase") infrastructure

library("MSnbase")
data(itraqdata)
itraqdata2 <- pickPeaks(itraqdata, verbose = FALSE)
plot(itraqdata[[25]], full = TRUE, reporters = iTRAQ4)
par(oma = c(0, 0, 0, 0))
par(mar = c(4, 4, 1, 1))
plot(itraqdata2[[25]], itraqdata2[[28]], sequences = rep("IMIDLDGTENK", 2))

The r CRANpkg("protViz") package

library("protViz")
data(msms)

fi <- fragmentIon("TAFDEAIAELDTLNEESYK")
fi.cyz <- as.data.frame(cbind(c=fi[[1]]$c, y=fi[[1]]$y, z=fi[[1]]$z))

p <- peakplot("TAFDEAIAELDTLNEESYK",
              spec = msms[[1]],
              fi = fi.cyz,
              itol = 0.6,
              ion.axes = FALSE)

The peakplot function return the annotation of the MSMS spectrum that is plotted:

str(p)

Preprocessing of MALDI-MS spectra

The following code chunks demonstrate the usage of the mass spectrometry preprocessing and plotting routines in the r CRANpkg("MALDIquant") package. r CRANpkg("MALDIquant") uses the traditional graphics system. Therefore r CRANpkg("MALDIquant") overloads the traditional functions plot, lines and points for its own data types. These data types represents spectrum and peak lists as S4 classes. Please see the r CRANpkg("MALDIquant") vignette and the corresponding website for more details.

After loading some example data a simple plot draws the raw spectrum.

library("MALDIquant")

data("fiedler2009subset", package="MALDIquant")

plot(fiedler2009subset[[14]])

After some preprocessing, namely variance stabilization and smoothing, we use lines to draw our baseline estimate in our processed spectrum.

transformedSpectra <- transformIntensity(fiedler2009subset, method = "sqrt")
smoothedSpectra <- smoothIntensity(transformedSpectra, method = "SavitzkyGolay")

plot(smoothedSpectra[[14]])
lines(estimateBaseline(smoothedSpectra[[14]]), lwd = 2, col = "red")

After removing the background removal we could use plot again to draw our baseline corrected spectrum.

rbSpectra <- removeBaseline(smoothedSpectra)
plot(rbSpectra[[14]])

detectPeaks returns a MassPeaks object that offers the same traditional graphics functions. The next code chunk demonstrates how to mark the detected peaks in a spectrum.

cbSpectra <- calibrateIntensity(rbSpectra, method = "TIC")
peaks <- detectPeaks(cbSpectra, SNR = 5)

plot(cbSpectra[[14]])
points(peaks[[14]], col = "red", pch = 4, lwd = 2)

Additional there is a special function labelPeaks that allows to draw the M/Z values above the corresponding peaks. Next we mark the 5 top peaks in the spectrum.

plot(cbSpectra[[14]])
points(peaks[[14]], col = "red", pch = 4, lwd = 2)
top5 <- intensity(peaks[[14]]) %in% sort(intensity(peaks[[14]]),
                                         decreasing = TRUE)[1:5]
labelPeaks(peaks[[14]], index = top5, avoidOverlap = TRUE)

Often multiple spectra have to be recalibrated to be comparable. Therefore r CRANpkg("MALDIquant") warps the spectra according to so called reference or landmark peaks. For debugging the determineWarpingFunctions function offers some warping plots. Here we show only the last 4 plots:

par(mfrow = c(2, 2))
warpingFunctions <-
    determineWarpingFunctions(peaks,
                              tolerance = 0.001,
                              plot = TRUE,
                              plotInteractive = TRUE)
par(mfrow = c(1, 1))
warpedSpectra <- warpMassSpectra(cbSpectra, warpingFunctions)
warpedPeaks <- warpMassPeaks(peaks, warpingFunctions)

In the next code chunk we visualise the need and the effect of the recalibration.

sel <- c(2, 10, 14, 16)
xlim <- c(4180, 4240)
ylim <- c(0, 1.9e-3)
lty <- c(1, 4, 2, 6)

par(mfrow = c(1, 2))
plot(cbSpectra[[1]], xlim = xlim, ylim = ylim, type = "n")

for (i in seq(along = sel)) {
  lines(peaks[[sel[i]]], lty = lty[i], col = i)
  lines(cbSpectra[[sel[i]]], lty = lty[i], col = i)
}

plot(cbSpectra[[1]], xlim = xlim, ylim = ylim, type = "n")

for (i in seq(along = sel)) {
  lines(warpedPeaks[[sel[i]]], lty = lty[i], col = i)
  lines(warpedSpectra[[sel[i]]], lty = lty[i], col = i)
}
par(mfrow = c(1, 1))

The code chunks above generate plots that are very similar to the figure 7 in the corresponding paper "Visualisation of proteomics data using R". Please find the code to exactly reproduce the figure at: https://github.com/sgibb/MALDIquantExamples/blob/master/R/createFigure1_color.R

Genomic and protein sequences

These visualisations originate from the Pbase Pbase-data and mapping vignettes.

Mass spectrometry imaging

The following code chunk downloads a MALDI imaging dataset from a mouse kidney shared by Adrien Nyakas and Stefan Schurch and generates a plot with the mean spectrum and three slices of interesting M/Z regions.

library("MALDIquant")
library("MALDIquantForeign")

spectra <- importBrukerFlex("http://files.figshare.com/1106682/MouseKidney_IMS_testdata.zip", verbose = FALSE)

spectra <- smoothIntensity(spectra, "SavitzkyGolay",  halfWindowSize = 8)
spectra <- removeBaseline(spectra, method = "TopHat", halfWindowSize = 16)
spectra <- calibrateIntensity(spectra, method = "TIC")
avgSpectrum <- averageMassSpectra(spectra)
avgPeaks <- detectPeaks(avgSpectrum, SNR = 5)

avgPeaks <- avgPeaks[intensity(avgPeaks) > 0.0015]

oldPar <- par(no.readonly = TRUE)
layout(matrix(c(1,1,1,2,3,4), nrow = 2, byrow = TRUE))
plot(avgSpectrum, main = "mean spectrum",
     xlim = c(3000, 6000), ylim = c(0, 0.007))
lines(avgPeaks, col = "red")
labelPeaks(avgPeaks, cex = 1)

par(mar = c(0.5, 0.5, 1.5, 0.5))
plotMsiSlice(spectra,
             center = mass(avgPeaks),
             tolerance = 1,
             plotInteractive = TRUE)
par(oldPar)
knitr::include_graphics("./figures/mqmsi-1.png")

An interactive r CRANpkg("shiny") app for Imaging mass spectrometry

There is also an interactive MALDIquant IMS shiny app for demonstration purposes. A screen shot is displayed below. To start the application:

library("shiny")
runGitHub("sgibb/ims-shiny")
knitr::include_graphics("./figures/ims-shiny.png")

Spatial proteomics

library("pRoloc")
library("pRolocdata")

data(tan2009r1)

## these params use class weights
fn <- dir(system.file("extdata", package = "pRoloc"),
          full.names = TRUE, pattern = "params2.rda")
load(fn)

setStockcol(NULL)
setStockcol(paste0(getStockcol(), 90))

w <- table(fData(tan2009r1)[, "pd.markers"])
(w <- 1/w[names(w) != "unknown"])
tan2009r1 <- svmClassification(tan2009r1, params2,
                               class.weights = w,
                               fcol = "pd.markers")
ptsze <- exp(fData(tan2009r1)$svm.scores) - 1
lout <- matrix(c(1:4, rep(5, 4)), ncol = 4, nrow = 2)
layout(lout)
cls <- getStockcol()
par(mar = c(4, 4, 1, 1))
plotDist(tan2009r1[which(fData(tan2009r1)$PLSDA == "mitochondrion"), ],
         markers = featureNames(tan2009r1)[which(fData(tan2009r1)$markers.orig == "mitochondrion")],
         mcol = cls[5])
legend("topright", legend = "mitochondrion", bty = "n")
plotDist(tan2009r1[which(fData(tan2009r1)$PLSDA == "ER/Golgi"), ],
         markers = featureNames(tan2009r1)[which(fData(tan2009r1)$markers.orig == "ER")],
         mcol = cls[2])
legend("topright", legend = "ER", bty = "n")
plotDist(tan2009r1[which(fData(tan2009r1)$PLSDA == "ER/Golgi"), ],
         markers = featureNames(tan2009r1)[which(fData(tan2009r1)$markers.orig == "Golgi")],
         mcol = cls[3])
legend("topright", legend = "Golgi", bty = "n")
plotDist(tan2009r1[which(fData(tan2009r1)$PLSDA == "PM"), ],
         markers = featureNames(tan2009r1)[which(fData(tan2009r1)$markers.orig == "PM")],
         mcol = cls[8])
legend("topright", legend = "PM", bty = "n")
plot2D(tan2009r1, fcol = "svm", cex = ptsze, method = "kpca")
addLegend(tan2009r1, where = "bottomleft", fcol = "svm", bty = "n")

See the pRoloc-tutorial vignette (pdf) from the r Biocpkg("pRoloc") package for details about spatial proteomics data analysis and visualisation.

Session information

print(sessionInfo(), locale = FALSE)


lgatto/RforProteomics documentation built on May 10, 2023, 11:51 p.m.