knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)

Introduction

This is a step by step tutorial on how to assess and/or correct signal drift and batch effects within/across a multi-batch direct infusion mass spectrometry (DIMS) dataset. The same approach can be used on liquid chromatography mass spectrometry (LCMS) peak table as well.

Deeper details on how the algorithm works are detailed in \@ref(algorithm)

Installation

You should have R version 4.0.0 or above and Rstudio installed to be able to run this notebook.

Execute following commands from the R terminal.

install.packages("gridExtra")

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("pmp")

Load the required libraries into the R environment

library(S4Vectors)
library(SummarizedExperiment)
library(pmp)
library(ggplot2)
library(reshape2)
library(gridExtra)

Data set

In this tutorial we will be using an direct infusion mass spectrometry (DIMS) data set consisting of 172 samples measured across 8 batches and is included in pmp package as SummarizedExperiemnt class object MTBLS79. More detailed description of the data set is available from @kirwan2014, MTBLS79 and R man page.

help ("MTBLS79")
feature_names <- c("70.03364", "133.07379", "146.16519", "163.04515", 
    "174.89483", "200.03196", "207.07818", "221.05062", "240.02445",
    "251.03658", "266.01793", "304.99115", "321.07923", "338.98131", 
    "376.03962", "393.35878", "409.05716", "430.24353", "451.01086",
    "465.14937")
summary(t(SummarizedExperiment::assay(MTBLS79[feature_names, ])))
#number of samples:
ncol(MTBLS79)
#Batches:
unique(MTBLS79$Batch)
#Sample classes:
unique(MTBLS79$Class)

Explorative example

A more detailed overview and guidelines on strategies for quality control of mass spectrometry assays is detailed in recent work by @broadhurst2018.

To evaluate if the data needs correction, it is common practice to examine the relative standard deviation (RSD) of the quality control (QC) samples and biological samples. RSD% is also sometimes referred to as the coefficient of variation (CV). An RSD% for the QC samples below 20-30% is commonly used as an acceptable level of technical variation where signal correction is not required.

The following code calculates and plots the RSD% values of the features within the data set.

#  separate the LCMS data from the meta data
data(MTBLS79)
data <- SummarizedExperiment::assay(MTBLS79[feature_names, ])
class <- SummarizedExperiment::colData(MTBLS79)$Class
batch <- SummarizedExperiment::colData(MTBLS79)$Batch
order <- c(1:ncol(data))

# get index of QC samples
QChits <- which(class == "QC")

# small function to calculate RSD%
FUN <- function(x) sd(x, na.rm=TRUE) / mean(x, na.rm=TRUE) * 100

# RSD% of biological and QC samples within all 8 batches:
out <- matrix(ncol=2, nrow=nrow(data))
colnames(out) <- c("Sample","QC")
rownames(out) <- rownames(data)

# for each feature calculate RSD% for the samples and the QCs
for (i in 1:nrow(data)) {
    out[i, 1] <- FUN(data[i, -QChits]) # for samples
    out[i, 2] <- FUN(data[i, QChits]) # for QCs
}

# prepare data for plotting
plotdata <- melt(data.frame(out), variable.name="Class", value.name="RSD")
plotdata$feature <- rownames(data)

plotdata$RSD <- round(plotdata$RSD,0)
plotdata$feature <- factor(plotdata$feature, ordered=TRUE,
    levels=unique(plotdata$feature))

# plot
ggplot(data=plotdata, aes(x=Class, y=feature, fill=RSD)) + 
    geom_tile() + 
    geom_text(aes(label=RSD)) +
    scale_fill_gradient2(low="black", mid="white", high="red")

A violin plot is a useful way of summarising the RSD% over all samples/QCs in the data set. Note a very high QC sample RSD% value for feature '409.05716'.

ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) +
    geom_violin(draw_quantiles=c(0.25,0.5,0.75)) +
    ylab("RSD%") + 
    theme(panel.background=element_blank())

The plots indicates that there are some features with a QC RSD% lower than 30%, which is a commonly accepted threshold, but for some features QC RSD % exceeds 30% and is similar to the signal variation of the biological samples. We can calculate similar statistics per batch and visualise the results with a box plot.

# prepare some matrices to store the results
RSDQC <- matrix(ncol=8, nrow=nrow(data))
RSDsample <- matrix(ncol=8, nrow=nrow(data))
colnames(RSDQC) <- unique(batch)
colnames(RSDsample) <- unique(batch)

rownames(RSDQC) <- rownames(data)
rownames(RSDsample) <- rownames(data)

# for each feature
for (i in 1:nrow(data)) {
    # for each batch
    for (nb in 1:8) {
        # RSD% of QCs in this batch
        RSDQC[i, nb] <- FUN(data[i, which(class == "QC" & batch == nb)])
        # RSD% of samples in this batch
        RSDsample[i, nb] <- FUN(data[i, which(!class == "QC" & batch == nb)])
    }
}

# prepare results for plotting
plotdataQC <- melt(as.data.frame(RSDQC), variable.name="batch",
    value.name="RSD")
plotdataQC$Class <- "QC"

plotdataBio <- melt(as.data.frame(RSDsample), variable.name="batch",
    value.name="RSD")
plotdataBio$Class <- "Sample"

plotdata <- rbind(plotdataQC, plotdataBio)

plotdata$Class <- as.factor(plotdata$Class)

# plot
ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() +
    facet_wrap(~ batch, ncol=3) +
    ylab("RSD%") +
    theme(panel.background=element_blank())

Summary of RSD% of QC samples

summary(RSDQC)

Summary of RSD% of biological samples

summary(RSDsample)

From the above we can conclude that for every analytical batch RSD% tends to be higher in the analytical samples than it is in the QC samples for all 20 measured features. A few outlier QC samples can be observed.

An alternative measure of QC and biological sample variability is the so called d-ratio, which indicates if the technical variation within the QC samples exceeds the biological variation within biological samples.

# prepare a list of colours for plotting
manual_color = c("#386cb0", "#ef3b2c", "#7fc97f", "#fdb462", "#984ea3", 
    "#a6cee3", "#778899", "#fb9a99", "#ffff33")

# Function to calculate median absolute deviation
DRatfun <- function(samples, qcs) mad(qcs) / mad(samples)

# prepare matrix for dratio output
dratio <- matrix(ncol=8, nrow=nrow(data))
colnames(dratio) <- unique(batch)
rownames(dratio) <- rownames(data)

# calculate dratio for each feature, per batch
for (i in 1:nrow(dratio)){
    for (nb in 1:8) {
        dratio[i, nb] <- DRatfun(samples=data[i, which(!class == "QC" &
        batch == nb)], qcs=data[i, which(class == "QC" & batch == nb)])
    }
}

# prepare data for plotting
dratio <- as.data.frame(round(dratio, 2))

plotdata2 <- melt(dratio, variable.name="batch")
plotdata2$index <- rownames(data)
plotdata2$index <- factor(plotdata2$index, ordered=TRUE,
    levels=unique(plotdata2$index))

ggplot(data=plotdata2, aes(x=index, y=value, color=batch)) +
    geom_point(size=2) +
    xlab("index") + ylab("D-ratio") +
    geom_hline(yintercept=1) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    theme(axis.text.x=element_text(angle=90))

The D-ratio is a convenient measure to assess if technical variation in the QC samples (MAD QC) is higher than the variation within the biological samples (MAD sample). Ratio values close to, or higher than 1 indicate that technical variation of the measured feature is higher than the biological variation and therefore should be treated carefully during interperation of the data set. The colors in the figure above indicate different analytical batches. In the example above we can see that feature '409.05716' has a d-ratio value above 1 in four batches, while for features '70.0336' and '393.35878' the D-ratio is reproducibly low within all eight batches.

Principal components analysis (PCA) can be used to check common trends in data. Let's inspect the scores of the first two principal components and color samples by batch and class. For PCA model data should be normalised and missing values should be replaced using imputation, followed by data scaling. We will use PQN method to normalise data, KNN for missing value imputation and finally glog scaling method using other functions from pmp package. A more detailed description on data filtering is detailed in @guida2016.

pca_data <- MTBLS79[feature_names, ]

pca_data <- pqn_normalisation(pca_data, classes=class, qc_label="QC")
pca_data <- mv_imputation(pca_data, method="KNN", k=5, rowmax=0.5,
    colmax=0.5, maxp=NULL, check_df=FALSE)
pca_data <- glog_transformation(pca_data, classes=class, qc_label="QC")

pca_data <- prcomp(t(SummarizedExperiment::assay(pca_data)), center=TRUE, 
    scale=FALSE)
exp_var_pca <- round(((pca_data$sdev^2)/sum(pca_data$sdev^2)*100)[1:2], 2)

plots <- list()

plotdata <- data.frame(PC1=pca_data$x[, 1], PC2=pca_data$x[, 2],
    batch=as.factor(batch), class=class)

plots[[1]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=batch)) +
    geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, before correction") +
    xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca[2] ," %)"))

plots[[2]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=class)) +
    geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores plot, before correction") +
    xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca[2] ," %)"))

grid.arrange(ncol=2, plots[[1]], plots[[2]])

Left side plot above clearly shows that samples measured in batches 7 and 8 are differentiating from bathes 1 to 6. On the right hand side plot the seperation between samples classes is still visible, but also seperation between measurement batches is clearly visible across PC2 axis.

Correlation between signal intensity and measurment order of QC samples

Alternatively, trends in measured signal related to measurement order could indicate if signal and/or batch correction is required. The plot below illustrates the measured signal of QC samples across all 8 batches. To be able to compare all 20 features measured at different signal ranges, the data will be scaled to unit variance (UV).

# autoscale the QC data
QCdata <- data[ ,QChits]
QCdata2 <- as.data.frame(scale(t(QCdata), scale=TRUE, center=TRUE))

# prepare the data for plotting
plotdata <- melt(QCdata2, value.name="intensity")
plotdata$index <- rep(1:nrow(QCdata2), ncol(QCdata2))

plotdata$batch <- as.factor(batch[QChits])

# plot
ggplot(data=plotdata, aes(x=index, y=intensity, col=batch)) +
    geom_point(size=2) +
    facet_wrap(~ variable, ncol=4) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color)

This figure indicates that there is some fluctuation in the measured signal across the eight batches, and that some features are following a similar pattern , i.e. they are correlated. We can create a similar plot to the one above including linear regression fit between measured data points.

ggplot(data=plotdata, aes(x=index, y=intensity, col=batch)) + 
    geom_point(size=2) +
    facet_wrap(~ variable, ncol=4) +
    geom_smooth(method="lm", se=TRUE, colour="black") +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color)

The plot above indicates that some trends can be observed. It is possible to calculate actual correlation values within QC samples for each measured feature, and we will use Kendall's tau statistic to estimate a rank-based measure of association.

sampleorder <- c(1:ncol(QCdata))

correlations <- matrix(ncol=2, nrow=nrow(data))
rownames(correlations) <- rownames(data)
colnames(correlations) <- c("tau","p.value")
correlations <- as.data.frame(correlations)

for (coln in 1:nrow(data)) {
    stat <- cor.test(sampleorder, QCdata[coln, ], method="kendall")
    correlations$tau[coln] <- stat$estimate
    correlations$p.value[coln] <- stat$p.value
}

correlations

While most of the calculated tau values and corresponding p-values indicate that there is not a strong trend between measurement order and the measured QC sample signal, some of the values don't match the trends we can observe in figure above. For features "133.07379" and "376.03962" calculated correlation values are aroun 0.6 and in figure above intensity increasing tren can be observed. For other features correlation values are relatively low, but in figure above clear trends in signal changes in batches 7 and 8 can be osberved.

Alternatively it is possible to calculate correlation statistics per batch and visualise the results.

correlations <- matrix(ncol=8, nrow=nrow(data))
rownames(correlations) <- rownames(data)
colnames(correlations) <- unique(batch)

QCbatch <- batch[QChits]

for (coln in 1:nrow(data)) {
    for (bch in 1:8) {
        sampleorder <- scale(c(1:length(which(QCbatch==bch))),
        center=TRUE, scale=TRUE)

        if ((length(sampleorder) - 
        length(which(is.na(QCdata[coln, which(QCbatch==bch)])))) >= 3){
            correlations[coln, bch] <- cor.test(sampleorder,
            QCdata[coln, which(QCbatch==bch)], method="kendall")$estimate
        }
    }
}

round(correlations, 2)
plotdata <- as.data.frame(correlations)
plotdata$feature <- rownames(plotdata)

plotdata <- melt(plotdata, variable.name="batch")
plotdata$feature <- factor(plotdata$feature, ordered=TRUE,
    levels = unique(plotdata$feature))

ggplot(data=plotdata, aes(x=batch, y=feature, fill=value)) + 
    geom_tile() + scale_fill_gradient2()

Figure above indicates that there are significant acquisition order related trends within some batches. Fore example, bathes 4 and 6 for features '451.01081' and '409.05716'.

Using regression to estimate signal trends and variability within QC sample

It is possible to apply univariate regression to the QC sample measurement order and signal intensity, to estimate correlation and spread (R2) of the measured data points.

sampleorder <- c(1:ncol(QCdata))

regressionout <- matrix(ncol=3, nrow=nrow(data))
rownames(regressionout) <- rownames(data)
colnames(regressionout) <- c("R2.adj","coefficient","p.value")
regressionout <- as.data.frame(regressionout)

for (coln in 1:nrow(data)) {
    tempdat <- data.frame(x=sampleorder, y=QCdata[coln, ])
    stat <- lm(x ~ y, data=tempdat)
    stat <- summary(stat)

    regressionout$R2.adj[coln] <- stat$adj.r.squared  
    regressionout$coefficient[coln] <- stat$coefficients[2,1]
    regressionout$p.value[coln] <- stat$coefficients[2,4]
}

regressionout

And regression statistics per batch.

regPerBatch <- matrix(ncol=8, nrow=nrow(data))
rownames(regPerBatch) <- rownames(data)
colnames(regPerBatch) <- unique(batch)

QCbatch <- MTBLS79$Batch[QChits]

for (coln in 1:nrow(data)) {
    for (bch in 1:8) {
        sampleorder <- c(1:length(which(QCbatch == bch)))
        tempdat <- data.frame(x=sampleorder, y=QCdata[coln, 
            which(QCbatch==bch)])
        stat <- lm(x ~ y, data=tempdat)
        stat <- summary(stat)
        regPerBatch[coln,bch] <- stat$adj.r.squared
    }
}

round(regPerBatch,2)
plotdata <- as.data.frame(regPerBatch)
plotdata$feature <- rownames(plotdata)

plotdata <- melt(plotdata, variable.name="batch")
plotdata$feature <- factor(plotdata$feature, ordered=TRUE, 
    levels=unique(plotdata$feature))

ggplot(data=plotdata, aes(x=batch, y=feature, fill=value)) + 
    geom_tile() + scale_fill_gradient2()

Example of signal and batch correction for a single feature {#algorithm}

Let's have a closer look to '451.01086' measured feature and how signal correction can be applied.

data <- data.frame(data=
    as.vector(SummarizedExperiment::assay(MTBLS79["451.01086", ])), batch=batch,
    class=factor(class, ordered=TRUE))
data$order <- c(1:nrow(data))
data$batch <- as.factor(data$batch)

ggplot(data=data, aes(x=order, y=log(data,10), col=batch, shape=class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color)

In this plot circles and squares represent biological samples and triangles are the QC samples. Analytical batches are represented by colours. Differences in measured intensities can be observed between analytical batches.

Similar plot for QC samples only

QCdata <- data[data$class == "QC",]

ggplot(data=QCdata, aes(x=order, y=log(data,10), col=batch, shape=class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color, drop=FALSE) +
    scale_shape_manual(values=c(16, 17, 15), drop=FALSE)

This figure indicates that there is signal drift present within each analytical batch and between analytical batches for this feature. Let's have a look at the RSD% for all QC samples and QC samples within each analytical batch.

FUN <- function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE) * 100

# RSD% of biological and QC samples within all 6 batches:
out <- c(NA,NA)
names(out) <- c("Biological","QC")
out[1] <-FUN(data$data[-QChits])
out[2] <-FUN(data$data[QChits])
out
# RSD% per batch:
out <- matrix(ncol=8,nrow=2)
colnames(out) <- unique(batch)
rownames(out) <- c("Biological","QC")
for (i in 1:8) {
    out[1, i] <- FUN(data$data[which(!class=="QC" & batch==i)])
    out[2, i] <- FUN(data$data[which(class=="QC" & batch==i)])
}

out

From the outputs above it's clear that variance of measured QC sample intensities between batches is high (RSD% = 50), and is relatively high within batches 1 to 5. For batch 3 QC variation exceeds that of the biological samples.

Signal/batch correction using smoothed spline fitting

We will apply QC-RSC signal correction method as it is described in @kirwan2013.

The first step involves extracting QC sample data

qcData <- data$data[class == "QC"]
qc_batch <- batch[class == "QC"]
qc_order <- order[class == "QC"]

qcData

Note that the QC data has 1 missing value. Smoothed spline regression doesn't support missing values, so the workaround is to apply missing value imputation or remove the NA values from input to the smoothed spline fit function (which we will do here). We recommend at least 4 QC values be present per batch for the fit to be reliable.

The next step involves applying the smoothed spline fit function to the QC sample data within each batch. We will look at the data for batch 6 in detail.

nbatch <- unique(qc_batch)

nb <- 6

# Sample measurement order
x <- qc_order[qc_batch==nbatch[nb]]

# Measured peak intensity or area
y <- qcData[qc_batch==nbatch[nb]]
y

In this example, signal for 1 QC sample wasn't measured, so these samples need to be removed. The smoothed spline regression input will look like this:

NAhits <- which(is.na(y))
if (length(NAhits)>0) {
    x <- x[-c(NAhits)]
    y <- y[-c(NAhits)]
    rbind(x,y)
}

We will apply a log transformation to the data before fitting

y <- log((y + sqrt(y^2)) / 2)
y

Fit a smoothed cubic spline using internal cross-validation for parameter estimation

sp.obj <- smooth.spline(x, y, cv=TRUE) 
sp.obj

out <- rbind(y,sp.obj$y)
row.names(out) <- c("measured","fitted")
out

Now the smoothed spline fit is used to predict values for the biological sample for the current batch.

valuePredict=predict(sp.obj, order[batch==nb])

plotchr <- as.numeric(data$class)

# reverse the log transformation to convert the predictions back to the
# original scale
valuePredict$y <- exp(valuePredict$y)

plotdata <- data.frame(measured=data$data[batch==nb], fitted=valuePredict$y,
    Class=class[batch==nb], order=order[batch==nb])
plotdata2 <- melt(plotdata, id.vars=c("Class","order"), value.name="intensity",
    variable.name="data")

ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), color=data,
    shape=Class)) + geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    scale_shape_manual(values=c(16, 17, 15), drop=FALSE)

The figure above shows the original data points in blue and the fitted data in red. Triangles represent QC samples.

The next step in signal correction is to "flatten" the fitted curve to correct for signal drift. This can usually be done by subtracting the fitted values from the actual measured values for each feature. To avoid getting negative values we will add the median value of the feature to the corrected data.

fitmedian <- median(plotdata$measured, na.rm=TRUE)
plotdata$corrected_subt <- (plotdata$measured - plotdata$fitted) + fitmedian

plotdata2 <- melt(plotdata, id.vars=c("Class","order"), 
    value.name="intensity", variable.name="data")

plotdata_class <- as.character(plotdata2$Class)
plotdata_class[plotdata_class == "S"] <- "Sample"
plotdata_class[plotdata_class == "C"] <- "Sample"
plotdata2$Class <- factor(plotdata_class)

ggplot(data=plotdata2, aes(x=order, y=intensity, color=data, shape=Class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) + 
    scale_color_manual(values=manual_color) +
    facet_grid(Class ~ .) +
    scale_shape_manual(values=c(17, 16), drop=FALSE)

An alternative to subtraction of the fitted values is to divide them by the median of the fit and use the resulting coefficients to correct the data points. The same general relative trends should be observed in either case.

plotdata$corrected_div <- plotdata$measured/(plotdata$fitted/fitmedian)

plotdata3 <- plotdata[,c("Class", "order", "corrected_subt", "corrected_div")]

plotdata3 <- melt(plotdata3, id.vars=c("Class","order"), 
    value.name="intensity", variable.name="data")

plotdata_class <- as.character(plotdata3$Class)
plotdata_class[plotdata_class=="S"] <- "Sample"
plotdata_class[plotdata_class=="C"] <- "Sample"
plotdata3$Class <- factor(plotdata_class)

ggplot(data=plotdata3, aes(x=order, y=intensity, color=data, shape=Class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    geom_smooth(se=FALSE) + facet_grid(Class ~ .)

So far we have applied signal correction for data points within one analytical batch. The code below will perform the same steps for each of the 8 batches.

outl <- rep(NA, nrow(data))

for (nb in 1:length(nbatch)){
    # assigning sample injection order for a batch to 'x', and corresponding 
    # intensities to 'y'
    x <- qc_order[qc_batch == nbatch[nb]]
    y <- qcData[qc_batch == nbatch[nb]]

    # remove measurements with missing values
    NAhits <- which(is.na(y))
    if (length(NAhits) > 0) {
        x <- x[-c(NAhits)]
        y <- y[-c(NAhits)]
    }

    # require at least 4 data points for QC fit
    if (length(y) >= 4) {
        range <- c(batch == nbatch[nb])
        # Order is a vector of sample indices for the current batch
        outl[range] <- pmp:::splineSmoother(x=x, y=y, newX=order[range], 
        log=TRUE, a=1, spar=0)

        # If less than 5 data points are present, return empty values   
    } else {
        range <- c(batch == nbatch[nb])
        outl[range] <- rep(NA, nrow(data))[range]
    }
}

plotdata <- data.frame(measured=data$data, fitted=outl, Class=class, 
    batch=batch, order=c(1:nrow(data)))
plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"),
    value.name="intensity", variable.name="data")

ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), 
    color=data, shape=Class)) + geom_point(alpha=0.5, size=2) +
    theme(panel.background=element_blank()) + 
    scale_color_manual(values=manual_color)

After smoothed spline fit per each batch is calculated, we can apply signal correction within each batch.

# median intensity value is used to adjust batch effect

mpa <- rep(NA, nrow(data))

for (bch in 1:8) {
    mpa[batch==bch] <- median(data$data[batch==bch], na.rm=TRUE)
}


QC_fit <- outl/mpa

# and correct actual data
res <- data$data/QC_fit

# correct data using subtratcion
res2 <- (data$data -outl) +mpa

plotdata <- data.frame(measured=data$data, corrected_subt=res2, 
    corrected_div=res, Class=class, batch=batch, order=c(1:nrow(data)))
plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"), 
    value.name="intensity", variable.name="data")

ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), 
    color=data, shape=Class)) + geom_point(alpha=0.2, size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    geom_smooth(se=FALSE) +
    facet_grid(Class ~ .)

The figure above shows the measured data points in blue and the corrected values using subtraction (red) or division(green). Fitted smoothed spline curves of the corrected data over all batches still indicates that there is batch related signal drift in the data. This can be corrected using the "grand median".

First, a grand median is calculated across all batches, and then difference between each batch median and the grand median is subtracted from all the samples in that batch, to remove the difference.

mpa <- rep(NA, nrow(data))

for (bch in 1:8) {
    mpa[batch == bch] <- median(res2[batch == bch], na.rm=TRUE)
}

grandMedian <- median(res2, na.rm=TRUE)

mpa <- mpa - grandMedian

plotdata$corrected_subt <- plotdata$corrected_subt - mpa

mpa <- rep(NA, nrow(data))

for (bch in 1:8) {
    mpa[batch == bch] <- median(res[batch == bch], na.rm=TRUE)
}

grandMedian <- median(res, na.rm=TRUE)

mpa <- mpa - grandMedian

plotdata$corrected_div <- plotdata$corrected_div - mpa

plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"),
    value.name="intensity", variable.name="data")

ggplot(data=plotdata2, aes(x=order, y=log(intensity,10),
    color=data, shape=Class)) + geom_point(alpha=0.2, size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    geom_smooth(se=FALSE) +
    facet_grid(Class ~ .)

We can calculate RSD% before and after correction.

FUN <- function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE) * 100

# RSD% of biological and QC samples within all 6 batches:
out <- matrix(nrow=2, ncol=2)
colnames(out) <- c("Biological","QC")
rownames(out) <- c("measured", "corrected")
out[1,1] <-FUN(data$data[-QChits])
out[1,2] <-FUN(data$data[QChits])
out[2,1] <-FUN(res[-QChits])
out[2,2] <-FUN(res[QChits])

round(out, 2)

# RSD% per batch:
out <- matrix(ncol=8,nrow=4)
colnames(out) <- unique(batch)
rownames(out) <- c("Biological","QC","Corrected biological","Corrected QC")
for(i in 1:8) {
    out[1, i] <- FUN(data$data[which(!class=="QC" & batch==i)])
    out[2, i] <- FUN(data$data[which(class=="QC" & batch==i)])
    out[3, i] <- FUN(res[which(!class=="QC" & batch==i)])
    out[4, i] <- FUN(res[which(class=="QC" & batch==i)])
}

round(out, 2)

The data has now been corrected for batch and signal drift effects.

All the steps from example above can be applied to all or susbet of features in the data set using function "QCRSC".

data <- MTBLS79[feature_names,]

class <- MTBLS79$Class
batch <- MTBLS79$Batch
sample_order <- c(1:ncol(data))

corrected_data <- QCRSC(df=data, order=sample_order, batch=batch,
    classes=class, spar=0, minQC=4)

We can calculate RSD% statistics per batch before and after correction and visualise the results with a box plot.

data <- SummarizedExperiment::assay(data)
corrected_data <- SummarizedExperiment::assay(corrected_data)
RSDQC <- matrix(ncol=8, nrow=nrow(data))
RSDsample <- matrix(ncol=8, nrow=nrow(data))
colnames(RSDQC) <- unique(batch)
colnames(RSDsample) <- unique(batch)

RSDQC_corrected <- matrix(ncol=8, nrow=nrow(data))
RSDsample_corrected <- matrix(ncol=8, nrow=nrow(data))
colnames(RSDQC_corrected) <- unique(batch)
colnames(RSDsample_corrected) <- unique(batch)

rownames(RSDQC) <- rownames(data)
rownames(RSDsample) <- rownames(data)
rownames(RSDQC_corrected) <- rownames(data)
rownames(RSDsample_corrected) <- rownames(data)

# for each feature
for (i in 1:nrow(data)) {
    # for each batch
    for (nb in 1:8) {
        # RSD% of QCs in this batch
        RSDQC[i, nb] <- FUN(data[i, which(class == "QC" & batch == nb)])
        # RSD% of samples in this batch
        RSDsample[i, nb] <- FUN(data[i, which(!class == "QC" & batch == nb)])
        # RSD% of QCs in this batch after correction
        RSDQC_corrected[i, nb] <- FUN(corrected_data[i, which(class == "QC"
            & batch == nb)])
        # RSD% of samples in this batch after correction
        RSDsample_corrected[i, nb] <- FUN(corrected_data[i, which(!class == "QC"
            & batch == nb)]) 
    }
}

# prepare results for plotting
plotdataQC <- melt(as.data.frame(RSDQC), variable.name="batch",
    value.name="RSD")
plotdataQC$Class <- "QC"

plotdataBio <- melt(as.data.frame(RSDsample), variable.name="batch",
    value.name="RSD")
plotdataBio$Class <- "Sample"

plotdataQC_corrected <- melt(as.data.frame(RSDQC_corrected),
    variable.name="batch", value.name="RSD")
plotdataQC_corrected$Class <- "QC_corr"

plotdataBio_corrected <- melt(as.data.frame(RSDsample_corrected),
    variable.name="batch", value.name="RSD")
plotdataBio_corrected$Class <- "Sample_corr"

plotdata <- rbind(plotdataQC, plotdataQC_corrected)

plotdata$Class <- as.factor(plotdata$Class)

# plot
ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() +
    facet_wrap(~ batch, ncol=3) +
    ylab("RSD%") +
    theme(panel.background=element_blank()) +
    scale_y_continuous(limits=c(0, 50))

plotdata <- rbind(plotdataBio, plotdataBio_corrected)

plotdata$Class <- as.factor(plotdata$Class)

# plot
ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() +
    facet_wrap(~ batch, ncol=3) +
    ylab("RSD%") +
    theme(panel.background=element_blank())

Or compare the scores plots of principal components analysis (PCA) before and after correction.

# PQN used to normalise data
# KNN for missing value imputation
# glog scaling method 
# A more detailed overview is detailed in
# Di Guida et al, Metabolomics, 12:93, 2016
# https://dx.doi.org/10.1007/s11306-016-1030-9

pca_data <- pqn_normalisation(data, classes=class, qc_label="QC")
pca_data <- mv_imputation(pca_data, method="KNN", k=5, rowmax=0.5,
    colmax=0.5, maxp=NULL, check_df=FALSE)
pca_data <- glog_transformation(pca_data, classes=class, qc_label="QC")

pca_corrected_data <- pqn_normalisation(corrected_data, classes=class,
    qc_label="QC")
pca_corrected_data <- mv_imputation(pca_corrected_data, method="KNN", k=5,
    rowmax=0.5, colmax=0.5, maxp=NULL, check_df=FALSE)
pca_corrected_data <- glog_transformation(pca_corrected_data, 
    classes=class, qc_label="QC")

pca_data <- prcomp(t(pca_data), center=TRUE, scale=FALSE)
pca_corrected_data <- prcomp(t(pca_corrected_data), center=TRUE, scale=FALSE)

# Calculate percentage of explained variance of the first two PC's
exp_var_pca <- round(((pca_data$sdev^2)/sum(pca_data$sdev^2)*100)[1:2],2)
exp_var_pca_corrected <- round(((pca_corrected_data$sdev^2) /
    sum(pca_corrected_data$sdev^2)*100)[1:2],2)

plots <- list()

plotdata <- data.frame(PC1=pca_data$x[, 1], PC2=pca_data$x[, 2], 
    batch=as.factor(batch), class=class)

plots[[1]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=batch)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, before correction") +
    xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca[2] ," %)"))

plots[[2]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, before correction") +
    xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca[2] ," %)"))

plotdata_corr <- data.frame(PC1=pca_corrected_data$x[, 1], 
    PC2=pca_corrected_data$x[, 2], batch=as.factor(batch), class=class)

plots[[3]] <- ggplot(data=plotdata_corr, aes(x=PC1, y=PC2, col=batch)) +
    geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, after correction") +
    xlab(paste0("PC1 (", exp_var_pca_corrected[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca_corrected[2] ," %)"))

plots[[4]] <- ggplot(data=plotdata_corr, aes(x=PC1, y=PC2, col=class)) +
    geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, after correction") +
    xlab(paste0("PC1 (", exp_var_pca_corrected[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca_corrected[2] ," %)"))

grid.arrange(ncol=2, plots[[1]], plots[[2]], plots[[3]], plots[[4]])

Session information

sessionInfo()

References



computational-metabolomics/pmp documentation built on Jan. 28, 2020, 12:05 a.m.