knitr::opts_chunk$set(echo = TRUE)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# ---------- Preparations ----------
# Load libraries
library(MSnbase)                 # MS features
library(xcms)                    # Swiss army knife for metabolomics
library(RColorBrewer)            # For colors

# Set encoding of files and set locale
options(encoding="UTF-8")
Sys.setlocale(category="LC_ALL", locale="C")

# Data directories
#data_dir <- "./inst/mzML"
data_dir <- system.file("mzML/", package="mtbls297")
# Multicore parallel
nSlaves <- parallel::detectCores(all.tests=FALSE, logical=TRUE)
library(doParallel)
registerDoParallel(nSlaves)
register(DoparParam(), default=TRUE)

# Input variables
polarity <- "positive"
pol <- substr(x=polarity, start=1, stop=3)
ms1_intensity_cutoff <- 16000 #approx. 0.01%
ms2_fragment_intensity_cutoff <- 100
mz_ppm <- 25
# ---------- Peak detection ----------
# Load files
mzml_dir <- data_dir
mzml_files <- list.files(mzml_dir, pattern="*.mzML", recursive=T, full.names=T)
mzml_names <- gsub('(.*)_.*', '\\1', basename(mzml_files))

# Build phenodata
mzml_pheno <- gsub(x=mzml_files, pattern=".*\\d\\d\\d\\d ", replacement="", perl=TRUE)
mzml_pheno <- as.factor(gsub(x=mzml_pheno, pattern=" .*", replacement="", perl=TRUE))
mzml_pheno <- data.frame(sample_name=mzml_names, sample_group=mzml_pheno)

# Import raw data as MSnbase object
raw_data <- readMSData(files=mzml_files, pdata=new("NAnnotatedDataFrame", mzml_pheno), mode="onDisk", centroided=TRUE)
table(msLevel(raw_data))
head(isolationWindowLowerMz(raw_data))
head(isolationWindowUpperMz(raw_data))
head(fData(raw_data)[, c("isolationWindowTargetMZ", "isolationWindowLowerOffset", "isolationWindowUpperOffset", "msLevel", "retentionTime")])

# Get base peak chromatograms
chromas <- chromatogram(raw_data, aggregationFun="max")
# Plot chromatograms based on phenodata groups
sample_group_colors <- brewer.pal(length(unique(mzml_pheno$sample_group)), "Set1")
names(sample_group_colors) <- unique(mzml_pheno$sample_group)
plot(chromas, col=sample_group_colors[raw_data$sample_group])

# Get TICs
tics <- split(tic(raw_data), f=fromFile(raw_data))
boxplot(tics, col=sample_group_colors[raw_data$sample_group], ylab="intensity", main="Total ion current")
# Peak detection in MS1 data
ms1_params <- CentWaveParam(ppm=mz_ppm, mzCenterFun="mean", prefilter=c(3, 100), peakwidth=c(10, 20), snthresh=10, integrate=2,
                                firstBaselineCheck=TRUE, verboseColumns=FALSE, fitgauss=FALSE, roiList=list(), roiScales=numeric())
ms1_data <- findChromPeaks(raw_data, param=ms1_params)

# Per file summary
ms1_summary <- lapply(split.data.frame(chromPeaks(ms1_data), f = chromPeaks(ms1_data)[, "sample"]),
                          FUN = function(z) { c(peak_count = nrow(z), rt = quantile(z[, "rtmax"] - z[, "rtmin"])) } )
ms1_summary <- do.call(rbind, ms1_summary)
rownames(ms1_summary) <- basename(fileNames(ms1_data))
print(ms1_summary)
table(msLevel(ms1_data))

# To get a global overview of the peak detection we can plot the frequency of identified peaks per file along the retention time axis. This allows to identify time periods along the MS run in which a higher number of peaks was identified and evaluate whether this is consistent across files.
plotChromPeakImage(ms1_data, main="Frequency of identified peaks per RT")

# Group peaks
ms1_data <- groupChromPeaks(ms1_data, param=PeakDensityParam(sampleGroups=ms1_data$sample_group, minFraction=0.5, bw=20))

# RT correction
ms1_data <- adjustRtime(ms1_data, param=PeakGroupsParam(minFraction=0.5,smooth="loess",span=0.2,family="gaussian"))

# Plot the difference of raw and adjusted retention times
par(mfrow=c(2,1), mar=c(4.5,4.2,1,0.5))
plot(chromas, col=sample_group_colors[chromas$sample_group], peakType="none", main="Raw chromatograms")
plotAdjustedRtime(ms1_data, col=sample_group_colors[ms1_data$sample_group], main="RT correction")
par(mfrow=c(1,1), mar=c(4,4,4,1), oma=c(0,0,0,0), cex.axis=0.9, cex=0.8)

# Group peaks
ms1_data <- groupChromPeaks(ms1_data, param=PeakDensityParam(sampleGroups=ms1_data$sample_group, minFraction=0.5, bw=20))

# Get integrated peak intensity per feature/sample
print(head(featureValues(ms1_data, value="into")))

# Fill peaks
#ms1_data <- fillChromPeaks(ms1_data, param=FillChromPeaksParam(ppm=35, fixedRt=0, expandRt=4, diffRt=4))
#head(featureValues(ms1_data))
#head(featureSummary(ms1_data, group=ms1_data$sample_group))

# Evaluate grouping
ms1_pca <- prcomp(t(na.omit(log2(featureValues(ms1_data, value="into")))), center=TRUE)
plot(ms1_pca$x[, 1], ms1_pca$x[,2], pch=21, main="PCA: Grouping of samples",
     xlab=paste0("PC1: ", format(summary(ms1_pca)$importance[2, 1] * 100, digits=3), " % variance"),
     ylab=paste0("PC2: ", format(summary(ms1_pca)$importance[2, 2] * 100, digits=3), " % variance"),
     col="darkgrey", bg=sample_group_colors[ms1_data$sample_group], cex=2)
grid()
text(ms1_pca$x[, 1], ms1_pca$x[,2], labels=ms1_data$sample_name, col="darkgrey", pos=3, cex=0.5)

# Show peaks
tail(chromPeaks(ms1_data))
tail(chromPeakData(ms1_data))

# Show process history
processHistory(ms1_data)
# Peak picking in all isolation windows (buckets) of SWATH data
ms2_params <- CentWaveParam(ppm=mz_ppm, mzCenterFun="mean", prefilter=c(3, 100), peakwidth=c(10, 20), snthresh=10, integrate=2,
                                firstBaselineCheck=TRUE, verboseColumns=FALSE, fitgauss=FALSE, roiList=list(), roiScales=numeric())
ms2_data <- findChromPeaksIsolationWindow(ms1_data, param=ms2_params)

# Count the number of peaks within each isolation window
ms2_peaks <- chromPeakData(ms2_data)
table(ms2_peaks$isolationWindow)

# Filter MS2 spectra
swath_data <- filterRt(ms2_data, rt=c(60,1200))

# Inspect number of spectra in MS1 and MS2
table(msLevel(swath_data))
head(fData(swath_data)[, c("isolationWindowTargetMZ", "isolationWindowLowerOffset", "isolationWindowUpperOffset", "msLevel", "retentionTime")])
head(isolationWindowLowerMz(swath_data))
head(isolationWindowUpperMz(swath_data))
table(isolationWindowTargetMz(swath_data))

# Compare MS1 to SWATH
sum(chromPeakData(swath_data)$ms_level == 1)
sum(chromPeakData(swath_data)$ms_level == 2)
table(chromPeakData(swath_data)$isolationWindow)

#install_github("sneumann/xcms", ref="02e4aaf80a6e8c5737d8c41d8e83e09d94fd2bef")
# Define an MS2 spectra for each MS1 peak
swath_spectra <- reconstructChromPeakSpectra(swath_data, minCor=0.5)

# Save MS2 spectra info and convert to peak list
ms2_spectra <- fData(swath_data)[, c("fileIdx", "originalPeaksCount", "totIonCurrent", "retentionTime", "basePeakMZ")]
ms2_spectra$fileIdx <- as.character(unlist(lapply(X=ms2_spectra$fileIdx, FUN=function(x) { mzml_names[x] })))



# ---------- Build feature matrices ----------
# Build feature matrix
ms1_matrix <- featureValues(ms1_data, method="medret", value="into")
colnames(ms1_matrix) <- mzml_names
dim(ms1_matrix)
feat_list <- t(ms1_matrix)

# Build feature summary
ms1_summary <- featureSummary(ms1_data)
ms1_def <- featureDefinitions(ms1_data)

# Missing value imputation
feat_list[is.na(feat_list)] <- 1

# Transform and clean data
feat_list <- log2(feat_list)
feat_list[is.na(feat_list)] <- 0
feat_list[which(feat_list < 0)] <- 0
feat_list[is.infinite(feat_list)] <- 0
#feat_list <- feat_list[!apply(feat_list, MARGIN=1, function(x) max(x,na.rm=TRUE) == min(x,na.rm=TRUE)),]

# Create single 0/1 matrix
bina_list <- feat_list
bina_list[bina_list < log2(ms1_intensity_cutoff)] <- 0
bina_list[bina_list != 0] <- 1
# ---------- Link MS2 spectra to MS1 matrix ----------
# Merge MS/MS spectra in each sample
ms2_spectra <- list()
ms2_spectra <- foreach(i=1:nrow(ms1_def)) %dopar% {
  # Extract spectra for feature
  feature_of_interest <- ms1_def[i, "mzmed"]
  peaks_of_interest <- chromPeaks(swath_data, msLevel=1, mz=feature_of_interest, ppm=mz_ppm)
  swath_spectra_of_interest <- swath_spectra[which(mcols(swath_spectra)$peak_id %in% rownames(peaks_of_interest))]
  merged_swath_spectrum_of_interest <- combineSpectra(swath_spectra_of_interest, mzd=0.05, ppm=50, intensityFun="max")

  # Plot single spectra + merged spectrum, i.e. 1010
  if (FALSE) {
    nonempty_spectra <- NULL
    for (i in 1:length(swath_spectra_of_interest@listData)) {
      if (length(swath_spectra_of_interest@listData[[i]]@precursorMz) > 0)
        if (! is.null(names(swath_spectra_of_interest@listData[[i]]@mz)))
          nonempty_spectra <- c(nonempty_spectra, i)
    }

    par(mfrow=c(3,2), mar=c(4.5,4.2,1,0.5))
    for (i in nonempty_spectra) {
      plot(x=swath_spectra_of_interest@listData[[i]]@mz, y=swath_spectra_of_interest@listData[[i]]@intensity, type="h", xlab="m/z", ylab="intensity", main=paste("Precursor m/z",swath_spectrum_of_interest@listData[[i]]@precursorMz))
      swath_spectra[which(mcols(swath_spectra)$peak_id %in% names(swath_spectra_of_interest@listData[[i]]@mz))]
    }
    par(mfrow=c(1,1), mar=c(4,4,4,1), oma=c(0,0,0,0), cex.axis=0.9, cex=0.8)

    plot(x=merged_swath_spectrum_of_interest@listData[[1]]@mz, y=merged_swath_spectrum_of_interest@listData[[1]]@intensity, type="h", xlab="m/z", ylab="intensity", main=paste("Precursor m/z",merged_swath_spectrum_of_interest@listData[[1]]@precursorMz))
  }

  return(merged_swath_spectrum_of_interest)
}

# Assign matching SWATH spectra to MS1 features
feat_spectra <- list()
for (i in 1:nrow(ms1_def)) {
  feat_rt_med <- ms1_def[i, "rtmed"]
  feat_mz_med <- ms1_def[i, "mzmed"]
  feat_ms2_list <- chromPeaks(swath_data, msLevel=1, mz=feat_mz_med, ppm=mz_ppm)

  if (nrow(feat_ms2_list) > 0) {
    feat_spectra <- swath_spectra[which(mcols(swath_spectra)$peak_id %in% rownames(feat_ms2_list))]
    feat_spectra_merged <- combineSpectra(feat_spectra, mzd=0.05, ppm=50, intensityFun="max")

    if (feat_spectra_merged@listData[[1]]@peaksCount == 0) {
      print(paste0("No spectrum found for #", i, " ", rownames(ms1_def)[i]))
      feat_spectra[[i]] <- list()
      names(feat_spectra)[i] <- ""
    } else if (is.na(feat_spectra_merged@listData[[1]]@mz[1])) {
      print(paste0("Empty spectrum found for #", i, " ", rownames(ms1_def)[i]))
      feat_spectra[[i]] <- list()
      names(feat_spectra)[i] <- ""
    } else {
      feat_spectra[[i]] <- feat_spectra_merged
      names(feat_spectra)[i] <- rownames(ms1_def)[i]
    }
  } else {
    print(paste0("No spectra found for #", i, " ", rownames(ms1_def)[i]))
    feat_spectra[[i]] <- list()
    names(feat_spectra)[i] <- ""
  }
}

print(length(colnames(feat_list)))
print(length(which(colnames(feat_list) %in% names(feat_spectra))))

# Filter MS1 matrix to contain only features that have spectra assigned
feat_list <- feat_list[, which(colnames(feat_list) %in% names(feat_spectra))]
bina_list <- bina_list[, which(colnames(bina_list) %in% names(feat_spectra))]

Including Plots

You can also embed plots, for example:

plot(pressure)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.



sneumann/mtbls297 documentation built on July 14, 2020, 12:43 a.m.