## todo:
## generate_connectivity_matrix:
## add biochemical relationships
#' Prepare input for annotation
#'
#' Extract information for annotation from SummarizedExperiment object. For
#' each feature, the median mz value across all samples will be used as
#' measured mz value and the sum of all intensities for the intensity value.
#'
#' @param se
#' \code{\link[SummarizedExperiment]{RangedSummarizedExperiment-class}}
#' object
#'
#' @return
#' data.frame with the following information about measured features:
#' \itemize{
#' \item id: unique identifier
#' \item mz: measured mz value
#' \item intensity: measured intensity
#' \item no.peak.detected: number of detected peaks.
#' }
#'
#' @importFrom SummarizedExperiment assays
#' @importFrom stats median
#' @export
#'
#' @examples
#' data("se.example")
#' dat = prepare_data_for_annotation(se = se.example)
prepare_data_for_annotation <- function(se) {
peak.all = assays(se)$peak.detection
mz.all = assays(se)$mz
int.all = assays(se)$intensity.log2
dat = matrix(nrow = nrow(se),
ncol = 3,
dimnames = list(NULL,
c(
"mz", "intensity", "no.peaks.detected"
)))
for (i in seq_len(nrow(se))) {
## get indices of samples with detected peak
ind = which(peak.all[i,] == 1)
if (length(ind) == 0) {
stop(paste("no peaks detected for feature", i))
}
mz = median(mz.all[i, ind], na.rm = TRUE)
intensity = sum(int.all[i, ind], na.rm = TRUE)
dat[i,] = c(mz, intensity, length(ind))
}
dat = data.frame(id = rownames(se),
dat,
stringsAsFactors = FALSE)
return(dat)
}
#' Find putative theoretical features for each measured feature
#'
#' Identifies all possible theoretical features within an user defined ppm
#' window around the measured mz value of each measured feature. The result
#' includes only those theoretical features that are potential candidates for
#' at least one measured feature and orphan isotopes are removed if no hit is
#' identified for the corresponding monoisotope.
#'
#' @param info.features data.frame with information about theoretical possible
#' features (e.g. as generated by the function
#' \code{\link{chem_formula_2_adducts}}).
#' @param dat data.frame with information about measured features with columns
#' id (= unique identifier), mz (= measured mz value), intensity (= measured
#' intensity).
#' @param ppm Numeric. Resolution of mass spectrometer.
#'
#' @return
#' matrix with measured features in rows and theoretical possible features in
#' columns (1 = measured feature within ppm distance to theoretical feature,
#' 0 = otherwise).
#'
#' @importFrom plyr arrange
#' @export
#'
#' @examples
#' data("se.example")
#' data("info.features")
#'
#' dat = prepare_data_for_annotation(se = se.example)
#' hits.m = find_hits(info.features = info.features,
#' dat = dat,
#' ppm = 20)
find_hits <- function(info.features, dat, ppm) {
if (any(duplicated(dat$id))) {
stop("duplicated ids in dat!")
}
if (any(duplicated(info.features$id))) {
stop("duplicated ids in info.features!")
}
## find overlaps
hits.m = matrix(
0,
nrow = nrow(dat),
ncol = nrow(info.features),
dimnames = list(dat$id,
info.features$id)
)
for (i in seq_len(nrow(hits.m))) {
deltaM = ppm * dat$mz[i] * 1e-06
ind = which(abs(info.features$m.z - dat$mz[i]) <= deltaM)
if (length(ind) > 0)
hits.m[i, ind] = 1
}
## remove theoretical features without any hits
no.hits.c = apply(hits.m, 2, sum)
hits.m = hits.m[, which(no.hits.c > 0)]
## remove orphan isotopes (monoisotope missing)
info.features.iso = info.features[which(info.features$abundance < 100 &
info.features$id %in%
colnames(hits.m)),]
if (nrow(info.features.iso) > 0) {
orphans = NULL
for (i in seq_len(nrow(info.features.iso))) {
mono = info.features[which(
info.features$chemical.form.adduct ==
info.features.iso$chemical.form.adduct[i] &
info.features$adduct.name ==
info.features.iso$adduct.name[i] &
info.features$abundance == 100
), "id"]
if (!(mono %in% colnames(hits.m))) {
orphans = c(orphans, info.features.iso$id[i])
}
}
if (length(orphans) > 0) {
print(paste("removed", length(orphans), "orphan isotopes"))
hits.m = hits.m[, setdiff(colnames(hits.m), orphans)]
}
}
## remove measured features without any hits
no.hits.r = apply(hits.m, 1, sum)
hits.m = hits.m[which(no.hits.r > 0),]
return(hits.m)
}
#' Compute prior probabilities for putative theoretical features
#'
#' Estimates prior probabilities of all possible theoretical features for each
#' measured feature.
#'
#' @param hits.m matrix with information about overlapping theoretical features
#' (e.g. as generated by the function \code{\link{find_hits}}).
#' @param info.features data.frame with information about theoretical possible
#' features (e.g. as generated by the function
#' \code{\link{chem_formula_2_adducts}}).
#' @param dat data.frame with information about measured features with columns
#' id (= unique identifier), mz (= measured mz value), intensity (= measured
#' intensity).
#' @param pk Numeric vector with probabilities of initial confidence of the
#' presence of each theoretical feature.
#' @param ppm Numeric. Resolution of mass spectrometer.
#' @param ppm.unknown Numeric. Resolution assigned to unknown theoretical
#' features (i.e. not contained in info.features).
#' @param pr.limit Numeric. Lowest probability not set to 0 (default = 1e-05).
#'
#' @return
#' matrix with measured features in rows and theoretical possible features in
#' columns. The cell i,j contains the prior probability that measured feature i
#' belongs to theoretical feature j.
#'
#' @export
#'
#' @examples
#' data("se.example")
#' data("info.features")
#'
#' dat = prepare_data_for_annotation(se = se.example)
#' hits.m = find_hits(info.features = info.features,
#' dat = dat,
#' ppm = 20)
#'
#' prior.prob = compute_prior_prob(hits.m = hits.m,
#' info.features = info.features,
#' dat = dat,
#' ppm = 20)
compute_prior_prob <- function(hits.m,
info.features,
dat,
pk = NULL,
ppm,
ppm.unknown = NULL,
pr.limit = 1e-05) {
rownames(info.features) = info.features$id
info.features = info.features[colnames(hits.m),]
rownames(dat) = dat$id
dat = dat[rownames(hits.m),]
prior.m = matrix(
0,
nrow = nrow(hits.m),
ncol = ncol(hits.m),
dimnames = dimnames(hits.m)
)
if (!is.null(ppm.unknown)) {
prior.m = cbind(prior.m,
unknown = rep(0, nrow(prior.m)))
}
if (is.null(pk)) {
pk = rep(1, ncol(prior.m))
} else if (length(pk) != ncol(prior.m)) {
stop(
paste(
"length of pk needs to be equal to number of columns of",
"hits.m (+1 if ppm.unknown is specified)!"
)
)
}
# sigma = (ppm * info.features$m.z) / (2 * 10^6)
sigma = (ppm * dat$mz) / (2 * 10 ^ 6)
for (i in seq_len(nrow(prior.m))) {
d = exp((-0.5 * 1 / (sigma[i] ^ 2)) * (dat$mz[i] - info.features$m.z) ^
2)
## new: restrict to possible hits
d = d * hits.m[i,]
if (!is.null(ppm.unknown)) {
d.unknown = exp((-0.5 * 1 / (sigma[i] ^ 2)) *
(ppm.unknown * dat$mz[i] * 10 ^ (-6)) ^
2)
d = c(d, d.unknown)
}
d = d * pk
d[which(d < pr.limit)] = 0
prior.m[i,] = d / sum(d)
}
## remove column with unknown if not relevant
if (!is.null(ppm.unknown)) {
sum.unknown = sum(prior.m[, "unknown"])
if (sum.unknown == 0) {
print(paste(
"column unknown removed since none of the",
"probabilities is > 0!"
))
prior.m =
prior.m[,-which(colnames(prior.m) == "unknown")]
}
}
return(prior.m)
}
#' Generate connectivity matrices
#'
#' Generates matrices summarizing connections between adducts and isotopes.
#'
#' @param info.features data.frame with information about theoretical possible
#' features (e.g. as generated by the function
#' \code{\link{chem_formula_2_adducts}}).
#' @param type Character. Type of connectivity matrix to generate ("adducts",
#' "isotopes").
#' @param ids.use Character vector. Feature identifiers to be used for
#' connectivity matrix. If NULL connectivity matrix is build using all features
#' in info.features.
#' @param ratios Logical. Should the isotope connectivity matrix contain the
#' expected abundance ratios? (deafult = TRUE).
#'
#' @return
#' matrix with theoretical possible features in rows and columns.
#' In the adduct matrix, the cell i,j contains 1, if features i and j are
#' monoisotopic isotopes and adducts of the same chemical formula and 0
#' otherwise.
#' In the isotope matrix, the cell i,j contains 1, if features i and j are
#' isotopes of the same adduct (or the ratio of the theoretical abundances if
#' ratios = TRUE) and 0 otherwise.
#'
#' @importFrom utils combn
#' @export
#'
#' @examples
#' data("info.features")
#'
#' add.m = generate_connectivity_matrix(info.features = info.features,
#' type = "adducts")
#'
#' iso.m = generate_connectivity_matrix(info.features = info.features,
#' type = "isotopes")
generate_connectivity_matrix <- function(info.features,
type,
ids.use = NULL,
ratios = FALSE) {
if (!is.null(ids.use)) {
info.features = info.features[which(info.features$id %in% ids.use),]
if (nrow(info.features) == 0) {
stop("no features left in info.features!")
}
}
conn.m = matrix(
0,
nrow = nrow(info.features),
ncol = nrow(info.features),
dimnames = list(info.features$id,
info.features$id)
)
if (type == "adducts") {
chem = unique(info.features$chemical.form)
for (c in chem) {
ind = which(info.features$chemical.form == c &
info.features$abundance == 100)
if (length(ind) > 1) {
ind = combn(ind, 2)
ind = rbind(t(ind), t(ind[2:1,]))
conn.m[ind] = 1
}
}
} else if (type == "isotopes") {
adduct = unique(info.features$chemical.form.adduct)
for (a in adduct) {
ind = which(info.features$chemical.form.adduct == a)
if (length(ind) > 1) {
# stop(a)
ind = combn(ind, 2)
if (ratios) {
ind.t = t(ind)
ind.t.rev = t(ind[2:1,])
for (r in seq_len(nrow(ind.t))) {
ab = info.features$abundance[ind.t[r,]]
conn.m[ind.t[r, , drop = FALSE]] = ab[1] / ab[2]
conn.m[ind.t.rev[r, , drop = FALSE]] = ab[2] / ab[1]
}
} else {
ind = rbind(t(ind), t(ind[2:1,]))
conn.m[ind] = 1
}
}
}
} else {
stop(paste("type", type, "not known!"))
}
return(conn.m)
}
#' Compute posterior probabilities for putative theoretical features
#'
#' Estimates posterior probabilities of all possible theoretical features for
#' each measured feature based on prior probabilities and additional
#' information about adduct and isotope connections using a Gibbs sampler. This
#' function is based on code of the internal function IPA.sampler.Add.Iso.Bio
#' in the R package IPA (available on github at
#' \url{https://github.com/francescodc87/IPA}).
#'
#' @param prior.prob matrix with prior probabilities for each measured feature
#' in rows and theoretical features in columns (e.g. as generated by the
#' function \code{\link{compute_prior_prob}}).
#' @param dat data.frame with information about measured features with columns
#' id (= unique identifier), mz (= measured mz value), intensity (= measured
#' intensity).
#' @param add.m matrix with information about adduct connections (e.g. as
#' generated by function \code{\link{generate_connectivity_matrix}}).)
#' @param iso.m matrix with information about isotope connections (e.g. as
#' generated by function \code{\link{generate_connectivity_matrix}}).)
#' @param bio.m matrix with information about biochemical connections (e.g. as
#' generated by function \code{\link{generate_connectivity_matrix}}).)
#' @param delta.add Numeric. Confidence on the information encoded in add.m
#' (smaller value means higher confidence).
#' @param delta.iso Numeric. Confidence on the information encoded in iso.m
#' (smaller value means higher confidence).
#' @param delta.bio Numeric. Confidence on the information encoded in bio.m
#' (smaller value means higher confidence).
#' @param ratio.tol Numeric. Minimum accepted ratio between thereoretical and
#' observed intensity ratios between isotopes (default = 0.8).
#' @param no.its Numeric. Number of iterations to be performed by the Gibbs
#' sampler (default = 1100).
#' @param burn Numeric. Number of initial iterations to be ignored when
#' computing the posterior probabilities (default = 100).
#'
#' @return
#' matrix with measured features in rows and theoretical possible features in
#' columns. The cell i,j contains the posterior probability that measured
#' feature i belongs to theoretical feature j.
#'
#' @export
#'
#' @examples
#' data("se.example")
#' data("info.features")
#'
#' dat = prepare_data_for_annotation(se = se.example)
#' hits.m = find_hits(info.features = info.features,
#' dat = dat,
#' ppm = 20)
#'
#' prior.prob = compute_prior_prob(hits.m = hits.m,
#' info.features = info.features,
#' dat = dat,
#' ppm = 20)
#' add.m = generate_connectivity_matrix(info.features = info.features,
#' type = "adducts")
#'
#' set.seed(20200402)
#' post.prob = compute_posterior_prob(prior.prob = prior.prob,
#' dat = dat,
#' add.m = add.m,
#' delta.add = 0.1)
compute_posterior_prob <- function(prior.prob,
dat,
add.m = NULL,
iso.m = NULL,
bio.m = NULL,
delta.add = 0.5,
delta.iso = 0.5,
delta.bio = 1,
ratio.tol = 0.8,
no.its = 1100,
burn = 100)
{
if (!(all(c("mz", "intensity") %in% colnames(dat))))
{
stop("data.frame dat must contain columns mz and intensity!")
}
if (is.null(add.m))
{
add.m = make_empty_matrix(ids = colnames(prior.prob))
use.add = FALSE
} else
{
add.m = add.m[colnames(prior.prob), colnames(prior.prob)]
use.add = TRUE
}
if (is.null(bio.m))
{
bio.m = make_empty_matrix(ids = colnames(prior.prob))
use.bio = FALSE
} else
{
bio.m = bio.m[colnames(prior.prob), colnames(prior.prob)]
use.bio = TRUE
}
if (is.null(iso.m))
{
iso.m = make_empty_matrix(ids = colnames(prior.prob))
use.iso = FALSE
} else
{
iso.m = iso.m[colnames(prior.prob), colnames(prior.prob)]
use.iso = TRUE
}
## initialization
## random drawing of formula for each feature based on prior probability
sampcomp = apply(prior.prob, 1, multsample)
allsampcomp = matrix(0, no.its, nrow(prior.prob))
## Gibbs sampler
pot.bio <- apply(bio.m[sampcomp,], 2, sum)
pot.add <- apply(add.m[sampcomp,], 2, sum)
pot.iso <- apply(iso.m[sampcomp,], 2, sum)
for (it in seq_len(no.its))
{
ordine <- sample(nrow(prior.prob))
for (thism in ordine)
{
po <- prior.prob[thism,]
p.add <-
pot.add - colSums(add.m[sampcomp[thism], , drop = FALSE])
p.add <- (p.add + delta.add) / sum(p.add + delta.add)
if (use.add)
po = po * p.add
p.bio <-
pot.bio - colSums(bio.m[sampcomp[thism], , drop = FALSE])
p.bio <- (p.bio + delta.bio) / sum(p.bio + delta.bio)
if (use.bio)
po = po * p.bio
p.iso <-
pot.iso - colSums(iso.m[sampcomp[thism], , drop = FALSE])
p.iso <- (p.iso + delta.iso) / sum(p.iso + delta.iso)
if (use.iso)
po = po * p.iso
po <- po / sum(po)
oldval <- sampcomp[thism]
sampcomp[thism] <- multsample(po)
if (oldval != sampcomp[thism])
{
pot.bio <- pot.bio - bio.m[, oldval] + bio.m[, sampcomp[thism]]
pot.add <-
pot.add - add.m[, oldval] + add.m[, sampcomp[thism]]
pot.iso <-
pot.iso - iso.m[, oldval] + iso.m[, sampcomp[thism]]
}
}
allsampcomp[it,] <- sampcomp
# if (v) {
if (it %% 250 == 0)
{
cat(paste0(round((it * 100) / no.its, 1), "%",
"\n"))
}
# }
}
## posterior probability
post.prob <- t(apply(
allsampcomp,
2,
compute_post,
burn = burn,
no.its = no.its,
Nc = ncol(prior.prob)
))
dimnames(post.prob) = dimnames(prior.prob)
return(post.prob)
}
#' Assign measured feature to theoretical features
#'
#' Based on estimated posterior probabilities each measured feature is assigned
#' to the most probable theoretical feature. Measured features are removed if
#' the maximum probability is below the specified cutoff. In rare cases,
#' several measured features are assigned to the same theoretical feature and
#' the measured feature with the largest number of detected peaks or highest
#' intensity will be kept.
#'
#' @param post.prob matrix with prior probabilities for each measured feature
#' in rows and theoretical features in columns (e.g. as generated by the
#' function \code{\link{compute_posterior_prob}}).
#' @param dat data.frame with information about measured features with columns
#' id (= unique identifier), mz (= measured mz value), intensity (= measured
#' intensity).
#' @param cutoff.prob Numeric. Probability cutoff which needs to be reached to
#' keep the assignment of measured and theoretical feature (default = 0.8).
#'
#' @return
#' data.frame with the following information about each assigned feature:
#' \itemize{
#' \item f.meas: identifier of measured feature
#' \item f.theo: identifier of assigned theoretical feature
#' \item prob: posterior probability
#' }
#'
#' @export
#'
#' @examples
#' data("se.example")
#' data("info.features")
#'
#' dat = prepare_data_for_annotation(se = se.example)
#' hits.m = find_hits(info.features = info.features,
#' dat = dat,
#' ppm = 20)
#'
#' prior.prob = compute_prior_prob(hits.m = hits.m,
#' info.features = info.features,
#' dat = dat,
#' ppm = 20)
#' add.m = generate_connectivity_matrix(info.features = info.features,
#' type = "adducts")
#'
#' set.seed(20200402)
#' post.prob = compute_posterior_prob(prior.prob = prior.prob,
#' dat = dat,
#' add.m = add.m,
#' delta.add = 0.1)
#'
#' info.assigned.use = assign_features(post.prob = post.prob,
#' dat = dat)
assign_features <- function(post.prob,
dat,
cutoff.prob = 0.8)
{
info.assigned = t(vapply(
X = seq_len(nrow(post.prob)),
FUN = function(i)
{
ind = which.max(post.prob[i, ])
return(c(
f.meas = rownames(post.prob)[i],
f.theo = colnames(post.prob)[ind],
prob = post.prob[i, ind]
))
},
FUN.VALUE = character(3)
))
info.assigned = data.frame(info.assigned,
stringsAsFactors = FALSE)
info.assigned$prob = as.numeric(info.assigned$prob)
## check theoretical features with multiple assignments
tab = table(info.assigned$f.theo)
if (any(tab > 1))
{
f.theo.mult = names(tab)[tab > 1]
print(
paste(
"selecting unique measured feature for",
length(f.theo.mult),
"theoretical features"
)
)
f.rm = NULL
for (f in f.theo.mult)
{
temp = info.assigned[which(info.assigned$f.theo == f),]
dat.temp = dat[which(dat$id %in% temp$f.meas),]
ind = which(dat.temp$no.peaks.detected ==
max(dat.temp$no.peaks.detected))
if (length(ind) > 1)
{
ind = which(
dat.temp$no.peaks.detected ==
max(dat.temp$no.peaks.detected) &
dat.temp$intensity == max(dat.temp$intensity)
)
if (length(ind) > 1)
{
ind = ind[1]
}
}
f.rm = c(f.rm, temp[-ind, "f.meas"])
}
info.assigned = info.assigned[-which(info.assigned$f.meas %in% f.rm),]
}
return(info.assigned[which(info.assigned$prob >= cutoff.prob),])
}
#' Summarize feature based data into metabolite based data
#'
#' Combine the feature based metabolomics data to metabolite data by summing up
#' the intensitites of all features assigned to a particular metabolite.
#'
#' @param info.assigned data.frame with information about assignment of
#' measured and theoretical features (e.g. as generated by the function
#' \code{\link{assign_features}}).
#' @param info.features data.frame with information about theoretical possible
#' features (e.g. as generated by the function
#' \code{\link{chem_formula_2_adducts}}).
#' @param se
#' \code{\link[SummarizedExperiment]{RangedSummarizedExperiment-class}}
#' object with feature based metabolomics data.
#'
#' @return
#' \code{\link[SummarizedExperiment]{RangedSummarizedExperiment-class}}
#' object with metabolite based metabolomics data.
#'
#' @importFrom SummarizedExperiment assays colData
#' @export
#'
#' @examples
#' data("se.example")
#' data("info.features")
#'
#' dat = prepare_data_for_annotation(se = se.example)
#' hits.m = find_hits(info.features = info.features,
#' dat = dat,
#' ppm = 20)
#'
#' prior.prob = compute_prior_prob(hits.m = hits.m,
#' info.features = info.features,
#' dat = dat,
#' ppm = 20)
#' add.m = generate_connectivity_matrix(info.features = info.features,
#' type = "adducts")
#'
#' set.seed(20200402)
#' post.prob = compute_posterior_prob(prior.prob = prior.prob,
#' dat = dat,
#' add.m = add.m,
#' delta.add = 0.1)
#'
#' info.assigned.use = assign_features(post.prob = post.prob,
#' dat = dat)
#'
#' se.example.met = summarize_metabolites(info.assigned = info.assigned.use,
#' info.features = info.features,
#' se = se.example)
summarize_metabolites <- function(info.assigned,
info.features,
se)
{
info.features.use = info.features[which(info.features$id %in%
info.assigned$f.theo),]
int.f = assays(se)$intensity
int.met = NULL
met = unique(info.features.use$chemical.form)
for (m in met)
{
ids = info.features.use[which(info.features.use$chemical.form == m),
"id", drop = TRUE]
f = info.assigned[which(info.assigned$f.theo %in% ids),
"f.meas", drop = TRUE]
temp = apply(int.f[f, , drop = FALSE], 2, sum, na.rm = TRUE)
int.met = rbind(int.met, temp)
}
rownames(int.met) = met
se.met = SummarizedExperiment(
assays = list(
intensity = int.met,
intensity.log2 = log2(int.met + 1)
),
colData = colData(se)
)
return(se.met)
}
# internal function
# copy of function IPA:::multsample
multsample <- function(P)
{
sample(
seq_len(length(P)),
size = 1,
replace = TRUE,
prob = P
)
}
# internal function
# copy of function IPA:::compute.post
compute_post <- function(allsampcomp, no.its, burn, Nc)
{
tmp <- table(allsampcomp[(burn + 1):no.its]) / (no.its - burn)
out <- rep(0, Nc)
out[as.numeric(names(tmp))] <- tmp
out
}
# internal function
make_empty_matrix <- function(ids)
{
matrix(
0,
nrow = length(ids),
ncol = length(ids),
dimnames = list(ids, ids)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.