##' These functions implement the bandle model for dynamic mass spectrometry based
##' spatial proteomics datasets using MCMC for inference, this is an internal sampling
##' function
##'
##' The `diffloc` function generate the sample from the posterior distributions
##' (object or class `bandleParam`) based on an annotated quantitative spatial
##' proteomics datasets (object of class [`MSnbase::MSnSet`]). Both are then
##' passed to the `bandlePredict` function to predict the sub-cellular localisation
##' and compute the differential localisation probability of proteins. See the
##' vignette for examples
##'
##' @title Differential localisation experiments using the bandle method
##' @param objectCond1 A list of [`MSnbase::MSnSet`]s where each is an experimental
##' replicate for the first condition, usually a control
##' @param objectCond2 A list of [`MSnbase::MSnSet`]s where each is an experimental
##' replicate for the second condition, usually a treatment
##' @param fcol The feature meta-data containing marker definitions. Default is
##' `markers`
##' @param hyperLearn Algorithm to learn posterior hyperparameters of the Gaussian
##' processes. Default is `LBFGS` and `MH` for metropolis-hastings is also implemented.
##' @param numIter The number of iterations of the MCMC
##' algorithm. Default is 1000. Though usually much larger numbers are used
##' @param burnin The number of samples to be discarded from the
##' begining of the chain. Default is 100.
##' @param thin The thinning frequency to be applied to the MCMC
##' chain. Default is 5.
##' @param u The prior shape parameter for Beta(u, v). Default is 2
##' @param v The prior shape parameter for Beta(u, v). Default is 10.
##' @param lambda Controls the variance of the outlier component. Default is 1.
##' @param gpParams Object of class `gpParams`. parameters from prior fitting of GPs
##' to each niche to accelerate inference. Default is NULL.
##' @param hyperIter The frequency of MCMC interation to update the hyper-parameters
##' default is 20
##' @param hyperMean The prior mean of the log normal prior of the GP parameters.
##' Default is 0 for each. Order is length-scale, amplitude and noise variance
##' @param hyperSd The prior standard deviation of the log normal prior fo the GP
##' parameters. Default is 1 for each. Order is length-scale, ampliture and noise
##' variance.
##' @param seed The random number seed.
##' @param pg `logical` indicating whether to use polya-gamma prior. Default is
##' `FALSE`.
##' @param pgPrior A matrix generated by pgPrior function. If param pg is TRUE
##' but pgPrior is NULL then a pgPrior is generated on the fly.
##' @param tau The tau parameter for the polya-Gamma prior (is used). Defaults
##' to 0.2
##' @param dirPrior A matrix generated by dirPrior function. Default is NULL
##' and dirPrior is generated on the fly.
##' @param maternCov `logical` indicated whether to use a matern or gaussian
##' covariance. Default is True and matern covariance is used
##' @param PC `logical` indicating whether to use a penalised complexity prior.
##' Default is TRUE.
##' @param nu `integer` indicating the smoothness of the matern prior. Default
##' is 2.
##' @param pcPrior `matrix` with 3 columns indicating the lambda paramters for the
##' penalised complexity prior. Default is null which internally sets
##' the penalised complexity prior to `c(0.5, 3, 100)` for each organelle and the order is
##' length-scale, amplitude and variance. See vignette for more details.
##' @param propSd If MH is used to learn posterior hyperparameters then the proposal
##' standard deviations. A Gaussian random-walk proposal is used.
##' @param numChains `integer` indicating the number of parallel chains to run.
##' Defaults to 4.
##' @return `bandle` returns an instance of class `bandleParams`
##' @md
##' @examples
##' library(pRolocdata)
##' data("tan2009r1")
##' set.seed(1)
##' tansim <- sim_dynamic(object = tan2009r1,
##' numRep = 6L,
##' numDyn = 100L)
##' gpParams <- lapply(tansim$lopitrep,
##' function(x) fitGPmaternPC(x, hyppar = matrix(c(0.5, 1, 100), nrow = 1)))
##' d1 <- tansim$lopitrep
##' control1 <- d1[1:3]
##' treatment1 <- d1[4:6]
##' mcmc1 <- diffLoc(objectCond1 = control1, objectCond2 = treatment1, gpParams = gpParams,
##' fcol = "markers", numIter = 5L, burnin = 1L, thin = 2L)
##'
##' @rdname bandle
diffLoc <- function(objectCond1,
objectCond2,
fcol = "markers",
hyperLearn = "MH",
numIter = 1000,
burnin = 100L,
thin = 5L,
u = 2,
v = 10,
lambda = 1,
gpParams = NULL,
hyperIter = 20,
hyperMean = c(0, 0, 0),
hyperSd = c(1, 1, 1),
seed = NULL,
pg = TRUE,
pgPrior = NULL,
tau = 0.2,
dirPrior = NULL,
maternCov = TRUE,
PC = TRUE,
nu = 2,
pcPrior = NULL,
propSd = c(0.3, 0.1, 0.05)){
# Checks
stopifnot(exprs = {
"ObjectCond1 must be an MSnSet"=is(objectCond1[[1]], "MSnSet")
"ObjectCond2 must be an MSnSet"=is(objectCond2[[1]], "MSnSet")
"hyperLearn must be either MH or LBFGS"=hyperLearn %in% c("MH", "LBFGS")
"numIter must be a numeric"=is(numIter, "numeric")
"burnin must be an integer"=is(burnin, "integer")
"thin must be an integer"=is(thin, "integer")
"burnin must be less than numIter"= burnin < numIter
"u must be numeric"=is(u, "numeric")
"v must be numeric"=is(v, "numeric")
"lambda must be numeric"=is(lambda, "numeric")
"hyperIter must be numeric"=is(hyperIter, "numeric")
"hyperMean must be numeric"=is(hyperMean, "numeric")
"hyperSd must be numeric"=is(hyperSd, "numeric")
"must provide 3 values for hyperMean"=length(hyperMean) == 3
"must provide 3 values for hyperSd" = length(hyperSd) == 3
"tau must be numeric" = is(tau, "numeric")
"nu must be numeric" = is(nu, "numeric")
"propSd must be numeric"=is(propSd, "numeric")
"Must provide 3 values for propSd"=length(propSd) == 3})
# This is required because `biocparrallel` is currently not exporting
# exprs from Biobase when using `SnowParams()`. This generates a
# warning in the build that is currently unavoidable. Removeal will
# cause a failure on windows machines.
suppressMessages(require(Biobase))
# Setting seed manually
if (is.null(seed)) {
message("You haven't provided a seed, you may wish to provide a seed")
}
# Elementary checks
stopifnot(length(hyperMean)==3)
stopifnot(length(hyperSd)==3)
stopifnot(sum(hyperSd > 0)==3)
## Samples to be retained as a result of burnin and thinning
toRetain <- seq.int(burnin + 1L, numIter, thin)
numRetained <- length(toRetain)
# Normalizing data
normCond1 <- lapply(objectCond1, function(x) normalise(x, method = "center.mean"))
normCond2 <- lapply(objectCond2, function(x) normalise(x, method = "center.mean"))
# Bring data together
object_cmb <- c(cond1 = normCond1, cond2 = normCond2)
# numCond should be 2
numCond <- 2
numRepl <- length(objectCond1)
# expressions from data and important summaries
exprs_cmb <- lapply(object_cmb, Biobase::exprs)
M <- lapply(exprs_cmb, colMeans)
V <- lapply(exprs_cmb, function(x) lambda * diag(diag(cov(x))) + diag(rep(10^{-6}, ncol(x))))
# Getting data dimensions
D <- ncol(object_cmb[[1]])
K <- length(getMarkerClasses(object_cmb[[1]], fcol = fcol))
# Set default pc priors
if(is.null(pcPrior)){
pcPrior <- matrix(rep(c(0.5, 3, 100),
each = K), ncol = 3)
}
# construct empirical Bayes Polya-Gamma prior
if (is.null(pgPrior)) {
pgPrior <- pg_prior(normCond1, normCond2, K = K, pgPrior = NULL, fcol = fcol)
}
# Fit GPs to markers
componenthypers <- lapply(object_cmb, function(x) vector(mode = "list", K))
if (maternCov == FALSE) {
res <- lapply(object_cmb, function(x) fitGP(x, fcol = fcol))
} else if ((maternCov == TRUE) & (PC = FALSE) & (is.null(gpParams))) {
res <- lapply(object_cmb, function(x) fitGPmatern(x, fcol = fcol, nu = nu))
} else if ((maternCov == TRUE) & (PC = TRUE) & (is.null(gpParams))) {
res <- lapply(object_cmb, function(x) fitGPmaternPC(x, fcol = fcol, nu = nu, hyppar = pcPrior))
} else {
res <- gpParams
}
# check gpParams and convert to a list from gpParams object
stopifnot("The parameters are not an instance of class gpParams"=
all(vapply(res, is, "gpParams") == "gpParams"))
## easiest thing here is to convert from gpParams object to a list for bandle
## Note: we can amend bandle to work with gpParams objects at a later date if needed
# res <- lapply(res, function(z) list(M = z@M, sigma = z@sigma, params = z@params))
# separate data into known and unknown
unknown_cmb <- lapply(object_cmb, function(x) unknownMSnSet(x, fcol = fcol))
exprsUnknown_cmb <- lapply(unknown_cmb, Biobase::exprs)
exprsKnown <- lapply(object_cmb, function(x) Biobase::exprs(markerMSnSet(x, fcol = fcol)))
# separate conditions allocations
allocKnown <- lapply(object_cmb[c(1, numRepl + 1)], function(x) seq.int(K)[fData(markerMSnSet(x, fcol = fcol))[, fcol]])
numProtein <- lapply(unknown_cmb[c(1, numRepl + 1)], nrow)
# Some storage
alloc <- lapply(numProtein, function(x) matrix(0, nrow = x, ncol = numRetained))
allocOut <- lapply(numProtein, function(x) matrix(0, nrow = x, ncol = numRetained))
outlierprob <- lapply(numProtein, function(x) matrix(0, nrow = x, ncol = numRetained))
weights <- array(0, c(K, K, numRetained))
epsilons <- matrix(0, nrow = 2, ncol = numRetained)
allocprob <- lapply(numProtein, function(x) array(0, c(x, numRetained, K)))
loglikelihoods <- lapply(rep(numProtein, numRepl), function(x) matrix(0, nrow = x, ncol = K))
#random allocation of unknown Proteins, allocations are condition specific
alloctemp <- lapply(numProtein, function(x) sample.int(n = K, size = x, replace = TRUE))
for (i in seq.int(numCond)) {
object_cmb[[i]] <- pRoloc::knnClassification(object_cmb[[i]], k = 10, fcol = fcol)
alloc[[i]][, 1] <- fData(object_cmb[[i]][rownames(unknown_cmb[[i]])])$knn
}
# number of proteins allocated to each component
nkknown <- lapply(object_cmb[c(1, numRepl + 1)], function(x)
table(getMarkers(x, verbose = FALSE, fcol = fcol))[getMarkerClasses(x, fcol = fcol)])
#number initial allocated to outlier component
outlier <- vector(mode = "list", length = 2)
for (i in seq.int(numCond)) {
outlier[[i]] <- allocOut[[i]][,1]
}
sampleGPMean <- lapply(object_cmb, function(x) vector(mode = "list", length = K))
centereddata <- lapply(object_cmb, function(x) vector(mode = "list", length = K))
hypers <- lapply(object_cmb, function(x) vector(mode = "list", length = numRetained))
.hypers <- vector(mode = "list", length = length(object_cmb))
for (i in seq.int(object_cmb)) {
hypers[[i]][[1]] <- res[[i]]@params
.hypers[[i]] <- hypers[[i]][[1]]
}
# intialise polya-gamma auxiliary variables
w <- rep(1, K^2)
# progress bar
pb <- txtProgressBar(min = 0,
max = numIter,
style = 3)
._t <- 0
for(t in seq.int(2, numIter)){
# Between data allocation tally
nk_mat <- diag(nkknown[[1]]) + table(factor(alloctemp[[1]], levels = seq.int(K)),
factor(alloctemp[[2]], levels = seq.int(K)))
# Within data allocation tally
nk <- list(cond1 = rowSums(nk_mat), cond2 = colSums(nk_mat))
if((t %% 1) ==0){
setTxtProgressBar(pb, ._t)
._t <- ._t + 1
}
gpnk_cond1 <- gpnk_cond2 <- vector(mode = "numeric", length = K)
gpnk <- list(gpnk_cond1 = gpnk_cond1, gpnk_cond2 = gpnk_cond2)
for (i in seq.int(object_cmb)) {
j <- ceiling(i/numRepl) # allocs don't change across replicates
for (l in seq.int(K)) {
gpnk[[j]][l] <- sum(alloctemp[[j]]*outlier[[j]] == l) + nkknown[[1]][l]
}
if (maternCov == FALSE) {
centereddata[[i]] <- centeredData(Xknown = exprsKnown[[i]],
BX = allocKnown[[j]],
Xunknown = exprsUnknown_cmb[[i]],
BXun = alloctemp[[j]]*outlier[[j]],
hypers = .hypers[[i]],
nk = gpnk[[j]], tau = seq.int(D), D = D, K)
} else if (maternCov == TRUE) {
centereddata[[i]] <- centeredDatamatern(Xknown = exprsKnown[[i]],
BX = allocKnown[[j]],
Xunknown = exprsUnknown_cmb[[i]],
BXun = alloctemp[[j]]*outlier[[j]],
hypers = .hypers[[i]],
nk = gpnk[[j]], tau = seq.int(D), D = D, K, nu = nu)
}
}
# Polya-Gamma Sampler or Dirichlet weights
if (pg == TRUE) {
pgres <- sample_weights_pg(nk_mat = nk_mat, pgPrior = pgPrior, K = K, w = w, tau = tau)
currentweights <- pgres$currentweights
w <- pgres$w
} else {
if (is.null(dirPrior)){
dirPrior <- diag(rep(1, K)) + matrix(0.05, nrow = K, ncol = K)
}
currentweights <- sample_weights_dir(nk_mat = nk_mat, dirPrior = dirPrior)
}
currentweights <- matrix(currentweights, ncol = K, nrow = K)
#extract noise component from hypers
sigmak <- lapply(hypers, function(x) exp(2 * x[[1]][, 3])) # fixed for the moment
sigmasqrt <- lapply(sigmak, sqrt)
loglikelihoods <- comploglikelist(centereddata, sigmasqrt)
resAllocCond1 <- proteinAllocation(loglikelihoods = loglikelihoods[seq.int(numRepl)],
currentweights = currentweights,
alloctemp = alloctemp,
cond = 1)
resAllocCond2 <- proteinAllocation(loglikelihoods = loglikelihoods[seq.int(1+numRepl, numRepl*numCond)],
currentweights = currentweights,
alloctemp = alloctemp,
cond = 2)
alloctemp <- list(resAllocCond1$alloctemp, resAllocCond2$alloctemp)
loglikelihoods_cond1 <- resAllocCond1$loglikelihoods_comb
loglikelihoods_cond2 <- resAllocCond2$loglikelihoods_comb
# sample epsilon
tau1 <- lapply(outlier, function(x) sum(x == 1) + nrow(exprsKnown[[1]]))
tau2 <- lapply(outlier, function(x) sum(x == 0))
epsilon <- rbeta(n = 2, shape1 = u + unlist(tau2), shape2 = v + unlist(tau1))
allocnotOutprob <- vector(mode = "list")
allocOutprob <- vector(mode = "list")
# Sample outlier allocations first condition 1 then condition 2
# outlier likelihood
outlierlikelihood <- vector(mode = "list")
for (i in seq.int(exprsUnknown_cmb)) {
outlierlikelihood[[i]] <- dmvtCpp(exprsUnknown_cmb[[i]],
mu_ = M[[i]],
sigma_ = V[[i]],
df_ = 4,
log_ = TRUE,
isChol_ = FALSE)
}
resOutCond1 <- outlierAllocationProbs(outlierlikelihood = outlierlikelihood[seq.int(1, numRepl)],
loglikelihoods = loglikelihoods_cond1,
epsilon = epsilon,
alloctemp = alloctemp,
cond = 1)
resOutCond2 <- outlierAllocationProbs(outlierlikelihood = outlierlikelihood[seq.int((1+numRepl),(numRepl*numCond))],
loglikelihoods = loglikelihoods_cond2,
epsilon = epsilon,
alloctemp = alloctemp,
cond = 2)
allocnotOutprob[[1]] <- resOutCond1[[1]]
allocOutprob[[1]] <- resOutCond1[[2]]
allocnotOutprob[[2]] <- resOutCond2[[1]]
allocOutprob[[2]] <- resOutCond2[[2]]
# Sample outliers
allocoutlierprob <- cbind(allocnotOutprob[[1]], allocOutprob[[1]])
outlier[[1]] <- sampleOutlier(allocoutlierprob = allocoutlierprob)
allocoutlierprob <- cbind(allocnotOutprob[[2]], allocOutprob[[2]])
outlier[[2]] <- sampleOutlier(allocoutlierprob = allocoutlierprob)
#update hypers
if((t %% hyperIter) == 0){
if ( isFALSE(maternCov)) {
if(hyperLearn == "LBFGS"){
}else if(hyperLearn == "MH"){
# Between data allocation tally
nk_mat <- diag(nkknown[[1]]) + table(factor(alloctemp[[1]], levels = seq.int(K)),
factor(alloctemp[[2]], levels = seq.int(K)))
# Within data allocation tally
nk <- list(cond1 = rowSums(nk_mat), cond2 = colSums(nk_mat))
for (i in seq.int(object_cmb)) {
for(j in seq.int(K)){
l <- ceiling(i/numRepl) # allocs don't change across replicates
Y <- makeComponent(X = exprsKnown[[i]],
BX = allocKnown[[l]],
Y = exprsUnknown_cmb[[i]],
BY = alloctemp[[l]] * outlier[[l]],
j = j)
componenthypers[[i]][[j]] <- metropolisGP(inith = .hypers[[i]][j,],
X = t(Y),
tau = seq.int(D), nk = nk[[l]][j],
D = D,
niter = 1,
hyperMean = hyperMean,
hyperSd = hyperSd)$h[ ,2]
}
# stores current hyperparameters invisibly
.hypers[[i]] <- matrix(unlist(componenthypers[[i]]), ncol = 3, byrow = TRUE)
}
}
} else if (isTRUE(maternCov) & (hyperLearn == "MH")) {
# Between data allocation tally
nk_mat <- diag(nkknown[[1]]) + table(factor(alloctemp[[1]], levels = seq.int(K)),
factor(alloctemp[[2]], levels = seq.int(K)))
for (i in seq.int(object_cmb)) {
for(j in seq.int(K)) {
l <- ceiling(i/numRepl) # allocs don't change across replicates
Y <- makeComponent(X = exprsKnown[[i]],
BX = allocKnown[[l]],
Y = exprsUnknown_cmb[[i]],
BY = - alloctemp[[l]] * 0,
j = j)
hyperstoupdate <- c(exp(.hypers[[i]][j, seq.int(2)]), exp(2 * .hypers[[i]][j,3]))
.pc_prior <- pcPrior[j, ]
newhypers <- metropolisGPmatern(inith = hyperstoupdate,
X = t(Y),
tau = seq.int(D),
nk = nk[[l]][j],
D = D,
niter = 1,
nu = nu,
hyppar = .pc_prior,
propSd = propSd)$h[ ,2]
componenthypers[[i]][[j]] <- c(log(newhypers[seq.int(2)]), log(newhypers[3])/2)
}
# stores current hyperparameters invisibily
.hypers[[i]] <- matrix(unlist(componenthypers[[i]]), ncol = 3, byrow = TRUE)
}
}
}
## Only store iterations that are going to be retained
if(t %in% toRetain) {
s <- which(t == toRetain) # index of variable to save
for(i in seq.int(object_cmb)) {
hypers[[i]][[s]] <- .hypers[[i]]
}
weights[, , s] <- currentweights
for (i in seq.int(numCond)) {
alloc[[i]][, s] <- alloctemp[[i]]
allocOut[[i]][, s] <- outlier[[i]]
if (i == 1) {
allocprob[[i]][, s, ] <- resAllocCond1$allocprobtemp
} else {
allocprob[[i]][, s, ] <- resAllocCond2$allocprobtemp
}
}
epsilons[, s] <- epsilon
}
}
## construct objects
nicheParam <- list()
.niche <- alloc
.nicheProb <- allocprob
.outlier <- allocOut
## name objects
rownames(hypers[[1]][[1]]) <- getMarkerClasses(object = objectCond1[[1]], fcol = fcol)
colnames(hypers[[1]][[1]]) <- c("length-scale", "amplitude", "sigma")
for (i in seq.int(numCond)){
for (j in seq.int(numRepl)){
if (i == 1){
nicheParam[[j]] <- .nicheParam(dataset = "control",
replicate = as.integer(j),
K = K,
D = D,
method = "bandle",
params = hypers[[j]])
} else{
nicheParam[[numRepl + j]] <- .nicheParam(dataset = "treatment",
replicate = j,
K = K,
D = D,
method = "bandle",
params = hypers[[numRepl + j]])
}
}
}
# set correct dimension names
dimnames(weights)[[1]] <- dimnames(weights)[[2]] <- getMarkerClasses(object = objectCond1[[1]],
fcol = fcol)
rownames(epsilons) <- c("Dataset 1", "Dataset 2")
.niche <- lapply(.niche, function(x){ rownames(x) <- rownames(unknown_cmb[[1]]); x})
.nicheProb <- lapply(.nicheProb, function(x) {
dimnames(x)[[1]] <- rownames(unknown_cmb[[1]]); x})
.nicheProb <- lapply(.nicheProb, function(x) {
dimnames(x)[[3]] <- getMarkerClasses(object = objectCond1[[1]],
fcol = fcol); x})
.outlier <- lapply(.outlier, function(x){ rownames(x) <- rownames(unknown_cmb[[1]]); x})
## construct bandleChains object
.out <- .bandleChain(dataset = "bandleExperiment",
replicates = length(object_cmb),
n = length(toRetain),
K = K,
N = numProtein[[1]],
weights = weights,
epsilons = epsilons,
niche = .niche,
nicheProb = .nicheProb,
outlier = .outlier,
nicheParams = .nicheParams(params = nicheParam))
return(.out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.