R/bandle-sampler.R

Defines functions diffLoc

Documented in diffLoc

##' These functions implement the bandle model for dynamic mass spectrometry based
##' spatial proteomics datasets using MCMC for inference, this is an internal sampling
##' function
##' 
##' The `diffloc` function generate the sample from the posterior distributions
##' (object or class `bandleParam`) based on an annotated quantitative spatial
##' proteomics datasets (object of class [`MSnbase::MSnSet`]). Both are then
##' passed to the `bandlePredict` function to predict the sub-cellular localisation
##' and compute the differential localisation probability of proteins. See the
##' vignette for examples
##' 
##' @title Differential localisation experiments using the bandle method
##' @param objectCond1 A list of [`MSnbase::MSnSet`]s where each is an experimental
##' replicate for the first condition, usually a control
##' @param objectCond2 A list of [`MSnbase::MSnSet`]s where each is an experimental
##' replicate for the second condition, usually a treatment
##' @param fcol The feature meta-data containing marker definitions. Default is
##' `markers`
##' @param hyperLearn Algorithm to learn posterior hyperparameters of the Gaussian
##' processes. Default is `LBFGS` and `MH` for metropolis-hastings is also implemented. 
##' @param numIter The number of iterations of the MCMC
##'     algorithm. Default is 1000. Though usually much larger numbers are used
##' @param burnin The number of samples to be discarded from the
##'     begining of the chain. Default is 100.
##' @param thin The thinning frequency to be applied to the MCMC
##'     chain.  Default is 5.
##' @param u The prior shape parameter for Beta(u, v). Default is 2
##' @param v The prior shape parameter for Beta(u, v). Default is 10.
##' @param lambda Controls the variance of the outlier component. Default is 1.
##' @param gpParams Object of class `gpParams`. parameters from prior fitting of GPs
##' to each niche to accelerate inference. Default is NULL.
##' @param hyperIter The frequency of MCMC interation to update the hyper-parameters
##' default is 20
##' @param hyperMean The prior mean of the log normal prior of the GP parameters.
##' Default is 0 for each. Order is length-scale, amplitude and noise variance
##' @param hyperSd The prior standard deviation of the log normal prior fo the GP
##' parameters. Default is 1 for each. Order is length-scale, ampliture and noise
##'  variance.
##' @param seed The random number seed.
##' @param pg `logical` indicating whether to use polya-gamma prior. Default is
##'  `FALSE`.
##' @param pgPrior A matrix generated by pgPrior function. If param pg is TRUE
##' but pgPrior is NULL then a pgPrior is generated on the fly.
##' @param tau The tau parameter for the polya-Gamma prior (is used). Defaults
##' to 0.2
##' @param dirPrior A matrix generated by dirPrior function. Default is NULL
##'  and dirPrior is generated on the fly.
##' @param maternCov `logical` indicated whether to use a matern or gaussian
##' covariance. Default is True and matern covariance is used
##' @param PC `logical` indicating whether to use a penalised complexity prior.
##' Default is TRUE.
##' @param nu `integer` indicating the smoothness of the matern prior. Default
##' is 2.
##' @param pcPrior `matrix` with 3 columns indicating the lambda paramters for the
##' penalised complexity prior. Default is null which internally sets
##' the penalised complexity prior to `c(0.5, 3, 100)` for each organelle and the order is 
##' length-scale, amplitude and variance. See vignette for more details.
##' @param propSd If MH is used to learn posterior hyperparameters then the proposal
##' standard deviations. A Gaussian random-walk proposal is used.
##' @param numChains `integer` indicating the number of parallel chains to run.
##' Defaults to 4.
##' @return  `bandle` returns an instance of class `bandleParams`
##' @md
##' @examples
##' library(pRolocdata)
##' data("tan2009r1")
##' set.seed(1)
##' tansim <- sim_dynamic(object = tan2009r1, 
##'                     numRep = 6L,
##'                    numDyn = 100L)
##' gpParams <- lapply(tansim$lopitrep, 
##' function(x) fitGPmaternPC(x, hyppar = matrix(c(0.5, 1, 100), nrow = 1)))
##' d1 <- tansim$lopitrep
##' control1 <- d1[1:3]
##' treatment1 <- d1[4:6]
##' mcmc1 <- diffLoc(objectCond1 = control1, objectCond2 = treatment1, gpParams = gpParams,
##'                                      fcol = "markers", numIter = 5L, burnin = 1L, thin = 2L)
##' 
##' @rdname bandle
diffLoc <- function(objectCond1,
                    objectCond2,
                    fcol = "markers",
                    hyperLearn = "MH",
                    numIter = 1000,
                    burnin = 100L,
                    thin = 5L,
                    u = 2,
                    v = 10,
                    lambda = 1,
                    gpParams = NULL,
                    hyperIter = 20,
                    hyperMean = c(0, 0, 0),
                    hyperSd = c(1, 1, 1),
                    seed = NULL,
                    pg = TRUE,
                    pgPrior = NULL,
                    tau = 0.2,
                    dirPrior = NULL,
                    maternCov = TRUE,
                    PC = TRUE,
                    nu = 2,
                    pcPrior = NULL,
                    propSd = c(0.3, 0.1, 0.05)){
    
    # Checks
    stopifnot(exprs = {
              "ObjectCond1 must be an MSnSet"=is(objectCond1[[1]], "MSnSet")
              "ObjectCond2 must be an MSnSet"=is(objectCond2[[1]], "MSnSet")
              "hyperLearn must be either MH or LBFGS"=hyperLearn %in% c("MH", "LBFGS")
              "numIter must be a numeric"=is(numIter, "numeric")
              "burnin must be an integer"=is(burnin, "integer")
              "thin must be an integer"=is(thin, "integer")
              "burnin must be less than numIter"= burnin < numIter
              "u must be numeric"=is(u, "numeric")
              "v must be numeric"=is(v, "numeric")
              "lambda must be numeric"=is(lambda, "numeric")
              "hyperIter must be numeric"=is(hyperIter, "numeric")
              "hyperMean must be numeric"=is(hyperMean, "numeric")
              "hyperSd must be numeric"=is(hyperSd, "numeric")
              "must provide 3 values for hyperMean"=length(hyperMean) == 3
              "must provide 3 values for hyperSd" = length(hyperSd) == 3
              "tau must be numeric" = is(tau, "numeric")
              "nu must be numeric" = is(nu, "numeric")
              "propSd must be numeric"=is(propSd, "numeric")
              "Must provide 3 values for propSd"=length(propSd) == 3})
    
    # This is required because `biocparrallel` is currently not exporting 
    # exprs from Biobase when using `SnowParams()`. This generates a 
    # warning in the build that is currently unavoidable. Removeal will
    # cause a failure on windows machines. 
    suppressMessages(require(Biobase))
    
    
    # Setting seed manually
    if (is.null(seed)) {
        message("You haven't provided a seed, you may wish to provide a seed")
    }

    # Elementary checks
    stopifnot(length(hyperMean)==3)
    stopifnot(length(hyperSd)==3)
    stopifnot(sum(hyperSd > 0)==3)
    
    ## Samples to be retained as a result of burnin and thinning
    toRetain <- seq.int(burnin + 1L, numIter, thin)
    numRetained <- length(toRetain)
    
    # Normalizing data
    normCond1 <- lapply(objectCond1,  function(x) normalise(x, method = "center.mean"))
    normCond2 <- lapply(objectCond2,  function(x) normalise(x, method = "center.mean"))
    
    # Bring data together
    object_cmb <- c(cond1 = normCond1, cond2 = normCond2)
    
    # numCond should be 2
    numCond <- 2
    numRepl <- length(objectCond1)
    

    # expressions from data and important summaries
    exprs_cmb <- lapply(object_cmb, Biobase::exprs)
    M <- lapply(exprs_cmb, colMeans)
    V <- lapply(exprs_cmb, function(x) lambda * diag(diag(cov(x))) + diag(rep(10^{-6}, ncol(x))))
    
    
    # Getting data dimensions
    D <- ncol(object_cmb[[1]])
    K <- length(getMarkerClasses(object_cmb[[1]], fcol = fcol))
    
    # Set default pc priors
    if(is.null(pcPrior)){
        pcPrior <- matrix(rep(c(0.5, 3, 100),
                   each = K), ncol = 3)
    }
    
    # construct empirical Bayes Polya-Gamma prior
    if (is.null(pgPrior)) {
        pgPrior <- pg_prior(normCond1, normCond2, K = K, pgPrior = NULL, fcol = fcol)
    }
    
    # Fit GPs to markers
    componenthypers <- lapply(object_cmb, function(x) vector(mode = "list", K))
    if (maternCov == FALSE) {
        res <- lapply(object_cmb, function(x) fitGP(x, fcol = fcol))
    } else if ((maternCov == TRUE) & (PC = FALSE) & (is.null(gpParams))) {
        res <- lapply(object_cmb, function(x) fitGPmatern(x, fcol = fcol, nu = nu))
    } else if ((maternCov == TRUE) & (PC = TRUE) & (is.null(gpParams))) {
        res <- lapply(object_cmb, function(x) fitGPmaternPC(x, fcol = fcol, nu = nu, hyppar = pcPrior))  
    } else {
        res <- gpParams
    }
    
    # check gpParams and convert to a list from gpParams object
    stopifnot("The parameters are not an instance of class gpParams"= 
                all(vapply(res, is, "gpParams") == "gpParams"))
    ## easiest thing here is to convert from gpParams object to a list for bandle
    ## Note: we can amend bandle to work with gpParams objects at a later date if needed
    # res <-  lapply(res, function(z) list(M = z@M, sigma = z@sigma, params = z@params))
    
    
    # separate data into known and unknown
    unknown_cmb <- lapply(object_cmb, function(x) unknownMSnSet(x, fcol = fcol))
    exprsUnknown_cmb <- lapply(unknown_cmb, Biobase::exprs)
    exprsKnown <- lapply(object_cmb, function(x) Biobase::exprs(markerMSnSet(x, fcol = fcol)))
    
    
    # separate conditions allocations
    allocKnown <- lapply(object_cmb[c(1, numRepl + 1)], function(x) seq.int(K)[fData(markerMSnSet(x, fcol = fcol))[, fcol]])
    numProtein <- lapply(unknown_cmb[c(1, numRepl + 1)], nrow)
    

    # Some storage
    alloc <- lapply(numProtein, function(x) matrix(0, nrow = x, ncol = numRetained))
    allocOut <- lapply(numProtein, function(x) matrix(0, nrow = x, ncol = numRetained))
    outlierprob <- lapply(numProtein, function(x) matrix(0, nrow = x, ncol = numRetained))
    weights <- array(0, c(K, K, numRetained))
    epsilons <- matrix(0, nrow = 2, ncol = numRetained)
    allocprob <- lapply(numProtein, function(x) array(0, c(x, numRetained, K)))
    loglikelihoods <- lapply(rep(numProtein, numRepl), function(x) matrix(0, nrow = x, ncol = K))
    
    #random allocation of unknown Proteins, allocations are condition specific
    alloctemp <- lapply(numProtein, function(x) sample.int(n = K, size = x, replace = TRUE))
    for (i in seq.int(numCond)) {
        object_cmb[[i]] <- pRoloc::knnClassification(object_cmb[[i]], k = 10, fcol = fcol)
        alloc[[i]][, 1] <- fData(object_cmb[[i]][rownames(unknown_cmb[[i]])])$knn
    }
    # number of proteins allocated to each component
    nkknown <- lapply(object_cmb[c(1, numRepl + 1)], function(x)
        table(getMarkers(x, verbose = FALSE, fcol = fcol))[getMarkerClasses(x, fcol = fcol)])
    
    #number initial allocated to outlier component
    outlier <- vector(mode = "list", length = 2)
    for (i in seq.int(numCond)) {
        outlier[[i]] <- allocOut[[i]][,1]
    }
    
    sampleGPMean <- lapply(object_cmb, function(x) vector(mode = "list", length = K))
    centereddata <- lapply(object_cmb, function(x) vector(mode = "list", length = K))
    hypers <- lapply(object_cmb, function(x) vector(mode = "list", length = numRetained))
    .hypers <- vector(mode = "list", length = length(object_cmb))
    
    for (i in seq.int(object_cmb)) {
        hypers[[i]][[1]] <- res[[i]]@params
        .hypers[[i]] <- hypers[[i]][[1]]
    }
    # intialise polya-gamma auxiliary variables
    w <- rep(1, K^2)
    
    # progress bar
    pb <- txtProgressBar(min = 0,
                         max = numIter,
                         style = 3)
    ._t <- 0
    
    for(t in seq.int(2, numIter)){
    
          
        # Between data allocation tally
        nk_mat <- diag(nkknown[[1]]) + table(factor(alloctemp[[1]], levels = seq.int(K)),
                                             factor(alloctemp[[2]], levels = seq.int(K)))
        
        # Within data allocation tally
        nk <- list(cond1 = rowSums(nk_mat), cond2 = colSums(nk_mat))
        
        if((t %% 1) ==0){
            setTxtProgressBar(pb, ._t)
            ._t <- ._t + 1
        }
        
        gpnk_cond1 <- gpnk_cond2 <- vector(mode = "numeric", length = K)
        gpnk <- list(gpnk_cond1 = gpnk_cond1, gpnk_cond2 = gpnk_cond2)
        
        for (i in seq.int(object_cmb)) {
            
            j <- ceiling(i/numRepl) # allocs don't change across replicates
            for (l in seq.int(K)) {
                gpnk[[j]][l] <- sum(alloctemp[[j]]*outlier[[j]] == l) + nkknown[[1]][l]
            }
            if (maternCov == FALSE) {
                centereddata[[i]] <- centeredData(Xknown = exprsKnown[[i]],
                                                  BX = allocKnown[[j]],
                                                  Xunknown = exprsUnknown_cmb[[i]], 
                                                  BXun = alloctemp[[j]]*outlier[[j]],
                                                  hypers = .hypers[[i]], 
                                                  nk = gpnk[[j]], tau = seq.int(D), D = D, K)
            } else if (maternCov == TRUE) {  
                centereddata[[i]] <- centeredDatamatern(Xknown = exprsKnown[[i]],
                                                        BX = allocKnown[[j]],
                                                        Xunknown = exprsUnknown_cmb[[i]], 
                                                        BXun = alloctemp[[j]]*outlier[[j]],
                                                        hypers = .hypers[[i]], 
                                                        nk = gpnk[[j]], tau = seq.int(D), D = D, K, nu = nu)
            }
        }
        
        # Polya-Gamma Sampler or Dirichlet weights
        if (pg == TRUE) {
            pgres <- sample_weights_pg(nk_mat = nk_mat, pgPrior = pgPrior, K = K, w = w, tau = tau)
            currentweights <- pgres$currentweights
            w <- pgres$w
        } else {
            if (is.null(dirPrior)){
                dirPrior <- diag(rep(1, K)) + matrix(0.05, nrow = K, ncol = K)
            }
            currentweights <- sample_weights_dir(nk_mat = nk_mat, dirPrior = dirPrior)
        }
        
        currentweights <- matrix(currentweights, ncol = K, nrow = K)

        #extract noise component from hypers
        sigmak <- lapply(hypers, function(x) exp(2 * x[[1]][, 3])) # fixed for the moment
        sigmasqrt <- lapply(sigmak, sqrt)
        loglikelihoods <- comploglikelist(centereddata, sigmasqrt)
        
        resAllocCond1 <- proteinAllocation(loglikelihoods = loglikelihoods[seq.int(numRepl)],
                                           currentweights = currentweights,
                                           alloctemp = alloctemp,
                                           cond = 1)
        resAllocCond2 <- proteinAllocation(loglikelihoods = loglikelihoods[seq.int(1+numRepl, numRepl*numCond)],
                                           currentweights = currentweights,
                                           alloctemp = alloctemp,
                                           cond = 2)
        
        alloctemp <- list(resAllocCond1$alloctemp, resAllocCond2$alloctemp)
        loglikelihoods_cond1 <- resAllocCond1$loglikelihoods_comb
        loglikelihoods_cond2 <- resAllocCond2$loglikelihoods_comb
        
        # sample epsilon
        tau1 <- lapply(outlier, function(x) sum(x == 1) + nrow(exprsKnown[[1]]))
        tau2 <- lapply(outlier, function(x) sum(x == 0))
        epsilon <- rbeta(n = 2, shape1 = u + unlist(tau2), shape2 = v + unlist(tau1))
        
        
        allocnotOutprob <- vector(mode = "list")
        allocOutprob <- vector(mode = "list")
        # Sample outlier allocations first condition 1 then condition 2
        # outlier likelihood
        outlierlikelihood <- vector(mode = "list")
        for (i in seq.int(exprsUnknown_cmb)) {
            outlierlikelihood[[i]] <- dmvtCpp(exprsUnknown_cmb[[i]],
                                              mu_ = M[[i]],
                                              sigma_ = V[[i]],
                                              df_ = 4,
                                              log_ = TRUE,
                                              isChol_ = FALSE)
        }
        
        resOutCond1 <- outlierAllocationProbs(outlierlikelihood = outlierlikelihood[seq.int(1, numRepl)],
                                              loglikelihoods = loglikelihoods_cond1,
                                              epsilon = epsilon,
                                              alloctemp = alloctemp,
                                              cond = 1)
        resOutCond2 <- outlierAllocationProbs(outlierlikelihood = outlierlikelihood[seq.int((1+numRepl),(numRepl*numCond))],
                                              loglikelihoods = loglikelihoods_cond2,
                                              epsilon = epsilon,
                                              alloctemp = alloctemp,
                                              cond = 2)
        
        allocnotOutprob[[1]] <- resOutCond1[[1]]
        allocOutprob[[1]] <- resOutCond1[[2]]
        
        allocnotOutprob[[2]] <- resOutCond2[[1]]
        allocOutprob[[2]] <- resOutCond2[[2]]
        
        
        # Sample outliers
        allocoutlierprob <- cbind(allocnotOutprob[[1]], allocOutprob[[1]])
        outlier[[1]] <- sampleOutlier(allocoutlierprob = allocoutlierprob)
        
        allocoutlierprob <- cbind(allocnotOutprob[[2]], allocOutprob[[2]])
        outlier[[2]] <- sampleOutlier(allocoutlierprob = allocoutlierprob)
        
        #update hypers
        if((t %% hyperIter) == 0){
            if ( isFALSE(maternCov)) {  
                if(hyperLearn == "LBFGS"){
                }else if(hyperLearn == "MH"){
                    # Between data allocation tally
                    nk_mat <- diag(nkknown[[1]]) + table(factor(alloctemp[[1]], levels = seq.int(K)),
                                                         factor(alloctemp[[2]], levels = seq.int(K)))
                    
                    # Within data allocation tally
                    nk <- list(cond1 = rowSums(nk_mat), cond2 = colSums(nk_mat)) 
                    
                    for (i in seq.int(object_cmb)) {
                        for(j in seq.int(K)){
                            
                            l <- ceiling(i/numRepl) # allocs don't change across replicates
                            Y <- makeComponent(X = exprsKnown[[i]],
                                               BX = allocKnown[[l]],
                                               Y = exprsUnknown_cmb[[i]],
                                               BY = alloctemp[[l]] * outlier[[l]],
                                               j = j)
                            componenthypers[[i]][[j]] <- metropolisGP(inith = .hypers[[i]][j,],
                                                                      X = t(Y),
                                                                      tau = seq.int(D), nk = nk[[l]][j],
                                                                      D = D,
                                                                      niter = 1,
                                                                      hyperMean = hyperMean,
                                                                      hyperSd = hyperSd)$h[ ,2]
                            
                            
                        }
                        # stores current hyperparameters invisibly
                        .hypers[[i]] <- matrix(unlist(componenthypers[[i]]), ncol = 3, byrow = TRUE)
                    }  
                }
            } else if (isTRUE(maternCov) & (hyperLearn == "MH")) {
                # Between data allocation tally
                nk_mat <- diag(nkknown[[1]]) + table(factor(alloctemp[[1]], levels = seq.int(K)),
                                                     factor(alloctemp[[2]], levels = seq.int(K)))
                
                for (i in seq.int(object_cmb)) {
                    for(j in seq.int(K)) {
                        l <- ceiling(i/numRepl) # allocs don't change across replicates
                        Y <- makeComponent(X = exprsKnown[[i]],
                                           BX = allocKnown[[l]],
                                           Y = exprsUnknown_cmb[[i]],
                                           BY = - alloctemp[[l]] * 0,
                                           j = j)
                        
                        hyperstoupdate <- c(exp(.hypers[[i]][j, seq.int(2)]), exp(2 * .hypers[[i]][j,3]))
                        
                        .pc_prior <- pcPrior[j, ]
                        newhypers <- metropolisGPmatern(inith = hyperstoupdate,
                                                        X = t(Y),
                                                        tau = seq.int(D),
                                                        nk = nk[[l]][j],
                                                        D = D,
                                                        niter = 1,
                                                        nu = nu,
                                                        hyppar = .pc_prior,
                                                        propSd = propSd)$h[ ,2]
                        
                        componenthypers[[i]][[j]] <- c(log(newhypers[seq.int(2)]), log(newhypers[3])/2)
                    }
                    # stores current hyperparameters invisibily
                    .hypers[[i]] <- matrix(unlist(componenthypers[[i]]), ncol = 3, byrow = TRUE)
                    
                }
            }
        }  
        
        ## Only store iterations that are going to be retained
        if(t %in% toRetain) {
            
            s <- which(t == toRetain) # index of variable to save
            for(i in seq.int(object_cmb)) {
                hypers[[i]][[s]] <- .hypers[[i]]
            }  
            weights[, , s] <- currentweights
            for (i in seq.int(numCond)) {
                alloc[[i]][, s] <- alloctemp[[i]]
                allocOut[[i]][, s] <- outlier[[i]]
                if (i == 1) {
                    allocprob[[i]][, s, ] <- resAllocCond1$allocprobtemp
                } else {
                    allocprob[[i]][, s, ] <- resAllocCond2$allocprobtemp
                }
            }  
            epsilons[, s] <- epsilon
        }  
    }
    
    
    ## construct objects
    nicheParam <- list()
    .niche <- alloc
    .nicheProb <- allocprob
    .outlier <- allocOut
    
    ## name objects
    rownames(hypers[[1]][[1]]) <- getMarkerClasses(object = objectCond1[[1]], fcol = fcol)
    colnames(hypers[[1]][[1]]) <- c("length-scale", "amplitude", "sigma")
    
    for (i in seq.int(numCond)){

        for (j in seq.int(numRepl)){
            if (i == 1){
                nicheParam[[j]] <- .nicheParam(dataset = "control",
                                               replicate = as.integer(j),
                                               K = K,
                                               D = D, 
                                               method = "bandle",
                                               params = hypers[[j]]) 
            } else{
                nicheParam[[numRepl + j]] <- .nicheParam(dataset = "treatment",
                                                         replicate = j,
                                                         K = K,
                                                         D = D, 
                                                         method = "bandle",
                                                         params = hypers[[numRepl + j]])     
            }    
        }
    }
    
    # set correct dimension names
    dimnames(weights)[[1]] <- dimnames(weights)[[2]] <- getMarkerClasses(object = objectCond1[[1]],
                                                                         fcol = fcol)
    rownames(epsilons) <- c("Dataset 1", "Dataset 2")
    .niche <-  lapply(.niche, function(x){ rownames(x) <- rownames(unknown_cmb[[1]]); x})
    .nicheProb <- lapply(.nicheProb, function(x) {
                                        dimnames(x)[[1]] <- rownames(unknown_cmb[[1]]); x})
    .nicheProb <- lapply(.nicheProb, function(x) {
        dimnames(x)[[3]] <- getMarkerClasses(object = objectCond1[[1]],
                                             fcol = fcol); x})
    .outlier <- lapply(.outlier, function(x){ rownames(x) <- rownames(unknown_cmb[[1]]); x})
    
    ## construct bandleChains object
    .out <- .bandleChain(dataset = "bandleExperiment",
                         replicates = length(object_cmb),
                         n = length(toRetain),
                         K = K,
                         N = numProtein[[1]], 
                         weights = weights,
                         epsilons = epsilons,
                         niche = .niche,
                         nicheProb = .nicheProb,
                         outlier = .outlier,
                         nicheParams = .nicheParams(params = nicheParam))
    
    return(.out)
    
}
ococrook/bandle documentation built on July 13, 2022, 5:54 a.m.