R/test_ind_origin.R

#' Test of independent origin hypothesis for a phylogenetic model with a binary trait.
#'
#' Runs a Gibbs sampler for a two-state CMTC 
#' on a set of phylogenetic trees, with data 
#' at the tips of the trees. Uses MCMC output to 
#' compute Bayes factor of the hypothesis N_{0->1} <= k
#' 
#' @param inputTrees \code{multiPhylo} object containing a list of trees.
#' @param rootDist Numeric vector of size two with probabilities of the 
#' two states at the root of the tree.
#' @param traitData Vector of trait values, coded as 0 and 1.
#' @param initLambda01 Initial value of the 0 -> 1 rate.
#' @param initLambda10 Initial value of the 1 -> 0 rate.
#' @param priorAlpha01 Shape parameter of the Gamma prior on the 0 -> 1 rate.
#' @param priorBeta01 Rate parameter of the Gamma prior on the 0 -> 1 rate.
#' @param priorAlpha10 Shape parameter of the Gamma prior on the 1 -> 0 rate.
#' @param priorBeta10 Rate parameter of the Gamma prior on the 0 -> 1 rate.
#' @param mcmcSize Total number of MCMC iterations. 
#' @param mcmcBurnin Number of initial iterations to ignore.
#' @param mcmcSubsample Integer controlling how many MCMC iterations to save. 
#' For example, \code{mcmcSubsample = 10} saves every tenth iteration after ignoreing the first \code{mcmcBurnin} iterations.
#' @param testThreshold Threshold k in the independent origin hypotheseis N_{0->1} <= k
#'
#' @return List with two elements: the input trees and a matrix of MCMC output.
#' The columns of MCMC output matrix are Iteration number, Index of a tree 
#' that was sampled at this iteration, Log posterior, 
#' 0 -> 1 rate, 1 -> 0 rate, Number of 0 -> 1 jumps, Number of 1 -> 0 jumps,
#' Time spent in state 0, Time spent in state 1

testIndOrigin <- function(inputTrees, rootDist=c(0,1), traitData, initLambda01, initLambda10, priorAlpha01, priorBeta01, priorAlpha10, priorBeta10, mcmcSize, mcmcBurnin, mcmcSubsample, mcSize, testThreshold=0){
  
  
  ## Prepare trees and trait data  for Rcpp code
  cat("pre-processing trees and trait data", "\n")
  
  if (prod(traitData == 0 | traitData == 1 | traitData == -1) != 1)
    stop("Error: \"traitData\" must be a vector with entries equal to 0 or 1 or -1")
  if (!("multiPhylo" %in% class(inputTrees)))
    stop("Error: object \"inputTrees\" is not of class \"multiPhylo\"")
  
  treeNum = length(inputTrees)
  
  ## Allocate memory for an array to hold tree edge matrices
  treeEdges = array(0, dim=c(dim(inputTrees[[1]]$edge),treeNum))
  
  ## Allocate memory for a matrix to hold branch lengths
  treeBranchLengths = matrix(0, dim(inputTrees[[1]]$edge)[1], treeNum)
  
  ## Allocate memory for a matrix to hold data vectors (reordered for each tree)
  treeTraits = matrix(0, length(inputTrees[[1]]$tip.label), treeNum)
  
  for (i in 1:treeNum){
    
    if (is.null(inputTrees[[i]]$edge.length))
      stop("Error: All input trees must have branch lengths")
    
    if (!is.rooted(inputTrees[[i]]))
      stop("Error: All input trees must be rooted")
    
    ## reorder the edges in the "pruningwise" order
    tempTree = reorder(inputTrees[[i]], order = "pr")
    
    treeEdges[,,i] = tempTree$edge
    treeBranchLengths[,i] = tempTree$edge.length
    treeTraits[,i] = traitData
    
    ## reorder data on tips to match the order of the my.tree phylo object
    if (!is.null(names(traitData))){
      if(!any(is.na(match(names(traitData), tempTree$tip.label)))){
        treeTraits[,i] = traitData[tempTree$tip.label]
      }else{
        warning('the names of argument "inputTrees" and the names of the tip labels
did not match: the former were ignored in the analysis.')
      }
    }        
  }
        
  ## Run Gibbs sampler that iterates between drawing from the full conditional of missing data
  ## and drawing from the full conditional of model parameters (rates 0->1 and 1->0)
  
  cat("running Gibbs sampler", "\n")
  
  mcmcOut = twoStatePhyloGibbsSampler(treeEdges, dim(treeEdges), treeBranchLengths, rootDist, treeTraits, initLambda01, initLambda10, priorAlpha01, priorBeta01, priorAlpha10,
                                priorBeta10, mcmcSize, mcmcBurnin, mcmcSubsample)

  colnames(mcmcOut) = c("iter", "treeIndex", "logPost", "lambda01", "lambda10", "n01", "n10", "t0", "t1")
  
  cat("Computing posterior probabilities", "\n")
  
  ## compute the posterior probability Pr(N_01 <= testThreshold | data) 
  postProbs = numeric(dim(mcmcOut)[1])
  
  for (i in 1:length(postProbs)){
    postProbs[i] = sum(exp(treeConvolveTest(inputTrees[[mcmcOut[i,"treeIndex"]]], treeTraits[,mcmcOut[i,"treeIndex"]], mcmcOut[i,"lambda01"], mcmcOut[i,"lambda10"], rootDist[1], testThreshold)[,"posterior"]))
  }
  
  postProbEst = mean(postProbs)
  
  cat("Computing prior probabilities", "\n")  
  
  ## Produce a sample from the priors of lambda01 and lambda10
  priorSampleLambda01 = rgamma(mcSize, shape=priorAlpha01, rate=priorBeta01)
  priorSampleLambda10 = rgamma(mcSize, shape=priorAlpha10, rate=priorBeta10)
  
  ## compute the prior probability Pr(N_01 <= testThreshold) 
  priorProbs = numeric(mcSize)
  
  for (i in 1:mcSize){
    sampledTreeIndex = sample(c(1:treeNum),1)
    priorProbs[i] = sum(exp(treeConvolveTest(inputTrees[[sampledTreeIndex]], treeTraits[,sampledTreeIndex], priorSampleLambda01[i], priorSampleLambda10[i], rootDist[1], testThreshold)[,"prior"]))
  }
  
  priorProbEst = mean(priorProbs)
  
  bfVector = numeric(5)
  names(bfVector) = c(paste("Pr(N01<=",as.character(testThreshold),")",sep=""), paste("Pr(N01<=",as.character(testThreshold),"|data)",sep=""), paste("BF for N01<=",as.character(testThreshold),sep=""), "log10(BF)", "2xlog_e(BF)")
  bfVector[1] = priorProbEst
  bfVector[2] = postProbEst
  bfVector[3] = (postProbEst/(1-postProbEst))/(priorProbEst/(1-priorProbEst))
  bfVector[4] = log10(postProbEst) - log10(1-postProbEst) - log10(priorProbEst) + log10(1-priorProbEst)
  bfVector[5] = 2*(log(postProbEst) - log(1-postProbEst) - log(priorProbEst) + log(1-priorProbEst))  
  
  returnList = list(treeList=inputTrees, mcmcOutput=mcmcOut, priorJumpProbs=priorProbs,
                    postJumpProbs=postProbs, probsAndBFs=bfVector)
  
  class(returnList) = "indorigin"
  
  return(returnList)
}

getBF = function(indOriginResults){
  if (!("indorigin" %in% class(indOriginResults)))
    stop("Error: object \"indOriginResults\" is not of class \"indorigin\"")
  
  return(indOriginResults$probsAndBFs[3:5])
}

getPriorProb = function(indOriginResults){
  if (!("indorigin" %in% class(indOriginResults)))
    stop("Error: object \"indOriginResults\" is not of class \"indorigin\"")
  
  returnVector = numeric(3)
  names(returnVector) = c("PriorProbEst", "PriorProbSd", "PriorProbConfInt")
  
  est = indOriginResults$probsAndBFs[1]
  names(est) = NULL
  mcSd = sd(indOriginResults$priorJumpProbs)/sqrt(length(indOriginResults$priorJumpProbs))
  confInt = c(est-1.96*mcSd,est+1.96*mcSd)
  
  return(list(PriorProbEst=est, PriorProbSd=mcSd, PriorProbConfInt=confInt))
}

getPostProb = function(indOriginResults){
  if (!("indorigin" %in% class(indOriginResults)))
    stop("Error: object \"indOriginResults\" is not of class \"indorigin\"")
  
  est = indOriginResults$probsAndBFs[2]
  names(est) = NULL
  mcmcPostJumpProbs = coda::as.mcmc(indOriginResults$postJumpProbs)
  
  mcSd = sd(indOriginResults$postJumpProbs)/sqrt(coda::effectiveSize(mcmcPostJumpProbs))
  names(mcSd) = NULL
  confInt = c(est-1.96*mcSd,est+1.96*mcSd)
  names(confInt) = c("2.5%", "97.5%")
  
  return(list(PostProbEst=est, PostProbSd=mcSd, PostProbConfInt=confInt))
}

plotPosterior = function(indOriginResults, trace=FALSE){
  if (!("indorigin" %in% class(indOriginResults)))
    stop("Error: object \"indOriginResults\" is not of class \"indorigin\"")
  
mcmcTable = coda::mcmc(indOriginResults$mcmcOutput[,c("lambda01", "lambda10", "n01", "n10")], thin=as.integer(dim(indOriginResults$mcmcOutput)[1]))
plot(mcmcTable, trace=trace)

}

AvereragePriorProbabilities = function(inputTrees, rootDist=c(0,1), traitData, initLambda01, initLambda10, testThreshold=0){
  
  ## Prepare trees and trait data  for Rcpp code
  cat("pre-processing trees and trait data", "\n")
  
  if (!("multiPhylo" %in% class(inputTrees)))
    stop("Error: object \"inputTrees\" is not of class \"multiPhylo\"")
  
  treeNum = length(inputTrees)
  
  ## Allocate memory for an array to hold tree edge matrices
  treeEdges = array(0, dim=c(dim(inputTrees[[1]]$edge),treeNum))
  
  ## Allocate memory for a matrix to hold branch lengths
  treeBranchLengths = matrix(0, dim(inputTrees[[1]]$edge)[1], treeNum)
  
  ## Allocate memory for a matrix to hold data vectors (reordered for each tree)
  treeTraits = matrix(0, length(inputTrees[[1]]$tip.label), treeNum)
  
  for (i in 1:treeNum){
    
    if (is.null(inputTrees[[i]]$edge.length))
      stop("Error: All input trees must have branch lengths")
    
    if (!is.rooted(inputTrees[[i]]))
      stop("Error: All input trees must be rooted")
    
    ## reorder the edges in the "pruningwise" order
    tempTree = reorder(inputTrees[[i]], order = "pr")
    
    treeEdges[,,i] = tempTree$edge
    treeBranchLengths[,i] = tempTree$edge.length
    treeTraits[,i] = traitData
    
    ## reorder data on tips to match the order of the my.tree phylo object
    if (!is.null(names(traitData))){
      if(!any(is.na(match(names(traitData), tempTree$tip.label)))){
        treeTraits[,i] = traitData[tempTree$tip.label]
      }else{
        warning('the names of argument "inputTrees" and the names of the tip labels
                did not match: the former were ignored in the analysis.')
      }
      }        
    }
  
  ## Run Gibbs sampler that iterates between drawing from the full conditional of missing data
  ## and drawing from the full conditional of model parameters (rates 0->1 and 1->0)
  
  cat("running Gibbs sampler", "\n")
  
  priorResults = AverageTreesConvolve(treeEdges, dim(treeEdges), treeBranchLengths, treeTraits, initLambda01, initLambda10, testThreshold, rootDist[1], "prior", 1)
  
  
  return(priorResults)
}
vnminin/indorigin documentation built on May 3, 2019, 6:17 p.m.