.StoufferCombination<-function(pvalue){
nan.pvalue = pvalue[!is.na(pvalue)]
n = sum(nan.pvalue<=1)
if(n>0){
z.stat = sum(qnorm(1-nan.pvalue))/sqrt(n)
return(1-pnorm(z.stat))
}else{
return(0)
}
}
.bottom.up.unweighted<-function(tree, pvalue, far = 0.10, tau = 0.3){## pvalue is a vector
## if a node A is detected later than another node B, A is less significant than B.
CompareTwoNodes<-function(a,b){
a = as.vector(a)
b = as.vector(b)
if(a[1] > b[1]){return(TRUE)
}
if(a[1] < b[1]){return(FALSE)
}
if(a[1] == b[1]){
if(a[2] > b[2]){return(TRUE)
}else{return(FALSE)
}
}
}
FindLeastSigNode<-function(x){ #x is an n*2 matrix
pos.min = 1
n = nrow(x)
if(n>1){
for(i in 2:n){
if(CompareTwoNodes(x[i,],x[pos.min,])){pos.min = i
}
}
}
return(pos.min)
}
n.node = nrow(tree@node)
max.depth = max(tree@node$level)
if(length(pvalue)!=sum(tree@node$level==1)){
stop("Incorrect dimension of p-values!")
}
if(!is.null(names(pvalue))){
level.one.id = tree@node$id[tree@node$level==1]
if(setequal(level.one.id, names(pvalue))){
pvalue = pvalue[level.one.id]
}else{
stop("Mis-matched symbols between p-values and tree")
}
}
trim.stat = matrix(NA, nrow = n.node, ncol = 2) # when a node is detected, keep the record
cum.rej = 0; last.cut.off = 0
sig.in = array(TRUE,n.node); sig.out = array(FALSE,n.node)
pvalue.in = pvalue.out = dec.level = array(NA, n.node); pvalue.in[1:length(pvalue)]= pvalue
q.spent = array(0, max.depth)
alpha.out = array(NA, max.depth)
nl.out = array(NA, max.depth)
for(i in 1:max.depth){
this.level.set = which(tree@node$level == i)
nl.out[i] = length(this.level.set)
q.this.level = far/n.node*nl.out[i] ### q-spending by proportion
q.spent[i] = q.this.level
this.level.tree = tree@edge[this.level.set]
# removing significant children from previous levels
if(i>1){
pre.sig.nodes = which(sig.in[1:(min(this.level.set)-1)])
this.level.tree = lapply(this.level.tree,function(x)return(setdiff(x,pre.sig.nodes)))
indices = (sapply(this.level.tree,length) > 0)
## swap with its least significant child node
na.nodes = this.level.set[!indices]
for(j in na.nodes){
j.child = tree@edge[[j]]
least.sig.child = j.child[FindLeastSigNode(trim.stat[j.child, ,drop=FALSE])]
sig.out[least.sig.child] = FALSE
dec.level[j] = dec.level[least.sig.child]
dec.level[least.sig.child] = NA
sig.out[j] = TRUE
pvalue.out[j] = pvalue.out[least.sig.child]
trim.stat[j,] = trim.stat[least.sig.child, ]
}
## finish the trimming step
this.level.set = this.level.set[indices]
this.level.tree = this.level.tree[indices]
}
if(length(this.level.set)>0){
if(i>1){
this.level.p = sapply(this.level.tree,function(x)return(.StoufferCombination(pvalue.in[x])))
}else{
this.level.p = pvalue
}
this.level.test.summary = .UnweightedStepdownControl(this.level.p, cum.rej, q.this.level, tau)
this.level.detect = this.level.test.summary$output
sig.in[this.level.set] = sig.out[this.level.set] = this.level.detect
pvalue.out[this.level.set] = this.level.p
dec.level[this.level.set] = i
trim.stat[this.level.set[this.level.detect],1] = i #detected at the i-th level
trim.stat[this.level.set,2] =this.level.p #p value when it is detected
cum.rej = cum.rej + sum(this.level.detect)
last.cut.off = this.level.test.summary$cut.off
alpha.out[i] = last.cut.off
if(sum(!this.level.detect)>0){
pvalue.in[this.level.set[!this.level.detect]]=
(this.level.p[!this.level.detect]-last.cut.off)/(1-last.cut.off)
}
}
}
tree@info$alpha = alpha.out
tree@info$nl = nl.out
tree@info$ql = q.spent
tree@test$reject = sig.out; tree@test$pvalue = pvalue.out; tree@node$dec.level = dec.level
tree@test$num = far
return(tree)
}
.bottom.up.weighted<-function(tree, pvalue, far = 0.10, tau = 0.3, mode = 'one.stage'){
get_weight<-function(hash.id, tree){## calculate the least favorable weights
n.node = nrow(tree@node)
weight = array(1, dim = n.node)
hash.xid = hash.id
subtree<-vector("list",n.node)
this = hash.id[1]
while(this <= n.node){
parent = tree@node$parent[this]
if(parent <= n.node & !parent %in% hash.id){
hash.id = c(hash.id, parent)
}
hash.id = hash.id[-1]
if(length(hash.id)==0){
break
}
if(parent <= n.node){
subtree[[parent]] = c(subtree[[parent]], this)
}
this = hash.id[1]
}
subtree = lapply(subtree, unique)
## compute least favorable weights
LFW<-vector("list",n.node); LFW[hash.xid] = 1
for(i in 1:n.node){
if(length(subtree[[i]]) > 0){
LFW[[i]] = sort(unlist(LFW[subtree[[i]]]), decreasing = TRUE)
LFW[[i]][1] = LFW[[i]][1] + 1
}
}
return(sort(LFW[[n.node]]))
}
n.node = nrow(tree@node)
max.depth = max(tree@node$level)
if(length(pvalue)!=sum(tree@node$level==1)){
stop("Incorrect dimension of p-values!")
}
if(!is.null(names(pvalue))){
level.one.id = tree@node$id[tree@node$level==1]
if(setequal(level.one.id, names(pvalue))){
pvalue = pvalue[level.one.id]
}else{
stop("Mis-matched symbols between p-values and tree")
}
}
q.spent = array(0, max.depth)
alpha.out = array(NA, max.depth)
nl.out = array(NA, max.depth)
if(mode == 'one.stage'){
for(i in 1:max.depth){
nl.out[i] = sum(tree@node$level == i)
q.spent[i] = far/n.node*nl.out[i]
}
}
if(mode == 'stageI.in.two.stage'){
q.spent[1] = far
nl.out = sapply(c(1:max.depth),function(x)return(sum(tree@node$level==x)))
}
if(mode == 'stageII.in.two.stage'){
if(max.depth<2){stop('Two stage analysis requires at least 2 levels.')}
n.node1 = n.node - length(which(tree@node$level == 1))
for(i in 2:max.depth){
q.spent[i] = far/n.node1*length(which(tree@node$level == i))
}
}
cum.rej = 0; last.cut.off = 0
sig.out = array(FALSE,n.node)
pvalue.in = pvalue.out = dec.level = array(NA, n.node); pvalue.in[1:length(pvalue)]= pvalue
for(i in 1:max.depth){
this.level.set = which(tree@node$level == i)
q.this.level = q.spent[i] ### by default q-spending by proportion
this.level.tree = tree@edge[this.level.set]
# removing significant children from previous levels
if(i>1){
## nodes that are significant before testing this level
pre.sig.nodes = which(sig.out[1:(min(this.level.set)-1)])
## nodes to be tested now
this.level.tree = lapply(this.level.tree,function(x)return(setdiff(x,pre.sig.nodes)))
ind = (sapply(this.level.tree,length) > 0)
above.level.set = which(tree@node$level >= i)
xind = (sapply(lapply(tree@edge[above.level.set],function(x)return(setdiff(x,pre.sig.nodes))),length)>0)
xind = (xind | sig.out[above.level.set])
na.nodes = above.level.set[!xind]
sig.out[na.nodes] = TRUE
pvalue.out[na.nodes] = NA
if(length(na.nodes)>0){
# find its least significant offspring on the i-1 th level
last.level.set = intersect(which(tree@node$level == (i-1)),which(sig.out))
for(j in 1:length(na.nodes)){
jind = (tree@node$parent[last.level.set]==na.nodes[j])
pvalue.out[na.nodes[j]] = max(pvalue.out[last.level.set[jind]])
}
dec.level[na.nodes] = i-1
}
if(mode == 'stageI.in.two.stage') next;
cum.rej = cum.rej + sum(!xind)
this.level.set = this.level.set[ind]
this.level.tree = this.level.tree[ind]
}
if(length(this.level.set)>0){
if(i>1){
this.level.p = sapply(this.level.tree,function(x)return(.StoufferCombination(pvalue.in[x])))
}else{
this.level.p = pvalue
}
weight = get_weight(this.level.set, tree)
this.level.test.summary = .WeightedStepdownControl(this.level.p, weight, cum.rej, q.this.level, tau)
this.level.detect = this.level.test.summary$output
sig.out[this.level.set] = this.level.detect
pvalue.out[this.level.set] = this.level.p
dec.level[this.level.set] = i
cum.rej = cum.rej + sum(this.level.detect)
last.cut.off = this.level.test.summary$cut.off
alpha.out[i] = last.cut.off
if(sum(!this.level.detect)>0){
pvalue.in[this.level.set[!this.level.detect]]=
(this.level.p[!this.level.detect]-last.cut.off)/(1-last.cut.off)
}
}
}
tree@info$alpha = alpha.out
tree@info$nl = nl.out
tree@info$ql = q.spent
tree@test$reject = sig.out; tree@test$pvalue = pvalue.out; tree@node$dec.level = dec.level
if(mode=='stageI.in.two.stage'){#1001
tree@test$num = last.cut.off
}else{
tree@test$num = far
}
return(tree)
}
#' Bottom-up Tree Hypothesis Tests
#'
#' This function implements the (one-stage and two-stage) bottom-up approach to testing hypotheses that
#' have a branching tree dependence structure, with false discovery rate control.
#' Our motivating example comes from testing the association between a trait of
#' interest and groups of microbes that have been organized into operational
#' taxonomic units (OTUs) or amplicon sequence variants (ASVs). Given p-values
#' from association tests for each individual OTU or ASV, we would like to know
#' if we can declare that a certain species, genus, or higher taxonomic grouping
#' can be considered to be associated with the trait. If a large proportion of
#' species from that genus influence the trait, we should conclude the genus
#' influences the trait. Conversely if only a few of the species from a genus
#' are non-null, then a better description of the microbes that influence occurrence
#' of the trait is a list of associated species. Finding taxa that can be said
#' to influence a trait in this sense is the first goal of our approach.
#' The second goal is to locate the highest taxa in the tree for which we
#' can conclude many taxa below, but not any ancestors above, influence risk;
#' we refer to such taxa as driver taxa.
#' @param anno.table an \code{n} by \code{l} data.frame that specifies the
#' annotation (i.e., grouping) of \code{n} leaf nodes at \code{l} levels. The
#' \code{l} levels are ordered with the highest level (the root node) first
#' and the lowest level (the leaf nodes) last, so that each row represents a
#' path from the root node to a leaf node.
#' @param pvalue.leaves a vector of p-values at leaf nodes, the names of which
#' should match the values in the last column of \code{anno.table}.
#' @param na.symbol a character string which is to be inpretreted as \code{NA}
#' values. The default is `unknown'.
#' @param far nominal false assignment rate (FAR), the error rate in analogy
#' with the false discovery rate. If \code{far} takes one value, it
#' corresponds to the one-stage bottom-up test with overal FAR control at the
#' specified level. If \code{far} takes a vector of two values, it corresponds
#' to the two-stage bottom-up test with the test of level-1 nodes controlling
#' FAR at the level of the first value of \code{far} and the test of the
#' remaining levels controlling FAR at the level of the second value of
#' \code{far}. The default is 0.1.
#' @param is.weighted a logical value indicating whether the weighted or
#' unweighted test is performed. The default is `TRUE'.
#' @param tau a pre-specified constant to prevent nodes with large p-values from
#' being detected if a large number (say, \code{m}) of null hypotheses can be
#' easily rejected because of very low p-values, in which case \code{q} x
#' \code{m} nodes with large p-values can be said to be detected, while still
#' controlling the overall error rate at level \code{q}. The default is 0.3.
#' @return A list consisting of
#' \item{results.by.level}{a data frame that gives the number of detected nodes at each level along with information on the test for that level.}
#' \item{results.by.node}{a data frame that gives detailed results at each node (i.e., leaf and inner nodes), including the (derived) p-value, the indicator of a detection or not, and the indicator of a driver node or not.}
#' \item{tree}{A tree structure used in the function \code{graphlan}.}
#'
#' @references Li, Y., Satten, GA., Hu Y.-J. "A bottom-up approach to testing hypotheses that have a
#' branching tree dependence structure, with false discovery rate control" XXX(2018)
#'
#' @examples
#' data(IBD)
#'
#' ## one-stage weighted bottom-up test on the IBD data
#' test.1 = bouth(anno.table = IBD$tax.table, pvalue.leaves = IBD$pvalue.otus,
#' na.symbol = "unknown", far = 0.1, is.weighted = TRUE)
#'
#' ## extract all detected nodes
#' test.1$results.by.node[test.1$results.by.node$is.detected, ]
#'
#' ## extract all detected driver nodes
#' test.1$results.by.node[test.1$results.by.node$is.driver,]
#'
#'
#' ## two-stage (weighted) bottom-up test on the IBD data
#' test.2 = bouth(anno.table = IBD$tax.table, pvalue.leaves = IBD$pvalue.otus,
#' na.symbol = "unknown", far = c(0.05, 0.05))
#'
#'
#' @export
bouth<-function(anno.table, pvalue.leaves, na.symbol ='unknown', far = 0.10, tau = 0.3, is.weighted = TRUE){
# this version supports one-stage & two-stage tests
if(is.null(anno.table) || is.null(pvalue.leaves)) stop("Incorrect number of parameters")
if(!is.null(colnames(anno.table))){
level.name = rev(colnames(anno.table))
}else{
level.name = rev(paste("level", c(1:ncol(anno.table)), sep='-'))
}
if(is.null(names(pvalue.leaves))){
names(pvalue.leaves) = anno.table[,ncol(anno.table)]
}
# build a tree object based on the input table
tree = .build.tree(anno.table = anno.table, na.symbol = na.symbol)
if(max(pvalue.leaves)==1){
warning("There are p-values that equal exactly 1.")
pvalue.leaves[pvalue.leaves==1] = 0.5*(1+max(c(0,pvalue.leaves[pvalue.leaves<1])))
}
# one-stage approach
if(length(far)==1){
if(is.weighted){
results = .bottom.up.weighted(tree, pvalue.leaves, far, tau)
}else{
results = .bottom.up.unweighted(tree, pvalue.leaves, far, tau)
}
}
# two-stage approach
if(length(far)==2){
results = .two.stage.test(anno.table, pvalue.leaves, na.symbol, far.leaves = far[1], far.inner = far[2],
tau)
}
options(digits=3)
## create a table for all nodes tested in a bottom-up procedure
n.node = length(results@node$level); max.level = max(results@node$level)
show.node = .query(results, c(1:n.node), na.symbol); colnames(show.node$name.mat) = rev(level.name)
show.pvalue = results@test$pvalue
show.detected = .sig(results)
show.driver = .top(results)
results.by.node = data.frame(show.node$name.mat, show.pvalue, show.detected, show.driver,stringsAsFactors = FALSE)
colnames(results.by.node) = c(rev(level.name),"pvalue", "is.detected", "is.driver")
results.by.node = results.by.node[order(show.node$name.vec),]
rownames(results.by.node) = NULL
## create a table to save characteristics for the bottom-up procedure
detections.per.level = sapply(c(1:max.level),function(x)return(sum(results@node$level[results@test$reject]==x)))
results.by.level = data.frame(c(1:max.level),level.name,
results@info$ql, results@info$alpha,
results@info$nl, detections.per.level,
stringsAsFactors=FALSE)
colnames(results.by.level) = c("level","level.name", "q_l", "pvalue.cutoff", "all.nodes", "detected.nodes")
return(list(tree = results, results.by.node = results.by.node, results.by.level = results.by.level))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.