R/graph.builder.R

Defines functions graph.builder

Documented in graph.builder

#' Implement a graph builder heuristic by successively adding leaves to an initial graph
#' @param x An object (or list of objects) of class graph.params or fitted.graph (see details)
#' @param leaves.to.add Names of the leaves to successively add (in the given order)
#' @param fstats Object of class fstats that contains estimates of the fstats (see compute.fstats)
#' @param heap.dbic Maximal BIC distance from the best graph to be kept in the heap (heap.dbic=6 by default)
#' @param max.heap.size Maximal number of graphs stored in the heap (max.heap.size=25 by default)
#' @param verbose If TRUE extra information is printed on the terminal 
#' @param ... Some parameters to be passed the function add.leaf called internally
#' @details The input object x needs to be of class graph.params as generated by the function generate.graph.params; or fitted.graph as generated by the functions fit.graph, add.leaf (in the output list element named "fitted.graphs.list") or rooted.nj.builder (in the output element named "best.rooted.tree"). This is to ensure that the matrix describing the structure of the graph (graph slot of these objects) is valid (note that it can be plotted for checks). Hence graph.params objects may have been generated without fstats information (that should be supplied independently to the add.leaf function to obtain information on the fstats involving the candidate leaf defined with the leaf.to.add argument).
#' The functions successively add each leaf given in the leaves.to.add vector to the list of fitted graph stored in a heap using the function add.leaf. For the first iteration (i.e., first tested leaf) the heap consists of the input graph or list of graph x. At each iteration, the function add.leaf is used to test the candidate leaf to each graph from the current heap in turn. A new heap of graphs is then built by each time including the fitted graphs with a BIC less than heap.dbic larger than the best resulting graphs (treating each graph independently). If the final number of graphs in the heap is larger than max.heap.size, the max.heap.size graphs with the lowest BIC are kept in the heap. After testing the latest leaf, graphs with a BIC larger than heap.dbic units of the best graph are discarded from the final list of graphs.
#' In practice, it is recommended to test different orders of inclusion of the leaves (as specified in the vector leaves.to.add)     
#' @return A list with the following elements:
#' \enumerate{
#' \item "n.graphs": The final number of fitted graphs
#' \item "fitted.graphs.list": a list of fitted.graph objects (indexed from 1 to n.graphs and in the same order as the list "graphs") containing the results of fitting of each graph. 
#' \item "best.fitted.graph": The graph (object of class fitted.graph) with the minimal BIC (see function fit.graph) among all the graphs within fitted.graphs.list 
#' \item "bic": a vector of the n.graphs BIC (indexed from 1 to n.graphs and in the same order as the "fitted.graphs.list" list) (see fit.graph details for the computation of the scores). 
#' }
#' @seealso see \code{\link{fit.graph}}, \code{\link{generate.graph.params}} and \code{\link{add.leaf}}.
#' @export
graph.builder <- function(x,leaves.to.add,fstats,heap.dbic=6,max.heap.size=25,verbose=TRUE,...){
  ###Some checks
  ### fstat object
  if(!is.fstats(fstats)){
    stop("fstats must be an object of class fstats (see the function compute.fstats)\n")
  }else{
    if(!fstats@blockjacknife){
      stop("No block-jackknife estimate of estimators s.e. found in the object fstats.\nFunctions compute.fstats must be run with nsnp.per.block>0\n")
    }
    if(nrow(fstats@f3.values)==0){
      stop("No estimates of F3 statistics available in the fstats object.\nFunctions compute.fstats must be run with computeF3=TRUE\n")
    }
  }
  ### x object (or elements of the list)  
  if(is.list(x)){
    for(i in 1:length(x)){
      if(!is.graph.params(x[[i]]) & !is.fitted.graph(x[[i]])){stop(paste0("The element",i,"of the input list x is not an object of class graph.params (see the function generate.graph.params) or fitted.graph (see the function fit.graph)\n"))}  
      if(is.graph.params(x[[i]])){tmp.input.leaves=x[[i]]@leaves}
      if(is.fitted.graph(x[[i]])){tmp.input.leaves=rownames(x[[i]]@fitted.f2.mat)} 
      if(sum(!(tmp.input.leaves%in%fstats@comparisons[["pops"]]))>0){
        stop(paste0("Some leaves of the input graph (element",i," of the list) are not represented in the fstats object\n"))
      }
      if(sum(leaves.to.add %in% tmp.input.leaves)>0){
        stop(paste0("Some candidate leaves (leaves.to.add argument) are already present in the input graph",i,"of the input list)\n"))
      }
    }
  }else{
    if(!is.graph.params(x) & !is.fitted.graph(x)){stop("The input x must be an object of class graph.params (see the function generate.graph.params) or fitted.graph (see the function fit.graph)\n")}  
    if(is.graph.params(x)){tmp.input.leaves=x@leaves}
    if(is.fitted.graph(x)){tmp.input.leaves=rownames(x@fitted.f2.mat)}
    if(sum(!(tmp.input.leaves%in%fstats@comparisons[["pops"]]))>0){
      stop("Some leaves of the input graph are not represented in the fstats object\n")
    }
    if(sum(leaves.to.add %in% tmp.input.leaves)>0){
      stop("Some candidate leaves (leaves.to.add argument) are already present in the input graph\n")
    }
  }
  ##leaves present or not
  if(sum(!(leaves.to.add%in%fstats@comparisons[["pops"]]))){
    stop("Some candidate leaves (leaves.to.add argument) are not represented in the fstats object\n")
  }
  ###End checks 
  
  time1=proc.time()
  if(!is.list(x)){
    cur.tree=list()
    cur.tree[[1]]=x
  }else{cur.tree=x}
  n.g.tot=0
  for(i in 1:length(leaves.to.add)){
    if(verbose){cat("####################\n Adding: ",leaves.to.add[i],"\n")}
    ###on garde les top intra arbre (la liste peut grossir tres vite!!!)
    new.cur=list() ; heap.bic=c() ; n.g.eval=0 ; time2=proc.time()
    for(j in 1:length(cur.tree)){
      tmp.res=add.leaf(x=cur.tree[[j]],fstats,leaf.to.add = leaves.to.add[i],verbose=FALSE,...)
      n.g.eval=n.g.eval+tmp.res$n.g
      D_BIC=tmp.res$bic-min(tmp.res$bic) 
      tmp.sel.tree=which(D_BIC<heap.dbic)
      tmp.n=length(new.cur)
      for(k in 1:length(tmp.sel.tree)){
        new.cur[[tmp.n+k]]=tmp.res$fitted.graphs.list[[tmp.sel.tree[k]]]
        heap.bic=c(heap.bic,tmp.res$bic[tmp.sel.tree[k]])
      }
    }
    ##filtre pour limiter la taille du heap
    if(length(new.cur)>max.heap.size){
      tmp.sel=which(order(heap.bic)<=max.heap.size)
      new.cur=new.cur[tmp.sel]
      heap.bic=heap.bic[tmp.sel]
    }
    cur.tree=new.cur
    time.elapsed=((proc.time()-time2)[3])
    nhours=floor(time.elapsed/3600) ; nminutes=floor((time.elapsed-nhours*3600)/60) ;  nseconds=round(time.elapsed-nhours*3600-nminutes*60)  
    if(verbose){cat(n.g.eval,"graphs evaluated in",nhours,"h",nminutes, "m",nseconds,"s\n")}  
    if(verbose){cat(length(cur.tree),"graphs stored in the heap\n")}
    n.g.tot=n.g.tot+n.g.eval
  }
  #final filter: only those wihthin heap.dbic
  if(length(cur.tree)>1){
    D_BIC=heap.bic-min(heap.bic) 
    tmp.sel.tree=which(D_BIC<heap.dbic)
    cur.tree=cur.tree[tmp.sel.tree]
    heap.bic=heap.bic[tmp.sel.tree]
  }
  if(verbose){cat("\nFinal Number of graphs:",length(heap.bic),"\n(min. BIC=",min(heap.bic),")\n")}  
  
  time.elapsed=((proc.time()-time1)[3])
  nhours=floor(time.elapsed/3600) ; nminutes=floor((time.elapsed-nhours*3600)/60) ;  nseconds=round(time.elapsed-nhours*3600-nminutes*60)  
  if(verbose){cat("\nOverall Analysis Time:",nhours,"h",nminutes, "m",nseconds,"s (",n.g.tot,"graphs evaluated)\n")}  
    
  best.graph=cur.tree[[which.min(heap.bic)]]
  return(list(fitted.graphs.list=cur.tree,best.fitted.graph=cur.tree[[which.min(heap.bic)]],n.graphs=length(cur.tree),bic=heap.bic))
}

Try the poolfstat package in your browser

Any scripts or data that you put into this service are public.

poolfstat documentation built on Sept. 8, 2023, 5:49 p.m.