R/add.leaf.R

Defines functions add.leaf

Documented in add.leaf

#' Test all possible connection of a leaf to a graph with non-admixed and or admixed edges
#' @param x An object of class graph.params or fitted.graph (see details)
#' @param leaf.to.add Name of the leaf to add
#' @param fstats Object of class fstats that contains estimates of the fstats (see compute.fstats)
#' @param only.test.non.admixed.edges If TRUE the function only test non.admixed edges (may be far faster)
#' @param only.test.admixed.edges If TRUE the function only test admixed edges
# #' @param nthreads Number of available threads for parallelization of some part of the parsing (default=1, i.e., no parallelization)
#' @param verbose If TRUE extra information is printed on the terminal
#' @param ... Some parameters to be passed the function fit.graph called internally
#' @details The input object x needs to be of class graph.params (as generated by the function generate.graph.params) or fitted.graph (as generated by the function fit.graph or by the function add.leaf itself in the graphs.fit.res elements of the output list). This is to ensure that the matrix describing the structure of the graph (graph slot of these objects) is valid (note that it can be plotted for checks). Hence graph.params objects may have been generated without fstats information (that should be supplied independently to the add.leaf function to obtain information on the fstats involving the candidate leaf defined with the leaf.to.add argument).
#' By default the function tests all the possible positions of a newly added edge connecting the candidate leaf to the graph with both non-admixed (including a new rooting with the candidate leaf as an outgroup) and admixed edges. If n_e is the the number of non-admixed edges of the original graph, the number of tested graphs for non-admixed edges equals n_e+1. The newly added node is named "N-"{name of the leaf to add} (or with more N if the name already exists). For admixed edges, the number of tested graphs equals n_e*(n_e-1)/2 and for a given tested graph, three nodes named "S-"{name of the leaf to add}, "S1-"{name of the leaf to add} and "S2-"{name of the leaf to add} (or with more S if the name already exists) are added and the admixture proportions are named with a letter (A to Z depending on the number of admixed nodes already present in the graph). 
#' @return A list with the following elements:
#' \enumerate{
#' \item "n.graphs": The number of tested graphs
#' \item "fitted.graphs.list": a list of fitted.graph objects (indexed from 1 to n.graphs and in the same order as the list "graphs") containing the results of fitting of each graph. 
#' \item "best.fitted.graph": The graph (object of class fitted.graph) with the minimal BIC (see function fit.graph) among all the graphs within fitted.graphs.list 
#' \item "bic": a vector of the n.graphs BIC (indexed from 1 to n.graphs and in the same order as the "fitted.graphs.list" list) (see fit.graph details for the computation of the scores). 
#' }
#' @seealso see \code{\link{fit.graph}} and \code{\link{generate.graph.params}}.
#' @export
add.leaf<-function(x,leaf.to.add,fstats,only.test.non.admixed.edges=FALSE,only.test.admixed.edges=FALSE,verbose=TRUE,...){
  if(verbose){
    print.extra.info=TRUE
    verbose=FALSE #pour eviter conflits avec ...
  }else{print.extra.info=FALSE}
  
  # if(nthreads>1){
  #   tmp.ncores=detectCores()
  #   if(nthreads>tmp.ncores){nthreads=tmp.ncores}
  #   # options(cores=nthreads)
  #   # registerDoParallel()  ;  getDoParWorkers()  
  #   parallel=TRUE
  # }else{parallel=FALSE}
  ###Some checks
  if(!is.graph.params(x) & !is.fitted.graph(x)){stop("The input x must be an object of class graph.params (see the function generate.graph.params) or fitted.graph (see the function fit.graph)\n")}
  if(!is.fstats(fstats)){
    stop("fstats must be an object of class fstats (see the function compute.fstats)\n")
    }else{
      if(!fstats@blockjacknife){
         stop("No block-jackknife estimate of estimators s.e. found in the object fstats.\nFunctions compute.fstats must be run with nsnp.per.block>0\n")
        }
     if(nrow(fstats@f3.values)==0){
       stop("No estimates of F3 statistics available in the fstats object.\nFunctions compute.fstats must be run with computeF3=TRUE\n")
     }
      if(is.graph.params(x)){tmp.input.leaves=x@leaves}
      if(is.fitted.graph(x)){tmp.input.leaves=rownames(x@fitted.f2.mat)}      
      if(sum(!(tmp.input.leaves%in%fstats@comparisons[["pops"]]))>0){
        stop("Some leaves of the input graph are not represented in the fstats object\n")
      }
      if(!(leaf.to.add%in%fstats@comparisons[["pops"]])){
        stop("The candidate leaf (leaf.to.add argument) is not represented in the fstats object\n")
      }
      if(leaf.to.add %in% tmp.input.leaves){
        stop("The candidate leaf (leaf.to.add argument) is already present in the input graph\n")
      }
    }

  ##########################  
  time1=proc.time()
  if(print.extra.info){cat("Constructing graph for all possible configurations\n")}
  n.input.non.adm.edges=sum(nchar(x@graph[,3])==0)
  ##unadmixed graphs
  n.non.adm.tests=n.adm.tests=0 #intialisation necessaire
  if(!only.test.admixed.edges){
   root=unique(x@graph[,2][!(x@graph[,2]%in%x@graph[,1])])
   new.node.name=paste0("I-",leaf.to.add)
   while(new.node.name %in% tmp.input.leaves){new.node.name=paste0("I",new.node.name)} #in case of...
   
   n.eval=n.input.non.adm.edges+1 
   if(print.extra.info){cat(" ",n.eval,"possible graphs connecting",leaf.to.add,"with a non-admixed edge\n")}
   cdt.edges.idx=which(nchar(x@graph[,3])==0) #les non admixed
   proposed.graphs.unadmixed=list() 
   for(e in 1:length(cdt.edges.idx)){
       ii=cdt.edges.idx[e]
       tmp.graph<-rbind(c(leaf.to.add,new.node.name,""),c(new.node.name,x@graph[ii,2],""),
                        c(x@graph[ii,1],new.node.name,""),x@graph[-ii,])
       n.non.adm.tests=n.non.adm.tests+1
       proposed.graphs.unadmixed[[n.non.adm.tests]]=tmp.graph
    }
   ##test with a new root
   tmp.graph<-rbind(c(leaf.to.add,new.node.name,""),c(root,new.node.name,""),x@graph)
   n.non.adm.tests=n.non.adm.tests+1
   proposed.graphs.unadmixed[[n.non.adm.tests]]=tmp.graph
  }
  ##admixed graphs  
  if(!only.test.non.admixed.edges){
   new.node.name.S1=paste0("S1-",leaf.to.add)
   while(new.node.name.S1 %in% tmp.input.leaves){new.node.name.S1=paste0("S",new.node.name.S1)} #in case of...
   new.node.name.S2=paste0("S2-",leaf.to.add)
   while(new.node.name.S2 %in% tmp.input.leaves){new.node.name.S2=paste0("S",new.node.name.S2)} #in case of... 
   new.node.name.S=paste0("S-",leaf.to.add)
   while(new.node.name.S %in% tmp.input.leaves){new.node.name.S=paste0("S",new.node.name.S)} #in case of... 
   tmp.sel=nchar(x@graph[,3])>0 
   if(sum(tmp.sel)>0){n.adm.nodes=length(unique(x@graph[tmp.sel,1]))}else{n.adm.nodes=0}
   if(n.adm.nodes<25){
     #do not include I as a name because bug when evaluating with yacas (i.e., interpret as complex)
     adm.par.name=LETTERS[-9][n.adm.nodes+1] #rawToChar(as.raw(65+n.adm.nodes))
   }else{#unlikely but never know
     adm.par.name=paste0(LETTERS[-9][n.adm.nodes%%25+1],letters[-9][floor(n.adm.nodes/25)])
   }  
   while(length(grep(adm.par.name,x@graph[,3]))>0){adm.par.name=paste0("n",adm.par.name)} #in case of...   
   n.eval=n.input.non.adm.edges*(n.input.non.adm.edges-1)/2 - 1 #il faut retirer evenement impliquant les 2 root edges
   if(print.extra.info){cat(" ",n.eval,"possible graphs connecting",leaf.to.add,"with admixed edges\n")}   
   edges.pair.idx=t(combn(1:nrow(x@graph),2))
   #on retire  la position impliquant les 2 root edges: =>systeme singulier
   #on repart du graphe pour etre sur avoir les bons index des root edges
   root.edges.idx=which(x@graph[,2]==unique(x@graph[,2][!(x@graph[,2]%in%x@graph[,1])])) 
   tmp.keep=(edges.pair.idx[,1]%in%root.edges.idx + edges.pair.idx[,2]%in%root.edges.idx )<2
   edges.pair.idx=edges.pair.idx[tmp.keep,]
   #on retire les edges admixed
   tmp.keep=(nchar(x@graph[edges.pair.idx[,1],3]) + nchar(x@graph[edges.pair.idx[,2],3]))==0 
   edges.pair.idx=edges.pair.idx[tmp.keep,]
   proposed.graphs.admixed=list()    
   for(e in 1:nrow(edges.pair.idx)){
     ii<-edges.pair.idx[e,1] ; jj<-edges.pair.idx[e,2]
     tmp.graph<-rbind(c(new.node.name.S,new.node.name.S1,adm.par.name),
                      c(new.node.name.S,new.node.name.S2,paste0("(1-",adm.par.name,")")),
                      c(leaf.to.add,new.node.name.S,""),
                      c(new.node.name.S1,x@graph[ii,2],""),c(new.node.name.S2,x@graph[jj,2],""),
                      c(x@graph[ii,1],new.node.name.S1,""),c(x@graph[jj,1],new.node.name.S2,""),
                      x@graph[-c(ii,jj),])
      n.adm.tests=n.adm.tests+1
      proposed.graphs.admixed[[n.adm.tests]]=tmp.graph
   }
   }
  ##########################
  proposed.graphs=list()
  n.g=0
  if(n.non.adm.tests>0){
   for(i in 1:n.non.adm.tests){
     n.g=n.g+1
     proposed.graphs[[n.g]]=proposed.graphs.unadmixed[[i]]
   }
    rm(proposed.graphs.unadmixed) ; gc()
  }
  if(n.adm.tests>0){
    for(i in 1:n.adm.tests){
      n.g=n.g+1
      proposed.graphs[[n.g]]=proposed.graphs.admixed[[i]]
    }
    rm(proposed.graphs.admixed) ; gc()
  }  

  ##########################  
  if(print.extra.info){cat("\nFitting the",n.g,"eligible graphs\n")}  
  bics=c() ;  fit.res=list()
  # if(parallel){
  #   #Rk: impossible de faire une progression bar clean. La seule solution qui marche (cf lignes commentees) impose de faire outfile="" et entraine l'ecriture de plein de trucs a l'ecran impossible a virer (meme avec des suppressMessage car ils utilisent cat) notamment les Loading package de touts les packages pour chacun des threads
  #   cl<-makeCluster(nthreads)
  #   registerDoParallel(cl)  
  #    fit.res=foreach(i = 1:n.g,.packages = "tcltk") %dopar% {      
  #      if(!exists("pb")){pb <- tkProgressBar(title=paste0("Thread",i), min=1, max=n.g)}
  #      setTkProgressBar(pb, i,label=paste("currently tested graph",i)) #solution: https://gist.github.com/andrie/d4f9afdf1c9166fa906f
  #      tmp.graph.params<-generate.graph.params(proposed.graphs[[i]],fstats=fstats,verbose=FALSE)
  #      out.fit<-fit.graph(tmp.graph.params,verbose=FALSE,...)
  #      rm(tmp.graph.params)
  #      return(out.fit) 
  #     # rm(out.fit)
  #     # gc() ; closeAllConnections()
  #   }
  #   stopCluster(cl)
  # }else{
  if(print.extra.info){ pb <- progress_bar$new(format = " [:bar] :percent eta: :eta",total = n.g, clear = FALSE, width= 60)}
   for(i in 1:n.g){
    tmp.graph.params<-generate.graph.params(proposed.graphs[[i]],fstats=fstats,verbose=FALSE)
    tmp.res<-fit.graph(tmp.graph.params,verbose=FALSE,...)
    if(is.null(tmp.res)){
       fit.res[i]<-list(NULL) #gestion des cas ou overfitting
    }else{
       fit.res[[i]]<-tmp.res}
    rm(tmp.graph.params,tmp.res)
    if(print.extra.info){pb$tick()}
   }
  if(print.extra.info){pb$terminate()}
  
  #retrait des null au cas ou
  null.fit=unlist(lapply(fit.res,is.null))
  if(sum(null.fit)>0){
    if(print.extra.info){cat("NOTE:",sum(null.fit),"graph configurations out of",n.g,"could not be fitted (singular model)\n")}
    fit.res=fit.res[!null.fit]
    n.g=length(fit.res)
  }
  for(i in 1:n.g){bics=c(bics,fit.res[[i]]@bic)}

  if(print.extra.info){
  time.elapsed=((proc.time()-time1)[3])
  nhours=floor(time.elapsed/3600) ; nminutes=floor((time.elapsed-nhours*3600)/60) ;  nseconds=round(time.elapsed-nhours*3600-nminutes*60)  
  cat("\nOverall Analysis Time:",nhours,"h",nminutes, "m",nseconds,"s\n")  
  }
  ###############
  best.graph=fit.res[[which.min(bics)]]
  return(list(fitted.graphs.list=fit.res,best.fitted.graph=best.graph,n.graphs=n.g,bic=bics))
}

Try the poolfstat package in your browser

Any scripts or data that you put into this service are public.

poolfstat documentation built on Sept. 8, 2023, 5:49 p.m.