R/extractBranches.R

Defines functions extractBranches

Documented in extractBranches

#' Extracts LD clusters for linkage disequilibrium network analysis (LDna).
#'
#' Finds clusters of highly correlated loci (LD-clusters) by considering each branch in single-linkage clustering trees as a separate LD-cluster. The single-linkage can be plotted with the brances representing LD-clusters indicated.
#' 
#' In this implementation of defining LD-clusters, only the parameter \code{min.edges} needs to be specified. Low values of \code{min.edges} lead to "bushes" with many small "twigs" in the tree, each corresponding to a separate LD-cluster. These clusters contain fewer but more highly correlated sets of loci. Conversely, high values lead to fewer "thick" branches i.e. larger (in terms of the number of loci) but less strongly correlated LD-clusters.
#' \cr
#' \cr
#' The single-linkage clustering tree visualizing branches/LD-clusters can be printed by \code{plot.tree}=TRUE. 
#' \cr
#' \cr
#' With low values for \code{min.edges} an additional parameter \code{merge.min} can be used to group "branches" that merge above this threshold into a single LD-cluster; clusters that merge at high LD-threshold are so correlated, it makes no sense to regard them as separate LD-clusters.
#' 
#' This function works only with \code{\link{LDnaRaw}} as shown in the examples
#' 
#' @param ldna Output from \code{\link{LDnaRaw}}
#' @param min.edges Minimum number of edges for a cluster that is shown as a branch in a tree.
#' @param merge.min Is the correlation threshold at which a clade is considered a separate LD-cluster, even if it contains more than two branches.
#' @param plot.tree If \code{TRUE} (default), plots tree.
#' @param cores Number of cores to use.
#' @keywords extractBranches
#' @seealso \code{\link{LDnaRaw}}
#' @author Petri Kemppainen \email{petrikemppainen2@@gmail.com}, Christopher Knight \email{Chris.Knight@@manchester.ac.uk}
#' @return Returns a list containing a vectors of locus names belonging to a given cluster (branch)
#' @examples
#' data(LDna)
#' 
#' ldna <- LDnaRaw(r2.baimaii_subs)
#' 
#' clusters <- extractBranches(ldna,min.edges=20,merge.min=0.8, plot.tree=TRUE) # default values
#' clusters
#' 
#' # with lower threshold for min.edges
#' clusters <- extractBranches(ldna,min.edges=5,merge.min=0.8, plot.tree=TRUE) # default values
#' clusters
#' 
#' # with lower threshold for min.edges
#' clusters <- extractBranches(ldna,min.edges=5,merge.min=0.2, plot.tree=TRUE) # default values
#' clusters
#' @export

extractBranches <- function(ldna, min.edges=200, merge.min=0.8, plot.tree=TRUE, cores=1){
  
  # wtf
  # Get file for tree and clusters above min.edges and their lambda values
  tree <- clusterPhylo(ldna, min.edges)
  #plot(tree)
  
  clusterfile <- ldna$clusterfile
  
  
  stats <- data.table(ldna$stats)
  
  
  clusterfile_red <- clusterfile[,colnames(clusterfile) %in% stats[nE>=min.edges,cluster] & colnames(clusterfile) %in% stats[merge_at_below<=merge.min,cluster]]
  
  clusters.out <- colnames(clusterfile_red)
  
  clusters.out <- unique(unlist(sapply(clusters.out, function(x) {
    anc <- stats[cluster %chin% x, parent_cluster]
    cand <- stats[parent_cluster %chin% anc, cluster]
    cand <- cand[!cand %chin% x]
    cand <- cand[cand %chin% clusters.out]
  })))
  
  # # 
  if(length(clusters.out)!=0){
    
    temp <- ldna$clusterfile[,colnames(ldna$clusterfile) %chin% clusters.out]
    if(is.matrix(temp)){
      cols <- ncol(temp)
      #  x<-9
      if(ncol(temp)>2){
        out <- unlist(mclapply(1:(cols-1), function(x){
          if(x<cols-1){
            any(names(which(temp[,x])) %chin% unlist(apply(temp[,(x+1):cols], 2, function(y) names(which(y)))))  
          }else{
            any(names(which(temp[,x])) %chin% names(which(temp[,(x+1)])))
          }
          
        },mc.cores=cores))
        
        SOCs <- c(colnames(temp)[!out], colnames(temp)[cols])  
      }else{
        if(!any(names(which(temp[,1])) %chin% names(which(temp[,2])))) SOCs <- colnames(temp)
      }
      
      
      
    }else{
      SOCs <- clusters.out
    }
  }else{
    stop('No clusters to extract - consider lowering "min.edges" or increasing "merge.min"')
    
  } 
  
  SOCs <- SOCs[!duplicated(SOCs)]
  Ntips <- length(ldna$tree$tip.label)
  
  col <- rep("grey", length(tree$edge))
  
  if(min.edges==0){
    SOCs <- c(SOCs, rownames(clusterfile)[!rownames(clusterfile) %in% unlist(apply(clusterfile[,colnames(clusterfile) %in% SOCs], 2, function(x) names(which(x))))])
  }
  
  
  if(plot.tree){
    distances <- tree$edge.length[tree$edge[,2] %in% which(tree$tip.label %in% SOCs)]
    clusters.temp <- tree$tip.label[tree$edge[,2][tree$edge[,2] %in% which(tree$tip.label %in% SOCs)]]
    keep.col <- clusters.temp[distances > 0]
    col[tree$edge[,2] %in% which(tree$tip.label %in% keep.col)] <- "red"
    col[tree$edge[,2] %in% tree$edge[,1][tree$edge[,2] %in% which(tree$tip.label %in% SOCs[!SOCs %in% keep.col])]] <- "red"
    col.tip <- rep("#00000000", length(tree$tip.label))
    
    col.tip[tree$tip.label %in% SOCs] <- "black"
    plot(tree, show.tip.label=TRUE, edge.width=3, edge.color=col, cex=0.5, tip.color=col.tip, root.edge=TRUE, underscore=TRUE)
    
    roof <- floor(ldna$stats[nE>min.edges,max(merge_at_above, na.rm = TRUE)]*10)
    axis(1, at=c(0,(1:roof)*0.1))
    text(0,1,as.expression(bquote("|E|"[min]*plain("=")* .(min.edges))),  adj = c(0,0))
  }
  
  if(length(SOCs[!grep("_0.",SOCs)])==0){
    clusters <- lapply(SOCs, function(x) names(which(clusterfile_red[,which(colnames(clusterfile_red)==x)])))
    names(clusters) <- SOCs  
  }else{
    clusters2 <- list(SOCs[!grepl("_0.",SOCs)])
    clusters <- c(clusters,clusters2)
    names(clusters) <- SOCs  
  }
  
  return(clusters)
}

clusterPhylo <-  function(ldna, min.edges=0){
  d <- ldna$stats[ldna$stats$nE>=min.edges,]
  
  d$times<-rep(0,dim(d)[1])
  d$offspring<-as.list(rep(NA,dim(d)[1]))
  d$terminal<-rep(0,dim(d)[1])
  d$ancestor<-rep(0,dim(d)[1])
  
  row<-1
  rowOld<-0
  newick<-character()
  
  repeat{
    cluster<-as.character(d$cluster[row])
    d$offspring[[row]]<-which(as.character(d$parent_cluster)==cluster)
    offspNo<-sum(!is.na(d$offspring[[row]]))
    
    if(offspNo==0){d$terminal[row]<-1}
    
    if(d$terminal[row]==1){
      d$ancestor[row]<-rowOld
      if(d$ancestor[row]==0){
        newick<-paste("(",cluster,":0,:0);", sep="")
        break}
      newick<-paste(newick,cluster,":",d$distance[row], sep="")
      row<-d$ancestor[row]
      next}
    
    if(d$times[row]==0){
      newick<-paste(newick,"(", sep="")
      if(offspNo>1){newick<-paste(newick,"(", sep="")}
      d$ancestor[row]<-rowOld
      rowOld<-row
      d$times[row]<-d$times[row]+1
      row<-d$offspring[[row]][1]
      next}
    
    if(d$times[row]>0 & d$times[row]<offspNo){
      newick<-paste(newick,",", sep="")
      if((offspNo-d$times[row])>1){newick<-paste(newick,"(", sep="")}
      rowOld<-row
      d$times[row]<-d$times[row]+1
      row<-d$offspring[[row]][(d$times[row])]
      next}
    
    if(d$times[row]>0 & d$times[row]==offspNo){
      if(offspNo==1){
        newick<-paste(newick,",",cluster,":0):",d$distance[row], sep="")
      }else{
        newick<-paste(newick,paste(rep("):0",(offspNo-1)),sep="", collapse=""),",",cluster,":0):",d$distance[row], sep="", collapse="")            
      }
      if(d$ancestor[row]==0){
        newick<-paste(newick,";", sep="")
        break}
      row<-d$ancestor[row]
      next}
  }  
  tree <- ape::read.tree(text=newick)
}
petrikemppainen/LDna documentation built on April 14, 2024, 6:37 p.m.