R/phylo_builder.R

#' Create a subtree with largest overlap from a species list.
#'
#' phylobuilder creates a subtree with largest overlap from a species list.
#' If species in the species list are not already in the tip label, species will
#' added at the most recent common ancestor at the genus or family level when
#' possible.
#'
#' @param species A vector or matrix containing a species list
#' @param tree a phylogenetic tree (object of class phylo)
#' @seealso \code{\link[phangorn]{add.tips}}
#' @keywords cluster
#' @importFrom ape drop.tip stripLabel label2table getMRCA
#' @importFrom parallel mclapply
#' @importFrom phangorn add.tips
#' @examples
#' library(ape)
#' fdir <- system.file("extdata", package = "phyloregion")
#' comm <- read.community(file.path(fdir, "Korea_PRESAB_sample.csv"))
#' tree <- read.tree(file.path(fdir, "PhytoPhylo.tre"))
#' tree <- phylo_builder(colnames(comm), tree)
#' @export
phylo_builder <- function (species, tree)
{
  taxonomy <- c("species", "genus", "family")
#  ,"suborder", "order", "class", "phylum", "kingdom")
# branching phylomatic_names
  if(is.matrix(species) | is.data.frame(species)) species_list <- species

  if(is.factor(species)) species <- as.character(species)
  if(is.vector(species)) {
    species <- stripLabel(species)
    species_list <- label2table(species)
    species_list$species <- species
  }

  if(is.matrix(species_list) | is.data.frame(species_list)){
    colnames(species_list) <- tolower(colnames(species_list))
    nam <- intersect(taxonomy, colnames(species_list))
    species_list <- species_list[, nam]
  }

  species_list[sapply(species_list, is.factor)] <-
            lapply(species_list[sapply(species_list, is.factor)], as.character)

  if (any(duplicated(species_list$species))){
    warning("Duplicated species detected and removed.")
    species_list <- species_list[!duplicated(species_list$species), ]
  }
  Ori <- species_list
  species_list$species <- gsub(" ", "_", species_list$species)
  tree$tip.label <- gsub(" ", "_", tree$tip.label)
# need to prune away subspecies
#  tree$tip.label <- stripLabel(tree$tip.label)

  species_list$species <- gsub("(^[[:alpha:]])", "\\U\\1", species_list$species,
                         perl = TRUE)
  species_list$genus <- gsub("(^[[:alpha:]])", "\\U\\1", species_list$genus,
                       perl = TRUE)
  if(!is.null(species_list$family )) species_list$family <-
       gsub("(^[[:alpha:]])", "\\U\\1", species_list$family, perl = TRUE)

  T1 <- tree
  add_tip <- which(is.na(match(species_list$species, tree$tip.label)) )

  species_list$add <- rep("tree", nrow(species_list) )
  species_list$add[add_tip] <- "missing"

  if (length(add_tip) > 0){

    unique_genera <- unique(species_list$genus[add_tip])
    tree_genera <- as.character(label2table(T1$tip)$"genus")

    fun <- function(g, g_tree){
      tmp <- match(g_tree, g)
      l <- length(g)
      res <- vector("list", l)
      for(i in seq_along(tmp)){
        a <- tmp[i]
        if(!is.na(a))res[[a]] <- c(res[[a]], i)
      }
      res
    }

    nn_unique <- fun(unique_genera, tree_genera)
    ll_unique <- lengths(nn_unique)

    tmp <- match(species_list$genus[add_tip], unique_genera)
    nn <- nn_unique[tmp]
    ll <- ll_unique[tmp]

    tips <- tree$tip.label
    tips2add <- NULL
    where2add <- NULL
    if(any(ll==1)){
      tmp_num <- unlist(nn[ll==1])
      num <- T1$edge[match(tmp_num, T1$edge[,2]), 1]
      where2add <- c(where2add, num)
      tips2add <- species_list$species[add_tip[ll==1]]
    }
    if(any(ll > 1)){
      tips2add <- c(tips2add, species_list$species[add_tip[ll > 1]] )
      fun2 <- function(x, tree) ifelse(length(x)>1, getMRCA(tree, x), 0)
      num <- mclapply(nn_unique,  fun2 , tree=T1)
      num <- (unlist(num)[tmp])[ll > 1]
      where2add <- c(where2add, unlist(num))
    }
    if(!is.null(where2add)) T1 <- add.tips(T1, tips2add, where = where2add)
    species_list$add[add_tip[ll>0]] <- "genus"
    add_tip <- add_tip[ll==0]

    if(length(add_tip)>0){
      if("family" %in% colnames(species_list)){
        ind <- add_tip [species_list$family[add_tip] %in% T1$node.label]
        if(length(ind)>0){
          num <- match(species_list$family[ind], T1$node.label)
                  + length(T1$tip.label)
          T1 <- add.tips(T1, tips=species_list$species[ind], num)
          species_list$add[ind] <- "family"
        }
      }
    }

  }
  toDrop <- setdiff(T1$tip.label,
                    species_list$species[species_list$add!="missing"])
  T1 <- drop.tip(T1, tip = toDrop)
  attr(T1, "species_list") <- species_list
  T1
}
darunabas/bioregion documentation built on Oct. 27, 2019, 6:57 a.m.