R/preprocess_tree_and_vcf.R

Defines functions subset_tree_and_matrix root_tree build_tree read_in_tree

#' Read in tree
#'
#' If \code{tree} is a string, \code{read_in_tree} reads in the tree as an `ape
#' phylo` object. \code{read_in_tree} checks to see if the object is an `ape
#' phylo` object.
#'
#' @param tree Character or or `ape phylo`. If character: path to a tree file.
#'
#' @return tree: `ape phylo`.
#' @noRd
read_in_tree <- function(tree){
  if (is_file(tree)) {
    # LOAD IN TREE
    tree <- ape::read.tree(tree)
  }
  check_is_tree(tree)
  return(tree)
}

#' Build tree
#'
#' Builds maximum likelihood tree using \code{\link[phangorn]{optim.pml}} with a
#' GTR model of nucleotide substitution. ML R code inspired by:
#' https://www.molecularecologist.com/2016/02/quick-and-dirty-tree-building-in-r
#'
#' @param mat allele matrix where rows are variants and columns are samples
#'   (generated by \code{load_vcf_file})
#'
#' @return maximum likelihood tree generated by
#'   \code{\link[phangorn]{optim.pml}}
#' @noRd
build_tree <- function(mat){
  check_is_this_class(mat, "matrix")

  phydat <- phangorn::phyDat(t(mat))
  # phydat will be empty is there are no rows with just SNPs (A / T/ G/ C/ N)

  error_msg <- "There are not enough SNPs in the variant data to generate a tree, which is necessary when anc = TRUE. Please either supply a different VCF file or set anc = FALSE."
  distmat <- tryCatch(phangorn::dist.ml(phydat),
                                error = function(x) {
                                  error_msg
                                }
  )
  if (is_this_class(distmat, "character")) {
    stop(error_msg)
  }
  nj_tree <- phangorn::NJ(distmat)
  fit <- phangorn::pml(nj_tree, phydat)
  print("Start building tree")
  fitGTR <-
    phangorn::optim.pml(fit, model = "GTR", rearrangement = "stochastic")
  print("Finished building tree")
  return(fitGTR$tree)
}

#' Root tree on outgroup
#'
#' Roots tree based on an outgroup. If the outgroup is provided, roots based on
#' the outgroup. If the outgroup is null and the tree isn't rooted, midpoint
#' roots the tree. If the tree is rooted and no outgroup is provided, returns
#' the tree as is.
#'
#' @param tree Character or phylo. If character: path to a tree file.
#' @param outgroup Character. Tip name of outgroup in phylogeny. If NULL,
#'   midpoint root if not rooted
#'
#' @return tree: phylo. Rooted tree without outgroup.
#' @noRd
root_tree <- function(tree, outgroup = NULL){
  # READ IN TREE
  tree <- read_in_tree(tree)
  check_is_this_class(tree, "phylo")

  # IF OUTGROUP SUPPLIED
  if (!is.null(outgroup)) {
    check_is_this_class(outgroup, "character")

    # ROOT TREE ON OUTGROUP
    tree <- ape::root(tree, outgroup)
    tree <- ape::drop.tip(tree, outgroup)
    return(tree)
  # IF NO OUTGROUP AND TREE IS UNROOTED
  } else if (is.null(outgroup) & !ape::is.rooted(tree)) {
    # MIDPOINT ROOT TREE
    tree <- phangorn::midpoint(tree)
    return(tree)
  }
  return(tree)
}

#' Subset tree and matrix
#'
#' Subsets the tree and variant matrix to include only samples in both. Returns
#' the subsetted tree and matrix.
#'
#' @param tree path to tree file or ape phylo object
#' @param mat variant matrix (columns are samples, rows are variants)
#'
#' @return List with two objects:
#'   \describe{
#'     \item{tree}{\code{ape phylo}.}
#'     \item{mat}{Matrix.}
#'   }
#' @noRd
subset_tree_and_matrix <- function(tree, mat){
  tree <- read_in_tree(tree)
  check_is_this_class(mat, "matrix")
  check_is_this_class(tree, "phylo")

  # return if tree tip labels and variant matrix column names match
  if (setequal(tree$tip.label, colnames(mat))) {
    # order matrix same as tree tip labels
    mat <- mat[, tree$tip.label]
    return(list(tree = tree, mat = mat))
  }
  in_both <- intersect(tree$tip.label, colnames(mat))
  if (length(in_both) < 2) {
    stop(paste0("Not enough samples match between tree & variant matrix. ",
                "Tree tip labels and variant matrix column names must match."))
  }
  drop_from_tree <- setdiff(tree$tip.label, in_both)
  if (length(drop_from_tree) > 0) {
    tree <- ape::drop.tip(tree, drop_from_tree)
    warning(paste("These samples were dropped from the tree:",
                  paste(drop_from_tree, collapse = ", ")))
  }
  drop_from_mat <- setdiff(colnames(mat), in_both)
  if (length(drop_from_mat) > 0) {
    mat <- mat[, in_both]
    warning(paste("These samples were dropped from the matrix:",
                  paste(drop_from_mat, collapse = ", ")))
  }
  mat <- mat[, tree$tip.label]
  return(list(tree = tree, mat = mat))
}

Try the prewas package in your browser

Any scripts or data that you put into this service are public.

prewas documentation built on April 2, 2021, 5:06 p.m.