R/sim_multilocus.R

Defines functions get_child_gts retrieve_child_genetrees get_parent_gts retrieve_parent_genetrees get_tipnames get_tip_labels_tree_list collapse_clade collapse_locus_subtree get_loci sim_mlc sim_multilocus_coal

Documented in collapse_clade collapse_locus_subtree get_child_gts get_loci get_parent_gts get_tip_labels_tree_list get_tipnames retrieve_child_genetrees retrieve_parent_genetrees sim_mlc sim_multilocus_coal

#' Simulates multi-locus coalescent on a given locus tree
#'
#' @description separates a locus tree into loci broken up by
#' duplications  and simulates the coalescent on each loci.
#'
#' @param locus_tree a locus tree from `sim_ltBD` of class `phy`
#' @param effective_pop_size the effective population size
#' @param generation_time unit time per generation (default 1 year per generation)
#' @param mutation_rate number of mutations per unit time
#' @param num_reps number of coalescent simulations per locus
#' @return A list of list of gene trees of length `num_reps` simulated along each locus.
#' The first member of the list is the parent tree, all others are child trees
#'
#' @details
#' This simulation follows the algorithm given in Rasmussen and Kellis 2012.
#' The locus tree is scaled into coalescent units prior to being used.
#' The generation_time parameter default assumes 1 generation
#' per year if the units of the tree are in millions of years.
#' The mutation_rate parameter is by default set to 1e-6 mutations per year
#' (this is totally arbitrary).
#' Also note that the return type is a list of many trees so for
#' sufficiently complicated locus trees with `num_reps` set to a larger
#' value may slow things considerably so use with caution.
#'
#' @examples
#' # first simulate a species tree
#' mu <- 0.5
#' lambda <- 1.0
#' nt <- 6
#' tr <- sim_stBD(sbr = lambda, sdr = mu, numbsim = 1, n_tips = nt)
#' # for a locus tree with 100 genes sampled per locus tree
#' gene_br <- 0.1
#' gene_dr <- 0.02
#' transfer_rate <- 0.2
#' locus_tree <- sim_ltBD(species_tree = tr[[1]],
#'                   gbr = gene_br,
#'                   gdr = gene_dr,
#'                   lgtr = transfer_rate,
#'                   num_loci = 1)
#' effect_popsize <- 1e6
#' gene_tree_obj <- sim_mlc(locus_tree[[1]],
#'                                       effect_popsize,
#'                                       num_reps = 20)
sim_multilocus_coal <- function(locus_tree,
                                effective_pop_size,
                                generation_time = 1,
                                mutation_rate = 1e-6,
                                num_reps) {
    warning("please use sim_mlc() instead of sim_multilocus_coal()", call. = FALSE)
    sim_mlc(locus_tree, effective_pop_size, generation_time, mutation_rate, num_reps)
}
#' @export
#' @rdname sim_multilocus_coal
sim_mlc <- function(locus_tree,
                    effective_pop_size,
                    generation_time = 1,
                    mutation_rate = 1e-6,
                    num_reps) {
    if(effective_pop_size <= 0) {
        stop("'effective_pop_size' must be a strictly positive number")
    }
    if(generation_time <= 0) {
        stop("'generation_time' must be a strictly positive number")
    }
    if(num_reps < 1) {
        stop("'effective_pop_size' must be  at leat 1")
    }
    if(class(locus_tree) != "phylo") {
        stop("'locus_tree' must be an object of class 'phylo")
    }
    # first rescale the locus tree to be not in time but in coalescent units
    # tree is in time units so to get to generations we want

    ipp <- 1

    # split tree into subtrees at each duplication
    locus_trees_by_dup <- get_loci(locus_tree)
    if(class(locus_trees_by_dup) == "phylo") {
        message("This is a locus tree with only one loci")
        big_tree <- treeducken::sim_msc(locus_tree,
                                                      ne = effective_pop_size,
                                                      mutation_rate = mutation_rate,
                                                      generation_time = generation_time,
                                                      num_sampled_individuals = ipp,
                                                      rescale = TRUE,
                                                      num_genes = num_reps)
        gt_list <- list("parent_tree" = big_tree[[1]]$gene.trees, "child_trees" = NULL)
        class(gt_list) <- "mlc_genetrees"
        return(gt_list)
    }
    mlc_df <- list(length = length(locus_trees_by_dup))
    # loop through the locus trees simulate using MSC then prune the loci
    # that have that locus as a subtree to one tip (to simulate that
    # that coalescent
    # event has occurred and won't affect the sims)xz
    little_trees <- list(length = length(num_reps))
    for(i in seq_len(length(locus_trees_by_dup) - 1)) {
        # add artificial root for treeducken

        if(is.null(locus_trees_by_dup[[i]]$root.edge)) {
            locus_trees_by_dup[[i]]$root.edge <- 0.0
        }
        mlc_df[[i]]  <- treeducken::sim_msc(
            locus_trees_by_dup[[i]],
            ne = effective_pop_size,
            mutation_rate = mutation_rate,
            generation_time = generation_time,
            num_sampled_individuals = ipp,
            rescale = TRUE,
            num_genes = num_reps)
        little_trees[[i]] <- mlc_df[[i]][[1]]$gene.trees
        # collapse those loci on all
        locus_trees_by_dup <- collapse_clade(
            locus_trees_by_dup,
            locus_trees_by_dup)
    }
    locus_trees_by_dup[[length(locus_trees_by_dup)]]$root.edge <- locus_tree$root.edge
    big_tree <- treeducken::sim_msc(
        locus_trees_by_dup[[length(locus_trees_by_dup)]],
        ne = effective_pop_size,
        mutation_rate = mutation_rate,
        generation_time = generation_time,
        num_sampled_individuals = ipp,
        num_genes = num_reps)
    gt_list <- list("parent_tree" = big_tree[[1]]$gene.trees, "child_trees" = little_trees)
    class(gt_list) <- "mlc_genetrees"
    gt_list
}
#' Separate a locus tree into loci
#'
#' @details This separates loci based on node labels "D[A-Z]". This is intended
#' to be used internally, but should work with other trees where duplications
#' are marked similarly.
#'
#' @param locus_tree tree of type `phy`
#' @return list of subtrees (with `locus_tree` at the end`)
#' @examples
#' # first simulate a species tree
#' mu <- 0.5
#' lambda <- 1.0
#' nt <- 6
#' tr <- sim_stBD(sbr = lambda, sdr = mu, numbsim = 1, n_tips = nt)
#' # for a locus tree with 100 genes sampled per locus tree
#' gene_br <- 0.1
#' gene_dr <- 0.02
#' transfer_rate <- 0.0
#' locus_tree <- sim_ltBD(species_tree = tr[[1]],
#'                   gbr = gene_br,
#'                   gdr = gene_dr,
#'                   lgtr = transfer_rate,
#'                   num_loci = 1)
#' locus_tree_subtrees <- get_loci(locus_tree[[1]])
#' @export
get_loci <- function(locus_tree) {
    if(class(locus_tree) != "phylo") {
        stop("'locus_tree' must be an object of class 'phylo")
    }
    if(!(any(grep("D[A-Z]", locus_tree$node.label)))) {
        return(locus_tree)
    }

    # split tree into subtrees at each duplication
    loc_trees_subtree_all <- ape::subtrees(locus_tree)
    ul_trees <- unlist(loc_trees_subtree_all, recursive = FALSE)
    node_labels <- ul_trees[which(names(ul_trees) == "node.label")]
    indices_of_dup <- which(node_labels$node.label != "")
    rev(loc_trees_subtree_all[indices_of_dup])

}
#' Collapse a clade into a single tip
#'
#' @details Takes a clade as input and collapses that clade to one tip
#' in all trees in `list_of_subtrees`.
#' @param list_of_subtrees a list of type `multiPhylo`
#' @param locus_to_collapse a subtree found within a subst of `list_of_subtrees`
#' @return multiPhy (list of trees) with the subtree in question collapse
#' @examples
#' lambda <- 1.0
#' mu <- 0.2
#' nt <- 10
#' trees <- sim_stBD(sbr = lambda, sdr = mu, numbsim = 1, n_tips = nt)
#' subtrees_of_trees <- ape::subtrees(trees[[1]])
#' st_of_interest <- subtrees_of_trees[[1]]
#' collapse_st_of_interest <- collapse_locus_subtree(trees, st_of_interest)
#' @export
collapse_locus_subtree <- function(list_of_subtrees,
                                   locus_to_collapse) {
   warning("please use collapse_clade() instead of collapse_locus_subtree()", call. = FALSE)
   collapse_clade(list_of_subtrees, locus_to_collapse)
}
#' @export
#' @rdname collapse_locus_subtree
collapse_clade <- function(list_of_subtrees,
                           locus_to_collapse) {
    tip_labels_subtrees <- get_tipnames(list_of_subtrees)
    tips_to_remove <- locus_to_collapse$tip.label[-1]
    # figure out which loci have those tips

    tree_indices_to_drop <- lapply(tip_labels_subtrees,
                                   function(x) x %in% tips_to_remove)


    tree_indices_to_keep <- which(
        unlist(
            lapply(tree_indices_to_drop, all))
        , useNames = FALSE)
    names(tree_indices_to_keep) <- NULL
    tree_indices_to_drop <- which(
        unlist(
            lapply(tree_indices_to_drop, any
                   , useNames = FALSE)))
    names(tree_indices_to_drop) <- NULL
    drop_from_drop <- intersect(tree_indices_to_drop, tree_indices_to_keep)
    if(length(drop_from_drop) != 0)
        tree_indices_to_drop <- tree_indices_to_drop[-drop_from_drop]
    trees_to_prune <- list_of_subtrees[tree_indices_to_drop]
    pruned_trees <- lapply(trees_to_prune,
                           function(x) ape::drop.tip(x,
                                                     tip = tips_to_remove,
                                                     rooted = TRUE))
    list_of_subtrees[tree_indices_to_drop] <- pruned_trees
    list_of_subtrees
}
#' Get all the tip labels of a `multiPhylo` object
#' @details Retrieves the member "tip.label" from each tree in multi_tree
#' @param multi_tree an object of class `multiPhylo`
#' @return a list of the same length as `multi_tree` with only the tip labels
#' @examples
#' mu <- 0.5
#' lambda <- 1.0
#' nt <- 6
#' tr <- sim_stBD(sbr = lambda, sdr = mu, numbsim = 5, n_tips = nt)
#' tips_of_tr <- get_tip_labels_tree_list(tr)
get_tip_labels_tree_list <- function(multi_tree) {
    warning("please use get_tipnames() instead of get_tip_labels_tree_list()", call. = FALSE)
    get_tipnames(multi_tree)
}
#' @export
#' @rdname get_tip_labels_tree_list
get_tipnames <- function(multi_tree) {

    ul_multi_tree <- unlist(multi_tree, recursive = FALSE)
    ul_multi_tree[which(names(ul_multi_tree) == "tip.label")]
}


#' Retrieve all gene trees of the parent tree from a list generated from sim_mlc
#' @param gene_tree_list A list of length 2: "parent_tree" and "child_trees" both of which are of class "multiPhylo"
#' @return A `multiPhylo` object of only the gene trees generated on the parent subtree
#' @examples
#' #' # first simulate a species tree
#' mu <- 0.5
#' lambda <- 1.0
#' nt <- 6
#' tr <- sim_stBD(sbr = lambda, sdr = mu, numbsim = 1, n_tips = nt)
#' # for a locus tree with 100 genes sampled per locus tree
#' gene_br <- 0.1
#' gene_dr <- 0.02
#' transfer_rate <- 0.2
#' locus_tree <- sim_ltBD(species_tree = tr[[1]],
#'                   gbr = gene_br,
#'                   gdr = gene_dr,
#'                   lgtr = transfer_rate,
#'                   num_loci = 1)
#' effect_popsize <- 1e6
#' gene_tree_obj <- sim_mlc(locus_tree[[1]],
#'                                       effect_popsize,
#'                                       num_reps = 2)
#' parent_trees <- retrieve_parent_genetrees(gene_tree_obj)
retrieve_parent_genetrees <- function(gene_tree_list) {
    warning("please use get_parent_gts() instead of retrieve_parent_gts()", call. = FALSE)
    get_parent_gts(gene_tree_list)
}
#' @export
#' @rdname retrieve_parent_genetrees
get_parent_gts <- function(gene_tree_list) {
    if(class(gene_tree_list) != "mlc_genetrees")
        stop("list is not in the correct format, did you not use `sim_mlc`?")
    return(gene_tree_list$parent_tree)
}
#' Retrieves the gene trees of the child subtrees
#' @param gene_tree_list A list of length 2: "parent_tree" and "child_trees" both of which are of class "multiPhylo"
#' @describeIn retrieve_parent_genetrees Returns a list of objects of class `multiPhylo`
retrieve_child_genetrees <- function(gene_tree_list) {
    warning("please use get_child_gts() instead of ", call. = FALSE )
}
#' @export
#' @rdname retrieve_parent_genetrees
get_child_gts <- function(gene_tree_list) {
    if(class(gene_tree_list) != "mlc_genetrees")
        stop("list is not in the correct format, did you not use `sim_mlc`?")
    return(gene_tree_list$child_trees)
}

Try the treeducken package in your browser

Any scripts or data that you put into this service are public.

treeducken documentation built on March 3, 2021, 1:11 a.m.