#' Assert a Strata object is valid, die on failure
#'
#' @param required A character vector of required data fields
#' @param check_focal logical, if TRUE, then presence of the focal species in
#' the tree is checked
#' @param strata Strata object
is_valid_strata <- function(strata, required=NULL, check_focal=TRUE){
if(class(strata) != "Strata"){
stop("Expected Strata object, got '", class(strata), "'")
}
if(check_focal && !(strata@focal_species %in% strata@tree$tip.label)){
stop("Invalid Strata object, the focal species '", strata@focal_species, "' is not found in the tree")
}
for(field in names(strata@data)){
if(!all(names(strata@data[[field]]) %in% strata@tree$tip.label)){
stop("Invalid Strata object, data field '", field, "' contains species labels that are not in the tree")
}
}
if(!is.null(required)){
missing <- setdiff(required, names(strata@data))
if(length(missing) > 0){
msg <- "Required data field(s) [%s] are missing from Strata object"
msg <- sprintf(msg, paste0(missing, collapse=", "))
stop(msg)
}
}
}
#' Get a diverse subset of species from tree
#'
#' @param tree phylo object
#' @param n number of species to keep (if greater then the number of species in
#' the tree, the tree is returned unchanged)
#' @param weights Numeric vector with length equal to the number of species in
#' the tree. A weight of 1 will have no influence, lower than one means the
#' species is less likely to be selected.
#' @param collapse Should nodes with a single descendent be collapse?
#' @param FUN The diversification algorithm to use
#' @param ... Additional arguments passed to the diversification algorithm function
#' @return phylo object
#' @export
#' @examples
#' data(atree)
#'
#' # default
#' diverse_subtree(atree, 4)
#'
#' # do not to include the 't1' species
#' diverse_subtree(atree, 4, weights=c(t1=0))
diverse_subtree <- function(tree, n, weights=NULL, collapse=FALSE, FUN=.algo1, ...){
if(class(tree) == 'Strata'){
tree <- tree@tree
}
if(n < 1){
stop('Must select at least one species')
}
if(is.null(weights)){
weights <- rep(1, nleafs(tree))
} else {
weights <- weights[tree$tip.label]
weights[is.na(weights)] <- 1
}
n <- min(n, nleafs(tree))
lineages <- lapply(1:nleafs(tree), lineage, type="index", x=tree)
chosen <- FUN(lineages, n, weights, tree, ...)
subtree(tree, chosen, collapse=collapse)
}
.algo1 <- function(lineages, n, weights, tree, ...){
# The depth of every leaf (species) in the tree
# length(depths) == number of species
depths <- sapply(lineages, length)
# The number of times each node in the tree has been traversed
# length(k) == number of nodes (including species)
k <- rep(0, max(unlist(lineages)))
# Selected leafs
chosen <- rep(0, n)
for(i in 1:n){
w <- if(i == 1){
weights
} else {
# Calculate diversity weights: the mean number of times each ancestral
# branch has been traversed.
v <- sapply(lineages, function(x) sum(
k[x[-1]]) + # number of times each ancestral branch has been traversed
1 # constant offset to avoid division by zero
)
# Divide initial weight by the diversity score
weights / v
}
# Select the ID with the highest adjusted weight. Resolve ties by tree
# depth (preferring the deeper leaf).
chosen[i] <- setdiff(order(
w, # order first by adjusted weight
depths, # then by depth in the tree
rev(sort(tree$tip.label)), # then by tip name
decreasing=TRUE
), chosen)[1]
# increment the number of times each node has been traversed
k[lineages[[chosen[i]]][-1]] <- k[lineages[[chosen[i]]][-1]] + 1
}
chosen
}
#' Apply f to each outgroup branch ascending node id
#'
#' @param strata Strata object
#' @param f A function of a phylo object that return a phylo object
#' @param ... Additional arguments passed to f
#' @return Strata object
#' @export
strata_apply <- function(strata, f, ...){
outgroups <- strata_map(strata, f, ...)
merge_trees <- function(trees){
lapply(trees, tree2edgelist) %>%
do.call(what=rbind) %>%
unique %>%
edgelist_to_phylo %>%
ape::collapse.singles()
}
merge_data <- function(x,y){
if(identical(x, list())){
return(y)
}
stopifnot(identical(names(x), names(y)))
z <- list()
for(field in names(x)){
z[[field]] <- c(x[[field]], y[[field]])
}
z
}
if(all(sapply(outgroups, class) == 'phylo')){
strata@tree <- merge_trees(outgroups)
strata@data <- lapply(strata@data, function(x){
x[strata@tree$tip.label]
})
strata
} else if(all(sapply(outgroups, class) == 'Strata')) {
strata@tree <- lapply(outgroups, function(x) x@tree) %>% merge_trees
strata@data <- lapply(outgroups, function(x) x@data) %>% Reduce(f=merge_data, init=list())
strata
} else {
stop("'f' in 'strata_apply' must map to a tree of Strata object")
}
}
#' Apply f to each outgroup branch ascending node id, returning a list
#'
#' @param strata Strata object
#' @param f A function of a phylo object that may return anything
#' @param ... Additional arguments passed to f
#' @return A named list, with names corresponding to phylostrata
#' @export
#' @examples
#' data(arabidopsis_strata)
#' # count the number of species in each stratum
#' unlist(strata_map(arabidopsis_strata, nleafs))
strata_map <- function(strata, f, ...){
is_valid_strata(strata)
# Get list of internal IDs
lin <- lineage(strata@tree, strata@focal_species, type='name')[-1]
outgroups <- lapply(lin, function(ancestor){
sisters(strata@tree, ancestor, type='index') %>%
subtree(x=strata, collapse=FALSE, descend=TRUE, type='index') %>%
f(...)
})
names(outgroups) <- tree_names(strata@tree)[parent(strata@tree, lin, type='index')]
outgroups <- outgroups[!sapply(outgroups, is.null)]
outgroups[[strata@focal_species]] <- f(subtree(strata, strata@focal_species, type='name'), ...)
outgroups
}
#' Add an id to a representative list
#'
#' @param x A Strata or phylo object (all IDs MUST be NCBI taxonomy IDs)
#' @param taxa NCBI taxon ids to add
#' @param ... Additional arguments (not used currently)
#' @return Strata or phylo object
#' @export
#' @name add_taxa
add_taxa <- function(x, taxa, ...){
UseMethod('add_taxa', x)
}
#' @rdname add_taxa
#' @export
add_taxa.Strata <- function(x, taxa, ...){
# TODO: assert that all IDs are NCBI taxonomy IDs
x@tree <- unique(c(taxa, tree_names(x@tree))) %>%
taxizedb::classification() %>%
lineages_to_phylo
x
}
#' @rdname add_taxa
#' @export
add_taxa.phylo <- function(x, taxa, ...){
unique(c(taxa, tree_names(x))) %>%
taxizedb::classification() %>%
lineages_to_phylo
}
#' Infer homology inference based on a hard e-value threshold
#'
#' @param threshold An e-value threshold
#' @param ... Unused
#' @return A function of a dataframe that has the column 'evalue', the function
#' @export
classify_by_evalue <- function(threshold, ...){
function(x){
!is.na(x$evalue) & (x$evalue < threshold)
}
}
.evalue2pvalue <- function(x) { 1 - exp(-1 * x) }
#' Infer homology inference based on a hard p-value threshold
#'
#' @param threshold An e-value threshold
#' @param ... Unused
#' @return A function of a dataframe with column 'evalue'
#' @export
classify_by_pvalue <- function(threshold, ...){
function(x){
p <- .evalue2pvalue(x$evalue)
!is.na(p) & (p < threshold)
}
}
#' Infer homology inference based on an adjusted p-value threshold
#'
#' Here the E-values of sequence similarity against each target species is
#' multiplied by the total number of target species. This is equivalent to what
#' BLAST does in the special case where all proteomes are of equal length.
#'
#' @param threshold An e-value threshold (before adjustment)
#' @param ... Unused
#' @return A function of a dataframe with columns 'evalue' and 'staxid'
#' @export
classify_by_adjusted_pvalue <- function(threshold, ...){
function(x){
adj_threshold <- threshold / length(unique(x$staxid))
p <- .evalue2pvalue(x$evalue)
!is.na(p) & (p < adj_threshold)
}
}
#' Get an ordered factor mapping MRCA taxon IDs (as vector names) to names
#'
#' @param hittable A data.frame with the columns 'ps' and 'mrca'
#' @export
get_mrca_names <- function(hittable){
hittable %>%
dplyr::select(.data$ps, .data$mrca) %>%
dplyr::distinct() %>%
dplyr::arrange(.data$ps) %>%
{ .$mrca } %>%
# convert to name if possible
partial_id_to_name %>%
{ factor(., levels=unname(.)) }
}
#' Get the phylostratum of each query gene
#'
#' @param hittable data.frame with the columns 'qseqid', 'mrca', 'ps', and
#' whichever columns arerequired by 'classifier'
#' @param classifier A function to infer homology between the query gene and a specific subject
#' @param strata_names A factor of MRCA names (scientific names by default, but
#' you can set your own labels) with levels ordered by phylostrata. This must
#' be a named factor, with names corresponding to taxon IDs. It is used to make
#' the \code{mrca_name} column. It will be used mostly in plots and reports;
#' internally the taxon ids are used.
#' @export
stratify <- function(
hittable,
classifier = classify_by_adjusted_pvalue(0.001),
strata_names = get_mrca_names(hittable)
){
orphan_ps <- max(hittable$ps)
orphan_mrca <- levels(strata_names)[length(strata_names)]
hittable[classifier(hittable), ] %>%
dplyr::select(.data$qseqid, .data$mrca, .data$ps) %>%
dplyr::group_by(.data$qseqid) %>%
dplyr::filter(.data$ps == min(.data$ps)) %>%
dplyr::ungroup() %>%
{
strata <- .
rbind(
strata,
# add in queries that have no hit against anything
hittable[!(hittable$qseqid %in% strata$qseqid), ] %>%
dplyr::select(.data$qseqid, .data$mrca, .data$ps) %>%
# take just one entry for each query
dplyr::group_by(.data$qseqid) %>%
head(1) %>%
dplyr::ungroup() %>%
# assign it orphan status
dplyr::mutate(ps = orphan_ps, mrca = orphan_mrca)
)
} %>%
dplyr::distinct() %>%
dplyr::mutate(mrca_name = strata_names[.data$ps])
}
#' Make a Strata object from a list of taxon ids
#'
#'
#' @param focal_species character: the taxonomy ID of the species that was used in the BLAST
#' @param taxids character: list of NCBI taxonomy IDs
#' @param clean logical: Should leafs with descendents be removed? This can occur when
#' both a species and a descendent subspecies are in the lineage set.
#' @return Strata object
#' @export
strata_from_taxids <- function(focal_species, taxids, clean=TRUE){
lineages <- taxizedb::classification(taxids)
tree <- lineages_to_phylo(lineages, clean=clean)
if(! setequal(taxids, tree$tip.label)){
warning("The following subspecies taxa where removed: ",
setdiff(taxids, tree$tip.label))
blastfiles <- blastfiles[taxids %in% tree$tip.label]
taxids <- tree$tip.label
}
Strata(focal_species=focal_species, tree=tree)
}
#' Given a strata table, remove given strata
#'
#' The genes in a stratum that is removed will be reassigned to the parent
#' stratum. If the stratum is the root stratum, the genes will instead be
#' reassigned to the child stratum.
#'
#' Note that removing strata will result in non-monophyletic branches.
#'
#' @param strata_table data.frame with the factored column 'mrca_name'
#' @param strata_names character vector of strata to drop
#' @export
prune_phylostrata <- function(strata_table, strata_names){
if(!('mrca_name' %in% names(strata_table))){
stop("strata_table must have the column mrca_name")
}
if(!is.factor(strata_table$mrca_name)){
stop("strata_table$mrca_name must be a factor ordered by descending age")
}
mrca_levels <- levels(strata_table$mrca_name)
id2name_map <- dplyr::distinct(strata_table[, c('mrca', 'mrca_name')])
indices <- as.integer(strata_table$mrca_name)
indices_to_drop <- intersect(strata_names, mrca_levels) %>%
{which(mrca_levels %in% .)} %>%
sort(decreasing=TRUE)
for(i in indices_to_drop){
indices <- ifelse(indices >= i, indices-1, indices)
mrca_levels <- mrca_levels[-i]
}
# TODO: I am making the mistake of storing the same information several
# times: ps, mrca, and mrca_name. Then everytime I change one I have to
# change them all. I should add a phylostratigrph object that handles all
# this for me.
strata_table$mrca_name <- ifelse(indices > 0, mrca_levels[indices], mrca_levels[1])
strata_table$mrca <- NULL
strata_table <- merge(strata_table, id2name_map)
strata_table$mrca_name <- factor(strata_table$mrca_name, levels=mrca_levels)
strata_table$ps <- as.integer(strata_table$mrca_name)
strata_table
}
#' Standardize two or more phylostrata tables
#'
#' Each table is subset with the union of sequence IDs. Also all strata that
#' are not shared between all input tables are dropped, with the genes
#' reassigned to older strata as needed.
#'
#' @param strata_tables phylostrata tables, all must the column 'mrca_name'
#' @return list of standardized phylostrata tables
#' @export
standardize_strata <- function(strata_tables){
# Get intersection of all sequence IDs
common_ids <- Reduce(
f = function(x,y){intersect(x$qseqid, y$qseqid)},
x = strata_tables[-1],
init = strata_tables[[1]]
)
# Get the phylostrata common to all studies
common_strata <- Reduce(
f = function(x,y){intersect(levels(x$mrca_name), levels(y$mrca_name))},
x = strata_tables[-1],
init = strata_tables[[1]]
)
tuplify(strata_tables) %>%
lapply(function(x){
d <- x$value
d$group <- x$name
d <- subset(d, d$qseqid %in% common_ids)
d <- prune_phylostrata(d, setdiff(levels(d$mrca_name), common_strata))
x$value <- d
x
}) %>%
untuplify
}
#' Convert strata data, tip, and node names to and from NCBI taxonomy IDs
#'
#' @param strata Strata object
#' @param target What to convert, may be 'tip', 'node', or 'all'
#' @param to What to convert to, may be 'id' or 'name'
#' @param FUN a custom function to convert names
#' @return Strata object with new names
#' @export
strata_convert <- function(strata, target='tip', to='id', FUN=NULL){
if(is.null(FUN)){
FUN <- switch(
to,
id = taxizedb::name2taxid,
name = taxizedb::taxid2name,
stop("Illegal 'to' value: must be either 'id' or 'name'")
)
}
if(target == 'tip' || target == 'all'){
strata@tree$tip.label <- FUN(strata@tree$tip.label)
for(item in names(strata@data)){
names(strata@data[[item]]) <- FUN(names(strata@data[[item]]))
}
strata@focal_species <- FUN(strata@focal_species)
}
if(target == 'node' || target == 'all'){
strata@tree$node.label <- FUN(strata@tree$node.label)
}
strata
}
#' Convert vector of mixed IDs and names to a vector of names
#'
#' @param x character vector
#' @return Vector of names
#' @export
partial_id_to_name <- function(x){
is_id <- grepl('^[0-9]+$', x, perl=TRUE)
x[is_id] <- taxizedb::taxid2name(x[is_id])
x
}
#' Get a phylostrata map from a Strata object
#'
#' @param strata Strata object
#' @return data.frame with columns 'species', 'mrca', an 'ps', where 'mrca' is
#' a factor of phylostrata with levels ordered by age
#' @export
get_phylostrata_map <- function(strata){
map <- strata_map(strata, leafs) %>%
tuplify %>%
lapply(function(x){
tibble::data_frame(
species = x$value,
mrca = rep(x$name, length(x$value)),
ps = x$position
)
}) %>%
do.call(what=rbind)
mrca_levels <- map %>%
dplyr::select(.data$mrca, .data$ps) %>%
dplyr::distinct() %>%
dplyr::arrange(.data$ps) %>% {.$mrca}
map$mrca <- factor(map$mrca, levels=mrca_levels)
map
}
#' Sort strata relative to focal species
#'
#' @param strata Strata object
#' @return Strata object with reorded tips
#' @export
sort_strata <- function(strata){
is_valid_strata(strata)
strata@tree <- make_tree_relative_to(strata@tree, strata@focal_species)
strata
}
#' Rename a species in a Strata object
#'
#' @param strata Strata object
#' @param old_name character vector of names that are currently in the Strata object
#' @param new_name character vector of names to replace the old_name vector
#' @return Strata object with the same number of species
#' @examples
#' \dontrun{
#' data(saccharomyces)
#' x <- rename_species(saccharomyces, "Saccharomyces_arboricola", "Sa")
#' is_valid_strata(x)
#'
#' x <- rename_species(saccharomyces,
#' c("Saccharomyces_arboricola", "Saccharomyces_cerevisiae"), c("Sa", "Sc"))
#' is_valid_strata(x)
#' }
rename_species <- function(strata, old_name, new_name){
is_valid_strata(strata)
if(length(old_name) != length(new_name)){
stop("old_name must have the same length as the new_name")
}
if(any(!(old_name %in% strata@tree$tip.label))){
name <- which(!(old_name %in% strata@tree$tip.label))
stop(old_name[name]," is not in the tree")
}
for (i in 1:length(old_name)){
old <- which(strata@tree$tip.label %in% old_name[i])
strata@tree$tip.label[old] <- new_name[i]
for (j in names(strata@data)){
names(strata@data[[j]])[old] <- new_name[i]
}
}
strata
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.