#' Reflect a matrix on its leading diagonal (useful if distance matrix has entries in upper half only)
#'
#' @param matrix square matrix
#'
#' @return square matrix of same dimensions as input
#'
#' @export
#'
#' @examples
#' reflect_on_diagonal(matrix(1:9, ncol = 3))
reflect_on_diagonal <- function(matrix) {
if(dim(matrix)[1] != dim(matrix)[2]) stop("Not a square matrix")
matrix[lower.tri(matrix)] <- t(matrix)[lower.tri(t(matrix))]
diag(matrix) <- 0
return(matrix)
}
#' Get a "phylo" object from a relatedness matrix
#'
#' @param matrix square matrix
#'
#' @return An object of class phylo.
#'
#' @export
#' @importFrom ape plot.phylo
#' @importFrom ape njs
#' @importFrom stats as.dist
#'
#' @examples
#' tree <- get_tree_from_matrix(driver_matrix)
#' library(ape)
#' plot.phylo(tree, show.tip.label = FALSE)
get_tree_from_matrix <- function(matrix) {
if(dim(matrix)[1] != dim(matrix)[2]) stop("Not a square matrix")
matrix <- matrix[, colSums(is.na(matrix)) < nrow(matrix)] # remove any NA columns
matrix <- reflect_on_diagonal(matrix) # copy upper half to lower half, reflected on the leading diagonal
return(njs(as.dist(matrix))) # create tree object
}
#' Get the ancestry of each genotype
#'
#' @param edges Dataframe comprising an adjacency matrix, with column names "Parent" and "Identity"
#'
#' @return Dataframe listing the ancestors of each genotype (one row per Identity value).
#'
#' @export
#' @importFrom ggmuller find_start_node
#' @importFrom ggmuller move_up
#'
#' @examples
#' library(dplyr)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' library(ggmuller)
#' edges <- get_edges(phylo)
#' ancestry(edges)
ancestry <- function(edges) {
start <- find_start_node(edges)
anc <- data.frame(V1 = start)
gens_list <- unique(edges$Identity)
for(gen in 1:length(gens_list)) {
now <- as.numeric(edges[gen, "Identity"])
res <- now
repeat {
if(move_up(edges, now) == now) break
now <- move_up(edges, now)
res <- c(now, res)
}
res <- as.data.frame(t(res))
anc <- bind_rows(anc, res)
}
max_len <- dim(anc)[2]
colnames(anc) <- paste0("Level", 1:max_len)
gens_list <- c(start, gens_list)
anc <- cbind(anc, Identity = gens_list)
rownames(anc) <- NULL
return(anc)
}
#' Find the latest-occurring mutation shared by at least a threshold proportion of cells at a specified time
#'
#' @param anc Dataframe of ancestries, as generated by "ancestry" function
#' @param pop_df Dataframe with column names including "Identity" and "Population"
#' @param threshold Number between 0 and 1; a genotype must exceed this frequency to be considered dominant
#' @param generation Generation at which to make the measurement (default NA corresponds to the final Generation)
#'
#' @return Identity of the dominant genotype.
#'
#' @export
#' @import dplyr
#'
#' @examples
#' library(ggmuller)
#' library(dplyr)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' pop_df <- get_population_df(phylo)
#' edges <- get_edges(phylo)
#' anc <- ancestry(edges)
#' dominant(anc, pop_df, 0.01)
#' gens_list <- unique(pop_df$Generation)
#' dominant(anc, pop_df, 0.01, gens_list[round(length(gens_list) / 2)])
dominant <- function(anc, pop_df, threshold, generation = NA) {
if(is.na(generation)) generation = max(pop_df$Generation, na.rm = TRUE)
pop_subdf <- filter(pop_df, Generation == generation) %>%
select(Identity, Population)
max_len <- dim(anc)[2] - 1
anc <- merge(anc, pop_subdf)
total_pop <- sum(anc$Population)
for(level in 1:max_len) {
col <- paste0("Level", level)
dom <- anc %>% group_by_(col) %>%
filter(sum(Population) / total_pop >= threshold) %>%
select_(col)
dom <- dom[!is.na(dom), ]
if(dim(dom)[1] == 0) break
else res <- max(dom, na.rm = TRUE)
}
return(res)
}
#' Find generations at which sweeps (or at least partial sweeps) occurred
#'
#' @param phylo Dataframe containing phylogeny
#' @param threshold Number between 0 and 1; a genotype must exceed this frequency to be considered dominant
#' @param min_pop Genotypes with populations smaller than this size will be removed before analysis (default 0)
#'
#' @return Vector of generations at which (partial) sweeps occurred.
#'
#' @export
#' @import dplyr
#' @importFrom ggmuller get_population_df
#' @importFrom ggmuller get_edges
#'
#' @examples
#' library(dplyr)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' sweep_times(phylo, threshold = 0.1)
sweep_times <- function(phylo, threshold, min_pop = 0) {
pop_df <- get_population_df(phylo)
pop_df <- filter(pop_df, Population >= min_pop)
edges <- get_edges(phylo)
anc <- ancestry(edges)
res <- vector()
for(generation in unique(pop_df$Generation)) {
dom <- dominant(anc, pop_df, threshold, generation)
if(generation > min(pop_df$Generation, na.rm = TRUE)) if(dom != prev_dom) res <- c(res, generation)
prev_dom <- dom
}
return(res)
}
#' Calculate how the genotype frequency distribution changes over time
#'
#' @param pop_df Dataframe with column names "Identity", "Population" and "Generation"
#' @param lag_type Either "generations" or "proportions" (default "generations")
#' @param breaks Number of breaks for determining lag (used only if lag_type = "proportions"; default 10)
#' @param lag_gens Lag in terms of generations (used only if lag_type = "generations"; default 500)
#'
#' @return For each generation g in pop_df, excluding the first generation, the output vector quantifies
#' the change in genotype frequencies compared to generation g - lag_gens (by summing the squares of the
#' differences). The length of the output sequence is the same as the number of rows in the input dataframe.
#' The first value is always zero.
#'
#' @export
#' @import dplyr
#' @importFrom stats median
#' @importFrom graphics plot
#'
#' @examples
#' library(ggmuller)
#' library(dplyr)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' pop_df <- get_population_df(phylo)
#' sweep_seq1 <- sweep_sequence(pop_df, lag_type = "proportions", breaks = 6)
#' lag_gens <- round(length(unique(pop_df$Generation))/6)
#' sweep_seq2 <- sweep_sequence(pop_df, lag_type = "generations", lag_gens = lag_gens)
#' identical(sweep_seq1, sweep_seq2)
#' sweep_df <- data.frame(y = sweep_seq2, x = (1:length(sweep_seq2))/length(sweep_seq2))
#' plot(y ~ x , data = sweep_df, type = "l")
sweep_sequence <- function(pop_df, lag_type = "generations", breaks = 10, lag_gens = 500) { #*
if(lag_type != "generations" & lag_type != "proportions") stop("Lag type must be 'generations' or 'proportions'")
# remove generations with total population zero:
pop_df <- pop_df %>% group_by(Generation) %>%
filter(sum(Population) > 0) %>%
ungroup()
# add frequency column:
pop_df <- pop_df %>% group_by(Generation) %>%
mutate(Frequency = Population / sum(Population)) %>%
ungroup()
# calculate lag in terms of generations:
if(lag_type == "proportions") lag <- round(length(unique(pop_df$Generation))/breaks)
else lag <- round(lag_gens / median(diff(unique(pop_df$Generation))))
lag <- max(lag, 1)
# calculate differences between frequencies separated by lag generations:
pop_df <- group_by(pop_df, Identity) %>%
mutate(diff = (Frequency - lag(Frequency, n = lag))^2)
# find sum of differences for each generation:
#pop_df <- filter(pop_df, Generation >= min(pop_df$Generation, na.rm = TRUE) + (max(pop_df$Generation, na.rm = TRUE) - min(pop_df$Generation, na.rm = TRUE))/breaks)
sum_df <- group_by(pop_df, Generation) %>%
summarise(sum_diff = sum(diff, na.rm=TRUE))
return(sum_df$sum_diff)
}
#' Get genotype frequencies in increasing order of size
#'
#' @param pop_df Dataframe with column names "Identity", "Population" and "Generation"
#' @param generation Generation at which to get frequencies (default NA corresponds to final generation)
#'
#' @return vector of genotype frequencies
#'
#' @export
#' @import dplyr
#'
#' @examples
#' library(dplyr)
#' library(ggmuller)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' pop_df <- get_population_df(phylo)
#' dist <- geno_dist(pop_df)
#' barplot(dist, ylim = c(0, 1))
geno_dist <- function(pop_df, generation = NA) {
if(is.na(generation)) generation = max(pop_df$Generation, na.rm = TRUE)
# add frequency column:
pop_df <- pop_df %>% group_by(Generation) %>%
mutate(Frequency = Population / sum(Population)) %>%
ungroup()
pop_subdf <- filter(pop_df, Generation == generation)
pop_subdf <- filter(pop_subdf, Frequency > 0)
dist <- sort(pop_subdf$Frequency)
return(dist)
}
#' Rao's quadratic diversity, using a binary distance measure, converted to the effective number of species
#'
#' @param value vector of trait values
#' @param freq vector of frequencies or abundances
#' @param sigma Cutoff for considering two trait values to be identical
#' @param threshold minimum trait value to include in calculation
#'
#' @return effective number of species
#'
#' @export
#'
#' @examples
#' quadratic_diversity(1:5, 1:5, 1)
quadratic_diversity <- function(value, freq, sigma, threshold = 0.1) {
df <- data.frame(x = value, y = freq)
df <- filter(df, x > threshold, y > 0)
n <- length(df$y)
if(n == 0) return(0)
df$y <- df$y / sum(df$y)
sum <- 0
for(i in 1:n) for(j in 1:n) {
d <- 1 - (abs(df$x[j] - df$x[i]) <= sigma)
sum <- sum + d * df$y[i] * df$y[j]
}
return(1 / (1 - sum))
}
#' Plot a phylogenetic tree
#'
#' @param edges dataframe comprising an adjacency matrix (and optionally including Population column)
#' @param output_dir folder in which to save the image file; if NA then plots are displayed on screen instead
#' @param output_filename name of output image file
#' @param fill fill colour of the nodes (excluding the root, which is always red)
#' @param display whether to display the tree
#'
#' @return either an image file or a plot displyed on screen
#'
#' @export
#' @import Rgraphviz
#' @import dplyr
#' @importFrom grDevices dev.off
#' @importFrom grDevices pdf
#' @importFrom graph ftM2adjM
#'
#' @examples
#' edges1 <- data.frame(Parent = c(0,1,1,2,2,3,3), Identity = 1:7, Population = c(2,10,5,10,20,10,3))
#' plot_tree(edges1)
plot_tree <- function(edges, output_dir = NA, output_filename = NA, fill = "black", display = TRUE) {
elist <- select(edges, Parent, Identity)
elist <- filter(elist, Parent != Identity)
elist <- as.data.frame(elist)
M2 <- ftM2adjM(cbind(elist$Parent, elist$Identity), edgemode = "undirected")
gg <- as(M2, "graphNEL")
if(!"Population" %in% colnames(edges)) edges$Population <- 1
edges$Population <- edges$Population / sum(edges$Population)
node_sizes <- 10*(edges$Population)
names(node_sizes) <- edges$Identity
nAttrs <- list()
attrs <- list()
nAttrs$width <- node_sizes
nAttrs$fillcolor <- c("0" = "red")
nAttrs$fontcolor <- c("0" = "red")
attrs$node$fixedsize <- FALSE
if(fill == "black") attrs$node$fontsize <- 0
attrs$node$fillcolor <- fill
attrs$node$fontcolor <- "black"
attrs$edge$arrowsize <- 0
if(!is.na(output_dir)) pdf(paste0(output_dir, output_filename, ".pdf"), width = 5, height = 5)
if(display) {
Rgraphviz::plot(gg, nodeAttrs = nAttrs, attrs = attrs)
}
if(!is.na(output_dir)) dev.off()
}
#' Calculate the inverse Simpson index
#'
#' @param pops vector of population sizes (or frequencies)
#'
#' @return The inverse Simpson index (effective number of species)
#'
#' @export
#'
#' @examples
#' inv_Simpson_index(c(rep(1, 100)))
inv_Simpson_index <- function(pops) {
pops <- pops / sum(pops)
return(1 / sum(pops*pops))
}
#' Count the number of ancestors of a node in a phylogenetic tree
#'
#' @param edges dataframe comprising an adjacency matrix
#' @param node node for which to calculate the metric
#'
#' @return Number of ancestors
#'
#' @export
#' @import ggmuller
#'
#' @examples
#' edges1 <- data.frame(Parent = c(0,1,1,2,2,3,3), Identity = 1:7, Population = c(2,10,5,10,20,10,3))
#' count_drivers(edges1, 6)
count_drivers <- function(edges, node) {
n <- 1
now <- node
while(now != "0") {
now <- move_up(edges, now)
n <- n + 1
if(n > 20) stop(paste0("Can't find the origin of node ", node))
}
return(n)
}
#' Calculate average number of ancestors per node (n) and diversity (D) for a phylogenetic tree
#'
#' @param data dataframe comprising an adjacency matrix, including Population
#'
#' @return Vector containing average number of ancestors per node (n) and diversity (D)
#'
#' @export
#'
#' @examples
#' edges1 <- data.frame(Parent = c(0,1,1,2,2,3,3), Identity = 1:7, Population = c(2,10,5,10,20,10,3))
#' metrics(edges1)
metrics <- function(data) {
D <- inv_Simpson_index(data$Population)
data$Drivers <- sapply(data$Identity, count_drivers, edges = data)
n <- sum(data$Drivers * data$Population / sum(data$Population))
return(c(n = n, D = D))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.