R/sweepmetrics.R

Defines functions metrics count_drivers inv_Simpson_index plot_tree quadratic_diversity geno_dist sweep_sequence sweep_times dominant ancestry get_tree_from_matrix reflect_on_diagonal

Documented in ancestry count_drivers dominant geno_dist get_tree_from_matrix inv_Simpson_index metrics plot_tree quadratic_diversity reflect_on_diagonal sweep_sequence sweep_times

#' Reflect a matrix on its leading diagonal (useful if distance matrix has entries in upper half only)
#' 
#' @param matrix square matrix
#' 
#' @return square matrix of same dimensions as input
#' 
#' @export
#' 
#' @examples
#' reflect_on_diagonal(matrix(1:9, ncol = 3))
reflect_on_diagonal <- function(matrix) {
  if(dim(matrix)[1] != dim(matrix)[2]) stop("Not a square matrix")
  matrix[lower.tri(matrix)] <- t(matrix)[lower.tri(t(matrix))]
  diag(matrix) <- 0
  return(matrix)
}

#' Get a "phylo" object from a relatedness matrix
#' 
#' @param matrix square matrix
#' 
#' @return An object of class phylo.
#' 
#' @export
#' @importFrom ape plot.phylo
#' @importFrom ape njs
#' @importFrom stats as.dist
#' 
#' @examples
#' tree <- get_tree_from_matrix(driver_matrix)
#' library(ape)
#' plot.phylo(tree, show.tip.label = FALSE)
get_tree_from_matrix <- function(matrix) {
  if(dim(matrix)[1] != dim(matrix)[2]) stop("Not a square matrix")
  matrix <- matrix[, colSums(is.na(matrix)) < nrow(matrix)] # remove any NA columns
  matrix <- reflect_on_diagonal(matrix) # copy upper half to lower half, reflected on the leading diagonal
  return(njs(as.dist(matrix))) # create tree object
}

#' Get the ancestry of each genotype
#' 
#' @param edges Dataframe comprising an adjacency matrix, with column names "Parent" and "Identity"
#' 
#' @return Dataframe listing the ancestors of each genotype (one row per Identity value).
#' 
#' @export
#' @importFrom ggmuller find_start_node
#' @importFrom ggmuller move_up
#' 
#' @examples
#' library(dplyr)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' library(ggmuller)
#' edges <- get_edges(phylo)
#' ancestry(edges)
ancestry <- function(edges) {
  start <- find_start_node(edges)
  anc <- data.frame(V1 = start)
  gens_list <- unique(edges$Identity)
  for(gen in 1:length(gens_list)) {
    now <- as.numeric(edges[gen, "Identity"])
    res <- now
    repeat {
      if(move_up(edges, now) == now) break
      now <- move_up(edges, now)
      res <- c(now, res)
    }
    res <- as.data.frame(t(res))
    anc <- bind_rows(anc, res)
  }
  max_len <- dim(anc)[2]
  colnames(anc) <- paste0("Level", 1:max_len)
  gens_list <- c(start, gens_list)
  anc <- cbind(anc, Identity = gens_list)
  rownames(anc) <- NULL
  return(anc)
}

#' Find the latest-occurring mutation shared by at least a threshold proportion of cells at a specified time
#' 
#' @param anc Dataframe of ancestries, as generated by "ancestry" function
#' @param pop_df Dataframe with column names including "Identity" and "Population"
#' @param threshold Number between 0 and 1; a genotype must exceed this frequency to be considered dominant
#' @param generation Generation at which to make the measurement (default NA corresponds to the final Generation)
#' 
#' @return Identity of the dominant genotype.
#' 
#' @export
#' @import dplyr
#' 
#' @examples
#' library(ggmuller)
#' library(dplyr)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' pop_df <- get_population_df(phylo)
#' edges <- get_edges(phylo)
#' anc <- ancestry(edges)
#' dominant(anc, pop_df, 0.01)
#' gens_list <- unique(pop_df$Generation)
#' dominant(anc, pop_df, 0.01, gens_list[round(length(gens_list) / 2)])
dominant <- function(anc, pop_df, threshold, generation = NA) {
  if(is.na(generation)) generation = max(pop_df$Generation, na.rm = TRUE)
  pop_subdf <- filter(pop_df, Generation == generation) %>% 
      select(Identity, Population)
  max_len <- dim(anc)[2] - 1
  anc <- merge(anc, pop_subdf)
  total_pop <- sum(anc$Population)
  for(level in 1:max_len) {
    col <- paste0("Level", level)
    dom <- anc %>% group_by_(col) %>% 
      filter(sum(Population) / total_pop >= threshold) %>% 
      select_(col)
    dom <- dom[!is.na(dom), ]
    if(dim(dom)[1] == 0) break
    else res <- max(dom, na.rm = TRUE)
  }
  return(res)
}

#' Find generations at which sweeps (or at least partial sweeps) occurred
#' 
#' @param phylo Dataframe containing phylogeny
#' @param threshold Number between 0 and 1; a genotype must exceed this frequency to be considered dominant
#' @param min_pop Genotypes with populations smaller than this size will be removed before analysis (default 0)
#' 
#' @return Vector of generations at which (partial) sweeps occurred.
#' 
#' @export
#' @import dplyr
#' @importFrom ggmuller get_population_df
#' @importFrom ggmuller get_edges
#' 
#' @examples
#' library(dplyr)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' sweep_times(phylo, threshold = 0.1)
sweep_times <- function(phylo, threshold, min_pop = 0) {
  pop_df <- get_population_df(phylo)
  pop_df <- filter(pop_df, Population >= min_pop)
  edges <- get_edges(phylo)
  anc <- ancestry(edges)
  res <- vector()
  for(generation in unique(pop_df$Generation)) {
    dom <- dominant(anc, pop_df, threshold, generation)
    if(generation > min(pop_df$Generation, na.rm = TRUE)) if(dom != prev_dom) res <- c(res, generation)
    prev_dom <- dom
  }
  return(res)
}

#' Calculate how the genotype frequency distribution changes over time
#' 
#' @param pop_df Dataframe with column names "Identity", "Population" and "Generation"
#' @param lag_type Either "generations" or "proportions" (default "generations")
#' @param breaks Number of breaks for determining lag (used only if lag_type = "proportions"; default 10)
#' @param lag_gens Lag in terms of generations (used only if lag_type = "generations"; default 500)
#' 
#' @return For each generation g in pop_df, excluding the first generation, the output vector quantifies 
#' the change in genotype frequencies compared to generation g - lag_gens (by summing the squares of the 
#' differences). The length of the output sequence is the same as the number of rows in the input dataframe.
#' The first value is always zero.
#' 
#' @export
#' @import dplyr
#' @importFrom stats median
#' @importFrom graphics plot
#' 
#' @examples
#' library(ggmuller)
#' library(dplyr)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' pop_df <- get_population_df(phylo)
#' sweep_seq1 <- sweep_sequence(pop_df, lag_type = "proportions", breaks = 6)
#' lag_gens <- round(length(unique(pop_df$Generation))/6)
#' sweep_seq2 <- sweep_sequence(pop_df, lag_type = "generations", lag_gens = lag_gens)
#' identical(sweep_seq1, sweep_seq2)
#' sweep_df <- data.frame(y = sweep_seq2, x = (1:length(sweep_seq2))/length(sweep_seq2))
#' plot(y ~ x , data = sweep_df, type = "l")
sweep_sequence <- function(pop_df, lag_type = "generations", breaks = 10, lag_gens = 500) { #*
  if(lag_type != "generations" & lag_type != "proportions") stop("Lag type must be 'generations' or 'proportions'")
  # remove generations with total population zero:
  pop_df <- pop_df %>% group_by(Generation) %>% 
    filter(sum(Population) > 0) %>% 
    ungroup()
  # add frequency column:
  pop_df <- pop_df %>% group_by(Generation) %>% 
    mutate(Frequency = Population / sum(Population)) %>% 
    ungroup()
  # calculate lag in terms of generations:
  if(lag_type == "proportions") lag <- round(length(unique(pop_df$Generation))/breaks)
  else lag <- round(lag_gens / median(diff(unique(pop_df$Generation))))
  lag <- max(lag, 1)
  # calculate differences between frequencies separated by lag generations:
  pop_df <- group_by(pop_df, Identity) %>% 
    mutate(diff = (Frequency - lag(Frequency, n = lag))^2)
  # find sum of differences for each generation:
  #pop_df <- filter(pop_df, Generation >= min(pop_df$Generation, na.rm = TRUE) + (max(pop_df$Generation, na.rm = TRUE) - min(pop_df$Generation, na.rm = TRUE))/breaks)
  sum_df <- group_by(pop_df, Generation) %>% 
    summarise(sum_diff = sum(diff, na.rm=TRUE))
  return(sum_df$sum_diff)
}

#' Get genotype frequencies in increasing order of size
#' 
#' @param pop_df Dataframe with column names "Identity", "Population" and "Generation"
#' @param generation Generation at which to get frequencies (default NA corresponds to final generation)
#' 
#' @return vector of genotype frequencies
#' 
#' @export
#' @import dplyr
#' 
#' @examples
#' library(dplyr)
#' library(ggmuller)
#' phylo <- filter(driver_phylo, CellsPerSample == -1)
#' pop_df <- get_population_df(phylo)
#' dist <- geno_dist(pop_df)
#' barplot(dist, ylim = c(0, 1))
geno_dist <- function(pop_df, generation = NA) {
  if(is.na(generation)) generation = max(pop_df$Generation, na.rm = TRUE)
  # add frequency column:
  pop_df <- pop_df %>% group_by(Generation) %>% 
    mutate(Frequency = Population / sum(Population)) %>% 
    ungroup()
  pop_subdf <- filter(pop_df, Generation == generation)
  pop_subdf <- filter(pop_subdf, Frequency > 0)
  dist <- sort(pop_subdf$Frequency)
  return(dist)
}

#' Rao's quadratic diversity, using a binary distance measure, converted to the effective number of species
#' 
#' @param value vector of trait values
#' @param freq vector of frequencies or abundances
#' @param sigma Cutoff for considering two trait values to be identical
#' @param threshold minimum trait value to include in calculation
#' 
#' @return effective number of species
#' 
#' @export
#' 
#' @examples
#' quadratic_diversity(1:5, 1:5, 1)
quadratic_diversity <- function(value, freq, sigma, threshold = 0.1) {
  df <- data.frame(x = value, y = freq)
  
  df <- filter(df, x > threshold, y > 0)
  n <- length(df$y)
  if(n == 0) return(0)
  
  df$y <- df$y / sum(df$y)
  
  sum <- 0
  for(i in 1:n) for(j in 1:n) {
    d <- 1 - (abs(df$x[j] - df$x[i]) <= sigma)
    sum <- sum + d * df$y[i] * df$y[j]
  }
  return(1 / (1 - sum))
}

#' Plot a phylogenetic tree
#' 
#' @param edges dataframe comprising an adjacency matrix (and optionally including Population column)
#' @param output_dir folder in which to save the image file; if NA then plots are displayed on screen instead
#' @param output_filename name of output image file
#' @param fill fill colour of the nodes (excluding the root, which is always red)
#' @param display whether to display the tree
#' 
#' @return either an image file or a plot displyed on screen
#' 
#' @export
#' @import Rgraphviz
#' @import dplyr
#' @importFrom grDevices dev.off
#' @importFrom grDevices pdf
#' @importFrom graph ftM2adjM
#' 
#' @examples
#' edges1 <- data.frame(Parent = c(0,1,1,2,2,3,3), Identity = 1:7, Population = c(2,10,5,10,20,10,3))
#' plot_tree(edges1)
plot_tree <- function(edges, output_dir = NA, output_filename = NA, fill = "black", display = TRUE) {
  elist <- select(edges, Parent, Identity)
  elist <- filter(elist, Parent != Identity)
  elist <- as.data.frame(elist)
  
  M2 <- ftM2adjM(cbind(elist$Parent, elist$Identity), edgemode = "undirected")
  gg <- as(M2, "graphNEL")
  
  if(!"Population" %in% colnames(edges)) edges$Population <- 1
  
  edges$Population <- edges$Population / sum(edges$Population)
  node_sizes <- 10*(edges$Population)
  names(node_sizes) <- edges$Identity
  
  nAttrs <- list()
  attrs <- list()
  nAttrs$width <- node_sizes
  nAttrs$fillcolor <- c("0" = "red")
  nAttrs$fontcolor <- c("0" = "red")
  attrs$node$fixedsize <- FALSE
  if(fill == "black") attrs$node$fontsize <- 0
  attrs$node$fillcolor <- fill
  attrs$node$fontcolor <- "black"
  attrs$edge$arrowsize <- 0
  
  if(!is.na(output_dir)) pdf(paste0(output_dir, output_filename, ".pdf"), width = 5, height = 5)
  if(display) {
    Rgraphviz::plot(gg, nodeAttrs = nAttrs, attrs = attrs)
  }
  if(!is.na(output_dir)) dev.off()
}

#' Calculate the inverse Simpson index
#' 
#' @param pops vector of population sizes (or frequencies)
#' 
#' @return The inverse Simpson index (effective number of species)
#' 
#' @export
#' 
#' @examples
#' inv_Simpson_index(c(rep(1, 100)))
inv_Simpson_index <- function(pops) {
  pops <- pops / sum(pops)
  return(1 / sum(pops*pops))
}

#' Count the number of ancestors of a node in a phylogenetic tree
#' 
#' @param edges dataframe comprising an adjacency matrix
#' @param node node for which to calculate the metric
#' 
#' @return Number of ancestors
#' 
#' @export
#' @import ggmuller
#' 
#' @examples
#' edges1 <- data.frame(Parent = c(0,1,1,2,2,3,3), Identity = 1:7, Population = c(2,10,5,10,20,10,3))
#' count_drivers(edges1, 6)
count_drivers <- function(edges, node) {
  n <- 1
  now <- node
  while(now != "0") {
    now <- move_up(edges, now)
    n <- n + 1
    if(n > 20) stop(paste0("Can't find the origin of node ", node))
  }
  return(n)
}

#' Calculate average number of ancestors per node (n) and diversity (D) for a phylogenetic tree
#' 
#' @param data dataframe comprising an adjacency matrix, including Population
#' 
#' @return Vector containing average number of ancestors per node (n) and diversity (D)
#' 
#' @export
#' 
#' @examples
#' edges1 <- data.frame(Parent = c(0,1,1,2,2,3,3), Identity = 1:7, Population = c(2,10,5,10,20,10,3))
#' metrics(edges1)
metrics <- function(data) {
  D <- inv_Simpson_index(data$Population)
  
  data$Drivers <- sapply(data$Identity, count_drivers, edges = data)
  n <- sum(data$Drivers * data$Population / sum(data$Population))
  return(c(n = n, D = D))
}
robjohnnoble/demonanalysis documentation built on June 30, 2020, 12:47 a.m.