In philliplab/toolmania: Tool Mania

working_dir <- '/home/phillipl/projects/toolmania/code/toolmania'
setwd(working_dir)
library(toolmania)
input_aa_file <- '/home/phillipl/projects/toolmania/data/CAP177prot/all_timesv3_prot.fasta'
input_dna_file <- '/home/phillipl/projects/toolmania/data/CAP177/all_timesv3.fasta'
output_dir <- '/home/phillipl/projects/toolmania/results/pentropy'
opts_chunk$set(echo=FALSE)

library(knitr)
library(devtools)
working_dir <- '/home/phillipl/projects/toolmania/code/toolmania'
setwd(working_dir)
document()
build()
install()
load_all()
knit2html(file.path(working_dir, 'inst/haplotyper.Rmd'))
browseURL(paste('file://', file.path(getwd(),'haplotyper.html'), sep=''))

#' Counts the number of sequences by the number of times they occur
#'
#' If you have 7 sequences of length 1: 'A', 'A', 'C', 'G', 'G', 'T', 'T', 'T',
#' then this function will tell you that:
#' \itemize{
#'   \item{1 sequence occurs 1 time}
#'   \item{2 sequences occur 2 times}
#'   \item{1 sequence occur 3 times}
#' }
#' @param seq_data The input data
#' @export
count_the_counts_of_the_sequences <- function(seq_data){
  table_table <- table(table(seq_data))
  table_table <- data.frame(number_of_copies = as.numeric(attr(table_table,
                                                        'dimnames')[[1]]),
                                number_of_sequences = as.numeric(table_table))
  return(table_table)
}

#' Plot the counts of the counts as produced by
#' \code{\link{count_the_counts_of_the_sequences}}
#' @param table_table The counts of counts
#' @export
plot_the_counts_of_the_counts <- function(table_table){
  p <- ggplot(table_table, aes(x = number_of_sequences,
                              y = number_of_copies)) +
      geom_point() +
      scale_x_log10() +
      scale_y_log10()
  return(p)
}

#' Constructs a reversed log base 10 axis transformation
#'
#' http://stackoverflow.com/questions/11053899/how-to-get-a-reversed-log10-scale-in-ggplot2
#' 
#' Consider putting this in scales and ggplot?
#'
#' @param base The base to use for the log (defaults to exp(1))
#' @export

reverselog_trans <- function(base = exp(1)){
  trans <- function(x) -log(x, base)
  inv <- function(x) base^(-x)
  trans_new(paste0("reverselog-", format(base)), trans, inv, 
            log_breaks(base = base), 
            domain = c(1e-100, Inf))
}

#' Constructs ecdf of counts of the counts of the sequences
#' @param seq_data The input data
#' @export
ecdf_of_counts_of_the_counts <- function(seq_data){
  tt <- count_the_counts_of_the_sequences(seq_data)
  x <- tt[,1]*tt[,2]
  ecdf_of_cc <- data.frame(y = cumsum(rev(x)),
                           x = rev(tt$number_of_copies))
  return(ecdf_of_cc)
}

#' Plots the ecdf produced by \code{\link{ecdf_of_counts_of_the_counts}}
#' @param ecdf_of_cc The empirical cumulative distribution function of the
#' counts of the counts.
#' @export
plot_ecdf_of_counts_of_the_counts <- function(ecdf_of_cc){
  p <- ggplot(ecdf_of_cc, aes(x = x, y = y)) + 
  geom_line() +
  scale_x_continuous(trans = reverselog_trans(base = 10)) +
  scale_y_continuous(limits=c(0, max(ecdf_of_cc$y)))
  return(p)
}

Input Data

seq_dna_data <- readDNAStringSet(input_dna_file)
seq_aa_data <- readAAStringSet(input_aa_file)
seq_data <- seq_aa_data
print(seq_data)

Investigate number of unique sequences

print('Unique DNA sequences:')
print(length(unique(seq_dna_data)))

print('Unique AA sequences:')
print(length(unique(seq_aa_data)))

tt <- count_the_counts_of_the_sequences(seq_dna_data)
p <- plot_the_counts_of_the_counts(tt)
print(p + ggtitle('DNA'))
tt <- count_the_counts_of_the_sequences(seq_aa_data)
p <- plot_the_counts_of_the_counts(tt)
print(p + ggtitle('AA'))

Intepretation of these plots: The x-axis refers to the unique sequences in terms of their AA or nucleotide composition. The y-axis indicates the number of identical copies there are of exactly the same sequence. The dot on the far right shows the number of sequences of which there is only 1 copy. The vertical column on the far left shows all the sequences of which there is only 1 sequence that occur that many times. For example, there is only once sequence of which there is 24356 unique copies. (fictional number)

What percentage of data is covered by sequences with at least x number of copies?

ecdf_of_cc <- ecdf_of_counts_of_the_counts(seq_dna_data)
p <- plot_ecdf_of_counts_of_the_counts(ecdf_of_cc)
print(p + ggtitle('DNA'))
ecdf_of_cc <- ecdf_of_counts_of_the_counts(seq_aa_data)
p <- plot_ecdf_of_counts_of_the_counts(ecdf_of_cc)
print(p + ggtitle('AA'))

Unique sequences at a single time point

seq_aa_2000 <- get_data_of('timepoint', seq_aa_data, "_2000V3_")
tt <- count_the_counts_of_the_sequences(seq_aa_2000)
p <- plot_the_counts_of_the_counts(tt)
print(p + ggtitle('AA @ time 2000'))
ecdf_of_cc <- ecdf_of_counts_of_the_counts(seq_aa_2000)
p <- plot_ecdf_of_counts_of_the_counts(ecdf_of_cc)
print(p + ggtitle('AA @ time 2000'))

seq_aa_4250 <- get_data_of('timepoint', seq_aa_data, "_4250V3_")
tt <- count_the_counts_of_the_sequences(seq_aa_4250)
p <- plot_the_counts_of_the_counts(tt)
print(p + ggtitle('AA @ time 4250'))
ecdf_of_cc <- ecdf_of_counts_of_the_counts(seq_aa_4250)
p <- plot_ecdf_of_counts_of_the_counts(ecdf_of_cc)
print(p + ggtitle('AA @ time 4250'))

Phylogenetic Tree

Plotting a phylogenetic tree of all the sequences does not work since it is too much data. However, it might be useful for finding outliers?

seq_uniq <- unique(seq_aa_2000)
seq_dists <- stringDist(seq_uniq)
big_plot <- bionj(seq_dists)
plot(big_plot, show.tip.label = FALSE, main = 'AA 2000')

seq_uniq <- unique(seq_aa_4250)
seq_dists <- stringDist(seq_uniq)
big_plot <- bionj(seq_dists)
plot(big_plot, show.tip.label = FALSE, main = 'AA 4250')

We can make very very large versions of these plots to make it viewable (after zooming in a lot)

These plots are way too busy. We can use clustering to select a couple of most representative sequence and then plot them.

No idea if what I am doing is technically correct, will need to check the clustering approach with someone more knowledgable.

Use pam to create 20 clusters. Select out the medoids. Construct a tree using bionj.

seq_uniq <- unique(seq_aa_2000)
seq_dists <- stringDist(seq_uniq)
clustered_seqs <- pam(seq_dists, 20)
medoids <- seq_uniq[names(seq_uniq) %in% clustered_seqs$medoids]

seq_dists <- stringDist(medoids)
big_plot <- bionj(seq_dists)
plot(big_plot, show.tip.label = FALSE, main = 'AA 2000')

seq_uniq <- unique(seq_aa_4250)
seq_dists <- stringDist(seq_uniq)
clustered_seqs <- pam(seq_dists, 20)
medoids <- seq_uniq[names(seq_uniq) %in% clustered_seqs$medoids]

seq_dists <- stringDist(medoids)
big_plot <- bionj(seq_dists)
plot(big_plot, show.tip.label = FALSE, main = 'AA 4250')