working_dir <- '/home/phillipl/projects/toolmania/code/toolmania' setwd(working_dir) library(toolmania) input_aa_file <- '/home/phillipl/projects/toolmania/data/CAP177prot/all_timesv3_prot.fasta' input_dna_file <- '/home/phillipl/projects/toolmania/data/CAP177/all_timesv3.fasta' output_dir <- '/home/phillipl/projects/toolmania/results/pentropy' opts_chunk$set(echo=FALSE)
library(knitr) library(devtools) working_dir <- '/home/phillipl/projects/toolmania/code/toolmania' setwd(working_dir) document() build() install() load_all() knit2html(file.path(working_dir, 'inst/haplotyper.Rmd')) browseURL(paste('file://', file.path(getwd(),'haplotyper.html'), sep=''))
#' Counts the number of sequences by the number of times they occur #' #' If you have 7 sequences of length 1: 'A', 'A', 'C', 'G', 'G', 'T', 'T', 'T', #' then this function will tell you that: #' \itemize{ #' \item{1 sequence occurs 1 time} #' \item{2 sequences occur 2 times} #' \item{1 sequence occur 3 times} #' } #' @param seq_data The input data #' @export count_the_counts_of_the_sequences <- function(seq_data){ table_table <- table(table(seq_data)) table_table <- data.frame(number_of_copies = as.numeric(attr(table_table, 'dimnames')[[1]]), number_of_sequences = as.numeric(table_table)) return(table_table) } #' Plot the counts of the counts as produced by #' \code{\link{count_the_counts_of_the_sequences}} #' @param table_table The counts of counts #' @export plot_the_counts_of_the_counts <- function(table_table){ p <- ggplot(table_table, aes(x = number_of_sequences, y = number_of_copies)) + geom_point() + scale_x_log10() + scale_y_log10() return(p) } #' Constructs a reversed log base 10 axis transformation #' #' http://stackoverflow.com/questions/11053899/how-to-get-a-reversed-log10-scale-in-ggplot2 #' #' Consider putting this in scales and ggplot? #' #' @param base The base to use for the log (defaults to exp(1)) #' @export reverselog_trans <- function(base = exp(1)){ trans <- function(x) -log(x, base) inv <- function(x) base^(-x) trans_new(paste0("reverselog-", format(base)), trans, inv, log_breaks(base = base), domain = c(1e-100, Inf)) } #' Constructs ecdf of counts of the counts of the sequences #' @param seq_data The input data #' @export ecdf_of_counts_of_the_counts <- function(seq_data){ tt <- count_the_counts_of_the_sequences(seq_data) x <- tt[,1]*tt[,2] ecdf_of_cc <- data.frame(y = cumsum(rev(x)), x = rev(tt$number_of_copies)) return(ecdf_of_cc) } #' Plots the ecdf produced by \code{\link{ecdf_of_counts_of_the_counts}} #' @param ecdf_of_cc The empirical cumulative distribution function of the #' counts of the counts. #' @export plot_ecdf_of_counts_of_the_counts <- function(ecdf_of_cc){ p <- ggplot(ecdf_of_cc, aes(x = x, y = y)) + geom_line() + scale_x_continuous(trans = reverselog_trans(base = 10)) + scale_y_continuous(limits=c(0, max(ecdf_of_cc$y))) return(p) }
seq_dna_data <- readDNAStringSet(input_dna_file) seq_aa_data <- readAAStringSet(input_aa_file) seq_data <- seq_aa_data print(seq_data)
print('Unique DNA sequences:') print(length(unique(seq_dna_data)))
print('Unique AA sequences:') print(length(unique(seq_aa_data)))
tt <- count_the_counts_of_the_sequences(seq_dna_data) p <- plot_the_counts_of_the_counts(tt) print(p + ggtitle('DNA')) tt <- count_the_counts_of_the_sequences(seq_aa_data) p <- plot_the_counts_of_the_counts(tt) print(p + ggtitle('AA'))
Intepretation of these plots: The x-axis refers to the unique sequences in terms of their AA or nucleotide composition. The y-axis indicates the number of identical copies there are of exactly the same sequence. The dot on the far right shows the number of sequences of which there is only 1 copy. The vertical column on the far left shows all the sequences of which there is only 1 sequence that occur that many times. For example, there is only once sequence of which there is 24356 unique copies. (fictional number)
ecdf_of_cc <- ecdf_of_counts_of_the_counts(seq_dna_data) p <- plot_ecdf_of_counts_of_the_counts(ecdf_of_cc) print(p + ggtitle('DNA')) ecdf_of_cc <- ecdf_of_counts_of_the_counts(seq_aa_data) p <- plot_ecdf_of_counts_of_the_counts(ecdf_of_cc) print(p + ggtitle('AA'))
seq_aa_2000 <- get_data_of('timepoint', seq_aa_data, "_2000V3_") tt <- count_the_counts_of_the_sequences(seq_aa_2000) p <- plot_the_counts_of_the_counts(tt) print(p + ggtitle('AA @ time 2000')) ecdf_of_cc <- ecdf_of_counts_of_the_counts(seq_aa_2000) p <- plot_ecdf_of_counts_of_the_counts(ecdf_of_cc) print(p + ggtitle('AA @ time 2000')) seq_aa_4250 <- get_data_of('timepoint', seq_aa_data, "_4250V3_") tt <- count_the_counts_of_the_sequences(seq_aa_4250) p <- plot_the_counts_of_the_counts(tt) print(p + ggtitle('AA @ time 4250')) ecdf_of_cc <- ecdf_of_counts_of_the_counts(seq_aa_4250) p <- plot_ecdf_of_counts_of_the_counts(ecdf_of_cc) print(p + ggtitle('AA @ time 4250'))
Plotting a phylogenetic tree of all the sequences does not work since it is too much data. However, it might be useful for finding outliers?
seq_uniq <- unique(seq_aa_2000) seq_dists <- stringDist(seq_uniq) big_plot <- bionj(seq_dists) plot(big_plot, show.tip.label = FALSE, main = 'AA 2000')
seq_uniq <- unique(seq_aa_4250) seq_dists <- stringDist(seq_uniq) big_plot <- bionj(seq_dists) plot(big_plot, show.tip.label = FALSE, main = 'AA 4250')
We can make very very large versions of these plots to make it viewable (after zooming in a lot)
These plots are way too busy. We can use clustering to select a couple of most representative sequence and then plot them.
No idea if what I am doing is technically correct, will need to check the clustering approach with someone more knowledgable.
Use pam to create 20 clusters. Select out the medoids. Construct a tree using bionj.
seq_uniq <- unique(seq_aa_2000) seq_dists <- stringDist(seq_uniq) clustered_seqs <- pam(seq_dists, 20) medoids <- seq_uniq[names(seq_uniq) %in% clustered_seqs$medoids] seq_dists <- stringDist(medoids) big_plot <- bionj(seq_dists) plot(big_plot, show.tip.label = FALSE, main = 'AA 2000')
seq_uniq <- unique(seq_aa_4250) seq_dists <- stringDist(seq_uniq) clustered_seqs <- pam(seq_dists, 20) medoids <- seq_uniq[names(seq_uniq) %in% clustered_seqs$medoids] seq_dists <- stringDist(medoids) big_plot <- bionj(seq_dists) plot(big_plot, show.tip.label = FALSE, main = 'AA 4250')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.