library(knitr)
library(Biostrings)
options(scipen=999)
bbn_dat <- report_dat$bbn_dat
attach(bbn_dat)

The unprocessed bins produced

Sys.time()

The number of bins

print(length(bin_seqs))

The sizes of the bins

bin_sizes <- unlist(lapply(bin_seqs, length))
hist(bin_sizes)
kable(data.frame(bin_size = as.numeric(names(table(bin_sizes))),
                 num_bins = as.numeric(table(bin_sizes))))

How many sequences are duplicated?

uniq_len <- function(x){
  length(unique(x))
}
unique_bin_sizes <- unlist(lapply(bin_seqs, uniq_len))
plot(unique_bin_sizes ~ bin_sizes,
     ylim = c(0,max(bin_sizes)),
     ylab = 'Number unique sequences',
     xlab = 'Total number of sequences')
abline(h = max(unique_bin_sizes))

Total number of sequences (x-axis) vs Number of unique sequences (y-axis)

kable(table(cut(unique_bin_sizes, 5), cut(bin_sizes, 5)))

scatter and log log plot of the bin sizes

loglog <- data.frame(bin_size = as.numeric(names(table(bin_sizes))),
                     num_bins = as.numeric(table(bin_sizes)))
plot(num_bins ~ bin_size, data = loglog)
plot(log10(num_bins) ~ log10(bin_size), data = loglog)

Categories and count frequencies

cats <- hist(bin_sizes, breaks = c(0,1,2,3,4,10,20,400,1000000), plot=FALSE)
counts <- cats$counts
names(counts) <- paste('(', cats$breaks[1:(length(cats$breaks)-1)], 
                  '-', cats$breaks[2:(length(cats$breaks))], ']',
                  sep='')
names(counts)[length(counts)] <- paste(cats$breaks[length(cats$breaks)-1], '+',
                                       sep = '')
barplot(counts, main = 'Histogram of Bin Sizes')
kable(data.frame(category = names(counts),
                 count = counts,
                 row.names = 1:length(counts)))


philliplab/MotifBinner documentation built on Sept. 2, 2020, 11:41 a.m.