opts_chunk$set(echo=FALSE)
opts_chunk$set(warning=FALSE)
opts_chunk$set(fig.pos = 'H')

library(MotifBinner2)

if (!exists('result'))
{
  result <- all_results[[grep('processBadPIDs', names(all_results))[1]]]
}
if (class(result) != 'processBadPIDs')
{
  result <- all_results[[grep('processBadPIDs', names(all_results))[1]]]
}
figure_path <- paste('figure_', result$config$op_full_name, '/', sep = '')
fig.cap0 <- "Figure: Number of bins by bin size. The y axis counts the number of bins of the size given on the x-axis. Note the log scale on the y-axis. The vertical red line indicates the consensus cutoff. Note that a few additional QC checks were performed since the previous version of this figure (in the extactPID steps), so slight deviations are expected."
fig.cap1 <- "Figure: Distance from big bins by size of smaller bin. Parent bin sizes were chosen such that each interval contains 25% of all large bins."
fig.cap2 <- "Figure: Probability of small bins resulting from sequencing errors in the PIDs of larger bins. Note that the probability computation is based on a proportion of the size of the small bin (30% by default). Thus the probability only changes if the size of the bin increased enough for that proportion to increase by one. Hence bins of size 1, 2, and 3 are on the same probability curve; likewise bins of size 4, 5 and 6 and so forth."

r result$config$op_full_name

Divides all bins into one of two categories: Small bins and Big bins based on the consensus cutoff concept described in Zhou et al 2015. A computation is performed to determine the likelihood that more than a certain percentage of the contents of a small bin is the result of sequencing errors in a bigger bin. If it is more likely that the contents resulted from sequencing errors in a larger bin's PID than it is for the bin to naturally exist (i.e. it's PID to have been sampled from the pool of available PIDs) then the bin is discarded. If it is unlikely that the prespecified percentage of the bin is the result of sequencing error in the PIDs of a larger bin, then the bin is kept provided that it is larger than a certain minimum size.

The consensus cutoff computed for this dataset is r result$metrics$consensus_cutoff. For the purposes of flagging bins as potential offspring bins, this number was doubles, since the subsequent calculations to flag a bin as offspring is more accurate. Bins smaller than double the consensus cutoff are referred to as small bins while bins larger than double the consensus cutoff are referred to as large bins.

NOTE!! If your largest bin is very large, the scaling in the graphs below may not be appropriate - contact the package maintainer for further information.

result <- all_results[[4]]
result$metrics$consensus_cutoff
names(result)

names(result$metrics$bin_metrics)
names(result$metrics)
logplot <- data.frame(bin_size = as.numeric(names(table(result$metrics$bin_metrics$raw_size))),
                      num_bins = as.numeric(table(result$metrics$bin_metrics$raw_size)))
largest_bin <- max(logplot$num_bins)
p0 <- ggplot(logplot, aes(x = bin_size, y = num_bins)) + 
  geom_point() + 
  scale_y_log10() + 
  labs(x = 'Bin Size',
       y = 'Number of Bins') +
  geom_vline(xintercept = result$metrics$consensus_cutoff, col = 'red') +
  geom_text(data = data.frame(x=1), 
            aes(x = result$metrics$consensus_cutoff, y = largest_bin,
            hjust = 0, vjust = 1, 
            label = round(result$metrics$consensus_cutoff,0)),
            col = 'red')

small_bins <- subset(result$metrics$bin_metrics, !big_enough)
small_bins$off_spring_prob[small_bins$off_spring_prob == 0] <- 0.0000000001
small_bins[small_bins$dist_to_big >= 4,'dist_to_big'] <- '>3'
small_bins$raw_size_trunc <- small_bins$raw_size
small_bins[small_bins$raw_size_trunc >= 4,'raw_size_trunc'] <- '>3'

big_bins <- subset(result$metrics$bin_metrics, big_enough)
big_bin_sizes <- big_bins$raw_size
big_bin_size_breaks <- sort(unique(quantile(big_bin_sizes, c(0, 0.25, 0.5, 0.75, 1))))
big_bin_labs <- NULL
for (i in 1:(length(big_bin_size_breaks)-1)){
  big_bin_labs <- c(big_bin_labs, 
    paste("(", round(big_bin_size_breaks[i],0), " to ", 
          round(big_bin_size_breaks[i+1],0), "]", 
          sep = ""))
}
small_bins$cat_parent_size <- cut(small_bins$parent_size, 
  breaks = big_bin_size_breaks,
  labels = big_bin_labs)

big_bins$cat_raw_size <- cut(big_bins$raw_size, 
  breaks = big_bin_size_breaks,
  labels = big_bin_labs)

small_bin_size_labs <- 
data.frame(value = c('1', '2', '3', '>3'),
           raw_size_lab = c('Small Bin Size = 1',
                        'Small Bin Size = 2',
                        'Small Bin Size = 3',
                        'Small Bin Size > 3'),
           stringsAsFactors = FALSE)
small_bins <- merge(small_bins, small_bin_size_labs, all.x = T, 
                    by.x = 'raw_size_trunc', by.y = 'value')
small_bins$dist_to_big_lab <- paste('Distance between PIDs = ', small_bins$dist_to_big, sep = '')

small_bins_trans <-
  small_bins %>%
  group_by(raw_size_lab, dist_to_big, cat_parent_size) %>%
  summarize(count = n())
small_bins_trans <- data.frame(small_bins_trans)

p1 <-
ggplot(small_bins_trans, aes(y = count, x=dist_to_big, fill = cat_parent_size)) +
  geom_bar(stat='identity') +
  facet_wrap(~ raw_size_lab, scales = 'free_y') +
  ylab('Number of Small Bins') +
  xlab("Distance to Nearest Big Bin") +
  scale_fill_discrete(guide = guide_legend('Parent\nBin Size')) +
  theme(legend.position = "bottom")

p2 <- 
ggplot(subset(small_bins, dist_to_big <= 2), 
       aes(y = off_spring_prob, x = parent_size, col = raw_size)) +
  geom_point() +
  scale_y_log10() +
  facet_wrap( ~ dist_to_big_lab) +
  geom_hline(yintercept = result$metrics$offspring_prob_cutoff) +
  ylab("Probability of being offspring") +
  xlab("Size of closest parent bin") +
  scale_color_gradient("Size of\nSmall\nBin", low = '#52A9ED', high = '#153048')
print(p0)
print(p1)

As the size of the small bins increases, the probability of the same sequencing error in the PID of a large bin yielding that exact same offspring pid decreases. That means that in the previous graph, the distance to the nearest big bin will decrease as the size of the offspring bin increases. Also, since the sampling space of potential PIDs is so large, it is unlikely to sample two PIDs that are within 1 or 2 distance from each other. This means that as the small bins get larger, the likelihood that they are the result of a truly sampled PID, which just amplified poorly increases. Hence it is expected that as the small bin size increases, then proportion of bins of that size that are more than 2 distance from the nearest big bin will increase. Obviously this interpretation is affected by the length of the PID, the number of PIDs sampled and overall sampling depth. Hence if the previous graph does not follow this interpretation, those things must be looked at first.

Note also, that as the size of the small bins increases, then the size of the bins that are close to there bins should increase. This is logical because larger parent bins result in larger offspring bins. This interacts with the distibution of the size of potential parent (a.k.a big) bins. The breaks used to divide the big bins into groups were chosen so that each interval contains 25% of the bins.

print(p2)
kable_summary(result$summary)

timingTable(result)
cat('\n\n---\n\n')


HIVDiversity/MotifBinner2 documentation built on May 6, 2019, 6:44 p.m.