Basic report on the processed bins

library(knitr)
library(Biostrings)
options(scipen=999)
pb_dat <- report_dat$pb_dat
attach(pb_dat)

Arguments

x <- pb_dat
x$pb_out <- NULL
x$seqs <- NULL
print(x)

Compute metrics

metrics <- NULL
i <- 1
for (i in seq_along(pb_out)){
#  print(i)
  input_size <- length(pb_out[[i]]$src) + length(pb_out[[i]]$out)
  output_size <- length(pb_out[[i]]$src)
  out <- pb_out[[i]]$out
  src <- pb_out[[i]]$src
  dmat <- pb_out[[i]]$dmat
  min_dist <- min(as.dist(dmat))
  in_max_dist <- max(pb_out[[i]]$dmat)
  out_dmat <- dmat[match(names(src), colnames(dmat)), 
                   match(names(src), colnames(dmat))]
  out_max_dist <- max(out_dmat)
  min_out_dist <- min(as.dist(out_dmat))
  if (is.null(pb_out[[i]]$alignment)){
    gaps <- NA
    pos_no_mismatch <- NA
    pos_mismatch <- NA
    total_mismatches <- NA
    non_acgt <- NA
    non_acgt1 <- NA
    non_acgt2 <- NA
    non_acgt3 <- NA
    non_acgt4 <- NA
    non_acgt5 <- NA
    align_length <- NA
  } else {
    conMat <- consensusMatrix(pb_out[[i]]$alignment)
    gaps <- sum(conMat[-(1:4),])
    mismatches <- apply(conMat, 2, sum) - apply(conMat, 2, max)
    pos_no_mismatch <- sum(mismatches==0)
    pos_mismatch <- sum(mismatches!=0)
    total_mismatches <- sum(mismatches)
    non_acgt <- sum(consensusMatrix(DNAStringSet(pb_out[[i]]$consensus[[1]]))[-(1:4),])
    non_acgt1 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 2, 10000)))[-(1:4),])
    non_acgt2 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 3, 10000)))[-(1:4),])
    non_acgt3 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 4, 10000)))[-(1:4),])
    non_acgt4 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 5, 10000)))[-(1:4),])
    non_acgt5 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 6, 10000)))[-(1:4),])
    if (is.na(non_acgt1)) {non_acgt1 <- 0}
    if (is.na(non_acgt2)) {non_acgt2 <- 0}
    if (is.na(non_acgt3)) {non_acgt3 <- 0}
    if (is.na(non_acgt4)) {non_acgt4 <- 0}
    if (is.na(non_acgt5)) {non_acgt5 <- 0}
    align_length <- max(width(pb_out[[i]]$alignment))
  }
  metrics <- rbind(metrics,
    data.frame(input_size = input_size,
               output_size = output_size,
               min_dist = min_dist,
               in_max_dist = in_max_dist,
               out_max_dist = out_max_dist,
               min_out_dist = min_out_dist,
               gaps = gaps,
               pos_no_mismatch = pos_no_mismatch,
               pos_mismatch = pos_mismatch,
               total_mismatches = total_mismatches,
               non_acgt = non_acgt,
               non_acgt1 = non_acgt1,
               non_acgt2 = non_acgt2,
               non_acgt3 = non_acgt3,
               non_acgt4 = non_acgt4,
               non_acgt5 = non_acgt5,
               align_length = align_length))
}
metrics$input_size_bins <- cut(metrics$input_size, breaks =
                               c(0,1,2,3,10,50,100000))
metrics$output_size_bins <- cut(metrics$output_size, breaks =
                               c(0,1,2,3,10,50,100000))

Effects of outlier removal

Before and after bin sizes

par(mfrow=c(2,2))
hist(metrics$input_size)
hist(metrics$output_size)
plot(metrics$output_size ~ metrics$input_size)
abline(a=0, b=1)
par(mfrow=c(1,1))
Where are the small bins coming from?
small_bins <- subset(metrics, output_size < 6)
plot(jitter(small_bins$output_size) ~ jitter(small_bins$input_size))
abline(a=0, b=1)
small_bins <- subset(small_bins, input_size < 10)
plot(jitter(small_bins$output_size) ~ jitter(small_bins$input_size))
abline(a=0, b=1)

Before and after bin distances

par(mfrow=c(2,2))
hist(metrics$in_max_dist)
hist(metrics$out_max_dist)
hist(metrics$min_out_dist)
par(mfrow=c(1,1))

Alignment step

Number of gaps inserted

norm_gap <- (100*metrics$gaps)/(metrics$output_size * metrics$align_length)
hist(norm_gap,
     main = "Histogram of the number of gaps per\n100 characters in the alignment",
     xlab = "Gaps per 100 characters",
     ylab = "Number of alignments")
kable(data.frame(num_gaps = as.numeric(names(table(round(norm_gap,1)))),
                 num_alignments = as.numeric(table(round(norm_gap,1)))))

Number positions with mismatches in the alignment

par(mfrow=c(1,2))
hist(metrics$pos_no_mismatch/metrics$align_length,
     main = "Histogram of percentage of\npositions with no mismatches",
     xlab = "Percentage of Positions",
     ylab = "Number of Alignments")
hist(metrics$pos_mismatch/metrics$align_length,
     main = "Histogram of percentage of\npositions with mismatches",
     xlab = "Percentage of Positions",
     ylab = "Number of Alignments")
par(mfrow=c(1,1))

Total number of mismatches

hist(metrics$total_mismatches/(metrics$output_size * metrics$align_length),
     main = "Percentage of alignment that\nconflicts with consensus sequence",
     xlab = "Percentage of conflicting characters",
     ylab = "Number of Alignments")

NON-ACGT characters in consensus

barplot(table(metrics$non_acgt))
kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt))),
                 num_con_seq = as.numeric(table(metrics$non_acgt))))

NON-ACGT characters in consensus chop 1 base left

barplot(table(metrics$non_acgt1))
kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt1))),
                 num_con_seq = as.numeric(table(metrics$non_acgt1))))

NON-ACGT characters in consensus chop 2 bases left

barplot(table(metrics$non_acgt2))
kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt2))),
                 num_con_seq = as.numeric(table(metrics$non_acgt2))))

NON-ACGT characters in consensus chop 3 bases left

barplot(table(metrics$non_acgt3))
kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt3))),
                 num_con_seq = as.numeric(table(metrics$non_acgt3))))

NON-ACGT characters in consensus chop 4 bases left

barplot(table(metrics$non_acgt4))
kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt4))),
                 num_con_seq = as.numeric(table(metrics$non_acgt4))))

NON-ACGT characters in consensus chop 5 bases left

barplot(table(metrics$non_acgt5))
kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt5))),
                 num_con_seq = as.numeric(table(metrics$non_acgt5))))

What kinds of bins yield consensus sequences with non-acgt characters?

By starting bin size: (Number of non-acgt on the y-axis and bin size on x-axis)

kable( with(metrics, table(non_acgt, input_size_bins)))

By bin size after outliers have been removed: (Number of non-acgt on the y-axis and bin size on x-axis)

kable(with(metrics, table(non_acgt, output_size_bins)))

Letters in the final consensus sequences

let_count <- apply(consensusMatrix(pb_dat$consensuses), 1, sum)
let_count <- let_count[let_count != 0]
kable(data.frame(letter = names(let_count),
                 count = as.numeric(let_count)))
non_acgt_counts <- let_count[!names(let_count) %in% c('A','C','G','T')]
barplot(non_acgt_counts)


philliplab/MotifBinner documentation built on Sept. 2, 2020, 11:41 a.m.