library(knitr) library(Biostrings) options(scipen=999) pb_dat <- report_dat$pb_dat attach(pb_dat)
x <- pb_dat x$pb_out <- NULL x$seqs <- NULL print(x)
metrics <- NULL i <- 1 for (i in seq_along(pb_out)){ # print(i) input_size <- length(pb_out[[i]]$src) + length(pb_out[[i]]$out) output_size <- length(pb_out[[i]]$src) out <- pb_out[[i]]$out src <- pb_out[[i]]$src dmat <- pb_out[[i]]$dmat min_dist <- min(as.dist(dmat)) in_max_dist <- max(pb_out[[i]]$dmat) out_dmat <- dmat[match(names(src), colnames(dmat)), match(names(src), colnames(dmat))] out_max_dist <- max(out_dmat) min_out_dist <- min(as.dist(out_dmat)) if (is.null(pb_out[[i]]$alignment)){ gaps <- NA pos_no_mismatch <- NA pos_mismatch <- NA total_mismatches <- NA non_acgt <- NA non_acgt1 <- NA non_acgt2 <- NA non_acgt3 <- NA non_acgt4 <- NA non_acgt5 <- NA align_length <- NA } else { conMat <- consensusMatrix(pb_out[[i]]$alignment) gaps <- sum(conMat[-(1:4),]) mismatches <- apply(conMat, 2, sum) - apply(conMat, 2, max) pos_no_mismatch <- sum(mismatches==0) pos_mismatch <- sum(mismatches!=0) total_mismatches <- sum(mismatches) non_acgt <- sum(consensusMatrix(DNAStringSet(pb_out[[i]]$consensus[[1]]))[-(1:4),]) non_acgt1 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 2, 10000)))[-(1:4),]) non_acgt2 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 3, 10000)))[-(1:4),]) non_acgt3 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 4, 10000)))[-(1:4),]) non_acgt4 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 5, 10000)))[-(1:4),]) non_acgt5 <- sum(consensusMatrix(DNAStringSet(substr(pb_out[[i]]$consensus[[1]], 6, 10000)))[-(1:4),]) if (is.na(non_acgt1)) {non_acgt1 <- 0} if (is.na(non_acgt2)) {non_acgt2 <- 0} if (is.na(non_acgt3)) {non_acgt3 <- 0} if (is.na(non_acgt4)) {non_acgt4 <- 0} if (is.na(non_acgt5)) {non_acgt5 <- 0} align_length <- max(width(pb_out[[i]]$alignment)) } metrics <- rbind(metrics, data.frame(input_size = input_size, output_size = output_size, min_dist = min_dist, in_max_dist = in_max_dist, out_max_dist = out_max_dist, min_out_dist = min_out_dist, gaps = gaps, pos_no_mismatch = pos_no_mismatch, pos_mismatch = pos_mismatch, total_mismatches = total_mismatches, non_acgt = non_acgt, non_acgt1 = non_acgt1, non_acgt2 = non_acgt2, non_acgt3 = non_acgt3, non_acgt4 = non_acgt4, non_acgt5 = non_acgt5, align_length = align_length)) } metrics$input_size_bins <- cut(metrics$input_size, breaks = c(0,1,2,3,10,50,100000)) metrics$output_size_bins <- cut(metrics$output_size, breaks = c(0,1,2,3,10,50,100000))
par(mfrow=c(2,2)) hist(metrics$input_size) hist(metrics$output_size) plot(metrics$output_size ~ metrics$input_size) abline(a=0, b=1) par(mfrow=c(1,1))
small_bins <- subset(metrics, output_size < 6) plot(jitter(small_bins$output_size) ~ jitter(small_bins$input_size)) abline(a=0, b=1) small_bins <- subset(small_bins, input_size < 10) plot(jitter(small_bins$output_size) ~ jitter(small_bins$input_size)) abline(a=0, b=1)
par(mfrow=c(2,2)) hist(metrics$in_max_dist) hist(metrics$out_max_dist) hist(metrics$min_out_dist) par(mfrow=c(1,1))
norm_gap <- (100*metrics$gaps)/(metrics$output_size * metrics$align_length) hist(norm_gap, main = "Histogram of the number of gaps per\n100 characters in the alignment", xlab = "Gaps per 100 characters", ylab = "Number of alignments") kable(data.frame(num_gaps = as.numeric(names(table(round(norm_gap,1)))), num_alignments = as.numeric(table(round(norm_gap,1)))))
par(mfrow=c(1,2)) hist(metrics$pos_no_mismatch/metrics$align_length, main = "Histogram of percentage of\npositions with no mismatches", xlab = "Percentage of Positions", ylab = "Number of Alignments") hist(metrics$pos_mismatch/metrics$align_length, main = "Histogram of percentage of\npositions with mismatches", xlab = "Percentage of Positions", ylab = "Number of Alignments") par(mfrow=c(1,1))
hist(metrics$total_mismatches/(metrics$output_size * metrics$align_length), main = "Percentage of alignment that\nconflicts with consensus sequence", xlab = "Percentage of conflicting characters", ylab = "Number of Alignments")
barplot(table(metrics$non_acgt)) kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt))), num_con_seq = as.numeric(table(metrics$non_acgt))))
barplot(table(metrics$non_acgt1)) kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt1))), num_con_seq = as.numeric(table(metrics$non_acgt1))))
barplot(table(metrics$non_acgt2)) kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt2))), num_con_seq = as.numeric(table(metrics$non_acgt2))))
barplot(table(metrics$non_acgt3)) kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt3))), num_con_seq = as.numeric(table(metrics$non_acgt3))))
barplot(table(metrics$non_acgt4)) kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt4))), num_con_seq = as.numeric(table(metrics$non_acgt4))))
barplot(table(metrics$non_acgt5)) kable(data.frame(num_non_acgt = as.numeric(names(table(metrics$non_acgt5))), num_con_seq = as.numeric(table(metrics$non_acgt5))))
What kinds of bins yield consensus sequences with non-acgt characters?
By starting bin size: (Number of non-acgt on the y-axis and bin size on x-axis)
kable( with(metrics, table(non_acgt, input_size_bins)))
By bin size after outliers have been removed: (Number of non-acgt on the y-axis and bin size on x-axis)
kable(with(metrics, table(non_acgt, output_size_bins)))
let_count <- apply(consensusMatrix(pb_dat$consensuses), 1, sum) let_count <- let_count[let_count != 0] kable(data.frame(letter = names(let_count), count = as.numeric(let_count))) non_acgt_counts <- let_count[!names(let_count) %in% c('A','C','G','T')] barplot(non_acgt_counts)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.