knitr::opts_chunk$set(echo = TRUE) library(dplyr) library(odbc) library(bulletsamplr) library(bulletxtrctr) library(ggplot2) library(stringr) library(tidyr) library(dbplyr) library(randomForest)
con <- dbConnect(odbc::odbc(), "bullets", timeout = 10) N <- 300 nshow <- 6
One way around the matching set problem is to divide the barrels up into two groups, and compare sequences assembled from each group.
barrels <- tbl(con, "bullet.slice") %>% collect() %>% mutate(barrel = str_replace_all(id, ".*_(\\d{2})-.*", "\\1")) evens <- filter(barrels, barrel %in% c("02", "04", "06", "08", "10")) odds <- filter(barrels, barrel %in% c("01", "03", "05", "07", "09")) set.seed(2090490723) evenseq <- purrr::map_df(1:N, ~crosscut_assemble(2000, evens), .id = "seq") %>% mutate(seqtype = 'even') oddseq <- purrr::map_df(1:N, ~crosscut_assemble(2000, odds), .id = "seq") %>% mutate(seqtype = 'odd')
Plotting the first r nshow
sequences from each set,
first5 <- bind_rows( filter(evenseq, seq %in% as.character(1:nshow)), filter(oddseq, seq %in% as.character(1:nshow)) ) first5 %>% ggplot(aes(x = x, y = sig, color = seqtype)) + geom_line() + facet_wrap(~seq)
We can now align each sequence:
aligned <- purrr::map2(tidyr::nest(evenseq, -seqtype, -seq)$data, tidyr::nest(oddseq, -seqtype, -seq)$data, ~ sig_align(.x$sig, .y$sig)) aligned_df <- data_frame( ccf = sapply(aligned, function(x) x$ccf), lag = sapply(aligned, function(x) x$lag), df = purrr::map(aligned, ~.$lands) ) first5 <- aligned_df[1:nshow,] %>% mutate(i = 1:n()) %>% unnest() %>% gather(key = seqtype, value = sig, sig1:sig2) first5 %>% ggplot(aes(x = x, y = sig, color = seqtype)) + geom_line() + facet_wrap(~i)
We can also examine the maximum ccf of each aligned sequence:
aligned_df %>% ggplot(aes(x = ccf)) + geom_density() + geom_rug()
We can see that the ccf of each aligned sequence varies between .1 and .6, with the majority of sequences having a ccf of .3 - .4.
Using the aligned sequences, we can calculate striae alignment and features:
striae <- purrr::map(aligned, sig_cms_max, span = 75) striae_df <- data_frame( maxCMS = sapply(striae, function(x) x$maxCMS), ccf = sapply(striae, function(x) x$ccf), lag = sapply(striae, function(x) x$lag), lines = purrr::map(striae, ~.$lines), lands = purrr::map(striae, ~.$lands) ) features <- purrr::map_df(striae, extract_features_all_legacy, resolution = 0.625) features$rfscore <- predict(newdata = features, object = rtrees, type = 'prob')[,2]
The null distribution of random forest score is:
ggplot(data = features, aes(x = rfscore)) + geom_histogram(color = "black") + scale_x_continuous("Random Forest Match Score") + ylab("Number of Sequences") + ggtitle("Null Distribution of Random Forest Match Score")
ggplot(data = features, aes(x = ccf)) + geom_histogram(color = "black") + scale_x_continuous("Cross-Correlation") + ylab("Number of Sequences") + ggtitle("Null Distribution of Cross Correlation")
ggplot(data = features, aes(x = rough_cor)) + geom_histogram(color = "black") + scale_x_continuous("Rough Correlation") + ylab("Number of Sequences") + ggtitle("Null Distribution of Rough Correlation")
ggplot(data = features, aes(x = D)) + geom_histogram(color = "black") + scale_x_continuous("Sequence Distance") + ylab("Number of Sequences") + ggtitle("Null Distribution of Sequence Distance")
ggplot(data = features, aes(x = sd_D)) + geom_histogram(color = "black") + scale_x_continuous("Standard Deviation of Sequence Distance") + ylab("Number of Sequences") + ggtitle("Null Distribution of Standard Deviation of Sequence Distance")
ggplot(data = features, aes(x = matches)) + geom_histogram(color = "black", binwidth = 1) + scale_x_continuous("Matches", breaks = c(0:5)*2) + ylab("Number of Sequences") + ggtitle("Null Distribution of Matches")
ggplot(data = features, aes(x = mismatches)) + geom_histogram(color = "black", binwidth = 1) + scale_x_continuous("Mismatches", breaks = c(0:10)*2) + ylab("Number of Sequences") + ggtitle("Null Distribution of Mismatches")
features %>% ggplot(aes(x = cms)) + geom_histogram(color = 'black', binwidth = 1) + scale_x_continuous("Maximum number of Consecutive Matching Striae", breaks = c(0:5) * 2) + ylab("Number of Sequences") + ggtitle("Null Distribution of Maximum Consecutive Matching Striae")
features %>% ggplot(aes(x = non_cms)) + geom_histogram(color = 'black', binwidth = 1) + scale_x_continuous("Nonconsecutive Matching Striae", breaks = c(0:10) * 2) + ylab("Number of Sequences") + ggtitle("Null Distribution of Nonconsecutive Matching Striae")
features %>% ggplot(aes(x = sum_peaks)) + geom_histogram(color = 'black') + scale_x_continuous("Sum of Aligned Striae") + ylab("Number of Sequences") + ggtitle("Null Distribution of Sum of Aligned Striae")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.