rev_string <- function(x){ y <- x for (i in 1:length(x)){ y[i] <- intToUtf8(rev(utf8ToInt(x[i]))) } return(y) } library(yasss) rl <- 300 # read length n_rna <- 100 pid_length <- 8 left_primer_prefix <- 'GAGGAGATATG' left_primer_suffix <- 'AGGGACAATTG' right_primer <- 'AGGGACAATTGGAGAAGTGAA' print(rev_string(right_primer))
This vignette demonstrates how YASSS can be used to simulate PID data.
Steps:
Given a piece of HXB2, just run the basic functionality of YASSS to get a quasispecies. (Manipulating the parameters a bit for illustrative purposes)
# env from 6300 to ~7600 # chopped some for primer mats # ty lanl as always HXB2 <- 'TGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAG' #qs = short for quasispecies qs <- sim_pop(ancestors = substr(HXB2, 200, 700), r0 = n_rna, n_gen = 1, n_pop = Inf, mutator = list(fun = "mutator_uniform_fun", args = list(mu = 0.05)), fitness_evaluator = list(fun = "fitness_evaluator_uniform_fun", args = NULL))
pids <- apply(matrix(sample(c('A', 'C', 'G', 'T'), n_rna * pid_length, replace = TRUE), ncol = pid_length, nrow = n_rna), 1, paste0, collapse = '')
the_seq <- qs %>% filter(gen_num == 1) %>% select(the_seq) the_seq <- the_seq[,1,drop=TRUE] primed_seqs <- paste0(left_primer_prefix, pids, left_primer_suffix, the_seq, right_primer)
The RT step
rt_seqs <- sim_pop(ancestors = primed_seqs, r0 = 1, n_gen = 1, n_pop = Inf, mutator = list(fun = "mutator_uniform_fun", args = list(mu = 0.01)), fitness_evaluator = list(fun = "fitness_evaluator_uniform_fun", args = NULL)) rt_seqs <- rt_seqs %>% filter(gen_num == 1)
Couple of rounds of PCR
pcr_seqs <- sim_pop(ancestors = rt_seqs$the_seq, r0 = 5, n_gen = 3, n_pop = Inf, mutator = list(fun = "mutator_uniform_fun", args = list(mu = 0.001)), fitness_evaluator = list(fun = "fitness_evaluator_uniform_fun", args = NULL))
It is important to know for each sequence what was the rt sequence it came from. Thus a function is needed that can traverse up the genealogy to find the ancestor. However, this is likely to be excruciatingly slow due to data.frame indexing issues, so instead, make for each ancestor a list of all sequences that belong to it.
Resolve ancestry
Function to calculate mapping
make_ancestor_child_mapping <- function(genealogy){ full_ancestry <- genealogy %>% filter(gen_num == 1) %>% select(parent_id, id) names(full_ancestry) <- c('pcr_0', 'pcr_1') i <- 2 while (i <= max(genealogy$gen_num)){ curr_gen <- genealogy %>% filter(gen_num == i) %>% select(parent_id, id) full_ancestry <- merge(full_ancestry, curr_gen, by.x = paste0('pcr_', i-1), by.y = 'parent_id') names(full_ancestry)[names(full_ancestry) == 'id'] <- paste0('pcr_', i) i <- i + 1 } return(full_ancestry) }
Attach ancestor information to pcr seqs
full_ancestry <- make_ancestor_child_mapping(pcr_seqs) final_pcr_cycle <- max(pcr_seqs$gen_num) pcr_seqs <- merge(pcr_seqs, full_ancestry, by.y = paste0('pcr_', final_pcr_cycle), by.x = 'id')
Check basics of simulation
The first 5 input seqs - first 100 positions only
char <- 'A' all_dat <- NULL for (i in 1:5){ dat <- consensusMatrix_character((pcr_seqs %>% filter(pcr_0 == i & gen_num == final_pcr_cycle))$the_seq)[1:4,1:100] for (char in c('A', 'C', 'G', 'T')){ #inefficient but prevents tidyr dependency new_dat <- data.frame(id = i, pos = 1:ncol(dat), name = char, value = dat[char,]) all_dat <- rbind(all_dat, new_dat) } } print( ggplot(all_dat, aes(x = pos, fill = name, y = value)) + geom_bar(position='stack', stat = 'identity') + facet_wrap('id', ncol = 1) )
The whole dataset - first 100 positions only
all_dat <- NULL dat <- consensusMatrix_character((pcr_seqs %>% filter(gen_num == final_pcr_cycle))$the_seq)[1:4,1:100] for (char in c('A', 'C', 'G', 'T')){ #inefficient but prevents tidyr dependency new_dat <- data.frame(id = i, pos = 1:ncol(dat), name = char, value = dat[char,]) all_dat <- rbind(all_dat, new_dat) } print( ggplot(all_dat, aes(x = pos, fill = name, y = value)) + geom_bar(position='stack', stat = 'identity') + facet_wrap('id', ncol = 1) )
bin_size_distribution <- data.frame( pcr_0 = 1:n_rna, draw = ceiling(rexp(n_rna, 1/1e1))) pcr_seqs <- pcr_seqs %>% filter(gen_num == final_pcr_cycle) %>% group_by(pcr_0) %>% mutate(sampling_id = sample(1:n(), n())) pcr_seqs <- merge(pcr_seqs, bin_size_distribution, by.x = 'pcr_0', by.y = 'pcr_0') pcr_seqs <- pcr_seqs %>% filter(sampling_id <= draw)
#sanity check x <- pcr_seqs %>% group_by(pcr_0) %>% summarize(min_draw = min(draw), max_draw = max(draw))
Then chop the sequences so that 300 from left goes to left read and 300 from right goes to right read.
rev_string <- function(x){ y <- x for (i in 1:length(x)){ y[i] <- intToUtf8(rev(utf8ToInt(x[i]))) } return(y) } left_seqs <- pcr_seqs[, c('pcr_0', 'id', 'the_seq')] left_seqs$the_seq <- substr(left_seqs$the_seq, 1, rl) right_seqs <- pcr_seqs[, c('pcr_0', 'id', 'the_seq')] right_seqs$the_seq <- rev_string(substr(right_seqs$the_seq, nchar(right_seqs$the_seq) - rl + 1, nchar(right_seqs$the_seq)))
To make FastQ files - for now just perfect qualities
# Really poor implementation. Fix it later left_seqs$qual <- '' for (i in 1:nrow(left_seqs)){ left_seqs$qual <- paste(sample(c('E', 'F', 'G'), nchar(left_seqs$the_seq[i]), replace = TRUE), collapse = '') } right_seqs$qual <- '' for (i in 1:nrow(right_seqs)){ right_seqs$qual <- paste(rep('G', nchar(right_seqs$the_seq[i])), collapse = '') }
Requires the shortreads package. That is a very major dependency - probablly better to write a minimal fastq writer myself.
https://help.basespace.illumina.com/articles/descriptive/fastq-files/
Also basicQC.R in MotifBinner2
Build ShortReadQ objects
library(ShortRead) left_seqs_f <- ShortReadQ( sread = DNAStringSet(left_seqs$the_seq), quality = BStringSet(left_seqs$qual), id = BStringSet(paste('seq', left_seqs$pcr_0, left_seqs$id, sep = '_')) ) right_seqs_f <- ShortReadQ( sread = DNAStringSet(right_seqs$the_seq), quality = BStringSet(right_seqs$qual), id = BStringSet(paste('seq', right_seqs$pcr_0, right_seqs$id, sep = '_')) )
Write to file
writeFastq(left_seqs_f, '/tmp/left.fastq') writeFastq(right_seqs_f, '/tmp/right.fastq')
MotifBinner2 command
Currently fails on step 16 : extractPIDs
Error in $<-.data.frame
(*tmp*
, "total_pid_gaps", value = 0) :
replacement has 1 row, data has 0
Calls: applyOperation -> op -> $<- -> $<-.data.frame
{sh, eval = FALSE}
MotifBinner2.R --non_overlapping --fwd_file=/tmp/left.fastq --fwd_primer_seq=GAGGAGATATGNNNNNNNNAGGGACAATTG --fwd_primer_lens=11,8,11 --fwd_primer_min_score=25 --rev_file=/tmp/right.fastq --rev_primer_seq=AGGGACAATTGGAGAAGTGAA --rev_primer_lens=21 --rev_primer_min_score=21 --fwd_pid_in_which_fragment=2 --rev_pid_in_which_fragment=NULL --output_dir=/tmp/mb2_test/ --base_for_names=mb2_test --ncpu=2 --merged_read_length=25
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.