rev_string <- function(x){
  y <- x
  for (i in 1:length(x)){
    y[i] <- intToUtf8(rev(utf8ToInt(x[i])))
  }
  return(y)
}

library(yasss)
rl <- 300 # read length
n_rna <- 100
pid_length <- 8
left_primer_prefix <- 'GAGGAGATATG'
left_primer_suffix <- 'AGGGACAATTG'
right_primer <- 'AGGGACAATTGGAGAAGTGAA'
print(rev_string(right_primer))

This vignette demonstrates how YASSS can be used to simulate PID data.

Steps:

Simulate quasispecies

Given a piece of HXB2, just run the basic functionality of YASSS to get a quasispecies. (Manipulating the parameters a bit for illustrative purposes)

# env from 6300 to ~7600 # chopped some for primer mats
# ty lanl as always
HXB2 <- 'TGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAG'

#qs = short for quasispecies
qs <- sim_pop(ancestors = substr(HXB2, 200, 700),
              r0 = n_rna,
              n_gen = 1,
              n_pop = Inf,
              mutator = list(fun = "mutator_uniform_fun",
                             args = list(mu = 0.05)),
              fitness_evaluator = list(fun = "fitness_evaluator_uniform_fun",
                                       args = NULL))

Generate PIDs

pids <-
apply(matrix(sample(c('A', 'C', 'G', 'T'), 
                    n_rna * pid_length, 
                    replace = TRUE),
             ncol = pid_length,
             nrow = n_rna),
      1,
      paste0, 
      collapse = '')

Attach PIDs

the_seq <- qs %>% 
  filter(gen_num == 1) %>% 
  select(the_seq)

the_seq <- the_seq[,1,drop=TRUE]

primed_seqs <- paste0(left_primer_prefix, pids, left_primer_suffix, the_seq, right_primer)

Simulate PCR

The RT step

rt_seqs <- sim_pop(ancestors = primed_seqs,
                   r0 = 1,
                   n_gen = 1,
                   n_pop = Inf,
                   mutator = list(fun = "mutator_uniform_fun",
                                  args = list(mu = 0.01)),
                   fitness_evaluator = list(fun = "fitness_evaluator_uniform_fun",
                                            args = NULL))

rt_seqs <- rt_seqs %>% filter(gen_num == 1)

Couple of rounds of PCR

pcr_seqs <- sim_pop(ancestors = rt_seqs$the_seq,
                    r0 = 5,
                    n_gen = 3,
                    n_pop = Inf,
                    mutator = list(fun = "mutator_uniform_fun",
                                   args = list(mu = 0.001)),
                    fitness_evaluator = list(fun = "fitness_evaluator_uniform_fun",
                                             args = NULL))

It is important to know for each sequence what was the rt sequence it came from. Thus a function is needed that can traverse up the genealogy to find the ancestor. However, this is likely to be excruciatingly slow due to data.frame indexing issues, so instead, make for each ancestor a list of all sequences that belong to it.

Resolve ancestry

Function to calculate mapping

make_ancestor_child_mapping <- function(genealogy){
  full_ancestry <- genealogy %>% filter(gen_num == 1) %>% select(parent_id, id)  
  names(full_ancestry) <- c('pcr_0', 'pcr_1')

  i <- 2
  while (i <= max(genealogy$gen_num)){
    curr_gen <- genealogy %>% filter(gen_num == i) %>% select(parent_id, id)
    full_ancestry <- merge(full_ancestry, 
                           curr_gen, 
                           by.x = paste0('pcr_', i-1), 
                           by.y = 'parent_id')
    names(full_ancestry)[names(full_ancestry) == 'id'] <- paste0('pcr_', i)
    i <- i + 1
  }
  return(full_ancestry)
}

Attach ancestor information to pcr seqs

full_ancestry <- make_ancestor_child_mapping(pcr_seqs)
final_pcr_cycle <- max(pcr_seqs$gen_num)
pcr_seqs <- 
  merge(pcr_seqs, full_ancestry,
        by.y = paste0('pcr_', final_pcr_cycle),
        by.x = 'id')

Check basics of simulation

The first 5 input seqs - first 100 positions only

char <- 'A'
all_dat <- NULL
for (i in 1:5){
  dat <- consensusMatrix_character((pcr_seqs %>% filter(pcr_0 == i & gen_num == final_pcr_cycle))$the_seq)[1:4,1:100]
  for (char in c('A', 'C', 'G', 'T')){  #inefficient but prevents tidyr dependency
    new_dat <- data.frame(id = i,
                          pos = 1:ncol(dat),
                          name = char,
                          value = dat[char,])
    all_dat <- rbind(all_dat, new_dat)
  }
}
print(
ggplot(all_dat, aes(x = pos, fill = name, y = value)) +
  geom_bar(position='stack', stat = 'identity') +
  facet_wrap('id', ncol = 1)
)

The whole dataset - first 100 positions only

all_dat <- NULL
dat <- consensusMatrix_character((pcr_seqs %>% filter(gen_num == final_pcr_cycle))$the_seq)[1:4,1:100]

for (char in c('A', 'C', 'G', 'T')){  #inefficient but prevents tidyr dependency
  new_dat <- data.frame(id = i,
                        pos = 1:ncol(dat),
                        name = char,
                        value = dat[char,])
  all_dat <- rbind(all_dat, new_dat)
}

print(
ggplot(all_dat, aes(x = pos, fill = name, y = value)) +
  geom_bar(position='stack', stat = 'identity') +
  facet_wrap('id', ncol = 1)
)

TODO recombination here

sample from pcr product

bin_size_distribution <- data.frame(
  pcr_0 = 1:n_rna,
  draw = ceiling(rexp(n_rna, 1/1e1)))
pcr_seqs <-
pcr_seqs %>%
  filter(gen_num == final_pcr_cycle) %>%
  group_by(pcr_0) %>%
  mutate(sampling_id = sample(1:n(), n()))
pcr_seqs <- merge(pcr_seqs, bin_size_distribution,
                  by.x = 'pcr_0',
                  by.y = 'pcr_0')

pcr_seqs <- pcr_seqs %>% filter(sampling_id <= draw)
#sanity check
x <- pcr_seqs %>% group_by(pcr_0) %>% summarize(min_draw = min(draw), max_draw = max(draw))

Split into left and right reads

Then chop the sequences so that 300 from left goes to left read and 300 from right goes to right read.

rev_string <- function(x){
  y <- x
  for (i in 1:length(x)){
    y[i] <- intToUtf8(rev(utf8ToInt(x[i])))
  }
  return(y)
}

left_seqs <- pcr_seqs[, c('pcr_0', 'id', 'the_seq')]
left_seqs$the_seq <- substr(left_seqs$the_seq, 1, rl)

right_seqs <- pcr_seqs[, c('pcr_0', 'id', 'the_seq')]
right_seqs$the_seq <- rev_string(substr(right_seqs$the_seq, nchar(right_seqs$the_seq) - rl + 1, nchar(right_seqs$the_seq)))

Add quality scores

To make FastQ files - for now just perfect qualities

# Really poor implementation. Fix it later
left_seqs$qual <- ''
for (i in 1:nrow(left_seqs)){
  left_seqs$qual <- paste(sample(c('E', 'F', 'G'), nchar(left_seqs$the_seq[i]), replace = TRUE), collapse = '')
}
right_seqs$qual <- ''
for (i in 1:nrow(right_seqs)){
  right_seqs$qual <- paste(rep('G', nchar(right_seqs$the_seq[i])), collapse = '')
}

Write to Fastq

Requires the shortreads package. That is a very major dependency - probablly better to write a minimal fastq writer myself.

TODO - add in the content into the headers that makes the nice MotifBinner2 plots

https://help.basespace.illumina.com/articles/descriptive/fastq-files/

Also basicQC.R in MotifBinner2

Build ShortReadQ objects

library(ShortRead)
left_seqs_f <-
ShortReadQ(
  sread = DNAStringSet(left_seqs$the_seq),
  quality = BStringSet(left_seqs$qual),
  id = BStringSet(paste('seq', left_seqs$pcr_0, left_seqs$id, sep = '_'))
)

right_seqs_f <-
ShortReadQ(
  sread = DNAStringSet(right_seqs$the_seq),
  quality = BStringSet(right_seqs$qual),
  id = BStringSet(paste('seq', right_seqs$pcr_0, right_seqs$id, sep = '_'))
)

Write to file

writeFastq(left_seqs_f, '/tmp/left.fastq')
writeFastq(right_seqs_f, '/tmp/right.fastq')

MotifBinner2 command Currently fails on step 16 : extractPIDs Error in $<-.data.frame(*tmp*, "total_pid_gaps", value = 0) : replacement has 1 row, data has 0 Calls: applyOperation -> op -> $<- -> $<-.data.frame

{sh, eval = FALSE} MotifBinner2.R --non_overlapping --fwd_file=/tmp/left.fastq --fwd_primer_seq=GAGGAGATATGNNNNNNNNAGGGACAATTG --fwd_primer_lens=11,8,11 --fwd_primer_min_score=25 --rev_file=/tmp/right.fastq --rev_primer_seq=AGGGACAATTGGAGAAGTGAA --rev_primer_lens=21 --rev_primer_min_score=21 --fwd_pid_in_which_fragment=2 --rev_pid_in_which_fragment=NULL --output_dir=/tmp/mb2_test/ --base_for_names=mb2_test --ncpu=2 --merged_read_length=25



philliplab/yasss documentation built on Sept. 7, 2020, 3:28 p.m.