# Given some input sequences the functions in this script will produce
# simulated reads from those sequences
#' Generates a read error profile
#' @param technique What technique should be used to generate the error
#' profile?
#' @param params A list of parameters to pass to the specific profile generator
#' @export
gen_error_profile <- function(technique = 'uniform',
params = list(read_len = 1000,
A_err_rates = list('A' = 0.997, 'C' = 0.001,
'G' = 0.001, 'T' = 0.001),
C_err_rates = list('C' = 0.997, 'A' = 0.001,
'G' = 0.001, 'T' = 0.001),
G_err_rates = list('G' = 0.997, 'C' = 0.001,
'A' = 0.001, 'T' = 0.001),
T_err_rates = list('T' = 0.997, 'C' = 0.001,
'G' = 0.001, 'A' = 0.001))){
if (technique == 'uniform'){
err_prof <- do.call(gen_error_profile_uniform, params)
}
return(err_prof)
}
#' Generates a read error profile by assigning a constant probability of an
#' error at all positions
#' @param read_len The length of the profile
#' @param A_err_rates The probabilities that an A will be read as each of the
#' four nucleotides. Must sum to 1.
#' @param C_err_rates The probabilities that a C will be read as each of the
#' four nucleotides. Must sum to 1.
#' @param G_err_rates The probabilities that a G will be read as each of the
#' four nucleotides. Must sum to 1.
#' @param T_err_rates The probabilities that a T will be read as each of the
#' four nucleotides. Must sum to 1.
#' @export
gen_error_profile_uniform <- function(read_len, A_err_rates, C_err_rates,
G_err_rates, T_err_rates){
error_rates <- data.frame(from_let = 'A',
to_let = c('A', 'C', 'G', 'T'),
prob = c(A_err_rates[['A']],
A_err_rates[['C']],
A_err_rates[['G']],
A_err_rates[['T']]))
error_rates <- rbind(error_rates,
data.frame(from_let = 'C',
to_let = c('A', 'C', 'G', 'T'),
prob = c(C_err_rates[['A']],
C_err_rates[['C']],
C_err_rates[['G']],
C_err_rates[['T']])))
error_rates <- rbind(error_rates,
data.frame(from_let = 'G',
to_let = c('A', 'C', 'G', 'T'),
prob = c(G_err_rates[['A']],
G_err_rates[['C']],
G_err_rates[['G']],
G_err_rates[['T']])))
error_rates <- rbind(error_rates,
data.frame(from_let = 'T',
to_let = c('A', 'C', 'G', 'T'),
prob = c(T_err_rates[['A']],
T_err_rates[['C']],
T_err_rates[['G']],
T_err_rates[['T']])))
pos <- rep(1:read_len, each=16)
error_rates <- data.frame(pos = pos,
from_let = error_rates$from_let,
to_let = error_rates$to_let,
prob = error_rates$prob)
return(error_rates)
}
#' Generates a specified number of reads from a given sequence based on a given
#' error profile
#' @param ref_seq The reference sequence to generate the reads from
#' @param n_reads The number of reads to generate
#' @param error_rates The error profile as generated by gen_error_profile
#' @param name_prefix The prefix to give to the sequence names
#' @export
gen_reads <- function(ref_seq,
n_reads,
error_rates,
name_prefix = 'src'){
out_seq <- DNAStringSet(rep(as.character(ref_seq), n_reads))
for (i in 1:nchar(ref_seq)){
mut_probs <- runif(n_reads)
cur_letter <- letter(ref_seq, i)
mut_rates <- error_rates[error_rates$pos == i & error_rates$from_let == cur_letter,
c('to_let', 'prob')]
cs_mut_rates <- cumsum(mut_rates$prob)
cs_mut_rates <- c(-1, cs_mut_rates)
new_letters <- .bincode(mut_probs, cs_mut_rates)
new_letters <- DNAStringSet(mut_rates$to_let[new_letters])
rep_mat <- matrix(F, nrow = n_reads, ncol = nchar(ref_seq))
rep_mat[,i] <- T
out_seq <- replaceLetterAt(out_seq, rep_mat, new_letters)
}
names(out_seq) <- paste(name_prefix, 1:n_reads, sep = '_')
return(out_seq)
}
#' Generate contaminated reads
#'
#' Basically calls gen_reads twice and mixes the results together
#'
#' @param ref_seq The true sequence from which to generate the 'src' reads
#' @param n_reads The number of reads to generate
#' @param error_rates The error profile as generated by gen_error_profile
#' @param contam_seq The sequence to generate the contaminated reads from
#' @param n_contam The number of contamination reads to include
#' @param seed The seed for the randomizer
#' @export
gen_and_contaminate_reads <- function(ref_seq,
n_reads,
error_rates,
contam_seq,
n_contam,
seed = NULL,
...){
if (!is.null(seed)){
set.seed(seed)
}
src_seq <- gen_reads(ref_seq, n_reads, error_rates, name_prefix = 'src')
if (is.null(contam_seq) | n_contam == 0){
return(list(src = src_seq,
out = DNAStringSet(NULL),
true_consensus = ref_seq))
} else {
contam_seq <- gen_reads(contam_seq, n_contam, error_rates, name_prefix = 'out')
return(list(src = src_seq,
out = contam_seq,
true_consensus = ref_seq))
}
}
#' Simulates a random sequence of a given length
#' @param n The length of the desired sequence
#' @export
gen_seq <- function(n){
DNAStringSet(paste0(sample(c('A', 'C', 'G', 'T'), n, replace=T), collapse=""))
}
#' Removes ambiguous letters from a sequence by replacing them with a randomly
#' selected letter they represent
#' @param seq_dat The sequence in which to remove the ambiguity characters
#' @export
randomize_ambig <- function(seq_dat){
if (class(seq_dat) == 'DNAString'){
seq_dat <- DNAStringSet(seq_dat)
}
ambig_char <- IUPAC_CODE_MAP[!(names(IUPAC_CODE_MAP) %in% c('A', 'C', 'G', 'T'))]
for (j in 1:length(seq_dat)){
for (i in 1:nchar(seq_dat[j])){
curr_char <- substr(seq_dat[j], i, i)
if (!(curr_char %in% c('A', 'C', 'G', 'T'))){
n_options <- nchar(ambig_char[curr_char])
sample_indx <- sample(1:n_options, 1)
new_let <- substr(ambig_char[curr_char], sample_indx, sample_indx)
seq_dat[j] <- gsub(curr_char, new_let, seq_dat[j])
}
}
}
return(seq_dat)
}
#' Mutates a single base to another base
#' @param x The input letter
#' @export
mutate_base <- function(x){
stopifnot(length(x) == 1)
stopifnot(nchar(x) == 1)
alphabet <- c('A', 'C', 'G', 'T')
alphabet <- alphabet[!(alphabet %in% x)]
return(sample(alphabet, 1))
}
#' Adds n snps to each sequence in seq_dat
#' @param seq_dat The sequences in which to introduce the snps
#' @param n The number of snps to introduce in each sequence
#' @export
add_snps <- function(seq_dat, n){
stopifnot(all(width(seq_dat) >= n))
for (j in 1:length(seq_dat)){
curr_seq <- seq_dat[j]
snp_positions <- sample(1:nchar(curr_seq), n)
for (snp_pos in snp_positions){
snp_range <- IRanges(snp_pos, snp_pos)
old_let <- as.character(subseq(curr_seq, snp_pos, snp_pos))
new_let <- mutate_base(old_let)
curr_seq <- replaceAt(curr_seq, snp_range, new_let)
}
seq_dat[j] <- curr_seq
}
return(seq_dat)
}
#' Simulate a scenario that can be used to test PID detection
#'
#' @param seq_len The length of the random sequence to put before the prefix
#' @param prefix_len The length of the prefix
#' @param pid_len The length of the pid
#' @param suffix_len The length of the suffix
#' @param prefix_snps The number of snps to introduce to the prefix
#' @param suffix_snps The number of snps to introduce to the suffix
#' @param suffix_chop The number of bases to chop from the end of the suffix
#' @export
gen_pid_search_scenario <- function(seq_len, prefix_len, pid_len,
suffix_len, prefix_snps, suffix_snps,
suffix_chop){
read_dat <- gen_seq(seq_len)
pid <- gen_seq(pid_len)
if (is.numeric(prefix_len)){
prefix <- gen_seq(prefix_len)
} else {
prefix <- DNAStringSet(prefix_len)
}
seq_prefix <- add_snps(prefix, prefix_snps)
if (is.numeric(suffix_len)){
suffix <- gen_seq(suffix_len)
} else {
suffix <- DNAStringSet(suffix_len)
}
seq_suffix <- add_snps(suffix, suffix_snps)
seq_suffix <- substr(seq_suffix, 1, (nchar(seq_suffix)-suffix_chop))
seq_dat <- paste0(read_dat, seq_prefix, pid, seq_suffix)
return(list(seq_dat = seq_dat,
prefix = prefix,
pid = pid,
suffix = suffix))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.