R/sim_read.R

Defines functions gen_pid_search_scenario add_snps mutate_base randomize_ambig gen_seq gen_and_contaminate_reads gen_reads gen_error_profile_uniform gen_error_profile

Documented in add_snps gen_and_contaminate_reads gen_error_profile gen_error_profile_uniform gen_pid_search_scenario gen_reads gen_seq mutate_base randomize_ambig

# Given some input sequences the functions in this script will produce
# simulated reads from those sequences

#' Generates a read error profile
#' @param technique What technique should be used to generate the error
#' profile?
#' @param params A list of parameters to pass to the specific profile generator
#' @export

gen_error_profile <- function(technique = 'uniform', 
                              params = list(read_len = 1000,
                                            A_err_rates = list('A' = 0.997, 'C' = 0.001,
                                                               'G' = 0.001, 'T' = 0.001),
                                            C_err_rates = list('C' = 0.997, 'A' = 0.001,
                                                               'G' = 0.001, 'T' = 0.001),
                                            G_err_rates = list('G' = 0.997, 'C' = 0.001,
                                                               'A' = 0.001, 'T' = 0.001),
                                            T_err_rates = list('T' = 0.997, 'C' = 0.001,
                                                               'G' = 0.001, 'A' = 0.001))){
  if (technique == 'uniform'){
    err_prof <- do.call(gen_error_profile_uniform, params)
  }
  return(err_prof)
}

#' Generates a read error profile by assigning a constant probability of an
#' error at all positions
#' @param read_len The length of the profile
#' @param A_err_rates The probabilities that an A will be read as each of the
#' four nucleotides. Must sum to 1.
#' @param C_err_rates The probabilities that a C will be read as each of the
#' four nucleotides. Must sum to 1.
#' @param G_err_rates The probabilities that a G will be read as each of the
#' four nucleotides. Must sum to 1.
#' @param T_err_rates The probabilities that a T will be read as each of the
#' four nucleotides. Must sum to 1.
#' @export

gen_error_profile_uniform <- function(read_len, A_err_rates, C_err_rates, 
                                      G_err_rates, T_err_rates){
  error_rates <- data.frame(from_let = 'A',
                            to_let = c('A', 'C', 'G', 'T'),
                            prob = c(A_err_rates[['A']],
                                     A_err_rates[['C']],
                                     A_err_rates[['G']],
                                     A_err_rates[['T']]))
  error_rates <- rbind(error_rates,
                       data.frame(from_let = 'C',
                                  to_let = c('A', 'C', 'G', 'T'),
                                  prob = c(C_err_rates[['A']],
                                           C_err_rates[['C']],
                                           C_err_rates[['G']],
                                           C_err_rates[['T']])))
  error_rates <- rbind(error_rates,
                       data.frame(from_let = 'G',
                                  to_let = c('A', 'C', 'G', 'T'),
                                  prob = c(G_err_rates[['A']],
                                           G_err_rates[['C']],
                                           G_err_rates[['G']],
                                           G_err_rates[['T']])))
  error_rates <- rbind(error_rates,
                       data.frame(from_let = 'T',
                                  to_let = c('A', 'C', 'G', 'T'),
                                  prob = c(T_err_rates[['A']],
                                           T_err_rates[['C']],
                                           T_err_rates[['G']],
                                           T_err_rates[['T']])))
  pos <- rep(1:read_len, each=16)
  error_rates <- data.frame(pos = pos,
                            from_let = error_rates$from_let,
                            to_let = error_rates$to_let,
                            prob = error_rates$prob)
  return(error_rates)
}

#' Generates a specified number of reads from a given sequence based on a given
#' error profile
#' @param ref_seq The reference sequence to generate the reads from
#' @param n_reads The number of reads to generate
#' @param error_rates The error profile as generated by gen_error_profile
#' @param name_prefix The prefix to give to the sequence names
#' @export

gen_reads <- function(ref_seq, 
                      n_reads,
                      error_rates,
                      name_prefix = 'src'){
  out_seq <- DNAStringSet(rep(as.character(ref_seq), n_reads))
  for (i in 1:nchar(ref_seq)){
    mut_probs <- runif(n_reads)
    cur_letter <- letter(ref_seq, i)
    mut_rates <- error_rates[error_rates$pos == i & error_rates$from_let == cur_letter,
                             c('to_let', 'prob')]
    cs_mut_rates <- cumsum(mut_rates$prob)
    cs_mut_rates <- c(-1, cs_mut_rates)
    new_letters <- .bincode(mut_probs, cs_mut_rates)
    new_letters <- DNAStringSet(mut_rates$to_let[new_letters])
    rep_mat <- matrix(F, nrow = n_reads, ncol = nchar(ref_seq))
    rep_mat[,i] <- T
    out_seq <- replaceLetterAt(out_seq, rep_mat, new_letters)
  }
  names(out_seq) <- paste(name_prefix, 1:n_reads, sep = '_')
  return(out_seq)
}

#' Generate contaminated reads
#'
#' Basically calls gen_reads twice and mixes the results together
#'
#' @param ref_seq The true sequence from which to generate the 'src' reads
#' @param n_reads The number of reads to generate
#' @param error_rates The error profile as generated by gen_error_profile
#' @param contam_seq The sequence to generate the contaminated reads from
#' @param n_contam The number of contamination reads to include
#' @param seed The seed for the randomizer
#' @export

gen_and_contaminate_reads <- function(ref_seq, 
                                      n_reads,
                                      error_rates,
                                      contam_seq,
                                      n_contam,
                                      seed = NULL,
                                      ...){
  if (!is.null(seed)){
    set.seed(seed)
  }
  src_seq <- gen_reads(ref_seq, n_reads, error_rates, name_prefix = 'src')
  if (is.null(contam_seq) | n_contam == 0){
    return(list(src = src_seq,
                out = DNAStringSet(NULL),
                true_consensus = ref_seq))
  } else {
    contam_seq <- gen_reads(contam_seq, n_contam, error_rates, name_prefix = 'out')
    return(list(src = src_seq,
                out = contam_seq,
                true_consensus = ref_seq))
  }
}

#' Simulates a random sequence of a given length
#' @param n The length of the desired sequence
#' @export

gen_seq <- function(n){
  DNAStringSet(paste0(sample(c('A', 'C', 'G', 'T'), n, replace=T), collapse=""))
}

#' Removes ambiguous letters from a sequence by replacing them with a randomly
#' selected letter they represent
#' @param seq_dat The sequence in which to remove the ambiguity characters
#' @export

randomize_ambig <- function(seq_dat){
  if (class(seq_dat) == 'DNAString'){
    seq_dat <- DNAStringSet(seq_dat)
  }
  ambig_char <- IUPAC_CODE_MAP[!(names(IUPAC_CODE_MAP) %in% c('A', 'C', 'G', 'T'))]
  for (j in 1:length(seq_dat)){
    for (i in 1:nchar(seq_dat[j])){
      curr_char <- substr(seq_dat[j], i, i)
      if (!(curr_char %in% c('A', 'C', 'G', 'T'))){
        n_options <- nchar(ambig_char[curr_char])
        sample_indx <- sample(1:n_options, 1)
        new_let <- substr(ambig_char[curr_char], sample_indx, sample_indx)
        seq_dat[j] <- gsub(curr_char, new_let, seq_dat[j])
      }
    }
  }
  return(seq_dat)
}

#' Mutates a single base to another base
#' @param x The input letter
#' @export

mutate_base <- function(x){
  stopifnot(length(x) == 1)
  stopifnot(nchar(x) == 1)
  alphabet <- c('A', 'C', 'G', 'T')
  alphabet <- alphabet[!(alphabet %in% x)]
  return(sample(alphabet, 1))
}

#' Adds n snps to each sequence in seq_dat
#' @param seq_dat The sequences in which to introduce the snps
#' @param n The number of snps to introduce in each sequence
#' @export

add_snps <- function(seq_dat, n){
  stopifnot(all(width(seq_dat) >= n))
  for (j in 1:length(seq_dat)){
    curr_seq <- seq_dat[j]
    snp_positions <- sample(1:nchar(curr_seq), n)
    for (snp_pos in snp_positions){
      snp_range <- IRanges(snp_pos, snp_pos)
      old_let <- as.character(subseq(curr_seq, snp_pos, snp_pos))
      new_let <- mutate_base(old_let)
      curr_seq <- replaceAt(curr_seq, snp_range, new_let)
    }
    seq_dat[j] <- curr_seq
  }
  return(seq_dat)
}

#' Simulate a scenario that can be used to test PID detection
#'
#' @param seq_len The length of the random sequence to put before the prefix
#' @param prefix_len The length of the prefix
#' @param pid_len The length of the pid
#' @param suffix_len The length of the suffix
#' @param prefix_snps The number of snps to introduce to the prefix
#' @param suffix_snps The number of snps to introduce to the suffix
#' @param suffix_chop The number of bases to chop from the end of the suffix
#' @export

gen_pid_search_scenario <- function(seq_len, prefix_len, pid_len, 
                                    suffix_len, prefix_snps, suffix_snps,
                                    suffix_chop){
  read_dat <- gen_seq(seq_len)
  pid <- gen_seq(pid_len)
  if (is.numeric(prefix_len)){
    prefix <- gen_seq(prefix_len)
  } else {
    prefix <- DNAStringSet(prefix_len)
  }
  seq_prefix <- add_snps(prefix, prefix_snps)
  if (is.numeric(suffix_len)){
    suffix <- gen_seq(suffix_len)
  } else {
    suffix <- DNAStringSet(suffix_len)
  }
  seq_suffix <- add_snps(suffix, suffix_snps)
  seq_suffix <- substr(seq_suffix, 1, (nchar(seq_suffix)-suffix_chop))
  seq_dat <- paste0(read_dat, seq_prefix, pid, seq_suffix)
  return(list(seq_dat = seq_dat,
              prefix = prefix,
              pid = pid,
              suffix = suffix))
}
philliplab/MotifBinner documentation built on Sept. 2, 2020, 11:41 a.m.