In zcolburn/evolution: Genetic Algorithm For Protein Engineering

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width=7, fig.height=5
)

Installation

The evolution package available on GitHub can be installed as follows:

devtools::install_github("zcolburn/evolution")
library(evolution)

An HTML rendered version of this vignette can be found on this RPubs page.

Overview

Using the functions included in the evolution package, you can simulate the evolution of regions of interest in a nucleotide sequence. The following code generates a nucleotide sequence of 100 codons and does not attach start or stop codons to the ends. Thus, in the following analyses, we ignore the possibility that the initial start and stop codons become mutated. The functions find_start_codon and find_stop_codon could be used to facilitate a more extensive analysis of this possibility.

Performing mutations

library(evolution)

# Set a random seed.
set.seed(10)

# Load %>% from dplyr.
library(dplyr)

# Create a nucleotide sequence.
nt_seq <- evolution::generate_sequence(
  num = 100, 
  start_codon = FALSE, stop_codon = FALSE
)

# Subject the sequence to mutation using default parameters.
mutated_seq <- nt_seq %>%
  evolution::substitution_mutator()

Simulating genetic drift

Next, I perform numerous iterations of mutation and compare the resulting protein sequence at each iteration. I make the assumptions that 1) the sequence is in-frame, 2) any alteration to protein sequence will result in a non-functional protein, causing cell death, 3) only substitution mutations take place, and 4) the number of individuals in the population is restored at the end of each time-step. The general modeling function defined below takes a minimum of 4 parameters: the initial nucleotide sequence, the number of time-steps, the population size, and the default nucleotide substitution probability.

# Get template protein sequence
prot_seq <- evolution::translate(nt_seq, check_start_and_stop = FALSE)

#' iterative_substitution
#'
#' @param nt_sequence A nucleotide sequence, as generated by generate_sequence.
#' @param num_iterations The number of time-steps.
#' @param pop_size The population size.
#' @param sub_rate The default nucleotide substitution probability.
#' @param ... Specific nucleotide substitution probabilities, i.e 
#' sub_rate_A_to_T or sub_rate_T_to_G.
iterative_substitution <- function(
  nt_sequence, num_iterations, pop_size, sub_rate, ...
){
  # Get  "..." arguments.
  params <- list(...)

  # Create a protein template for comparison to the protein sequence made
  # after substitution mutations.
  prot_template <- evolution::translate(nt_sequence, FALSE)

  # Initialize a list to store sequences for each element of the population.
  output <- rep(list(nt_sequence), pop_size)

  # Get loop start time.
  start_time <- as.numeric(Sys.time())

  # Define substitution rate setter. This is used to select the appropriate
  # substitution probability based on whether a specific subsitution 
  # probability has been defined or only the default.
  sub_rate_setter <- function(param, par = params){
    if(param %in% names(par)){
      return(par[[param]])
    }else{
      return(sub_rate)
    }
  }

  # Perform numerous rounds of mutations.
  for(i in 1:num_iterations){
    # Generate two mutated nucleotide sequences for each element in the 
    # population.
    new_seqs <- rep(lapply(
      output,
      function(current_seq){
        evolution::substitution_mutator(
          current_seq,
          sub_rate_A_to_T = sub_rate_setter("sub_rate_A_to_T", params),
          sub_rate_A_to_C = sub_rate_setter("sub_rate_A_to_C", params),
          sub_rate_A_to_G = sub_rate_setter("sub_rate_A_to_G", params),
          sub_rate_T_to_A = sub_rate_setter("sub_rate_T_to_A", params),
          sub_rate_T_to_C = sub_rate_setter("sub_rate_T_to_C", params),
          sub_rate_T_to_G = sub_rate_setter("sub_rate_T_to_G", params),
          sub_rate_C_to_T = sub_rate_setter("sub_rate_C_to_T", params),
          sub_rate_C_to_G = sub_rate_setter("sub_rate_C_to_G", params),
          sub_rate_C_to_A = sub_rate_setter("sub_rate_C_to_A", params),
          sub_rate_G_to_T = sub_rate_setter("sub_rate_G_to_T", params),
          sub_rate_G_to_C = sub_rate_setter("sub_rate_G_to_C", params),
          sub_rate_G_to_A = sub_rate_setter("sub_rate_G_to_A", params)
        )
      }
    ), 2)

    # Translate all codons of these nucleotide sequences into amino acids.
    new_prots <- lapply(
      new_seqs,
      function(current_seq){evolution::translate(current_seq, FALSE)}
    )

    # Identify protein sequences that do not match the template.
    is_dead_end <- lapply(
      new_prots,
      function(prot){
        if(any(is.na(prot)) || any(prot != prot_template)){
          return(TRUE)
        } else {
          return(FALSE)
        }
      }
    ) %>% unlist()

    # If no sequences are valid, then terminate the simulation.
    if(sum(!is_dead_end) == 0){return(paste0("All died on iteration ", i))}

    # Drop dead ends.
    new_seqs <- new_seqs[!is_dead_end]

    # Select pop_size sequences for the next iteration. This resamples with 
    # replacement such that the pop size is restored to pop_size at the end of 
    # each iteration.
    valid_indices <- sample(1:length(new_seqs), size = pop_size, replace = TRUE)
    output <- new_seqs[valid_indices]

    # Show progress.
    flush.console()
    print(
      sprintf(
        "%d/%d. Time remaining: %f s.", 
        i, 
        num_iterations, 
        (num_iterations - i)*(as.numeric(Sys.time())-start_time)/i)
    )
  }

  # Print elapsed time.
  print(sprintf("Elapsed time: %f", as.numeric(Sys.time())-start_time))

  # Output the result.
  return(output)
}

# Perform iterative substitution.
num_iterations <- 50
pop_size <- 200
result <- iterative_substitution(nt_seq, num_iterations, pop_size, 0.002)

We can compare the resulting sequences obtained above to the original sequence.

# Determine the number of single nucleotide polymorphisms (SNPs).
num_snps <- lapply(
  result,
  function(current_seq){
    sum(current_seq != nt_seq)
  }
) %>% unlist()

# Print the average number of SNPs per sequence.
sprintf("Average number of SNPs per sequence: %f", mean(num_snps))

# Check that the protein sequence remains unchanged.
prot_template <- evolution::translate(nt_seq, FALSE)
proteins_match <- lapply(
  result,
  function(current_seq){
    prot_seq <- evolution::translate(current_seq, FALSE)
    if(all(prot_seq == prot_template)){
      return(TRUE)
    } else {
      return(FALSE)
    }
  }
) %>% unlist()

# Print the number of protein sequences with amino acids that do not match the
# template.
sprintf(
  "Number of proteins that do not match the template: %d",
  sum(!proteins_match)
)

Despite the protein products all being identical, the nucleotide sequences that encode them carry an average of r round(mean(num_snps), 1) SNPs. That is due solely to genetic drift.

Introducing selective pressure

We can subject individuals to selective pressure in a couple ways. First, we could do so by limiting the substitution rate of individual nucleotides. In the example below, the rate at which adenine is substituted is reduced.

# Mutate the template nucleotide sequence as before but with reduced adenine
# substitution rates.
mut_result <- iterative_substitution(
  nt_seq, num_iterations, pop_size, 0.002,
  sub_rate_T_to_A = 0.0001,
  sub_rate_C_to_A = 0.0001,
  sub_rate_G_to_A = 0.0001
)

Repeating the analysis described above on the sequences that resulted from altered selective pressure gives:

# Compare new sequences to the original.
mut_num_snps <- lapply(
  mut_result,
  function(current_seq){
    sum(current_seq != nt_seq)
  }
) %>% unlist()

# Print the average number of single nucleotide polymorphisms (SNPs) per 
# sequence.
sprintf("Average number of SNPs per sequence: %f", mean(mut_num_snps))

# Check that the protein sequence remains unchanged.
mut_proteins_match <- lapply(
  mut_result,
  function(current_seq){
    prot_seq <- evolution::translate(current_seq, FALSE)
    if(all(prot_seq == prot_template)){
      return(TRUE)
    } else {
      return(FALSE)
    }
  }
) %>% unlist()

# Print the number of protein sequences with amino acids that do not match the
# template.
sprintf(
  "Number of proteins that do not match the template: %d",
  sum(!mut_proteins_match)
)

Compare SNP frequency distributions

# Place the simulated data in a tibble.
df <- tibble::data_frame(
  category = rep(c("Drift","Selection"), each = pop_size),
  snps = c(num_snps, mut_num_snps)
)

# Graph the results.
library(ggplot2)
ggplot(df, aes(snps, fill = category))+
  geom_histogram(position = "identity", alpha = 0.4, binwidth = 1)+
  theme_bw()+
  xlab("Number of SNPs")+
  ylab("Count")+
  scale_x_continuous(expand = c(0,0))+
  scale_y_continuous(expand = c(0,0))+
  theme(
    legend.position = c(0.1,0.9)
  )+
  labs(fill = "Condition")

Due to the difference in the adenine substitution rate, the two evaluation conditions exhibit different numbers of SNPs. These distributions are significantly different by the Kolmogorov-Smirnov test:

ks.test(num_snps, mut_num_snps, alternative = "two.sided")

Multiple correspondence analysis to compare sequence similarity

# Place all sequences in a data frame.
seqs <- lapply(c(result, mut_result), function(nt_seq){
  output <- matrix(nt_seq, nrow = 1)
  colnames(output) <- make.names(1:length(nt_seq))
  tibble::as_tibble(output)
}) %>%
  dplyr::bind_rows() %>%
  mutate_all(funs(factor)) %>% # Convert to factors, then add the condition.
  dplyr::mutate(condition = rep(c("Drift","Selection"), each = pop_size))

# Perform multiple correspondence analysis
mca_result <- FactoMineR::MCA(
  seqs, 
  ncp = 2, 
  quali.sup = which(colnames(seqs) == "condition"),
  graph = FALSE
)

# Extract MCA coordinates.
mca_coords <- mca_result$ind$coord %>%
  tibble::as_tibble() %>%
  dplyr::mutate(condition = seqs$condition)

# Plot MCA result
ggplot(mca_coords, aes(`Dim 1`, `Dim 2`, colour = condition))+
  geom_point()+
  theme_bw()+
  scale_x_continuous(expand = c(0,0))+
  scale_y_continuous(expand = c(0,0))+
  theme(
    legend.position = c(0.1,0.9)
  )+
  labs(colour = "Condition")

MCA reveals that r num_iterations generations of selection (with the rate parameters indicated above) is sufficient to cause noticeable divergence of two populations initially composed of r pop_size identical individuals.

Summary

The functions provided in this package can be used to simulate gene evolution. The methods provided allow for substitution, insertion, and deletion mutations to be made in arbitrary sequences. By modifying the iterative_substitution function defined above, more elaborate simulation schemes could be developed. For example, the probability of propagation of a particular gene sequence to the following generation could be modulated based on the disruption of sites of interest. This would be analogous to adjusting organism fitness.

zcolburn/evolution documentation built on May 19, 2019, 1:48 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com