Packages used in this exercise.
#--- ape --- library(ape) #--- seqinr --- library(seqinr) #--- simulation --- library(phangorn) #--- msa --- library(msa)
# --- reading in the lizard fasta file lizard_seqs <- read.fasta( file = "lizard_seqs.fasta", seqtype = "DNA" ) # remove unwanted characters liz <- lapply(lizard_seqs, function(x){ x[!x %in% c(" ", "y", "r", "m", "s")] } )
There are gaps in the sequence in each of the accession sequences. Points of puRines (R), pYrimidines (Y), aMino groups bases (M) and strong interaction (S). Adenine have the highest composition percentage at each of the accession numbers. Area of strong interaction only occurs at accession number "FJ356747". Overall, these imply the sequences are incomplete. In the simulations gaps and all other non ACTG are removed.
The length of the sequnces at the first five accessions are as below.
#--- length and composition of the sequences --- lapply(lizard_seqs, length)[1:5]
First five accession base compositions:
lapply(lizard_seqs, table)[1:5]
The first five accession GC are as follows:
#--- GC content of each --- lapply(lizard_seqs, GC)[1:5]
Artificial DNA simulation; each nucleotide randomly and independently drawn from base frequencies.
bcomps <- lapply(lizard_seqs, table) #--- simulation fxn --- set.seed(12345) sims <- function(x){ x <- lapply(x, function(x){x[c("a","c","g","t")]}) # x is base compositions # return a list of sequences art_sims <- list() # simulations for (i in 1:length(x)) { art_sims[[paste("synthetic", i, sep = "_")]] <- sample( names(x[[i]]), size = sum(x[[i]]), replace = TRUE, prob = x[[i]]/sum(x[[i]]) ) } return(art_sims) } sim1_seq <- sims(bcomps)
The first five base frequencies are presented below. We observed that in most of the simulated sequence the a's had higher frequencies while in the origin sequence all the accessions had a's with highest frequency.
basecomposition_sim_seq <- lapply(sim1_seq, table) basecomposition_sim_seq[1:5]
#--- saving simulation as fasta file ape::write.dna(sim1_seq, file ="sim1_seq.fasta", format = "fasta", append =FALSE, nbcol = 6, colsep = " ", colw = 10)
#---- creating a phylogenetic tree with 33 nodes set.seed(12345) tree <- rtree(n = 33) plot(tree)
We opted to simulate sequences using a custom transition matrix Q where probabiliy of mutation is 0.1. Base frequencies of each of the accessions are used in the simulation. The DNA sequences are saved as a fasta file sim2_seq.fasta.
#simulating the sequence Q_mat <- matrix(data= c(0.9,0.1,0.1,0.1,0.1,0.9,0.1,0.1,0.1,0.1,0.9,0.1,0.1,0.1,0.1,0.9),4,4) base_f <- as.vector(bcomps[[1]][c("a","c","g","t")])/sum(bcomps[[1]][c("a","c","g","t")]) data <- simSeq(tree, l = 1000, type="DNA", bf=base_f, Q=Q_mat) counter = 1 sim2_seq = list() for (i in 1:length(data)) { sim2_seq[[paste("synthetic2", i, sep = "_")]] <- as.character(data)[counter:(counter+1000)] counter = counter + 1000 }
#saving as fasta file ape::write.dna(sim2_seq, file ="sim2_seq.fasta", format = "fasta", append =FALSE, nbcol = 6, colsep = " ", colw = 10)
A sample of the base compostion on the simulated sequences is shown below. The base frequencies are almost similar to the original sequence with a's and t's having higher frequencies.
basecomposition_sim2_seq <- lapply(sim2_seq, table) basecomposition_sim2_seq[1:5]
Basic statistics on GC content.
gc1 <- lapply(lizard_seqs, GC) gc1[1:5]
# GC content of sim1_seq gc2 <- lapply(sim1_seq, GC) gc2[1:5]
# omit na from sim2_seq sim2_seq <- lapply(sim2_seq, na.omit) gc3 <- lapply(sim2_seq, GC) gc3[1:5]
AT content for original lizard sequence first 5. Computed as 1 - GC content.
at1 <- list() for (i in 1:length(gc1)) { at1[[i]] <- 1 -gc1[[i]] } at1[1:5]
AT content for independence sequence simulation.
at2 <- list() for (i in 1:length(gc2)) { at2[[i]] <- 1 - gc2[[i]] } at2[1:5]
AT content of simulated sequence from tree
at3 <- list() for (i in 1:length(gc3)) { at3[[i]] <- 1 - gc3[[i]] } at3[1:5]
For all the sequences AT content was greater than 50%.
Base compositions:
lapply(lizard_seqs, table)[1:5]
# base composition of sim2_seq lapply(sim1_seq, table)[1:5]
# base compostion of sim1_seq lapply(sim2_seq, table)[1:6]
We used the translate function from package seqinr to obtain the protein sequences of each of the three data sets. We noticed that the simulated sequence using independence model the accessions had higher number of stop codons compared to the original sequences data sets. Some accessions had as high as 73 stop codons.
p_seq1 <- lapply(lizard_seqs, translate) p_seq2 <- lapply(sim2_seq, translate) p_seq3 <- lapply(sim1_seq, translate)
Markov chains were fitted for the lizards sequences and the artificial sequences, from the transition matrix obtained we have understood that the markov chain is of order 1. Thus, the current state only depends on the immediate previous state.
markov1 <-markovchainFit(sim1_seq) markov2 <-markovchainFit(sim2_seq) markov_initial <-markovchainFit(lizard_seqs)
#Align sequences Lizard_seq_align <- msa(readDNAStringSet("lizard_seqs.fasta")) sim1_seq_align <- msa(readDNAStringSet("sim1_seq.fasta")) sim2_seq_align <- msa(readDNAStringSet("sim2_seq.fasta")) Lizard_seq_align_c <- msaConvert(Lizard_seq_align, type="seqinr::alignment") sim1_seq_align_c <- msaConvert(sim1_seq_align, type="seqinr::alignment") sim2_seq_align_c <- msaConvert(sim2_seq_align, type="seqinr::alignment") dist_mat1 <- as.matrix(dist.alignment(Lizard_seq_align_c, matrix = "identity")) dist_mat2 <- as.matrix(dist.alignment(sim1_seq_align_c, matrix = "identity")) dist_mat3 <- as.matrix(dist.alignment(sim2_seq_align_c, matrix = "identity")) heatmap(dist_mat1) heatmap(dist_mat2) heatmap(dist_mat3)
#Lizard_seq_align_d <- as.DNAbin(Lizard_seq_align) #sim1_seq_align_d <- as.DNAbin(sim1_seq_align) #sim2_seq_align_d <- as.DNAbin(sim2_seq_align)
#fun <- function(x) as.phylo(hclust(dist.dna(x), "average")) # upgma() in phangorn #tree <- fun(Lizard_seq_align_d) #bstrees <- boot.phylo(tree, Lizard_seq_align_d, fun, trees = TRUE)$trees f <- function(x) upgma(x) tree <- f(dist_mat1) bstrees <- boot.phylo(tree, dist_mat1, f, trees = TRUE)$trees sim1_tree <- f(dist_mat2) bstrees_sim1 <- boot.phylo(sim1_tree, dist_mat2, f, trees = TRUE)$trees sim2_tree <- f(dist_mat3) bstrees_sim2 <- boot.phylo(sim2_tree, dist_mat3, f, trees = TRUE)$trees
# clads clad_l_seq <- prop.clades(tree, bstrees, rooted = TRUE) clad_sim1 <- prop.clades(sim1_tree, bstrees_sim1, rooted = TRUE) clad_sim2 <- prop.clades(sim2_tree, bstrees_sim2, rooted = TRUE) # bipartitions boot <- prop.clades(tree, bstrees) layout(1) par(mar = rep(2, 4)) boot_sim1 <- prop.clades(sim1_tree, bstrees_sim1) layout(1) par(mar = rep(2, 4)) boot_sim2 <- prop.clades(sim2_tree, bstrees_sim2) layout(1) par(mar = rep(2, 4))
#plot of main DNA plot(tree, main = "Bipartition vs. Clade Support Values", sub = "Lizard Sequence") drawSupportOnEdges(boot) nodelabels(clad_l_seq) legend("bottomleft", legend = c("Bipartitions", "Clades"), pch = 22, pt.bg = c("green", "lightblue"), pt.cex = 2.5)
#Plot synthetic sequence 1 plot(sim1_tree, main = "Bipartition vs. Clade Support Values", sub = "Simulated Sequence 1") drawSupportOnEdges(boot_sim1) nodelabels(clad_sim1) legend("bottomleft", legend = c("Bipartitions", "Clades"), pch = 22, pt.bg = c("green", "lightblue"), pt.cex = 2.5)
#plot synthetic sequence 2 plot(sim2_tree, main = "Bipartition vs. Clade Support Values", sub = "Simulated Sequence 2") drawSupportOnEdges(boot_sim2) nodelabels(clad_sim2) legend("bottomleft", legend = c("Bipartitions", "Clades"), pch = 22, pt.bg = c("green", "lightblue"), pt.cex = 2.5)
comparePhylo(tree, sim1_tree, plot = FALSE, force.rooted = FALSE, use.edge.length = FALSE) comparePhylo(tree, sim2_tree, plot = FALSE, force.rooted = FALSE, use.edge.length = FALSE) comparePhylo(sim1_tree, sim2_tree, plot = FALSE, force.rooted = FALSE, use.edge.length = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.