#################
## fwd.snp.sim ##
#################
########################################################################
###################
## DOCUMENTATION ##
###################
#' Short one-phrase description.
#'
#' Longer proper discription of function...
#'
#' @param snps description.
#'
#' @author Caitlin Collins \email{caitiecollins@@gmail.com}
#'
#'
#' @import adegenet
#' @rawNamespace import(ape, except = zoom)
#' @export
########################################################################
## ARGUMENTS: ##
## n.subs <- either an integer or a vector containing a distribution of n.subs-per-site
## phen.loci <- a vector containing the indices of the edges on which phen subs occurred
fwd.snp.sim <- function(n.snps=10000, n.subs=1,
n.snps.assoc=0, n.subs.assoc=15,
tree=coalescent.tree.sim(100),
heatmap=FALSE, reconstruct=FALSE,
dist.dna.model="JC69",
seed=1){
# require(adegenet)
# require(ape)
# require(phangorn)
##################################
## GET MUTATIONS' branch & loci ##
##################################
n.ind <- tree$Nnode+1
gen.size <- n.snps
if(!is.null(seed)) set.seed(seed)
## simulate genotype for root individual:
gen.root <- sample(c("a", "c", "g", "t"), gen.size, replace=TRUE)
## get the sum of all branch lengths in the tree:
time.total <- sum(tree$edge.length)
## make dummy variables in which to store the resulting n.mts variables:
L <- lambda <- n.mts <- new.nt <- NA
snps.assoc <- NULL
## if n.snps.assoc is neither NULL nor 0:
if(is.null(n.snps.assoc)) n.snps.assoc <- 0
if(n.snps.assoc != 0){
## get non.assoc gen.size
gen.size.ori <- gen.size
gen.size <- gen.size-n.snps.assoc
## assign snps.assoc to be the last n.snps.assoc snps columns
snps.assoc <- c((gen.size+1):(gen.size+n.snps.assoc))
}
###################
## Handle n.subs ##
###################
## Either an integer
## --> draw n.subs from a Poisson distribution w parameter n.subs
## OR a vector (containing a distribution)
## --> use this distribution to define n.subs-per-site
if(length(n.subs)==1){
#####################
## NO DISTRIBUTION ##
#####################
## if no distribution is inputted,
## use normal simulation procedure
## (ie. Poisson parameter 1):
## draw the number of mutations to occur at each site:
n.mts <- rpois(n=gen.size, lambda=(n.subs))
## for any n.mts==0, re-sample
for(i in 1:length(n.mts)){
while(n.mts[i]==0){
n.mts[i] <- rpois(n=1, lambda=(n.subs))
}
}
################
## SNPS.ASSOC ##
################
if(n.snps.assoc != 0){
##############
## OPTIONS: ##
##############
## NOTE: Only Poisson currently implemented..
## OTHER POSSIBILITIES:
## Use a second inputted distribution as the snps.assoc dist.
## Use the upper X% tail of the regular n.subs dist for the snps.assoc.
## Allow all possible combinations of the two n.subs & n.subs.assoc arguments.
## Re-enable the backward simulation options w phen.loci & assoc.prob w/in one fn?
#############
## POISSON ##
#############
## draw n.subs per snp.assoc site from
## Poisson dist w param n.subs.assoc:
if(length(n.subs.assoc) == 1){
## draw the number of mutations to occur at each site:
n.mts.assoc <- rpois(n=n.snps.assoc, lambda=(n.subs.assoc))
## for any n.mts==0, re-sample
for(i in 1:length(n.mts.assoc)){
while(n.mts.assoc[i]==0){
n.mts.assoc[i] <- rpois(n=1, lambda=(n.subs.assoc))
}
}
## add the n.mts assoc to the general (non.ssoc) n.mts vector:
n.mts <- c(n.mts, n.mts.assoc)
}
}
}else{
##################
## DISTRIBUTION ##
##################
## if a distribution is provided by the user,
## we use this to determine the number of substitutions
## to occur at what proportion of sites (note that
## we may not be simulating the same number of sites)
## get dist.prop, a distribution containing the counts
## of the number of SNPs to be simulated that will have
## i many substitutions
dist.sum <- sum(dist)
dist.prop <- round((dist/dist.sum)*gen.size)
## check that these counts sum to gen.size,
## else add the remainder to the largest n.subs count
## (OR should we just add these to the n.subs=1 set ??? ###
## likely to be the same thing, but could not be...)
if(sum(dist.prop) != gen.size){
m <- which.max(dist.prop)
#m <- 1
if(sum(dist.prop) < gen.size){
dist.prop[m] <- dist.prop[m] + (gen.size - sum(dist.prop))
}
if(sum(dist.prop) > gen.size){
dist.prop[m] <- dist.prop[m] - (sum(dist.prop) - gen.size)
}
}
## get rid of useless trailing 0s
while(dist.prop[length(dist.prop)] == 0){
dist.prop <- dist.prop[c(1:(length(dist.prop)-1))]
}
## make n.mts, a vector of length ncol(snps)
n.mts <- rep(1111, gen.size)
loci.available <- c(1:gen.size)
## assign dist.prop[i] elements of n.mts
## to be the same as the n.subs
## indicated by i, the given element of dist.prop
for(j in 1:length(dist.prop)){
## provided there are not 0 sites to have this number of substitutions...
if(dist.prop[j] > 0){
if(length(loci.available) > 1){
## assign dist.prop[i] elements of n.mts to be i
loci.selected <- sample(loci.available, dist.prop[j], replace = FALSE)
loci.available <- loci.available[-which(loci.available %in% loci.selected)]
}else{
## if there is only 1 (the last) loci available,
## we select this one:
loci.selected <- loci.available
}
n.mts[loci.selected] <- j
}
}
} # end dist
############################
## Assign mts to branches ##
############################
## whether n.mts is chosen by Poisson or according to a Distribution...
# if(n.snps.assoc != 0){
# ## for snps.assoc (the last n.snps.assoc snps, for now),
# ## add n.mts == n.phen.loci s.t these sites mutate at each
# ## and every phen.loci (for now, to be diluted later
# ## according to assoc.prob if !=100)
# n.mts <- c(n.mts, rep(length(phen.loci), n.snps.assoc))
# }
## for each site, draw the branches to which
## you will assign the mts for this site
## (~ branch length):
snps.loci <- sapply(c(1:length(n.mts)),
function(e)
sample(c(1:length(tree$edge.length)),
n.mts[e],
replace=FALSE,
prob=tree$edge.length))
# if(n.snps.assoc != 0){
# ## get snps.loci for the ASSOCIATED snps (ie. set to phen.loci) ##
# for(i in 1:n.snps.assoc){
# snps.loci[[snps.assoc[i]]] <- phen.loci
# #print(snps.loci[[snps.assoc[i]]])
# }
# }
## rearrange snps.loci s.t it becomes a
## list of length tree$edge.length,
## each element of which contains the
## locations of the mutations that will
## occur on that branch
snps.loci <- sapply(c(1:length(tree$edge.length)),
function(f)
seq_along(snps.loci)[sapply(snps.loci,
function(e) f %in% e)])
# we will store the output in a list called genomes:
genomes <- list()
## get the node names for all individuals (terminal and internal)
all.inds <- sort(unique(as.vector(unlist(tree$edge))))
## we start w all inds having same genotype as root:
for(i in all.inds){
genomes[[i]] <- gen.root
}
## store replacement nts in list new.nts:
new.nts <- list()
## distinguish btw list of loci and unique list
snps.loci.ori <- snps.loci
## will need to treat repeat loci differently...
snps.loci.unique <- lapply(snps.loci, unique)
## the last individual in the first column of tree$edge
## (ie. ind.length(tree$tip.label)+1 ) is our root individual:
x <- rev(c(1:nrow(tree$edge)))
#############################
## For Loop to get new nts ##
#############################
for(i in x){
## for all genomes other than root, we mutate the
## genome of the node preceding it, according to snps.loci.
## Draw new nts for each locus selected for mutation:
if(!.is.integer0(snps.loci.unique[[i]])){
new.nts[[i]] <- sapply(c(1:length(snps.loci.unique[[i]])), function(e)
selectBiallelicSNP(c("a", "c", "g", "t")[which(c("a", "c", "g", "t")
%in% genomes[[tree$edge[i,1]]]
[snps.loci.unique[[i]][e]])]))
## if any loci are selected for multiple mutations
## within their given branch length:
if(length(snps.loci.ori[[i]]) != length(snps.loci.unique[[i]])){
## identify which loci are repeaters
repeats <-table(snps.loci.ori[[i]])[which(table(snps.loci.ori[[i]])!=1)]
## how many times they repeat
n.reps <- repeats - 1
## the positions of these loci in the vector of snps loci
toRepeat <- which(snps.loci.unique[[i]] %in% names(repeats))
## run chain of re-sampling to end in our new nt for repeater loci:
foo <- list()
for(j in 1:length(toRepeat)){
foo[[j]] <- new.nts[[i]][toRepeat[j]]
for(k in 1:n.reps[j]){
if(k==1){
foo[[j]][k] <- selectBiallelicSNP(c("a", "c", "g", "t")[which(c("a", "c", "g", "t")
%in% foo[[j]][1])])
}else{
foo[[j]][k] <- selectBiallelicSNP(c("a", "c", "g", "t")[which(c("a", "c", "g", "t")
%in% foo[[j]][k-1])])
}
}
## retain only the last nt selected
out <- sapply(c(1:length(foo)),
function(e) foo[[e]][length(foo[[e]])])
}
## for the loci with repeated mts, replace these positions
## in new.nts with the corresponding elements of out, above.
new.nts[[i]][toRepeat] <- out
} # end of if statement for repeaters
## update ancestral genotype with new.nts:
temp <- genomes[[tree$edge[i,1]]]
temp[snps.loci.unique[[i]]] <- new.nts[[i]]
genomes[[tree$edge[i,2]]] <- temp
}else{
## if no mts occur on branch, set genotype of
## downstream individual to be equal to ancestor's
genomes[[tree$edge[i,2]]] <- genomes[[tree$edge[i,1]]]
}
} # end of for loop selecting new nts at mutator loci
###############################################
## MODIFY SNPS.ASSOC ACCORDING TO ASSOC.PROB ##
###############################################
# if(n.snps.assoc != 0){
# ## if we have any imperfect associations... ##
# if(any(assoc.prob != 100)){
# ## check length
# if(length(assoc.prob) != n.snps.assoc){
# ## if only 1 prob value given...
# if(length(assoc.prob) == 1){
# ## ... assume uniform assoc.prob;
# assoc.prob <- rep(assoc.prob, n.snps.assoc)
# ## no warning needed
# }else{
# ## BUT if assoc.prob of random length:
# ## repeat until of length n.snps.assoc
# assoc.prob <- rep(assoc.prob, length.out=n.snps.assoc)
# ## and print warning (only if not of length n.snps.assoc OR 1)
# warning("assoc.prob not of length n.snps.assoc;
# sequence will be repeated until correct length is reached.")
# }
# } # end checks
#
# ## for each associated SNP,
# ## we undo some associations | assoc.prob for that snp.assoc
# for(i in 1:n.snps.assoc){
# prob <- assoc.prob[i]
# ## only if the association is imperfect
# if(prob != 100){
# ## draw genomes to change at snps.assoc[i]
# n.toChange <- round(length(genomes)*(1 - (prob/100)))
# toChange <- sample(c(1:length(genomes)), n.toChange)
#
# ## change those genomes at snps.assoc[i]
# for(j in 1:length(toChange)){
# genomes[[toChange[j]]][snps.assoc[i]] <-
# selectBiallelicSNP(genomes[[toChange[j]]][snps.assoc[i]])
# } # end for loop
# }
# } # end for loop
# } # end any assoc.prob != 100
# } # end modification | assoc.prob
##############################
## PLOTS & TREECONSTRUCTION ##
##############################
if(heatmap == TRUE || reconstruct!=FALSE){
dna <- as.DNAbin(genomes)
names(dna) <- c(1:length(genomes))
}
#############
## HEATMAP ##
#############
if(heatmap==TRUE){
heatmap.DNAbin(dna=dna,
dist.dna.model=dist.dna.model)
}
##########################################
## PLOT 2: RECONSTRUCTING THE PHYLOGENY ##
##########################################
tree.reconstructed <- NULL
if(reconstruct!=FALSE){
tree.reconstructed <- tree.reconstruct(dna[1:n.ind],
method=reconstruct,
dist.dna.model=dist.dna.model,
plot=TRUE)
}
##################
## CONVERT SNPS ##
##################
## keep and return ONLY genomes for TERMINAL individuals
genomes <- genomes[1:n.ind]
x <- genomes
if(is.null(names(x))) names(x) <- paste("ind", c(1:length(x)), sep=".")
gen.size <- length(x[[1]])
## working with snps in matrix form
snps <- do.call("rbind", x)
## get snps as DNAbin
ploidy <- 1
snps.bin <- as.DNAbin(snps, ploidy=ploidy)
## get snps as genind
#source("C:/Users/caitiecollins/adegenet/R/sequences.R")
snps.gen <- DNAbin2genind(snps.bin, polyThres=0.01)
## get snps as binary matrix
snps <- snps.gen@tab
## correct genind for ploidy:
snps <- snps[,seq(1, ncol(snps), 2)]
colnames(snps) <- gsub("[:.:]|.$*", "", colnames(snps))
if(!is.null(snps.assoc)){
#############################################
## Ensure snps.assoc loci are ###############
## correctly labelled | n.columns retained ##
## in snps.gen object #######################
#############################################
## identify any columns of snps.bin that
## do NOT meet DNAbin2genind polyThres
## or are NOT SNPs
x <- snps.bin
if(is.list(x)) x <- as.matrix(x)
if(is.null(colnames(x))) colnames(x) <- 1:ncol(x)
temp <- lapply(1:ncol(x), function(i)
.getFixed(x[,i], i)) # process all loci, return a list
fixed.loci <- which(temp==TRUE) ## identify loci that are NOT SNPs
## update snps.assoc to reflect true loci
gen.size.final <- ncol(snps)
snps.assoc.loci.ori <- c((gen.size.final-(n.snps.assoc-1)):gen.size.final)
## Re-enabled snps.assoc loci "randomization" by
## just drawing indices and shuffling the columns accordingly...
## draw which SNPs will be associated to the phenotype
snps.assoc.loci <- sort(sample(c(1:gen.size.final),
n.snps.assoc,
replace=FALSE))
snps.indices <- c(1:gen.size.final)
snps.ori <- snps
snps.names.ori <- ind.names.ori <- NULL
if(!is.null(dimnames(snps))){
snps.names.ori <- dimnames(snps)[[2]]
ind.names.ori <- dimnames(snps)[[1]]
}
snps.non.assoc <- snps[,c(1:(gen.size.final-n.snps.assoc))]
snps.assoc <- snps[,snps.assoc.loci.ori]
snps.new <- matrix(99, nrow=100, ncol=gen.size.final)
snps.new[,snps.indices[-snps.assoc.loci]] <- snps.non.assoc
snps.new[,snps.assoc.loci] <- snps.assoc
snps <- snps.new
if(!is.null(snps.names.ori)) colnames(snps) <- snps.names.ori
if(!is.null(ind.names.ori)) rownames(snps) <- ind.names.ori
snps.assoc <- snps.assoc.loci
## update names of snps.assoc
gen.size <- ncol(snps)
names(snps.assoc) <- as.vector(unlist(sapply(c(1:length(snps.assoc)),
function(e)
paste("L",
paste(rep(0, (nchar(gen.size)-
nchar(snps.assoc[e]))),
sep="",
collapse=""),
snps.assoc[e], sep=""))))
}
##################
## get RESULTS: ##
##################
out <- list(snps, snps.assoc, tree.reconstructed)
names(out) <- c("snps", "snps.assoc", "tree.reconstructed")
return(out)
} # end fwd.snp.sim
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.