## WARNING:
## SOME LINES OF SNP.SIM.Q.R HAVE FALLEN OUT OF DATE w SNP.SIM.R.
## BEFORE USING SNP.SIM.Q, UPDATE (at least!):
## sapply, add n.mts > l.edge check,
## convert snps to logical, for loop (selectBiallelicSNP --> !l[[i]])
###############
## snp.sim.Q ##
###############
########################################################################
###################
## DOCUMENTATION ##
###################
#' Aternative SNPs simulation fn.
#'
#' Currently under development. Please use the regular snp.sim function to simulate genetic data.
#'
#' @param n.snps An integer specifying the number of snps columns to be simulated.
#' At present, all snps are simulated to be correlated to the
#' provided \code{phen.reconstruction} along the provided \code{tree}.
#' @param tree A \code{phylo} object containing the phylogenetic tree; or, a character string,
#' one of \code{"NJ"}, \code{"BIONJ"} (the default), or \code{"parsimony"};
#' or, if NAs are present in the distance matrix, one of: \code{"NJ*"} or \code{"BIONJ*"},
#' specifying the method of phylogenetic reconstruction.
#' @param phen.reconstruction Either a character string specifying \code{"parsimony"} (the default) or \code{"ML"} (maximum likelihood)
#' for the ancestral state reconstruction of the phenotypic variable,
#' or a vector containing this reconstruction if it has been performed elsewhere.
#' @param s If \code{set} is 3, the \code{s} parameter controls a baseline number of substitutions to be
#' experienced by the phenotype and associated loci: by default, 20.
#' @param af If \code{set} is 3, the \code{af} parameter provides an association factor,
#' controlling the preference for association over non-association at associated loci: by default, 10 (for a 10x preference).
#' @param plot A logical indicating whether to generate a plot of the phylogenetic tree (\code{TRUE}) or not (\code{FALSE}, the default).
#' @param heatmap A logical indicating whether to produce a heatmap of the genetic distance
#' between the simulated genomes of the n.ind individuals.
#' @param reconstruct Either a logical indicating whether to attempt to reconstruct
#' a phylogenetic tree using the simulated genetic data, or one of c("UPGMA", "nj", "ml")
#' to specify that tree reconstruction is desired by one of these three methods
#' (Unweighted Pair Group Method with Arithmetic Mean, Neighbour-Joining, Maximum-Likelihood).
#' @param dist.dna.model A character string specifying the type of model to use in reconstructing the phylogenetic tree for
#' calculating the genetic distance between individual genomes, only used if \code{tree} is
#' a character string (see ?dist.dna).
#' @param grp.min (Not yet (re-)implemented in this function.)
#' An optional number between 0.1 and 0.9 to control the proportional size of the smaller phenotypic group.
#' @param row.names An optional vector containing row names for the individuals to be simulated.
#' @param seed An optional integer to control the pseudo-randomisation process and allow for identical repeat runs of the function;
#' else \code{NULL}.
#'
#' @author Caitlin Collins \email{caitiecollins@@gmail.com}
#'
#'
#' @examples
#' ## Example ##
#'
#' @import adegenet
#' @rawNamespace import(ape, except = zoom)
#' @importFrom Hmisc all.is.numeric
#' @importFrom phangorn midpoint
#'
#' @export
########################################################################
# @useDynLib phangorn, .registration = TRUE
snp.sim.Q <- function(n.snps = 10,
tree = coalescent.tree.sim(100, seed=1),
phen.reconstruction, ## provide phen.rec and use this to set s=n.phen.subs...
s = NULL, ## n.subs for correlated snps.assoc (s over-rides n.phen.subs)
af = 10, ## association factor (relative odds of assoc vs. non-assoc in Q mat)
snp.root = NULL,
heatmap = FALSE,
reconstruct = FALSE,
dist.dna.model = "JC69",
grp.min = 0.25,
row.names = NULL,
seed=1){
# require(adegenet)
# require(ape)
temp <- snps <- snps.assoc <- tree.reconstructed <- sets <- phen <- phen.nodes <- NULL
n.snps.assoc <- n.snps
## HANDLE TREE: ##
## Always work with trees in "pruningwise" order:
tree <- reorder.phylo(tree, order="pruningwise")
## Trees must be rooted:
# if(!is.rooted(tree)) tree <- midpoint(tree)
####################################################################
############################
## Get Anc-Des EDGE ORDER ##
############################
## Get sequence from lowest ("root", Nterm+1) to highest ancestral node:
ix <- c(min(tree$edge[,1]):max(tree$edge[,1]))
## Get for loop index of rows in tree$edge[,1], in pairs, from lowest to highest:
x <- as.vector(unlist(sapply(c(1:length(ix)), function(e) which(tree$edge[,1] == ix[e]))))
####################################################################
##################################
## GET MUTATIONS' branch & loci ##
##################################
n.ind <- min(tree$edge[,1])-1 # tree$Nnode+1
gen.size <- n.snps
edges <- tree$edge
if(!is.null(seed)) set.seed(seed)
## Simulate genotype (& phen) for root individual: ##
## if snp.root given:
if(!is.null(snp.root)){
if(length(snp.root) == 1){
## select only root state --> different SNP sim method?
if(snp.root %in% c(0, FALSE)) gen.root <- rep(FALSE, gen.size)
if(snp.root %in% c(1, TRUE)) gen.root <- rep(TRUE, gen.size)
}else{
## if snp.root provided for all loci:
if(length(snp.root) == gen.size){
if(length(unique(snp.root[!is.na(snp.root)])) == 2){
gen.root <- as.logical(as.numeric(as.factor(snp.root))-1)
}else{
warning("snp.root must be binary; ignoring.")
}
}else{
warning("snp.root should either be of length 1 or length n.snps; ignoring.")
}
}
}
## check phen.rec:
if(!is.null(phen.reconstruction)){
if(length(unique(phen.reconstruction[!is.na(phen.reconstruction)])) == 2){
p.noms <- names(phen.reconstruction)
phen.reconstruction <- as.logical(as.numeric(phen.reconstruction))
names(phen.reconstruction) <- p.noms
## Infer n.phen.subs:
n.phen.subs <- length(which(phen.reconstruction[tree$edge[,1]] != phen.reconstruction[tree$edge[,2]]))
}else{
phen.reconstruction <- NULL
n.phen.subs <- NULL
warning("phen.reconstruction must be binary. Ignorning (sorry).")
}
}else{
phen.reconstruction <- NULL
n.phen.subs <- NULL
}
## For n.subs = n or = dist approaches:
gen.root <- sample(c(TRUE, FALSE), gen.size, replace=TRUE)
if(is.null(phen.reconstruction)){
phen.root <- sample(c(TRUE, FALSE), gen.size, replace=TRUE)
}else{
phen.root <- phen.reconstruction[min(tree$edge[,1])]
}
## get the sum of all branch lengths in the tree:
time.total <- sum(tree$edge.length)
## make dummy variables in which to store the resulting n.mts variables:
L <- lambda <- n.mts <- new.nt <- NA
snps.assoc <- NULL
## if n.snps.assoc is neither NULL nor 0:
if(is.null(n.snps.assoc)) n.snps.assoc <- 0
if(n.snps.assoc != 0){
## get non.assoc gen.size
gen.size.ori <- gen.size
gen.size <- gen.size-n.snps.assoc
## assign snps.assoc to be the last n.snps.assoc snps columns
snps.assoc <- c((gen.size+1):(gen.size+n.snps.assoc))
}
######################################################################################################################################################################
######################################################################################################################################################################
#########################
## SIM ASSOCIATED SNPS ##
#########################
## Need to treat ASSOCIATED SNPs differently:
## (non.assoc.snps do NOT need to pass the "while" check;
## they just need to match phen.loci at this point.)
snps.assoc.nodes <- phen.nodes <- NULL
if(n.snps.assoc != 0){
## (*) Now assuming phen.rec provided as input
## If s provided, s overrides n.phen.subs;
## else s=n.phen.subs as inferred from phen.reconstruction:
if(is.null(s)){
if(!is.null(n.phen.subs)){
s <- n.phen.subs
}else{
s <- 15
warning("Setting s (n.subs for snps.assoc in Q rate matrix) as 15.
User must provide phen.reconstruction (s=n.phen.subs) or set s argument to specify manually.")
}
}
## Scale s to account for total edge.length:
s <- s/sum(tree$edge.length)
## Set af = the association factor:
## *af = the relative odds of var1 changing state towards being in-phase w var2 vs.
## changing to the opposite state, out-of-step w var2, over a given branch length...
## (Q is actually an instantaneous transition rate matrix, but we will account for branch lengths later.)
if(is.null(af)){
af <- 10
warning("Setting association factor af = 10.")
}
## Get Q, the dependent transition rate/prob matrix:
Q.mat <- matrix(c(NA, 1*s, 1*s, 0,
1*af*s, NA, 0, 1*af*s,
1*af*s, 0, NA, 1*af*s,
0, 1*s, 1*s, NA),
nrow=4, byrow=T, dimnames=rep(list(c("0|0", "0|1", "1|0", "1|1")), 2))
diag(Q.mat) <- sapply(c(1:nrow(Q.mat)), function(e) -sum(Q.mat[e, c(1:ncol(Q.mat))[-e]]))
Q <- Q.mat
## get snps.loci for all ASSOCIATED snps, conditional on phen, Q: ##
snps.assoc.nodes <- list()
N.OVERLAP <- list()
Qt <- list()
## get snps.loci for FIRST ASSSOCIATED snp WITH PHEN.loci:
i <- 1
edges <- tree$edge
root.nt <- gen.root[snps.assoc[i]]
## Need to update tree$edge.length probs to mirror sampling without replacement...
## BUT only remove branches from the calculation once they have had a sub on them...?
# N.SUBS.TOTAL <- rpois(1, n.phen.subs) ############
###############
## FOR LOOP: ##
###############
set.seed(seed)
for(i in 1:n.snps.assoc){
## get nt for root at this locus:
root.nt <- gen.root[snps.assoc[i]]
snp.node <- as.list(rep(NA, length(unique(as.vector(edges)))))
names(snp.node) <- names(phen.reconstruction)
snp.node[[edges[x[1], 1]]] <- root.nt
## go from last to first edge in edges:
for(e in x){
probs <- NULL
####################################################
## get conditional probs for each edge w matexpo! ##
####################################################
Qt[[e]] <- matexpo(Q*tree$edge.length[e])
P <- Qt[[e]]
rownames(P) <- rownames(Qt[[e]]) <- rownames(Q)
colnames(P) <- colnames(Qt[[e]]) <- colnames(Q)
if(snp.node[[edges[e, 1]]] == FALSE & phen.reconstruction[[edges[e, 1]]] == FALSE) probs <- P[1,]
if(snp.node[[edges[e, 1]]] == FALSE & phen.reconstruction[[edges[e, 1]]] == TRUE) probs <- P[2,]
if(snp.node[[edges[e, 1]]] == TRUE & phen.reconstruction[[edges[e, 1]]] == FALSE) probs <- P[3,]
if(snp.node[[edges[e, 1]]] == TRUE & phen.reconstruction[[edges[e, 1]]] == TRUE) probs <- P[4,]
## Now we KNOW, a priori, the phen.node for the DESCENDANT
probs.mod <- replace(probs, which(!as.logical(as.numeric(keepLastN(colnames(Q), 1))) == phen.reconstruction[[edges[e, 2]]]), 0)
SP.dec <- sample(colnames(Q), 1, prob = probs.mod)
S.dec <- as.logical(as.numeric(keepFirstN(SP.dec, 1)))
names(S.dec) <- NULL
snp.node[[edges[e, 2]]] <- S.dec
} # end for (e) loop
## STORE SNPS.ASSOC (FOR ALL NODES):
snps.assoc.nodes[[i]] <- as.vector(unlist(snp.node))
################################################
##### ##### ##### ##### ##### ##### ##### #####
## Get proportion overlap btw phen and snps.assoc.i:
N.overlap.pos <- length(which(phen.reconstruction[1:n.ind] == snps.assoc.nodes[[i]][1:n.ind]))
# N.overlap <- max(N.overlap, (n.ind-N.overlap))
N.overlap.neg <- n.ind - N.overlap.pos
if(N.overlap.pos > N.overlap.neg){
N.overlap <- +round((N.overlap.pos/n.ind), 2)
}else{
N.overlap <- -round((N.overlap.neg/n.ind), 2)
}
N.OVERLAP[[i]] <- N.overlap
##### ##### ##### ##### ##### ##### ##### #####
################################################
} # end for loop
## Bind SNPs.ASSOC into matrix:
snps.assoc.nodes <- do.call("cbind", snps.assoc.nodes)
rownames(snps.assoc.nodes) <- names(phen.reconstruction)
###################################################
## TEMP -- COMPARE PHEN & ALL SNPS.ASSOC w PLOT: ##
###################################################
par(mfrow=c(2,6))
plot_phen(tree, phen.nodes=phen.reconstruction, main.title="phen")
if(n.snps.assoc >= 10){
## just plot first 10:
for(i in 1:5){
plot_phen(tree, phen.nodes=snps.assoc.nodes[,i], RTL = TRUE, show.axis=FALSE,
main.title=paste("snp.assoc", i, sep=" "))
title(N.OVERLAP[[i]], line=2.8, font.main=1, cex.main=0.8)
}
plot_phen(tree, phen.nodes=phen.reconstruction, main.title="phen")
for(i in 6:10){
plot_phen(tree, phen.nodes=snps.assoc.nodes[,i], RTL = TRUE, show.axis=FALSE,
main.title=paste("snp.assoc", i, sep=" "))
title(N.OVERLAP[[i]], line=2.8, font.main=1, cex.main=0.8)
}
}else{
## plot up to 5:
if(n.snps.assoc <= 5){
par(mfrow=c(1,(n.snps.assoc+1)))
for(i in 1:n.snps.assoc){
plot_phen(tree, phen.nodes=snps.assoc.nodes[,i], RTL = TRUE, show.axis=FALSE,
main.title=paste("snp.assoc", i, sep=" "))
title(N.OVERLAP[[i]], line=2.8, font.main=1)
}
}else{
## plot btw 5 and 10:
for(i in 1:5){
plot_phen(tree, phen.nodes=snps.assoc.nodes[,i], RTL = TRUE, show.axis=FALSE,
main.title=paste("snp.assoc", i, sep=" "))
title(N.OVERLAP[[i]], line=2.8, font.main=1)
}
plot_phen(tree, phen.nodes=phen.reconstruction, main.title="phen")
for(i in 6:n.snps.assoc){
plot_phen(tree, phen.nodes=snps.assoc.nodes[,i], RTL = TRUE, show.axis=FALSE,
main.title=paste("snp.assoc", i, sep=" "))
title(N.OVERLAP[[i]], line=2.8, font.main=1)
}
}
}
par(mfrow=c(1,1)) # end temp panel plot
gc()
} # end of snps.assoc generation
#### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###
###########################################
## GET COMPLETE SNPS MATRIX ("genomes"): ##
###########################################
if(!is.null(temp)){
## Create snps matrix:
snps <- temp
rm(temp)
## Attach snps.assoc loci to last column:
if(!is.null(snps.assoc.nodes)){
snps <- cbind(snps[,c(1:(ncol(snps)-(n.snps.assoc)))], snps.assoc.nodes[1:n.ind,])
}
## keep only rows containing terminal individuals:
snps <- snps[1:n.ind, ]
}else{
## If only snps.assoc simulated (no non-assoc snps):
snps <- snps.assoc.nodes[1:n.ind,]
}
#### ### ### ### ### ### ### ### ### ### ### ### ####
##############################
## PLOTS & TREECONSTRUCTION ##
##############################
tree.reconstructed <- NULL
if(heatmap == TRUE || reconstruct!=FALSE){
## CONVERT TO CHARACTER: ##
dna <- snps
dna <- replace(dna, which(dna == TRUE), "a")
dna <- replace(dna, which(dna == "FALSE"), "t")
## Get DNAbin object:
dna <- as.DNAbin(dna)
# rownames(dna) <- c(1:nrow(snps))
#############
## HEATMAP ##
#############
if(heatmap==TRUE){
heatmap.DNAbin(dna=dna,
dist.dna.model=dist.dna.model)
}
##########################################
## PLOT 2: RECONSTRUCTING THE PHYLOGENY ##
##########################################
if(reconstruct!=FALSE){
tree.reconstructed <- tree.reconstruct(dna, # [1:n.ind,]
method=reconstruct,
dist.dna.model=dist.dna.model,
plot=TRUE)
}
## Remove unnecessary object:
rm(dna)
} # end heatmap, treeconstruction
##################
## CONVERT SNPS ##
##################
## (NO LONGER) CONVERT TO NUMERIC: ##
## Convert from logical to binary SNPs (for terminal nodes only):
# snps <- replace(snps, which(snps == TRUE), 1)
## Reassort snps.assoc to new columns:
if(!is.null(snps.assoc)){
if(!is.null(temp)){
rm(temp)
## update snps.assoc to reflect true loci
gen.size.final <- ncol(snps)
snps.assoc.loci.ori <- c((gen.size.final-(n.snps.assoc-1)):gen.size.final)
#########################################
## RANDOMIZE SNPS.ASSOC LOCI POSITIONS ##
#########################################
## Re-enabled snps.assoc loci "randomization" by
## just drawing indices and shuffling the columns accordingly...
## draw which SNPs will be associated to the phenotype
if(!is.null(seed)) set.seed(seed)
snps.assoc.loci <- sort(sample(c(1:gen.size.final),
n.snps.assoc,
replace=FALSE))
snps.indices <- c(1:gen.size.final)
snps.ori <- snps
snps.non.assoc <- snps[,c(1:(gen.size.final-n.snps.assoc))]
snps.assoc <- snps[,snps.assoc.loci.ori]
snps.new <- matrix(99, nrow=nrow(snps), ncol=gen.size.final)
snps.new[,snps.indices[-snps.assoc.loci]] <- snps.non.assoc
snps.new[,snps.assoc.loci] <- snps.assoc
snps <- snps.new
snps.assoc <- snps.assoc.loci
}
} # end snps.assoc randomization
###############################
## Assign row & column names ##
###############################
## assign/generate row.names
if(!is.null(row.names)){
## If row.names have been provided in args list, assign them:
if(length(row.names) == nrow(snps)){
## match tree$tip.label?
if(!is.null(tree$tip.label)){
if(all(row.names %in% tree$tip.label) & all(tree$tip.label %in% row.names)){
## REORDER to match tree$tip.labs if possible:
if(!identical(row.names, tree$tip.label)){
ord <- match(tree$tip.label, row.names)
row.names <- row.names[ord]
}
}
}
rownames(snps) <- row.names
}else{
if(is.null(rownames(snps))) rownames(snps) <- c(1:nrow(snps))
}
}else{
## Otherwise, try to assign rownames(snps) to match tree$tip.label:
if(!is.null(tree$tip.label)){
if(length(tree$tip.label) == nrow(snps)){
rownames(snps) <- tree$tip.label
}else{
rownames(snps) <- 1:nrow(snps)
warning("The length of tree$tip.label was not equal to nrow(snps) being simulated;
rownames(snps) have been set to 1:N and will not match tree$tip.label.")
}
}
}
## generate column names:
colnames(snps) <- 1:ncol(snps)
## Get association +/-:
snps.assoc.direction <- NULL
if(!is.null(N.OVERLAP)){
snps.assoc.direction <- as.vector(unlist(N.OVERLAP))
}
##################
## get RESULTS: ##
##################
out <- list(snps, snps.assoc, snps.assoc.nodes, snps.assoc.direction, tree.reconstructed, phen, phen.nodes)
names(out) <- c("snps", "snps.assoc", "snps.assoc.nodes", "snps.assoc.direction", "tree.reconstructed", "phen", "phen.nodes")
return(out)
} # end snp.sim.Q
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.