###############################################################################
# HELPER FUNCTIONS #
###############################################################################
## Simulate gene gain and loss
#' @importFrom phangorn Descendants
#' @importFrom stats runif rbinom setNames
#' @importFrom reshape2 melt
#' @author Ignacio Ferres
.sim_gl <- function(phy, m, brti, ne, norg, ou, ov, mrca_acc_o){
# Generate random deviate integers respect the expected number of gains using
# a poisson distribution, given a gain rate and a branch length, and
# distribute them uniformly along branches.
revbrti <- max(brti) - brti
dpt <- max(revbrti)
gain <- apply(m, 1, function(x){
ti <- (x[3] * ne) / dpt
min <- revbrti[as.character(x[1])]
max <- revbrti[as.character(x[2])]
min.ti <- round((min * ne) / dpt)
max.ti <- round((max * ne) / dpt)
# Note for me:
# unique() call should eventually be removed and below code
# should cope with these marginal cases better.
unique(round(runif(ti * ou, min=min.ti, max=max.ti)))
})
if (!length(gain)) gain <- rep(list(numeric(0)), dim(m)[1])
mgain <- melt(setNames(gain, paste(m$V1, m$V2, sep = '-')))
mgain <- mgain[order(mgain$value, decreasing = FALSE), ]
# Segments. Initialize list where to save gene loss events.
sgmt <- paste(m$V1, m$V2, sep = '-')
loss <- setNames(vector('list', length(sgmt)), nm = sgmt)
# Paths from root to tips
roottotip <- setNames(ape::nodepath(phy), nm = seq_len(norg))
# Empthy vector to fill acc genome size on each node
node_ag_size <- vector('integer', length(brti))
names(node_ag_size) <- names(brti)
# MRCA node is the first one after the tips:
node_ag_size[as.character(norg + 1)] <- mrca_acc_o
for (i in seq_len(norg)){
nds <- roottotip[[i]]
# Iterate over branches from root to tip
for (j in seq_len(length(nds) - 1L)){
sg <- c(nds[j], nds[j+1])
# Generations at which tree bifurcates (nodes)
tnode <- round(revbrti[sg] * ne / dpt)
psg <- paste(sg, collapse = '-')
# Do not repeat already computed branches
if (psg %in% sgmt){
# Initialize variable to save generations at which gl event occur
gloss <- c()
# Get gene gains for this segment (branch)
ggains <- mgain[which(mgain$L1==psg), 1, drop=TRUE]
ti <- tnode[1]
tfnode <- tnode[2]
ag_size <- node_ag_size[as.character(nds[j])]
for (tf in c(ggains, tfnode)){
# Length of sub-segment
tl <- tf - ti
# P loosing existing genes
pl <- ov * ag_size ##
# Simulate gene loss events. Number of successes in 'tl' generations.
# Here the algorithm don't mimic exactly what IMG model says:
x <- rbinom(1, size = tl, prob = pl)
# Next routine corrects behavoir to mimic IMG
while(x>0){
pos <- round(runif(x, min = ti, max = tf))
pos <- min(pos)
gloss <- c(gloss, pos)
ag_size <- ag_size - 1L
pl <- ov * ag_size
ti <- pos
x <- rbinom(1, size = tf - ti, prob = pl)
}
# Update initial time of sub-segment
ti <- tf + 1L
# Add a gene since every iteration is over computed gene gains.
ag_size <- ag_size + 1L
}
loss[[psg]] <- if (length(gloss)>0) gloss else integer()
# "Remove" segment (mark as already computed)
sgmt <- sgmt[-which(sgmt==psg)]
# Save accessory genome size of end node
node_ag_size[as.character(nds[j+1])] <- ag_size
}
}
}
if (!length(loss)) loss <- rep(list(numeric(0)), dim(m)[1])
dfg <- lapply(1:(dim(m)[1]), function(x){
if (length(gain[[x]])>0){
cbind(gain[[x]], m[x, 1], m[x, 2], 'G')
}else{
NULL
}
})
dfg <- do.call(rbind, dfg)
dfl <- lapply(1:(dim(m)[1]), function(x){
if (length(loss[[x]])>0){
cbind(loss[[x]], m[x, 1], m[x, 2], 'L')
}else{
NULL
}
})
dfl <- do.call(rbind, dfl)
df <- do.call(rbind, list(dfg, dfl))
df <- as.data.frame(df, stringsAsFactors = FALSE)
colnames(df) <- c('generation', 'from.node', 'to.node', 'type')
df$generation <- as.numeric(df$generation)
df$from.node <- as.integer(df$from.node)
df$to.node <- as.integer(df$to.node)
# df$type <- as.factor(df$type)
df <- df[order(df$generation, decreasing = FALSE), ]
pm <- matrix(data = 1L,
nrow = norg,
ncol = mrca_acc_o,
dimnames = list(paste0('genome', 1:norg),
NULL))
allDes <- Descendants(phy, type = 'tips')
allorgs <- seq_len(norg)
for (i in 1:dim(df)[1]){
# If it is a gain, then add a new column with ones and zeros according the
# descendants of the node where the gene was gained.
if (.subset2(df, c(4, i)) == 'G'){
# desc <- phy$tip.label[allDes[[.subset2(df, c(3, i))]]]
desc <- .subset2(phy, 3)[allDes[[.subset2(df, c(3, i))]]]
nwc <- rep(0, norg)
nwc[as.integer(sub('genome','',desc))] <- 1L
pm <- cbind(pm, nwc) # Future versions: modify in place with data.table
# ndm <- dim(pm)
# colnames(pm)[ndm[2]] <- paste0('gene', ndm[2])
# If it is a loss, then put zeros at the corresponding column and rows.
} else {
# desc <- phy$tip.label[allDes[[.subset2(df, c(3, i))]]]
desc <- .subset2(phy, 3)[allDes[[.subset2(df, c(3, i))]]]
# Loose existing genes from node descendants
ex <- which(colSums(pm[desc, , drop=FALSE])!=0) # Expensive
# sample
lo <- round(runif(1, min = 1, max = length(ex)))
exlo <- ex[lo]
pm[desc, exlo] <- 0L
if (sum(.subset(pm, allorgs, exlo))==0)
pm <- .subset(pm, allorgs, seq_len(dim(pm)[2])[-exlo]) #pm <- pm[, -exlo]
}
}
# colnames(pm) <- paste0('gene', seq_len(dim(pm)[2]))
return(pm)
}
#' @importFrom ape getMRCA nodepath
#' @importFrom stats setNames rpois
#' @importFrom phangorn Descendants
.sim_mut <- function(pm, phy, nsites, m, depth, brti, ne, norg, mu, smat){
revbrti <- max(brti) - brti
dpt <- max(revbrti)
if (missing(smat)){
smat <- .codon.subst.mat
}
# Paths from root to tips
roottotip <- setNames(nodepath(phy), nm = seq_len(norg))
# Create structure to store result
res_mut <- setNames(vector('list', dim(pm)[2]), dimnames(pm)[[2]])
# Iterate over genes
for (g in seq_len(dim(pm)[2])){
# ge <- genes[[i]]
# nsites <- length(ge)
mrca <- getMRCA(phy, names(which(pm[, g]==1L)))
if (!is.null(mrca)){
tips <- Descendants(phy, mrca, 'tips')[[1]]
# Segments.
ismrca <- which(sapply(roottotip, function(x) mrca %in%x))
rttp <- lapply(roottotip[ismrca], function(x){
x[which(x==mrca):length(x)]
})
sgmt <- m[which(m$V1 %in% unlist(rttp)), c(1,2)]
sgmt <- paste(sgmt$V1, sgmt$V2, sep = '-')
}else{
tips <- unname(which(pm[, g]==1))
# Segments
sgmt <- paste(rev(rev(roottotip[[which(pm[, g]==1L)]])[1:2]), collapse = '-')
}
# Create structure to save simulated mutations
muts <- setNames(vector('list', length = length(sgmt)), nm = sgmt)
#Iterate over descendant organisms
for (i in tips){
nds <- roottotip[[i]]
# Iterate over branches
for (j in seq_len(length(nds) - 1L)){
sg <- c(nds[j], nds[j+1])
psg <- paste(sg, collapse = '-')
# Do not repeat if already computed branch
if (psg %in% sgmt){
# Simulate number of substitutions for this segment (binomial)
slth <- revbrti[sg[2]] - revbrti[sg[1]]
ngen <- unname( slth * ne / dpt)
trials <- ngen * nsites[g]
# # Too big, coerces it into numeric for double presision :,(
# trials <- ngen * nsites
# # This produces NAs because expects class(trials) == 'integer' :
# rbinom(1, size = trials, prob = mu)
# # Seen on stackoverflow but also produces NaNs:
# qbinom(runif(1), trials, mu, FALSE)
# Approximation with poisson ¯\_(ツ)_/¯ :
nmut <- rpois(1, trials * mu)
if (nmut){
# Distribute substitutions with uniform probability
smut <- sample(seq_len(nsites[g]), nmut, replace = TRUE)
smut <- as.integer(ceiling(smut/3))
muts[[psg]] <- smut
}else{
muts[[psg]] <- NA_integer_
}
# "Remove" segment (mark as already computed)
sgmt <- sgmt[-which(sgmt==psg)]
}
}
}
#print(g)
res_mut[[g]] <- muts
#Finish gene[g]
}
res_mut
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.