Nothing
#' Estimate TE dynamics using mismatch data
#'
#' Given the number of mismatches and element lengths for an LTR retrotransposon family, estimate the age distribution, insertion rate, and deletion rates.
#'
#' @param mismatch A vector containing the number of mismatches.
#' @param len A vector containing the length of each element.
#' @param r Mutation rate (substitutions/(million year * site)) used in the calculation.
#' @param perturb A scalar multiple to perturb the estimated death rate from the null hypothesis estimate. Used to generate the sensitivity analysis.
#' @param rateRange A vector of death rates, an alternative to \code{perturb} for specifying the death rates.
#' @param plotFit Whether to plot the distribution fits.
#' @param plotSensitivity Whether to plot the sensitivity analysis.
#' @param pause Whether to pause after each plot.
#' @param main The title for the plot.
#'
#' @details \code{EstDynamics} estimates the TE dynamics through fitting a negative binomial fit to the mismatch data, while \code{EstDynamics2} uses a mixture model. For detailed implementation see References.
#'
#' @return \code{EstDynamics} returns a \code{TEfit} object, containing the following fields, where the unit for time is million years ago (Mya):
#' \item{pvalue}{The p-value for testing H_0: The insertion rate is uniform over time.}
#' \item{ageDist}{A list containing the estimated age distributions.}
#' \item{insRt}{A list containing the estimated insertion rates.}
#' \item{agePeakLoc}{The maximum point (in age) of the age distribution. }
#' \item{insPeakLoc}{The maximum point (in time) of the insertion rate.}
#' \item{estimates}{The parameter estimates from fitting the distributions; see References}
#' \item{sensitivity}{A list containing the results for the sensitivity analysis, with fields \code{time}: time points; \code{delRateRange}: A vector for the range of deletion rates; \code{insRange}: A matrix whose columns contain the insertion rates under different scenarios.}
#' \item{n}{The sample size.}
#' \item{meanLen}{The mean of element length.}
#' \item{meanDiv}{The mean of divergence.}
#' \item{KDE}{A list containing the kernel density estimate for the mismatch data. }
#' \item{logLik}{The log-likelihoods of the parametric fits.}
#'
#' @references
#' \cite{Dai, X., Wang, H., Dvorak, J., Bennetzen, J., Mueller, H.-G. (2018). "Birth and Death of LTR Retrotransposons in Aegilops tauschii". Genetics}
#' @export
#'
#' @examples
#' # Analyze Gypsy family 24 (Nusif)
#' data(AetLTR)
#' dat <- subset(AetLTR, GroupID == 24 & !is.na(Chr))
#' set.seed(1)
#' res1 <- EstDynamics(dat$Mismatch, dat$UngapedLen, plotFit=TRUE, plotSensitivity=FALSE, pause=FALSE)
#'
#' # p-value for testing a uniform insertion rate
#' res1$pvalue
#'
EstDynamics <- function(mismatch, len, r=1.3e-2, perturb=2, rateRange=NULL, plotFit=FALSE, plotSensitivity=FALSE, pause=plotFit && plotSensitivity, main=sprintf('n = %d', n)) {
stopifnot(length(mismatch) == length(len))
stopifnot(all(perturb > 0))
if (any(is.na(mismatch)) || any(is.na(len))) {
stop('NA values in the input are not allowed')
}
n <- length(mismatch)
perturb[perturb < 1] <- 1 / perturb[perturb < 1]
# mutation rate
meanLen <- mean(len)
## Fit distributions to the mutation data.
geom1 <- MASS::fitdistr(mismatch, 'geometric')
geom1$loglik <- sum(stats::dgeom(mismatch, geom1[['estimate']], log=TRUE))
nb1 <- suppressWarnings({
MASS::fitdistr(mismatch, 'negative binomial')
})
bw <- stats::bw.nrd0(mismatch)
bw <- max(bw, 0.5)
den1 <- stats::density(mismatch, bw, kernel='gaussian')
logLik <- c(nb1 = nb1[['loglik']], geom1 = geom1[['loglik']])
logLikDiff <- logLik['nb1'] - logLik['geom1']
pv <- stats::pchisq(2 * logLikDiff, 1, lower.tail=FALSE)
## Recover age distribution.
alpha1 <- nb1$estimate['size']
beta1 <- nb1$estimate['size'] / nb1$estimate['mu']
## Time axis
maxPts <- floor(max(mismatch) * 1.5)
pts <- seq(0, maxPts)
ptsDense <- seq(0.1, maxPts, by=0.1)
timeScale <- pts / meanLen / r / 2
timeScaleDense <- ptsDense / meanLen / r / 2
nbDensity <- stats::dnbinom(pts, nb1$estimate['size'], mu=nb1$estimate['mu'])
lineGeom <- stats::dgeom(pts, geom1$estimate['prob'])
lineGammaRec <- stats::dgamma(ptsDense, shape=alpha1, rate=beta1) # TODO: use log scale.
yMax <- max(max(den1[['y']]), max(nbDensity), max(lineGeom), max(lineGammaRec)) * 1.1
if (plotFit) {
lwd <- 2
MASS::truehist(as.integer(mismatch),
xlab='Mismatch',
main=main,
ylim=c(0, yMax),
prob=TRUE,
col='white',
nbins=min(20, length(unique(as.integer(mismatch))) + 1))
# , xlim=c(0, max(mismatch)))
ind <- den1$x >= 0
lines(den1$x[ind], den1$y[ind], lwd=lwd)
lines(pts, nbDensity, lty=2, lwd=lwd, col='blue')
# lines(pts, lineGeom, lty=3)
lines(ptsDense, lineGammaRec, lty=4, lwd=lwd, col='blue')
legend('topright',
legend=c('KDE',
'neg. bin.',
# 'geometric',
'age dist'),
lty=c(1, 2, 4),
lwd=lwd,
col=c('black', 'blue', 'blue'))
# title='Density fit')
if (pause)
readline('Press enter to continue')
}
## Recover birth rate. The death rate is estimated from H0 (to be improved).
p1 <- geom1$estimate['prob']
deathRate <- p1 / (1 - p1)
names(deathRate) <- 'rate'
lineExpRec <- stats::dexp(ptsDense, deathRate)
# plot(pts, lineExpRec, type='l', xlim=c(0, 40))
birthRateRec <- lineGammaRec / stats::pexp(ptsDense, deathRate, lower.tail=FALSE)
# Normalize to unit integral
birthRateRec <- birthRateRec / sum(birthRateRec) *
length(birthRateRec) / diff(range(timeScaleDense))
ageDistRec <- lineGammaRec / sum(lineGammaRec) *
length(lineGammaRec) / diff(range(timeScaleDense))
## Sensitivity analysis.
if (is.null(rateRange)) {
rateRange <- signif(deathRate * c(rev(1 / perturb), 1, perturb), 3)
} else {
rateRange <- signif(rateRange, 3)
}
## In mutation scale.
birthRange <- sapply(rateRange, function(rate) {
birth <- lineGammaRec / (1 - stats::pexp(ptsDense, rate))
birth[is.infinite(birth)] <- NaN
birth <- birth / sum(birth, na.rm=TRUE) / (timeScaleDense[2] - timeScaleDense[1])
birth
})
colnames(birthRange) <- rateRange
obj1 <- rainbow::fts(timeScaleDense, birthRange, xname='Mya', yname='Normalized birth rate')
ind <- max(which(birthRange > 1e-4, arr.ind=TRUE)[, 1])
if (ind == -Inf)
ind <- length(timeScaleDense)
xMaxSen <- timeScaleDense[ind]
if (plotSensitivity) {
rainbow::plot.fds(obj1, plot.type='functions', main=sprintf('Sensitivity analysis'), xlim=c(0, xMaxSen), plotlegend=TRUE)
if (pause)
readline('Press enter to continue')
}
estimates <- c(nb_n = unname(nb1$estimate['size']),
nb_p = unname(nb1$estimate['size'] /
(nb1$estimate['size'] +
nb1$estimate['mu'])),
gamma_alpha= unname(alpha1) ,
gamma_beta = unname(beta1) * 2 * r * meanLen,
geom_p = unname(geom1$estimate) ,
exp_lambda = unname(deathRate) * 2 * r * meanLen,
exp_half = log(2) / (unname(deathRate) * 2 * r * meanLen))
res <- list(
pvalue = unname(pv),
# Gamma mode, in My
ageDist = list(x =timeScaleDense, y=ageDistRec),
insRt = list(x=timeScaleDense, y=birthRateRec),
agePeakLoc = unname((alpha1 - 1) / beta1 / 2 / r / meanLen),
insPeakLoc = timeScaleDense[which.max(birthRateRec)],
estimates = estimates,
sensitivity = list(
time = timeScaleDense,
delRateRange = rateRange,
insRange = birthRange
),
n = n,
meanLen = meanLen,
meanDiv = mean(mismatch / len),
KDE = den1,
logLik = logLik)
class(res) <- 'TEfit'
res
}
#' Print a TEfit or TEfit2 object
#'
#' @param x A TEfit or TEfit2 object
#' @param ... Not used
#'
#' @export
print.TEfit <- function(x, ...) {
cat(sprintf('Fit for LTR-RTN family\n\nn=%s, mean divergence=%.2g, mean length=%.4g\n', x[['n']], x[['meanDiv']], x[['meanLen']]))
cat(sprintf('P-value for testing uniform insertion rate: %.3g\n', x[['pvalue']]))
invisible(x)
}
#' @rdname print.TEfit
#' @export
print.TEfit2 <- function(x, ...) {
cat('Mixture ')
NextMethod()
}
#' Generate sensitivity plots
#'
#' Create sensitivity plots of a few families to investigate different death rate scenarios
#'
#' @param resList A list of families returned by \code{\link{EstDynamics}}
#' @param col A vector of colors
#' @param xMax The maximum of the x-axis
#' @param markHalfPeak Whether to mark the time points with half-intensity
#' @param famLegend Whether to create legend for families
#' @param rLegend Text for the legend for families
#' @param ... Passed into \code{matplot}
#' @export
#' @examples
#' data(AetLTR)
#' copia3 <- subset(AetLTR, GroupID == 3 & !is.na(Chr))
#' copia9 <- subset(AetLTR, GroupID == 9 & !is.na(Chr))
#' res3 <- EstDynamics(copia3$Mismatch, copia3$UngapedLen)
#' res9 <- EstDynamics(copia9$Mismatch, copia9$UngapedLen)
#' SensitivityPlot(list(`Copia 3`=res3, `Copia 9`=res9))
SensitivityPlot <- function(resList, col, xMax, markHalfPeak=FALSE, famLegend=TRUE, rLegend=names(resList), ...) {
if (!is.null(names(resList)[1]) && 'pvalue' %in% names(resList)) { # not a list
resList <- list(resList)
}
# if (any(sapply(resList, function(x) 'TEfit2' %in% class(x)))) {
# warning("SensitivityPlot is only implemented for 'TEfit' objects. Using fits produced by EstDynamics")
# }
if (missing(col)) {
col <- grDevices::rainbow(length(resList))
}
# v: A vector
# len: The desired length after padding in NAs
padNA <- function(v, len) {
if (is.vector(v)) {
c(v, rep(NA, len - length(v)))
} else {
rbind(v, matrix(NA, len - nrow(v), ncol(v)))
}
}
timeList <- lapply(resList, function(res) if (inherits(res, 'TEfit2')) res[['sensitivity2']][['time']] else res[['sensitivity']][['time']])
maxl <- max(sapply(timeList, length))
timeMat <- sapply(timeList, padNA, len=maxl)
birthRangeAll <- sapply(resList, function(res) {
if (inherits(res, 'TEfit2')) {
padNA(res[['sensitivity2']][['insRange']], maxl)
} else {
padNA(res[['sensitivity']][['insRange']], maxl)
}
}, simplify='array') # Reorder to a 3D array
if (any(is.nan(birthRangeAll))) {
warning('Some scenario may produce unreasonable results')
}
if (missing(xMax)) {
xMax <- max(timeMat[which(birthRangeAll > 1e-4, arr.ind=TRUE)[, c(1, 3)]])
}
yMax <- max(birthRangeAll, na.rm=TRUE)
baselwd <- 2
matplot(timeMat, birthRangeAll[, 2, ], xlab='Mya', lty=1, col=col, type='l', xlim=c(0, xMax), ylim=c(0, yMax), lwd=baselwd, ...)
matplot(timeMat, birthRangeAll[, 1, ], xlab='Mya', lty=2, col=col, add=TRUE, type='l', lwd=baselwd, ...)
matplot(timeMat, birthRangeAll[, 3, ], xlab='Mya', lty=3, col=col, add=TRUE, type='l', lwd=baselwd + 2, ...)
legend('topright', c('low', 'median', 'high'), lty=c(2, 1, 3), title='Death rate', lwd=c(baselwd, baselwd, baselwd + 2))
# indices are (subject, scenario, before/after, x/y)
peakLoc <- array(NA, c(dim(birthRangeAll)[3], 3, 2, 2))
dimnames(peakLoc) <- list(names(resList), NULL, c('before', 'after'), c('x', 'y'))
for (k in 1:dim(birthRangeAll)[3]) {
halfInd <- apply(birthRangeAll[, , k], 2, findHalfIntensity)
for (i in 1:2) {
xx <- timeMat[halfInd[i, ], k]
yy <- halfInd[3, ]
peakLoc[k, , i, ] <- cbind(xx, yy)
if (markHalfPeak) {
matlines(matrix(c(xx - 0.09,
xx + 0.09), nrow=2,
byrow=TRUE),
rbind(yy, yy),
pch='-', lwd=c(baselwd, baselwd, baselwd + 2), col=col[k],
lty=c(2, 1, 3))
}
}
}
if (length(resList) != 1 && !is.null(col) && famLegend) {
legend('right', rLegend, col=col, lty=1, lwd=baselwd)
}
invisible(list(timeMat=timeMat,
birthRange=birthRangeAll,
peakLoc=peakLoc))
}
findHalfIntensity <- function(y) {
peakLoc <- which.max(y)
peak <- y[peakLoc]
half <- peak / 2
beforePeak <- seq_len(peakLoc)
afterPeak <- seq(peakLoc, length(y), by=1)
beforex <- beforePeak[which.min(abs(y[beforePeak] - half))]
afterx <- afterPeak[which.min(abs(y[afterPeak] - half))]
c(beforex=beforex, afterx=afterx, halfPeak=half)
}
#' Calcualte the KL divergence of a negative binomial fit to the mismatch data.
#'
#' @param res A TEfit object.
#'
#'
#' @export
#' @examples
#' # Analyze Gypsy family 24 (Nusif)
#' data(AetLTR)
#' dat <- subset(AetLTR, GroupID == 24 & !is.na(Chr))
#' set.seed(1)
#' res1 <- EstDynamics(dat$Mismatch, dat$UngapedLen, plotFit=TRUE, plotSensitivity=FALSE, pause=FALSE)
#' nbLackOfFitKL(res1)
nbLackOfFitKL <- function(res) {
pts <- seq(0, 0.2 * res$meanLen) # TODO: modify this
if ('TEfit2' %in% class(res)) {
est2 <- res[['estimates2']]
nbDensity <- dTwoNB(pts,
size1=est2[['size1']],
prob1=est2[['prob1']],
size2=est2[['size2']],
prob2=est2[['prob2']],
p=est2[['p']])
} else {
est <- res[['estimates']]
nbDensity <- stats::dnbinom(pts, est['nb_n'], est['nb_p'])
}
KDE <- res[['KDE']]
KDEval <- stats::approx(KDE[['x']], KDE[['y']], pts)[['y']]
KDEval <- KDEval / sum(KDEval, na.rm=TRUE)
kl <- sum(nbDensity * log(nbDensity / KDEval), na.rm=TRUE)
# browser()
kl
}
dTwoNB <- function(x, size1, prob1, size2, prob2, p) {
stopifnot(p >= 0 && p <= 1)
stats::dnbinom(x, size=size1, prob=prob1) * p +
stats::dnbinom(x, size=size2, prob=prob2) * (1 - p)
}
#' @param nTrial The number of starting points for searching for the MLE.
#' @param ... Pass to EstDynamics
#' @return
#' This function returns a \code{TEfit2} object, containing all the above fields for \code{TEfit} and the following:
#' \item{estimates2}{The parameter estimates from fitting the mixture distribution.}
#' \item{ageDist2}{The estimated age distribution from fitting the mixture distribution.}
#' \item{insRt2}{The estimated insertion rate from fitting the mixture distribution.}
#' \item{agePeakLoc2}{Maximum point(s) for the age distribution.}
#' \item{insPeakLoc2}{Maximum point(s) for the insertion rate.}
#'
#' @rdname EstDynamics
#' @export
#'
#' @examples
#'
#' # Use a mixture distribution to improve fit
#' res2 <- EstDynamics2(dat$Mismatch, dat$UngapedLen, plotFit=TRUE)
#'
#' # A larger number of trials is recommended to achieve the global MLE
#' \dontrun{
#' res3 <- EstDynamics2(dat$Mismatch, dat$UngapedLen, plotFit=TRUE, nTrial=1000L)
#' }
EstDynamics2 <- function(mismatch, len, r=1.3e-2, nTrial=10L, perturb=2, rateRange=NULL, plotFit=FALSE, plotSensitivity=FALSE, pause=plotFit && plotSensitivity, ...) {
ddd <- list(...)
main <- ddd[['main']]
meanLen <- mean(len)
# plotFit <- ddd[['plotFit']]
# if (is.null(plotFit)) {
# plotFit <- FALSE
# }
res_ <- EstDynamics(mismatch, len, plotFit=FALSE, plotSensitivity=FALSE, pause=FALSE, ...)
size0 <- res_[['estimates']]['nb_n']
prob0 <- res_[['estimates']]['nb_p']
if (nTrial <= 0)
stop('need at least 1 trial')
loglik <- -Inf
res <- NULL
for (i in seq_len(nTrial)) {
start <- list(size1 = size0 * stats::runif(1, 0.5, 3),
prob1 = prob0 * stats::runif(1, 0.5, 1.5),
size2 = size0 * stats::runif(1, 0.5, 3),
prob2 = prob0 * stats::runif(1, 0.5, 1.5),
p = stats::runif(1, 0.1, 0.9))
# In case the starting value does not work.
tmpRes <- suppressWarnings({
tryCatch(
MASS::fitdistr(mismatch, dTwoNB, start=start,
lower=c(0, 0, 0, 0, 0), upper=c(Inf, 1, Inf, 1, 1),
control=list(maxit=500L)),
error=function(e) NULL
)
})
if (!is.null(tmpRes) && stats::logLik(tmpRes) > loglik) {
loglik <- unname(stats::logLik(tmpRes))
res <- tmpRes
}
}
if (is.null(res))
stop('No starting point works')
est2 <- res[['estimate']]
mu1 <- est2['size1'] * (1 - est2['prob1']) / est2['prob1']
alpha1 <- est2['size1']
beta1 <- est2['size1'] / mu1
mu2 <- est2['size2'] * (1 - est2['prob2']) / est2['prob2']
alpha2 <- est2['size2']
beta2 <- est2['size2'] / mu2
lof <- nbLackOfFitKL(res_)
maxMis <- floor(max(mismatch) * 1.5)
pts <- seq(0, maxMis)
ptsDense <- seq(0.1, maxMis, by=0.1)
timeScale <- pts / meanLen / r / 2
timeScaleDense <- ptsDense / meanLen / r / 2
den1 <- res_[['KDE']]
lineNB <- dnbinom(pts,
res_[['estimates']]['nb_n'],
res_[['estimates']]['nb_p'])
lineGammaRec <- res_[['ageDist']][['y']]
lineGammaRec <- lineGammaRec * max(res_[['ageDist']][['x']]) / maxMis
lineNB2 <- dTwoNB(pts, size1=est2['size1'], prob1=est2['prob1'],
size2=est2['size2'], prob2=est2['prob2'], p=est2['p'])
lineGammaRec2 <- stats::dgamma(ptsDense, alpha1, beta1) * est2['p'] + stats::dgamma(ptsDense, alpha2, beta2) * (1 - est2['p'])
allLines <- c(den1[['y']], lineNB, lineGammaRec, lineNB2, lineGammaRec2)
yMax <- max(allLines[!is.infinite(allLines)])
if (plotFit) {
if (is.null(main)) {
main <- sprintf('n = %d, LoF = %.3f',
length(mismatch), lof)
}
hist(mismatch, xlab='Mismatch',
main=main,
ylim=c(0, yMax), probability=TRUE, col='white')
lwd <- 2
ind <- den1$x >= 0
lines(den1$x[ind], den1$y[ind], lwd=lwd)
lines(pts, lineNB, lty=2, lwd=lwd, col='blue')
lg <- c('KDE', 'neg. bin.', 'age dist.', '2 neg. bin.', '2 age dist.')
lty <- c(1, 2, 4, 2, 4)
col <- c('black', rep('blue', 2), rep('red', 2))
legend('topright', legend=lg, col=col, lty=lty, lwd=lwd)
lines(ptsDense, lineGammaRec, lty=4, lwd=lwd, col='blue')
lines(pts, lineNB2, lty=2, col='red', lwd=lwd)
lines(ptsDense, lineGammaRec2, lty=4, col='red', lwd=lwd)
}
## Sensitivity analysis.
p1 <- res_$estimate['geom_p']
deathRate <- p1 / (1 - p1)
if (is.null(rateRange)) {
rateRange <- signif(deathRate * c(rev(1 / perturb), 1, perturb), 3)
} else {
rateRange <- signif(rateRange, 3)
}
## In mutation scale.
birthRange2 <- sapply(rateRange, function(rate) {
birth <- lineGammaRec2 / (1 - stats::pexp(ptsDense, rate))
birth[is.infinite(birth)] <- NaN
birth <- birth / sum(birth, na.rm=TRUE) / (timeScaleDense[2] - timeScaleDense[1])
birth
})
colnames(birthRange2) <- rateRange
obj1 <- rainbow::fts(timeScaleDense, birthRange2, xname='Mya', yname='Normalized birth rate')
ind <- max(which(birthRange2 > 1e-4, arr.ind=TRUE)[, 1])
if (ind == -Inf) {
ind <- length(timeScaleDense)
}
xMaxSen <- timeScaleDense[ind]
if (plotSensitivity) {
rainbow::plot.fds(obj1, plot.type='functions', main=sprintf('Sensitivity analysis'), xlim=c(0, xMaxSen), plotlegend=TRUE)
if (pause) {
readline('Press enter to continue')
}
}
estimates2 <- c(
alpha1 = unname(alpha1),
beta1 = unname(beta1 * 2 * r * res_[['meanLen']]),
size1 = unname(est2['size1']),
prob1 = unname(est2['prob1']),
alpha2 = unname(alpha2),
beta2 = unname(beta2 * 2 * r * res_[['meanLen']]),
size2 = unname(est2['size2']),
prob2 = unname(est2['prob2']),
p = unname(est2['p']),
loglik = loglik)
timeScaleDense <- ptsDense / res_[['meanLen']] / r / 2
birthRateRec2 <- lineGammaRec2 / stats::pexp(timeScaleDense, res_[['estimates']]['exp_lambda'], lower.tail=FALSE)
# Normalize to unit integral
birthRateRec2 <- birthRateRec2 / sum(birthRateRec2) *
length(birthRateRec2) / diff(range(timeScaleDense))
ageDist2Rec <- lineGammaRec2 / sum(lineGammaRec2) *
length(lineGammaRec2) / diff(range(timeScaleDense))
res <- c(
list(
estimates2 = estimates2,
ageDist2 = list(x=timeScaleDense, y=ageDist2Rec),
insRt2 = list(x=timeScaleDense, y=birthRateRec2),
agePeakLoc2 = timeScaleDense[findPeaks(lineGammaRec2)],
insPeakLoc2 = timeScaleDense[findPeaks(birthRateRec2)],
sensitivity2 = list(
time = timeScaleDense,
delRateRange = rateRange,
insRange = birthRange2
)
),
res_)
class(res) <- c('TEfit2', class(res_))
res
}
# # Get the recovered (gamma) age distribution, possibly with mixtures.
# getGammaCurve <- function(est, pts=seq(0, 10, length.out=1000L)) {
# if (all(c('alpha1', 'alpha2', 'beta1', 'beta2', 'p') %in%
# names(est))) {
# # Two mixture case
# alpha1 <- est['alpha1']
# beta1 <- est['beta1']
# alpha2 <- est['alpha2']
# beta2 <- est['beta2']
# p <- est['p']
# den <- stats::dgamma(pts, alpha1, beta1) * p +
# stats::dgamma(pts, alpha2, beta2) * (1 - p)
# peaks <- c((alpha1 - 1) / beta1, (alpha2 - 1) / beta2)
# } else if (all(c('gamma_alpha', 'gamma_beta') %in% names(est))) {
# # One mixture case; use Mya scale
# alpha <- est['gamma_alpha']
# beta <- est['gamma_beta']
# den <- stats::dgamma(pts, alpha, beta)
# peaks <- (alpha - 1) / beta
# } else
# stop('`est` does not have the required fields')
# list(x=pts, y=den, peaks=peaks)
# }
# #' Find the indices of peaks (local maximas) for y.
# #'
# #' Note that each function may have multiple peaks/local maximas, representing multiple bursts of insertions.
# #'
# #' @param y A vector containing function values at a set of increasing time points
# #' @export
findPeaks <- function(y) {
yl <- c(-Inf, y[-length(y)])
yr <- c(y[-1], -Inf)
which(y > yl + 1e-12 & y > yr + 1e-12)
}
#' Plot the age distributions or insertion rates for multiple families.
#'
#' @param resList A list of TEfit/TEfit2 objects, which can be mixed
#' @param type Whether to plot the insertion rates ('insRt') or the age distributions ('ageDist').
#' @param ... Passed into plotting functions.
#'
#' @return A list of line data (plotDat) and peak locations (peakDat).
#' @export
#' @examples
#' data(AetLTR)
#' copia3 <- subset(AetLTR, GroupID == 3 & !is.na(Chr))
#' gypsy24 <- subset(AetLTR, GroupID == 24 & !is.na(Chr))
#' res3 <- EstDynamics(copia3$Mismatch, copia3$UngapedLen)
#' res24 <- EstDynamics2(gypsy24$Mismatch, gypsy24$UngapedLen)
#'
#' # Plot insertion rates
#' PlotFamilies(list(`Copia 3`=res3, `Gypsy 24`=res24))
#'
#' # Plot age distributions
#' PlotFamilies(list(`Copia 3`=res3, `Gypsy 24`=res24), type='ageDist')
PlotFamilies <- function(resList, type=c('insRt', 'ageDist'), ...) {
if (is.null(names(resList))) {
names(resList) <- paste('Group', seq_along(resList))
}
type <- match.arg(type)
titleText <- ifelse(type == 'insRt', 'Insertion Rate', 'Age Distribution')
ylab <- ifelse(type == 'insRt', 'Normalized Intensity', 'Density')
pars <- list(lty = 1, pch=17, xlab = 'Mya', ylab = ylab, main = titleText)
ddd <- list(...)
pars[names(ddd)] <- ddd
name1 <- type
name2 <- paste0(type, '2')
plotDat <- lapply(seq_along(resList), function(i) {
res <- resList[[i]]
mixture <- 'TEfit2' %in% class(res)
if (mixture) {
dat <- as.data.frame(res[[name2]])
} else {
dat <- as.data.frame(res[[name1]])
}
# cbind(dat,
# GroupID=names(resList)[i],
# mix=mixture,
# stringsAsFactors=FALSE)
dat
})
pname1 <- sprintf('%sPeakLoc', substr(type, 1, 3))
pname2 <- paste0(pname1, '2')
peakDat <- unlist(sapply(seq_along(resList), function(i) {
res <- resList[[i]]
mixture <- 'TEfit2' %in% class(res)
if (mixture) {
# dat <- data.frame(x=res[[pname2]])
res[[pname2]]
} else {
# dat <- data.frame(x=res[[pname1]])
res[[pname1]]
}
# cbind(dat,
# GroupID=names(resList)[i],
# mix=mixture,
# stringsAsFactors=FALSE)
}))
n <- length(resList)
maxLen <- max(sapply(plotDat, nrow))
xMat <- yMat <- matrix(NA, maxLen, n)
for (i in seq_len(n)) {
xMat[seq_len(nrow(plotDat[[i]])), i] <- plotDat[[i]][['x']]
yMat[seq_len(nrow(plotDat[[i]])), i] <- plotDat[[i]][['y']]
}
do.call(matplot, c(list(xMat, yMat, type='l'), pars))
do.call(points, c(list(peakDat, rep(0, length(peakDat))), pars))
# firstLast <- peakDat[peakDat[['x']] %in% range(peakDat[['x']]), ]
# p <- ggplot2::ggplot(plotDat, aes(x=x, y=y, color=GroupID)) + settings + ggplot2::geom_point(aes(x=x, y=y), data=cbind(peakDat, y=0), size=1, shape=17, color='grey40')
invisible(list(plotDat = plotDat, peakDat = peakDat))
}
#' Implements the matrix model in Promislow et al (1999)
#'
#' @param mismatch A vector containing the number of mismatches.
#' @param len A vector containing the length of each element.
#' @param nsolo An integer giving the number of solo elements.
#' @param r Mutation rate (substitutions/(million year * site)) used in the calculation.
#' @param plotFit Whether to plot the distribution fits.
#' @param main The title for the plot.
#'
#' @details For the method implemented see References.
#'
#' @return This function returns various parameter estimates described in Promislow et al. (1999), containing the following fields. The unit for time is million years ago (Mya):
#' \item{B}{The constant insertion rate}
#' \item{q}{The constant excision rate}
#' \item{lam}{The population growth rate}
#' \item{R}{The ratio of the number of elements in class j over class j+1, which is constant by assumption}
#' \item{age1}{The age of the system under model 1 (lambda > 1)}
#' \item{age2}{The age of the system under model 2 (an initial burst followed by stasis lambda = 1)}
#'
#' @references
#' \cite{Promislow, D., Jordan, K. and McDonald, J. "Genomic demography: a life-history analysis of transposable element evolution." Proceedings of the Royal Society of London B: Biological Sciences 266, no. 1428 (1999): 1555-1560.}
#' @export
#'
#' @examples
#' # Analyze Gypsy family 24 (Nusif)
#' data(AetLTR)
#' dat <- subset(AetLTR, GroupID == 24 & !is.na(Chr))
#' res1 <- MatrixModel(dat$Mismatch, dat$UngapedLen, nsolo=450, plotFit=TRUE)
MatrixModel <- function(mismatch, len, nsolo, r=1.3e-2, plotFit=FALSE, main=sprintf('n = %d', n)) {
# dat <- filter(allDat, GroupID == 1)
# if (dat$GroupID[1] == 31) browser()
meanLen <- mean(len)
theta <- nsolo
n <- length(mismatch)
EN <- mean(mismatch)
mu <- r * 2 * mean(len)
B <- mu / EN
q <- theta / (theta + n) / EN * mu
lam <- 1 + B - q
gamma_a <- log(n) / log(lam)
# Model 2, burst + stasis. Estimate R using all the complete elements. Not reliable
R <- EN / (EN + 1)
gamma_b <- theta * R / (n * mu * (1 - R))
if (plotFit) {
maxPts <- floor(max(mismatch) * 1.5)
pts <- seq(0, maxPts)
ptsDense <- seq(0.1, maxPts, by=0.1)
timeScale <- pts / meanLen / r / 2
timeScaleDense <- ptsDense / meanLen / r / 2
MASS::truehist(mismatch / len / r / 2,
xlab='Age (Mya)',
main=main,
# ylim=c(0, yMax),
prob=TRUE,
col='white',
nbins=min(20, length(unique(as.integer(mismatch))) + 1))
lines(timeScaleDense, stats::dexp(timeScaleDense, B), lwd=2)
}
c(B=B, q=q, lam=lam, R=R, age1=gamma_a, age2=gamma_b)
}
#' Implements the master gene model in Marchani et al (2009)
#'
#' @param mismatch A vector containing the number of mismatches.
#' @param len A vector containing the length of each element.
#' @param r Mutation rate (substitutions/(million year * site)) used in the calculation.
#' @param plotFit Whether to plot the distribution fits.
#' @param main The title for the plot.
#'
#' @details For the method implemented see References.
#'
#' @return This function returns various parameter estimates described in Marchani et al (2009), containing the following fields. The unit for time is million years ago (mya):
#' \item{B}{The constant insertion rate}
#' \item{q}{The constant excision rate}
#' \item{lam}{The population growth rate}
#' \item{R}{The ratio of the number of elements in class j over class j+1, which is constant by assumption}
#' \item{age1}{The age of the system under model 1 (lambda > 1)}
#' \item{age2}{The age of the system under model 2 (an initial burst followed by stasis lambda = 1)}
#'
#' @references
#' \cite{Marchani, Elizabeth E., Jinchuan Xing, David J. Witherspoon, Lynn B. Jorde, and Alan R. Rogers. "Estimating the age of retrotransposon subfamilies using maximum likelihood." Genomics 94, no. 1 (2009): 78-82.}
#' @export
#'
#' @examples
#' # Analyze Gypsy family 24 (Nusif)
#' data(AetLTR)
#' dat <- subset(AetLTR, GroupID == 24 & !is.na(Chr))
#' res2 <- MasterGene(dat$Mismatch, dat$UngapedLen, plotFit=TRUE)
MasterGene <- function(mismatch, len, r=1.3e-2, plotFit=FALSE, main=sprintf('n = %d', n)) {
# ad hoc method: mean age * 2
meanLen <- mean(len)
n <- length(mismatch)
x <- mismatch
EN <- mean(x)
mu <- 2 * r * meanLen
adhoc <- EN / mu * 2
# MLE which follows formula (3)
ff <- function(x, T) {
exp(-meanLen * T) * (meanLen * T)^x
}
dLdT <- function(T) {
- n / T + sum(vapply(seq_len(n), function(i) {
ff(x[i], T) / stats::integrate(ff, 0, T, x=x[i])$value
}, 0))
}
root <- stats::uniroot(dLdT, c(1e-4, 4 * adhoc * r), tol=.Machine$double.eps^0.5)
famAge <- root$root / (2 * r)
if (plotFit) {
maxPts <- floor(max(mismatch) * 1.5)
pts <- seq(0, maxPts)
ptsDense <- seq(0.1, maxPts, by=0.1)
timeScale <- pts / meanLen / r / 2
timeScaleDense <- ptsDense / meanLen / r / 2
MASS::truehist(mismatch / len / r / 2,
xlab='Age (Mya)',
main=main,
# ylim=c(0, yMax),
prob=TRUE,
col='white',
nbins=min(20, length(unique(as.integer(mismatch))) + 1))
dens <- stats::dunif(timeScaleDense, 0, famAge)
is.na(dens) <- 0
lines(timeScaleDense, dens, lwd=2)
}
c(adhoc=adhoc, MLE=famAge)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.