.taj_d <- function(afs) # tajima's d
{
n <- length(afs) + 1
Sn <- sum(afs)
theta_pi <- 2/(n*(n-1)) * sum( (1:(n-1)) * (n - 1:(n-1)) * afs)
theta_w <- sum(afs)/sum(1/(1:(n-1)))
a1 <- sum(1/(1:(n-1)))
a2 <- sum(1/(1:(n-1))^2)
b1 <- (n + 1)/(3*(n - 1))
b2 <- 2 * (n^2 + n + 3)/(9*n*(n-1))
c1 <- (b1 - 1/a1)
c2 <- (b2 + a2/a1^2 - (n+2)/(a1*n))
e1 <- c1/a1
e2 <- c2/(a1^2 + a2)
c(theta_pi - theta_w, e1*Sn + e2*Sn*(Sn - 1))# num, denom^2
}
.get_stats <- function(geno)
{
# stats needed for VCFtools method of calculating D
cnt <- apply(geno, 2, function(x) sum(x[x>0]))
nonmiss <- apply(geno, 2, function(x) 2*sum(x!=-1))
p <- ifelse(nonmiss > 0, cnt/nonmiss, 0)
He <- sum(p*(1-p))
N <- nrow(geno)*2
Sn <- sum(p > 0)
return (list(He=He, N=N, Sn=Sn))
}
.taj_d_vcftools <- function(stats) # tajima's D, ala vcftools
{
He <- stats$He
N <- stats$N
Sn <- stats$Sn
a1 <- sum(1/(1:(N-1)))
a2 <- sum(1/(1:(N-1))^2)
b1 <- (N + 1)/(3*(N - 1))
b2 <- 2 * (N^2 + N + 3)/(9*N*(N-1))
c1 <- (b1 - 1/a1)
c2 <- (b2 + a2/a1^2 - (N+2)/(a1*N))
e1 <- c1/a1
e2 <- c2/(a1^2 + a2)
theta_pi <- 2 * He * N/(N-1)
theta_w <- Sn/a1
(theta_pi - theta_w)/sqrt(e1*Sn + e2*Sn*(Sn - 1))
}
.sim_stats <- function(geno, maf=0, ploidy=2)
{
# simulate given null model, stuff needed for VCFtools method of calculating D
N <- nrow(geno)*ploidy
# get probability of allele count given maf filter, sample size
pr <- c(0, 1/(1:(N-1)), 0)
maf <- ceiling(N*maf)
if (maf > 0)
pr <- c(rep(0, maf-1), 1/(maf:(N-maf)), rep(0, maf-1))
else
pr <- c(1/(1:(N-1)))
# simulate data respecting missing data pattern
nonmiss <- apply(geno, 2, function(x) 2*sum(x!=-1))
cnt <- rep(0, ncol(geno))
for (i in unique(nonmiss))
cnt[nonmiss == i] <- sample(1:(i-1), sum(nonmiss == i), prob=pr[1:(i-1)], replace=TRUE)
p <- ifelse(nonmiss > 0, cnt/nonmiss, 0)
He <- sum(p*(1-p))
Sn <- sum(p > 0)
return (list(He=He, N=N, Sn=Sn))
}
.sim_afs <- function(N, Sn, maf=0, ploidy=2)
{
# simulate afs under neutral model with missing data and a MAF cutoff -- *at least* MAF chromosomes must have minor allele
pr <- c(0, 1/(1:(N-1)), 0)
maf <- ceiling(N*maf)
if (maf > 0)
pr <- c(rep(0, maf-1), 1/(maf:(N-maf)), rep(0, maf-1))
else
pr <- c(1/(1:(N-1)))
rmultinom(1, size=Sn, prob=pr)
}
.TajimaD <- function(geno, ploidy=2)
{
# my way of calculating D, just an aggregate estimator across missing data patterns
mis_pat <- apply(geno,2,function(x) ploidy*sum(x!=-1))
sums <- apply(geno,2,function(x) sum(x[x!=-1]))
afs <- list()
for (i in unique(mis_pat))
{
af <- table(sums[mis_pat==i])
afs[[as.character(i)]] <- rep(0, i-1)
afs[[as.character(i)]][as.numeric(names(af))] <- af[]
}
tmp <- sapply(afs, .taj_d)
tmp <- apply(tmp, 1, mean)
tmp[1]/sqrt(tmp[2])
}
#' Tajima's D
#'
#' @description
#' Calculate a genome-wide estimate of Tajima's D, and optionally use simulation from the neutral model to correct for bias due to a minor-allele-frequency filter and perform a significance test.
#'
#' @param object an object of class "vcfLink"
#' @param maf the minimum allele frequency allowed in the null model simulations, as a proportion; SNPs with frequency < MAF are excluded from the simulations. The data is also thinned using this criterion prior to calculating D.
#' @param nboot the number of bootstrap replicates used to generate a confidence interval around D; if 0, no bootstrap is performed
#' @param nsim the number of simulations from the null model, used to correct bias due to the MAF and to test the significance of D; if 0, no correction is performed
#' @param ploidy the ploidy of the organism
#' @param use_vcftools_D if TRUE, use the method for calculating D implemented in vcftools, which is biased when there is missing data. If FALSE (default), use an unbiased (and faster) method
#' @rdname TajimaD
#' @export
TajimaD <- function(object, maf=0, nboot=0, nsim=1000, ploidy=2, use_vcftools_D = FALSE)
{
if (maf>0)
object <- Filter(object, filterOptions(maf=maf), verbose=FALSE)
geno <- GenotypeMatrix(object, verbose=FALSE)
min_count <- min(apply(geno, 2, function(x) sum(x[x > -1])))
min_maf <- min(apply(geno, 2, function(x) ceiling(maf*ploidy*sum(x > -1))))
if (min_count < min_maf) stop("'maf' does not agree with data: minimum count in data was ", min_count, " while minimum count allowed by maf filter was", min_maf)
if (min_count > min_maf) warning("'maf' may not agree with data: minimum count in data was ", min_count, " while minimum count allowed by maf filter was", min_maf)
if (!use_vcftools_D)
{
mis_pat <- table(apply(geno,2,function(x) ploidy*sum(x!=-1)))
sims <- rep(NA, nsim)
if (nsim > 0)
for (i in 1:nsim)
{
if (i %% 100 == 0) cat("Simulation: iter", i, "\n")
sim_num <- rep(0, length(mis_pat)) # numerator of D
sim_den <- rep(0, length(mis_pat)) # squared denominator of D
for (j in 1:length(mis_pat))
{
tmp <- .taj_d(.sim_afs(as.numeric(names(mis_pat)[j]), mis_pat[j], maf=maf))
sim_num[j] <- tmp[1]
sim_den[j] <- tmp[2]
}
# for each missing data pattern, we've calculated
# \theta_\pi and \theta_w and the variance thereof.
# To get an aggregate estimator, we take the mean.
sims[i] <- mean(sim_num)/sqrt(mean(sim_den))
}
boot <- rep(NA, nboot)
if (nboot > 0)
for (i in 1:nboot)
{
if (i %% 100 == 0) cat("Bootstrap: iter", i, "\n")
boot[i] <- .TajimaD(geno[,sample(1:ncol(geno), ncol(geno), replace=T)])
}
obs <- .TajimaD(geno)
}
else
{
sims <- rep(NA, nsim)
if (nsim > 0)
for (i in 1:nsim)
{
if (i %% 100 == 0) cat("Simulation: iter", i, "\n")
sims[i] <- .taj_d_vcftools(.sim_stats(geno, maf))
}
boot <- rep(NA, nboot)
if (nboot > 0)
for (i in 1:nboot)
{
if (i %% 100 == 0) cat("Bootstrap: iter", i, "\n")
boot[i] <- .taj_d_vcftools(.get_stats(geno[,sample(1:ncol(geno), ncol(geno), replace=T)]))
}
obs <- .taj_d_vcftools(.get_stats(geno))
}
bias <- if(nsim > 0) mean(sims) else 0
sims_corr <- (sims - bias)
boot_corr <- (boot - bias)
obs_corr <- (obs - bias)
p <- sum(abs(sims_corr) >= abs(obs_corr - bias))/nsim
list (results = list(
"Tajima D, raw" = obs,
"Tajima D, bias-corrected" = obs_corr,
"Tajima D, bias-corrected 95CI" = quantile(boot_corr, c(0.025, 0.975)),
"Pr(abs(D) > abs(D_observed)|Null)" = ifelse(p, p, paste("<", 1/nsim)),
"Bias" = bias),
simulations = list(
"Null" = sims,
"Null, bias-corrected" = sims_corr,
"Bootstrap" = boot,
"Bootstrap, bias-corrected" = boot_corr))
}
#TajimaD_test_vcftools <- function(object, maf=0, nboot=1000, nsim=1000, null_quantiles=seq(0,1,0.1))
#{
# geno <- GenotypeMatrix(object)
#
# sims <- rep(NA, nsim)
# if (nsim > 0)
# for (i in 1:nsim)
# {
# if (i %% 100 == 0) cat("Simulation: iter", i, "\n")
# sims[i] <- taj_d_vcftools(sim_stats(geno, maf))
# }
#
# boot <- rep(NA, nboot)
# if (nboot > 0)
# for (i in 1:nboot)
# {
# if (i %% 100 == 0) cat("Bootstrap: iter", i, "\n")
# boot[i] <- taj_d_vcftools(get_stats(geno[,sample(1:ncol(geno), ncol(geno), replace=T)]))
# }
#
# obs <- taj_d_vcftools(get_stats(geno))
# bias <- mean(sims)
# sims <- (sims - bias)
# boot <- (boot - bias)
# obs_corr <- (obs - bias)
#
# p <- sum(abs(sims) >= abs(obs_corr))/nsim
#
# list ("Tajima D, raw" = obs,
# "Bias" = bias,
# "Tajima D, bias-corrected" = obs_corr,
# "Tajima D, bias-corrected 95CI" = quantile(boot, c(0.025, 0.975)),
# "Null model quantiles" = quantile(sims, null_quantiles),
# "Pr(abs(D) > abs(D_observed)|Null)" = ifelse(p, p, paste("<", 1/nsim)))
#}
#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.