##########################################################
##### HACSim: Haplotype Accumulation Curve Simulator #####
##########################################################
#####
# HACSim is a nonparametric stochastic iterative local search optimization algorithm
# for the extrapolation of species' haplotype accumulation curves to assess likely
# required specimen sample sizes for genetic diversity assessment
#####
##########
# Author: Jarrett D. Phillips
# Last modified: December 22, 2020
## Best run in RStudio ##
## DO NOT change order of code (can throw errors)! ##
##########
### Input parameters ###
## Required ##
# N = Number of specimens (DNA sequences)
# Hstar = Number of observed unique haplotypes
# probs = Probability frequency distribution of haplotypes
## Optional ##
# p = Proportion (in (0, 1]) of unique haplotypes to recover - 0.95 (95%) by default
# perms = Number of permutations (replications) (in (1, inf)) - 10000 by default
# subsample = Should a subsample of DNA sequences or haplotype be taken? (TRUE or FALSE)
# prop = Proportion (in (0, 1]) of DNA sequences or haplotypes to sample (not used if subsample = FALSE)
# conf.level = Confidence level for accumulation curve and confidence intervals
# conf.type = Typpe of CI to compute and plot ("quantile" or "asymptotic")
# num.iters = Number of iterations to perform (1 or NULL (i.e., all))
# progress = Print results to console (TRUE or FALSE)
# filename = Name of temporary file in which to save results (NULL by default)
#####
HAC.sim <- function(N,
Hstar,
probs,
perms = 10000,
K = 1, # DO NOT CHANGE
p = 0.95,
subset.haps = NULL,
prop.haps = NULL,
input.seqs = FALSE,
subset.seqs = FALSE,
prop.seqs = NULL,
conf.level = 0.95,
ci.type = "quantile",
df = NULL, # dataframe
num.iters = NULL,
progress = TRUE) {
## Number of iterations to run ##
if ((is.null(num.iters)) || (num.iters == 1)) {
cat("\n \n")
## Display progress bar ##
if (progress == TRUE) {
pb <- utils::txtProgressBar(min = 0, max = 1, style = 3)
}
## Load DNA sequence data and set N, Hstar and probs ##
if (input.seqs == TRUE) {
seqs <- read.dna(file = file.choose(), format = "fasta")
bf <- base.freq(seqs, all = TRUE)[5:17] # frequencies of IUPAC codes, Ns, gaps and missing bases
if (any(bf > 0)) {
warning("Inputted DNA sequences contain missing and/or ambiguous
nucleotides, which may lead to overestimation of the number of
observed unique haplotypes. Consider excluding sequences or alignment
sites containing these data. If missing and/or ambiguous bases occur
at the ends of sequences, further alignment trimming is an option.")
}
assign("ptm", proc.time(), envir = envr) # set timer
# take random subset of sequences (e.g., prop.seqs = 0.10 (10%))
if (subset.seqs == TRUE) {
seqs <- seqs[sample(nrow(seqs), size = ceiling(prop.seqs * nrow(seqs)), replace = FALSE), ]
seqsfile <- tempfile(fileext = ".fas")
write.dna(seqs, file = seqsfile)
}
assign("N", dim(seqs)[[1]], envir = envr)
h <- sort(haplotype(seqs), decreasing = TRUE, what = "frequencies")
rownames(h) <- 1:nrow(h)
assign("Hstar", dim(h)[[1]], envir = envr)
assign("probs", lengths(attr(h, "index")) / N, envir = envr)
}
## Error messages ##
if (N < Hstar) {
stop("N must be greater than or equal to Hstar")
}
if (N == 1) {
stop("N must be greater than 1")
}
if (Hstar == 1) {
stop("H* must be greater than 1")
}
if (!isTRUE(all.equal(1, sum(probs), tolerance = .Machine$double.eps^0.25))) {
stop("probs must sum to 1")
}
if (length(probs) != Hstar) {
stop("probs must have Hstar elements")
}
if ((perms == 0) || (perms == 1)) {
stop("perms must be greater than 1")
}
if ((p <= 0) || (p > 1)) {
stop("p must be greater than 0 and less than or equal to 1")
}
## Set up container to hold the identity of each individual from each permutation ##
num.specs <- N
## Create an ID for each haplotype ##
if (is.null(subset.haps)) {
haps <- 1:Hstar
} else {
subset.haps <- subset.haps
}
## Assign individuals (N) ##
specs <- 1:num.specs
## Generate permutations. Assume each permutation has N individuals, and sample those
## individuals' haplotypes from the probabilities ##
gen.perms <- function() {
if (is.null(subset.haps)) {
sample(haps, size = num.specs, replace = TRUE, prob = probs)
} else {
resample <- function(x, ...) x[sample.int(length(x), ...)]
resample(subset.haps, size = num.specs, replace = TRUE, prob = probs[subset.haps])
}
}
pop <- array(dim = c(perms, num.specs, K)) # preallocate array
for (i in 1:K) {
pop[, , i] <- replicate(perms, gen.perms()) # fill array
}
## Perform haplotype accumulation ##
# Custom C++ function selects one permutation (row) in pop array,
# then selects individuals in a random order and samples haplotypes for that permutation
HAC.mat <- accumulate(pop, specs, perms, K)
HAC.mat <- drop(HAC.mat) # drop array dimension
if (progress == TRUE) {
utils::setTxtProgressBar(pb, i)
}
## Calculate the mean, sd and CI (based on sample quantiles) for number of haplotypes recovered over all permutations
# means <- apply(HAC.mat, MARGIN = 2, mean)
means <- colMeans2(HAC.mat)
# means <- colmeans(HAC.mat)
# sds <- apply(HAC.mat, MARGIN = 2, sd)
sds <- colSds(HAC.mat)
# sds <- colVars(HAC.mat, std = TRUE)
# lower <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, (1 - conf.level) / 2))
lower <- colQuantiles(HAC.mat, probs = (1 - conf.level) / 2)
# lower <- colQuantile(HAC.mat, probs = (1 - conf.level) / 2)
# upper <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, (1 + conf.level) / 2))
upper <- colQuantiles(HAC.mat, probs = (1 + conf.level) / 2)
# upper <- colQuantile(HAC.mat, probs = (1 + conf.level) / 2)
## Make data accessible to user ##
assign("d", data.frame(specs, means, sds), envir = envr)
# Compute CIs for fraction of haplotypes sampled
assign("R.low", (envr$d$means - qnorm((1 + envr$conf.level) / 2) * (envr$d$sds / sqrt(length(envr$d$specs)))) / envr$Hstar, envir = envr)
assign("R.high", (envr$d$means + qnorm((1 + envr$conf.level) / 2) * (envr$d$sds / sqrt(length(envr$d$specs)))) / envr$Hstar, envir = envr)
## Compute simple summary statistics and display output ##
# tail() is used here instead of max() because curves will not be monotonic if perms is not set high enough.
# When perms is large (say 10000), tail() is sufficiently close to max().
# Cache values for easy referencing
P <- tail(means, n = 1)
num1 <- N * Hstar
num.haps <- length(subset.haps)
num2 <- N * num.haps
# Measures of Sampling Closeness #
if (is.null(subset.haps)) {
Q <- Hstar - P
assign("R", P / Hstar, envir = envr)
S <- Q / Hstar
assign("Nstar", num1 / P, envir = envr)
assign("X", (num1 / P) - N, envir = envr)
} else {
Q <- num.haps - P
assign("R", P / num.haps, envir = envr)
S <- Q / num.haps
assign("Nstar", num2 / P, envir = envr)
assign("X", (num2 / P) - N, envir = envr)
}
if (envr$X < 0) {
envr$X <- 0 # to ensure non-negative result
}
## Output results to R console and CSV file ##
if (progress == TRUE) {
cat(
"\n \n --- Measures of Sampling Closeness --- \n \n",
"Mean number of haplotypes sampled: ", P,
"\n Mean number of haplotypes not sampled: ", Q,
"\n Proportion of haplotypes sampled: ", envr$R,
"\n Proportion of haplotypes not sampled: ", S,
"\n \n Mean value of N*: ", envr$Nstar,
"\n Mean number of specimens not sampled: ", envr$X
)
## Compute asymptotic confidence interval endpoints and margin of error (MOE) for N* (based on delta method)
MOE <- (qnorm((1 + envr$conf.level) / 2) * (tail(envr$d$sds, n = 1) / tail(envr$d$means, n = 1)) * sqrt(N))
if (envr$iters == 1) {
assign("Nstar.low", N, envir = envr)
assign("Nstar.high", N, envir = envr)
} else {
assign("Nstar.low", signif(N - MOE), envir = envr)
assign("Nstar.high", signif(N + MOE), envir = envr)
}
## Plot the mean haplotype accumulation curve (averaged over perms number of curves) and haplotype frequency barplot ##
par(mfrow = c(1, 2))
if (is.null(subset.haps)) {
plot(specs, means, type = "n", xlab = "Specimens sampled", ylab = "Unique haplotypes", ylim = c(1, Hstar), main = "Haplotype accumulation curve")
} else {
plot(specs, means, type = "n", xlab = "Specimens sampled", ylab = "Unique haplotypes", ylim = c(1, length(subset.haps)), main = "Haplotype accumulation curve")
}
if (ci.type == "quantile") {
polygon(x = c(specs, rev(specs)), y = c(lower, rev(upper)), col = "gray")
}
if (ci.type == "asymptotic") {
polygon(x = c(specs, rev(specs)), y = c(envr$R.low * envr$Hstar, rev(envr$R.high * envr$Hstar)), col = "gray")
}
lines(specs, means, lwd = 2)
if (is.null(subset.haps)) {
abline(h = envr$R * Hstar, v = max(envr$d$specs), lty = 2) # dashed line
abline(h = p * Hstar, lty = 3) # dotted line
barplot(num.specs * probs, xlab = "Unique haplotypes", ylab = "Specimens sampled", names.arg = haps, main = "Haplotype frequency distribution")
} else {
abline(h = envr$R * length(subset.haps), v = max(envr$d$specs), lty = 2) # dashed line
abline(h = p * length(subset.haps), lty = 3) # dotted line
barplot(num.specs * (probs[subset.haps] / sum(probs[subset.haps])), xlab = "Unique haplotypes", ylab = "Specimens sampled", names.arg = subset.haps, main = "Haplotype frequency distribution")
}
}
# Output summary statistics to dataframe (df)
df[nrow(df) + 1, ] <- c(P, Q, envr$R, S, envr$Nstar, envr$X)
df
}
} # end HAC.sim
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.