R/HAC.sim.R
In HACSim: Iterative Extrapolation of Species' Haplotype Accumulation Curves for Genetic Diversity Assessment

Documented in HAC.sim

##########################################################
##### HACSim: Haplotype Accumulation Curve Simulator #####
##########################################################

##### 

# HACSim is a nonparametric stochastic iterative local search optimization algorithm
# for the extrapolation of species' haplotype accumulation curves to assess likely 
# required specimen sample sizes for genetic diversity assessment                    

#####

##########

# Author: Jarrett D. Phillips
# Last modified: December 22, 2020

## Best run in RStudio ##
## DO NOT change order of code (can throw errors)! ##

##########

### Input parameters ###

## Required ##

# N = Number of specimens (DNA sequences)
# Hstar = Number of observed unique haplotypes
# probs = Probability frequency distribution of haplotypes

## Optional ##

# p = Proportion (in (0, 1]) of unique haplotypes to recover - 0.95 (95%) by default
# perms = Number of permutations (replications) (in (1, inf)) - 10000 by default
# subsample = Should a subsample of DNA sequences or haplotype be taken? (TRUE or FALSE)
# prop = Proportion (in (0, 1]) of DNA sequences or haplotypes to sample (not used if subsample = FALSE)
# conf.level = Confidence level for accumulation curve and confidence intervals
# conf.type = Typpe of CI to compute and plot ("quantile" or "asymptotic")
# num.iters = Number of iterations to perform (1 or NULL (i.e., all))
# progress = Print results to console (TRUE or FALSE)
# filename = Name of temporary file in which to save results (NULL by default)

#####

HAC.sim <- function(N,
                    Hstar,
                    probs,
                    perms = 10000,
                    K = 1, # DO NOT CHANGE
                    p = 0.95,
                    subset.haps = NULL,
                    prop.haps = NULL,
                    input.seqs = FALSE,
                    subset.seqs = FALSE,
                    prop.seqs = NULL,
                    conf.level = 0.95,
                    ci.type = "quantile",
                    df = NULL, # dataframe
                    num.iters = NULL,
                    progress = TRUE) {
  
  ## Number of iterations to run ##
  
  if ((is.null(num.iters)) || (num.iters == 1)) {
    cat("\n \n")

    ## Display progress bar ##

    if (progress == TRUE) {
      pb <- utils::txtProgressBar(min = 0, max = 1, style = 3)
    }

    ## Load DNA sequence data and set N, Hstar and probs ##
    
    if (input.seqs == TRUE) {
      
      seqs <- read.dna(file = file.choose(), format = "fasta")
    
        
      bf <- base.freq(seqs, all = TRUE)[5:17] # frequencies of IUPAC codes, Ns, gaps and missing bases

      if (any(bf > 0)) {
        warning("Inputted DNA sequences contain missing and/or ambiguous 
	    nucleotides, which may lead to overestimation of the number of 
	    observed unique haplotypes. Consider excluding sequences or alignment 
	    sites containing these data. If missing and/or ambiguous bases occur 
	    at the ends of sequences, further alignment trimming is an option.")
      }

      assign("ptm", proc.time(), envir = envr) # set timer
      
      # take random subset of sequences (e.g., prop.seqs = 0.10 (10%))

      if (subset.seqs == TRUE) { 
        seqs <- seqs[sample(nrow(seqs), size = ceiling(prop.seqs * nrow(seqs)), replace = FALSE), ]
        seqsfile <- tempfile(fileext = ".fas")
        write.dna(seqs, file = seqsfile)
      }

      assign("N", dim(seqs)[[1]], envir = envr)
      h <- sort(haplotype(seqs), decreasing = TRUE, what = "frequencies")
      rownames(h) <- 1:nrow(h)
      assign("Hstar", dim(h)[[1]], envir = envr)
      assign("probs", lengths(attr(h, "index")) / N, envir = envr)
    } 

    ## Error messages ##

    if (N < Hstar) {
      stop("N must be greater than or equal to Hstar")
    }

    if (N == 1) {
      stop("N must be greater than 1")
    }

    if (Hstar == 1) {
      stop("H* must be greater than 1")
    }

    if (!isTRUE(all.equal(1, sum(probs), tolerance = .Machine$double.eps^0.25))) {
      stop("probs must sum to 1")
    }
    
    if (length(probs) != Hstar) {
      stop("probs must have Hstar elements")
    }

    if ((perms == 0) || (perms == 1)) {
      stop("perms must be greater than 1")
    }

    if ((p <= 0) || (p > 1)) {
      stop("p must be greater than 0 and less than or equal to 1")
    }

    ## Set up container to hold the identity of each individual from each permutation ##

    num.specs <- N

    ## Create an ID for each haplotype ##

    if (is.null(subset.haps)) {
      haps <- 1:Hstar
    } else {
      subset.haps <- subset.haps
    }

    ## Assign individuals (N) ##

    specs <- 1:num.specs

    ## Generate permutations. Assume each permutation has N individuals, and sample those
    ## individuals' haplotypes from the probabilities ##

    gen.perms <- function() {
      if (is.null(subset.haps)) {
        sample(haps, size = num.specs, replace = TRUE, prob = probs)
      } else {
        resample <- function(x, ...) x[sample.int(length(x), ...)]
        resample(subset.haps, size = num.specs, replace = TRUE, prob = probs[subset.haps])
      }
    }

    pop <- array(dim = c(perms, num.specs, K)) # preallocate array
    
    for (i in 1:K) {
      pop[, , i] <- replicate(perms, gen.perms()) # fill array
    }
    
    ## Perform haplotype accumulation ##
    
    # Custom C++ function selects one permutation (row) in pop array, 
    # then selects individuals in a random order and samples haplotypes for that permutation 

    HAC.mat <- accumulate(pop, specs, perms, K) 
    HAC.mat <- drop(HAC.mat) # drop array dimension
    
    if (progress == TRUE) {
      utils::setTxtProgressBar(pb, i)
    }

    ## Calculate the mean, sd and CI (based on sample quantiles) for number of haplotypes recovered over all permutations

    # means <- apply(HAC.mat, MARGIN = 2, mean)
    means <- colMeans2(HAC.mat)
    # means <- colmeans(HAC.mat)
    # sds <- apply(HAC.mat, MARGIN = 2, sd)
    sds <- colSds(HAC.mat)
    # sds <- colVars(HAC.mat, std = TRUE)
    
    # lower <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, (1 - conf.level) / 2))
    lower <- colQuantiles(HAC.mat, probs = (1 - conf.level) / 2)
    # lower <- colQuantile(HAC.mat, probs = (1 - conf.level) / 2)
    # upper <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, (1 + conf.level) / 2))
    upper <- colQuantiles(HAC.mat, probs = (1 + conf.level) / 2)
    # upper <- colQuantile(HAC.mat, probs = (1 + conf.level) / 2)
    
    ## Make data accessible to user ##
    
    assign("d", data.frame(specs, means, sds), envir = envr)
    
    # Compute CIs for fraction of haplotypes sampled
    
    assign("R.low", (envr$d$means - qnorm((1 + envr$conf.level) / 2) * (envr$d$sds / sqrt(length(envr$d$specs)))) / envr$Hstar, envir = envr)
    assign("R.high", (envr$d$means + qnorm((1 + envr$conf.level) / 2) * (envr$d$sds / sqrt(length(envr$d$specs)))) / envr$Hstar, envir = envr)
    
    ## Compute simple summary statistics and display output ##
    
    # tail() is used here instead of max() because curves will not be monotonic if perms is not set high enough. 
    # When perms is large (say 10000), tail() is sufficiently close to max().
    
    # Cache values for easy referencing 
    
    P <- tail(means, n = 1)
    num1 <- N * Hstar
    num.haps <- length(subset.haps)
    num2 <- N * num.haps
    
    # Measures of Sampling Closeness #

    if (is.null(subset.haps)) {
      Q <- Hstar - P
      assign("R", P / Hstar, envir = envr)
      S <- Q / Hstar
      assign("Nstar", num1 / P, envir = envr)
      assign("X", (num1 / P) - N, envir = envr)
    } else {
      Q <- num.haps - P
      assign("R", P / num.haps, envir = envr)
      S <- Q / num.haps
      assign("Nstar", num2 / P, envir = envr)
      assign("X", (num2 / P) - N, envir = envr)
    }

    if (envr$X < 0) {
      envr$X <- 0 # to ensure non-negative result
    }

    ## Output results to R console and CSV file ##
    
    if (progress == TRUE) {
      cat(
        "\n \n --- Measures of Sampling Closeness --- \n \n",
        "Mean number of haplotypes sampled: ", P,
        "\n Mean number of haplotypes not sampled: ", Q,
        "\n Proportion of haplotypes sampled: ", envr$R,
        "\n Proportion of haplotypes not sampled: ", S,
        "\n \n Mean value of N*: ", envr$Nstar,
        "\n Mean number of specimens not sampled: ", envr$X
      )
      
      ## Compute asymptotic confidence interval endpoints and margin of error (MOE) for N* (based on delta method)
      
      MOE <- (qnorm((1 + envr$conf.level) / 2) * (tail(envr$d$sds, n = 1) / tail(envr$d$means, n = 1)) * sqrt(N))
      
      if (envr$iters == 1) {
        assign("Nstar.low", N, envir = envr)
        assign("Nstar.high", N, envir = envr)
      } else {
        assign("Nstar.low", signif(N - MOE), envir = envr)
        assign("Nstar.high", signif(N + MOE), envir = envr)
      }

      ## Plot the mean haplotype accumulation curve (averaged over perms number of curves) and haplotype frequency barplot ##
      
      par(mfrow = c(1, 2))
      
      if (is.null(subset.haps)) {
        plot(specs, means, type = "n", xlab = "Specimens sampled", ylab = "Unique haplotypes", ylim = c(1, Hstar), main = "Haplotype accumulation curve")
      } else {
        plot(specs, means, type = "n", xlab = "Specimens sampled", ylab = "Unique haplotypes", ylim = c(1, length(subset.haps)), main = "Haplotype accumulation curve")
      }
      
      if (ci.type == "quantile") {
        polygon(x = c(specs, rev(specs)), y = c(lower, rev(upper)), col = "gray")
      } 
      
      if (ci.type == "asymptotic") {
        polygon(x = c(specs, rev(specs)), y = c(envr$R.low * envr$Hstar, rev(envr$R.high * envr$Hstar)), col = "gray")
      }
      
      lines(specs, means, lwd = 2)
      
      if (is.null(subset.haps)) {
        abline(h = envr$R * Hstar, v = max(envr$d$specs), lty = 2) # dashed line
        abline(h = p * Hstar, lty = 3) # dotted line
        barplot(num.specs * probs, xlab = "Unique haplotypes", ylab = "Specimens sampled", names.arg = haps, main = "Haplotype frequency distribution")
      } else {
        abline(h = envr$R * length(subset.haps), v = max(envr$d$specs), lty = 2) # dashed line
        abline(h = p * length(subset.haps), lty = 3) # dotted line
        barplot(num.specs * (probs[subset.haps] / sum(probs[subset.haps])), xlab = "Unique haplotypes", ylab = "Specimens sampled", names.arg = subset.haps, main = "Haplotype frequency distribution")
      }
    }

    # Output summary statistics to dataframe (df)
    
    df[nrow(df) + 1, ] <- c(P, Q, envr$R, S, envr$Nstar, envr$X)
    df
  }

} # end HAC.sim

Any scripts or data that you put into this service are public.

HACSim documentation built on Nov. 5, 2025, 5:45 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

HACSim
Iterative Extrapolation of Species' Haplotype Accumulation Curves for Genetic Diversity Assessment

R/HAC.sim.R
In HACSim: Iterative Extrapolation of Species' Haplotype Accumulation Curves for Genetic Diversity Assessment

Defines functions HAC.sim

Documented in HAC.sim

Try the HACSim package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

HACSim Iterative Extrapolation of Species' Haplotype Accumulation Curves for Genetic Diversity Assessment

R/HAC.sim.R In HACSim: Iterative Extrapolation of Species' Haplotype Accumulation Curves for Genetic Diversity Assessment

Defines functions HAC.sim

Documented in HAC.sim

Try the HACSim package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

HACSim
Iterative Extrapolation of Species' Haplotype Accumulation Curves for Genetic Diversity Assessment

R/HAC.sim.R
In HACSim: Iterative Extrapolation of Species' Haplotype Accumulation Curves for Genetic Diversity Assessment