#' @name run_populations
#' @title Run STACKS Version >= 2.0 populations module
#' @description Run \href{http://catchenlab.life.illinois.edu/stacks/}{STACKS}
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/populations.php}{populations}
#' module inside R!
#' @param P (path, character) Path to the directory containing all the STACKS files.
#' Default: \code{P = "06_ustacks_2_gstacks"}.
#' @param V (character) Path to an input VCF file. When this module is used to
#' filter an existing vcf file.
#' Default: \code{V = NULL}.
#' @param O (character) Path to a directory where to write the output files.
#' With default: \code{O = "07_populations"}, the function creates a folder inside
#' \code{07_populations}, with date and time appended to \code{populations_}.
#' @param M path to a population map file. The format is a tab-separated file,
#' with first column containing sample name and second column population id.
#' No heather (column name).
#' e.g. \code{M = "02_project_info/population.map.catalog.tsv"}
#' @param parallel.core (integer) enable parallel execution with num_threads threads.
#' Default: \code{parallel.core = parallel::detectCores() - 1}
#' @param batch.size (integer) The number of loci (de novo mode) or
#' chromosome (reference mode), to process in a batch.
#' Increase to speed analysis, uses more memory, decrease to save memory).
#' Default in de novo mode (loci/batch): \code{batch.size = 10000}.
#' Default in reference mode (chromosome/batch): \code{batch.size = 1}.
#' @param p (integer) Minimum number of populations a locus must be present in to process a locus.
#' Default: \code{p = 1}.
#' @param r (double) Minimum percentage of individuals in a population required to process a locus for that population.
#' Default: \code{r = 0.3}.
#' @param R (double) Minimum percentage of individuals across populations required to process a locus.
#' Default: \code{r = 0.3}.
#' @param H (logical) Apply the above filters haplotype wise
#' (unshared SNPs will be pruned to reduce haplotype-wise missing data).
#' Default: \code{H = TRUE}.
#' @param min.maf (double) Specify a minimum minor allele frequency required to process a nucleotide site at a locus (0 < min.maf < 0.5).
#' Default: \code{min.maf = NULL}. Recommendation: use my other R package RADIATOR to filter data based on MAF.
#' @param min.mac (integer) Specify a minimum minor allele count required to process a nucleotide site at a locus.
#' Default: \code{min.mac = NULL}.
#' @param max.obs.het (double) Specify a maximum observed heterozygosity required to process a nucleotide site at a locus.
#' Default: \code{max.obs.het = NULL}. Recommendation: use my other R package RADIATOR to filter data based on heterozygosity.
#' @param write.single.snp (logical) Restrict data analysis to only the first SNP per locus (implies --no-haps).
#' Default: \code{write.single.snp = FALSE}.
#' @param write.random.snp (logical) Restrict data analysis to one random SNP per locus (implies --no-haps).
#' Default: \code{write.random.snp = FALSE}.
#' @param B (character) Path to a file containing Blacklisted markers to be excluded from the export.
#' Default: \code{B = NULL}.
#' @param W (character) Path to a file containing Whitelisted markers to include in the export.
#' Default: \code{W = NULL}.
#' @param e (character) Restriction enzyme name.
#' Default: \code{e = NULL}.
#' @param merge.sites (logical) Merge loci that were produced from the same restriction enzyme cutsite (requires reference-aligned data).
#' Default: \code{merge.sites = FALSE}.
#' @param merge.prune.lim (integer) When merging adjacent loci, if at least X% samples posses both loci prune the remaining samples out of the analysis.
#' Default: \code{merge.prune.lim = NULL}.
#' @param hwe (logical) Calculate divergence from Hardy-Weinberg equilibrium
#' using the exact test at the SNP level and Guo and Thompson MCMC algorithm at
#' the haplotype level.
#' Default: \code{hwe = FALSE}.
#' @param fstats (logical) Enable SNP and haplotype-based F statistics.
#' Default: \code{fstats = FALSE}.
#' @param fst.correction (character) Specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'.
#' Default: \code{fst.correction = NULL}.
#' @param p.value.cutoff (double) maximum p-value to keep an Fst measurement.
#' Also used as base for Bonferroni correction.
#' Default: \code{p.value.cutoff = 0.05}.
#' @param k (logical) Enable kernel-smoothed Pi, Fis, Fst, Fst', and Phi_st calculations.
#' Default: \code{k = FALSE}.
#' @param smooth.fstats (logical) Enable kernel-smoothed Fst, Fst', and Phi_st calculations.
#' Default: \code{smooth.fstats = FALSE}.
#' @param smooth.popstats (logical) Enable kernel-smoothed Pi and Fis calculations.
#' Default: \code{smooth.popstats = FALSE}.
#' @param sigma (integer) Standard deviation of the kernel smoothing weight distribution.
#' Default: \code{sigma = 150000} (150kb).
#' @param bootstrap (logical) Turn on boostrap resampling for all smoothed statistics.
#' Default: \code{bootstrap = FALSE}.
#' @param N (integer) Number of bootstrap resamplings to calculate.
#' Default: \code{N = 100}.
#' @param bootstrap.pifis (logical) Turn on boostrap resampling for smoothed SNP-based Pi and Fis calculations.
#' Default: \code{bootstrap.pifis = FALSE}.
#' @param bootstrap.fst (logical) Turn on boostrap resampling for smoothed Fst calculations based on pairwise population comparison of SNPs.
#' Default: \code{bootstrap.fst = FALSE}.
#' @param bootstrap.div (logical) Turn on boostrap resampling for smoothed haplotype diveristy and gene diversity calculations based on haplotypes.
#' Default: \code{bootstrap.div = FALSE}.
#' @param bootstrap.phist (logical) Turn on boostrap resampling for smoothed Phi_st calculations based on haplotypes.
#' Default: \code{bootstrap.phist = FALSE}.
#' @param bootstrap.wl (character) Path to a whitelist file. Only use bootstrap loci contained in this whitelist.
#' Default: \code{bootstrap.wl = NULL}.
#' @param ordered.export (logical) If data is reference aligned, exports will be ordered; only a single representative of each overlapping site.
#' Default: \code{ordered.export = FALSE}.
#' @param fasta.loci (logical) Output consensus sequences of all loci, in FASTA format.
#' stackr default is different than stacks default because I think having this info helps to identify problems.
#' Default: \code{fasta.loci = TRUE}.
#' @param fasta.samples (logical) Output the sequences of the two haplotypes of each (diploid) sample, for each locus, in FASTA format.
#' stackr default is different than stacks default because I think having this info helps to identify problems.
#' Default: \code{fasta.samples = TRUE}.
#' @param fasta.samples_raw (logical) Output all haplotypes observed in each sample, for each locus, in FASTA format.
#' Default: \code{fasta.samples_raw = FALSE}.
#' @param vcf (logical) Output SNPs in Variant Call Format (VCF).
#' Default: \code{vcf = TRUE}.
#' @param genepop (logical) Output results in GenePop format.
#' Default: \code{genepop = FALSE}.
#' @param structure (logical) Output results in Structure format.
#' Default: \code{structure = FALSE}.
#' @param radpainter (logical) Output results results in fineRADstructure/RADpainter format.
#' Default: \code{radpainter = FALSE}.
#' @param phase (logical) Output genotypes in PHASE format.
#' Default: \code{phase = FALSE}.
#' @param fastphase (logical) Output genotypes in fastPHASE format.
#' Default: \code{fastphase = FALSE}.
# @param beagle (logical) Output genotypes in Beagle format.
# Default: \code{beagle = FALSE}.
# @param beagle.phased (logical) Output haplotypes in Beagle format.
# Default: \code{beagle.phased = FALSE}.
#' @param plink (logical) Output genotypes in PLINK format.
#' Default: \code{plink = FALSE}.
#' @param hzar (logical) Output genotypes in Hybrid Zone Analysis using R (HZAR) format.
#' Default: \code{hzar = FALSE}.
#' @param phylip (logical) Output nucleotides that are fixed-within, and variant among populations in Phylip format for phylogenetic tree construction.
#' Default: \code{phylip = FALSE}.
#' @param phylip.var (logical) Include variable sites in the phylip output encoded using IUPAC notation.
#' Default: \code{phylip.var = FALSE}.
#' @param phylip.var.all (logical) Include all sequence as well as variable sites in the phylip output encoded using IUPAC notation.
#' Default: \code{phylip.var.all = FALSE}.
#' @param treemix (logical) Output SNPs in a format useable for the TreeMix program (Pickrell and Pritchard).
#' Default: \code{treemix = FALSE}.
#' @param no.hap.exports (logical) Omit haplotype outputs.
#' Default: \code{no.hap.exports = FALSE}.
#' @param h Display this help messsage.
#' Default: \code{h = FALSE}
#' @param verbose turn on additional logging.
#' Default: \code{verbose = FALSE}
#' @param v print program version.
#' Default: \code{v = FALSE}
#' @param log.fst.comp (logical) Log components of Fst/Phi_st calculations to a file.
#' Default: \code{log.fst.comp = FALSE}
#' @rdname run_populations
#' @export
#' @return \href{http://catchenlab.life.illinois.edu/stacks/comp/populations.php}{populations}
#' returns different output depending on arguments selected.
#' @examples
#' \dontrun{
#' pop <- stackr::run_populations(M = "population.map.tsv")
#' }
#' @seealso
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/populations.php}{populations}
#' \href{http://catchenlab.life.illinois.edu/stacks/}{STACKS Version 2.0b}
#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.
#' @references Guo SW, Thompson EA (1992)
#' Performing the exact test of Hardy-Weinberg proportion for multiple alleles.
#' Biometrics, 48, 361-372.
# populations ------------------------------------------------------------------
run_populations <- function(
P = "06_ustacks_2_gstacks",
V = NULL,
O = "07_populations",
M,
parallel.core = parallel::detectCores() - 1,
batch.size = 10000,
p = 1,
r = 0.3,
R = 0.3,
H = TRUE,
min.maf = NULL,
min.mac = NULL,
max.obs.het = NULL,
write.single.snp = FALSE,
write.random.snp = FALSE,
B = NULL,
W = NULL,
e = NULL,
merge.sites = FALSE,
merge.prune.lim = NULL,
hwe = FALSE,
fstats = FALSE,
fst.correction = NULL,
p.value.cutoff = 0.05,
k = FALSE,
smooth.fstats = FALSE,
smooth.popstats = FALSE,
sigma = 150000,
bootstrap = FALSE,
N = 100,
bootstrap.pifis = FALSE,
bootstrap.fst = FALSE,
bootstrap.div = FALSE,
bootstrap.phist = FALSE,
bootstrap.wl = NULL,
ordered.export = FALSE,
fasta.samples = TRUE,
fasta.samples_raw = FALSE,
fasta.loci = TRUE,
vcf = TRUE,
genepop = FALSE,
structure = FALSE,
radpainter = FALSE,
phase = FALSE,
fastphase = FALSE,
#beagle = FALSE,
#beagle.phased = FALSE,
plink = FALSE,
hzar = FALSE,
phylip = FALSE,
phylip.var = FALSE,
phylip.var.all = FALSE,
treemix = FALSE,
no.hap.exports = FALSE,
h = FALSE,
verbose = FALSE,
v = FALSE,
log.fst.comp = FALSE
) {
cat("#######################################################################\n")
cat("####################### stackr::run_populations #######################\n")
cat("#######################################################################\n")
timing <- proc.time()
# Check directory ------------------------------------------------------------
if (!dir.exists("09_log_files")) dir.create("09_log_files")
if (!dir.exists(O)) dir.create(O)
if (missing(M)) stop("Population map required")
# file data and time ---------------------------------------------------------
file.date.time <- format(Sys.time(), "%Y%m%d@%H%M")
# logs file ------------------------------------------------------------------
populations.log.file <- stringi::stri_join("09_log_files/populations_", file.date.time,".log")
message("For progress, look in the log file:\n", populations.log.file)
# common options -------------------------------------------------------------
input.folder <- P
P <- stringi::stri_join("-P ", shQuote(P))
# output.folder <- O
output.folder <- stringi::stri_join("populations_", file.date.time)
output.folder <- file.path(O, output.folder)
dir.create(output.folder)
message("\nOutput files written in:\n", output.folder)
O <- stringi::stri_join("-O ", shQuote(output.folder))
if (is.null(V)) {
V <- ""
} else {
V <- stringi::stri_join("-V ", V)
}
# pop map
M <- stringi::stri_join("-M ", M)
# parallel.core <- t
t <- stringi::stri_join("-t ", parallel.core)
batch.size <- stringi::stri_join("--batch-size ", batch.size)
# Data Filtering -------------------------------------------------------------
p <- stringi::stri_join("-p ", p)
r <- stringi::stri_join("-r ", r)
R <- stringi::stri_join("-R ", R)
if (H) {
H <- "-H "
} else {
H <- ""
}
if (is.null(min.maf)) {
min.maf <- ""
} else {
min.maf <- stringi::stri_join("--min.maf ", min.maf)
}
if (is.null(min.mac)) {
min.mac <- ""
} else {
min.mac <- stringi::stri_join("--min.mac ", min.mac)
}
if (is.null(max.obs.het)) {
max.obs.het <- ""
} else {
max.obs.het <- stringi::stri_join("--max.obs.het ", max.obs.het)
}
if (write.single.snp) {
write.single.snp <- "--write-single-snp "
} else {
write.single.snp <- ""
}
if (write.random.snp) {
write.random.snp <- "--write-random-snp "
} else {
write.random.snp <- ""
}
if (is.null(B)) {
B <- ""
} else {
B <- stringi::stri_join("-B ", B)
}
if (is.null(W)) {
W <- ""
} else {
W <- stringi::stri_join("-W ", W)
}
# Merging and Phasing --------------------------------------------------------
if (is.null(e)) {
e <- ""
} else {
e <- stringi::stri_join("-e ", e)
}
if (merge.sites) {
merge.sites <- "--merge-sites "
} else {
merge.sites <- ""
}
if (is.null(merge.prune.lim)) {
merge.prune.lim <- ""
} else {
merge.prune.lim <- stringi::stri_join("--merge-prune-lim ", merge.prune.lim)
}
# Locus stats ----------------------------------------------------------------
if (hwe) {
hwe <- "--hwe "
} else {
hwe <- ""
}
# Fstats ---------------------------------------------------------------------
if (fstats) {
fstats <- "--fstats "
if (is.null(fst.correction)) {
fst.correction <- ""
} else {
fst.correction <- stringi::stri_join("--fst-correction ", fst.correction)
}
p.value.cutoff <- stringi::stri_join("-p-value-cutoff ", p.value.cutoff)
} else {
fstats <- ""
fst.correction <- ""
p.value.cutoff <- ""
}
# Kernel-smoothing algorithm -------------------------------------------------
if (k) {
kernel.smoothed <- TRUE
k <- "-k "
} else {
kernel.smoothed <- FALSE
k <- ""
}
if (smooth.fstats) {
smooth.fstats <- TRUE
smooth.fstats <- "--smooth-fstats "
} else {
smooth.fstats <- FALSE
smooth.fstats <- ""
}
if (smooth.popstats) {
smooth.popstats <- TRUE
smooth.popstats <- "--smooth-popstats "
} else {
smooth.popstats <- FALSE
smooth.popstats <- ""
}
if (kernel.smoothed) {
sigma <- stringi::stri_join("--sigma ", sigma)
if (bootstrap) {
bootstrap <- "--bootstrap "
} else {
bootstrap <- ""
}
N <- stringi::stri_join("-N ", N)
if (bootstrap.pifis) {
bootstrap.pifis <- "--bootstrap-pifis "
} else {
bootstrap.pifis <- ""
}
if (bootstrap.fst) {
bootstrap.fst <- "--bootstrap-fst "
} else {
bootstrap.fst <- ""
}
if (bootstrap.div) {
bootstrap.div <- "--bootstrap-div "
} else {
bootstrap.div <- ""
}
if (bootstrap.phist) {
bootstrap.phist <- "--bootstrap-phist "
} else {
bootstrap.phist <- ""
}
if (is.null(bootstrap.wl)) {
bootstrap.wl <- ""
} else {
bootstrap.wl <- stringi::stri_join("--bootstrap-wl ", bootstrap.wl)
}
} else {
sigma <- ""
bootstrap <- ""
N <- ""
bootstrap.pifis <- ""
bootstrap.fst <- ""
bootstrap.div <- ""
bootstrap.phist <- ""
bootstrap.wl <- ""
}
# File output options --------------------------------------------------------
if (ordered.export) {
ordered.export <- "--ordered-export "
} else {
ordered.export <- ""
}
if (fasta.samples) {
fasta.samples <- "--fasta-samples "
} else {
fasta.samples <- ""
}
if (fasta.samples_raw) {
fasta.samples_raw <- "--fasta-samples-raw "
} else {
fasta.samples_raw <- ""
}
if (fasta.loci) {
fasta.loci <- "--fasta-loci "
} else {
fasta.loci <- ""
}
if (vcf) {
vcf <- "--vcf "
} else {
vcf <- ""
}
if (genepop) {
genepop <- "--genepop "
} else {
genepop <- ""
}
if (structure) {
structure <- "--structure "
} else {
structure <- ""
}
if (radpainter) {
radpainter <- "--radpainter "
} else {
radpainter <- ""
}
if (phase) {
phase <- "--phase "
} else {
phase <- ""
}
if (fastphase) {
fastphase <- "--fastphase "
} else {
fastphase <- ""
}
# if (beagle) {
# beagle <- "--beagle "
# } else {
# beagle <- ""
# }
#
# if (beagle.phased) {
# beagle.phased <- "--beagle-phased "
# } else {
# beagle.phased <- ""
# }
if (plink) {
plink <- "--plink "
} else {
plink <- ""
}
if (hzar) {
hzar <- "--hzar "
} else {
hzar <- ""
}
if (phylip) {
phylip <- "--phylip "
} else {
phylip <- ""
}
if (phylip.var) {
phylip.var <- "--phylip-var "
} else {
phylip.var <- ""
}
if (phylip.var.all) {
phylip.var.all <- "--phylip-var-all "
} else {
phylip.var.all <- ""
}
if (treemix) {
treemix <- "--treemix "
} else {
treemix <- ""
}
if (no.hap.exports) {
no.hap.exports <- "--no-hap-exports "
} else {
no.hap.exports <- ""
}
# Additional options --------------------------------------------------------
if (h) {
h <- "-h "
} else {
h <- ""
}
if (verbose) {
verbose <- "--verbose "
} else {
verbose <- ""
}
if (v) {
v <- "-v "
} else {
v <- ""
}
if (log.fst.comp) {
log.fst.comp <- "--log-fst-comp "
} else {
log.fst.comp <- ""
}
# command args ---------------------------------------------------------------
command.arguments <- paste(
P, V, O, M, t,
batch.size,
p, r, min.maf, min.mac, max.obs.het, write.single.snp,
write.random.snp, B, W, e, merge.sites, merge.prune.lim, hwe, fstats, fst.correction,
p.value.cutoff, k, smooth.fstats, smooth.popstats, sigma, bootstrap, N, bootstrap.pifis, bootstrap.fst,
bootstrap.div, bootstrap.phist, bootstrap.wl, ordered.export,
fasta.samples, fasta.samples_raw, fasta.loci, vcf, genepop, structure,
radpainter, phase,
fastphase,
# beagle, beagle.phased,
plink, hzar, phylip, phylip.var,
phylip.var.all, treemix,
no.hap.exports,
h, verbose, v, log.fst.comp
)
# run command ----------------------------------------------------------------
system2(command = "populations", args = command.arguments, stdout = populations.log.file)
timing <- proc.time() - timing
message("\nComputation time: ", round(timing[[3]]), " sec")
cat("######################## populations completed ########################\n")
# return(res)
}# end run_populations
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.