## Nucleotide diversity
#' @title Nucleotide diversity
#' @description Calculates the nucleotide diversity (Nei & Li, 1979).
#'
#' To get an estimate with the consensus reads, use the
#' function \emph{summary_haplotypes} found in the package
#' [stackr](https://github.com/thierrygosselin/stackr). The estimate in
#' \emph{summary_haplotypes} integrates the consensus markers found in
#' [STACKS](http://catchenlab.life.illinois.edu/stacks/)
#' populations.haplotypes.tsv file.
#' Both radiator and stackr functions requires \code{stringdist} package.
#'
#' The \code{read.length} argument below is used directly in the calculations.
#' To be correctly estimated, the reads obviously need to be of identical size...
#' @inheritParams radiator_common_arguments
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#' @param read.length (integer, optional) The length in nucleotide of your reads.
#' By default it is estimated from the data using the column \code{COL}.
#' Default: \code{read.length = NULL}.
#' @param path.folder (path, optional) By default will print results in the working directory.
#' Default: \code{path.folder = NULL}.
#' @return The function returns a list with the function call and:
#' \enumerate{
#' \item $pi.individuals: the pi estimated for each individual
#' \item $pi.populations: the pi statistics estimated per populations and overall.
#' \item $boxplot.pi: showing the boxplot of Pi for each populations and overall.
#' }
#' use $ to access each #' objects in the list.
#' @examples
#' \dontrun{
#' require(stringdist)
#' # The simplest way to run the function:
#' pi.sum <- radiator::pi(data = "brook.charr.gds")
#' }
#' @rdname pi
#' @export
#' @references Nei M, Li WH (1979)
#' Mathematical model for studying genetic variation in terms of
#' restriction endonucleases.
#' Proceedings of the National Academy of Sciences of
#' the United States of America, 76, 5269–5273.
#' @note Thanks to Anne-Laure Ferchaud for very useful comments on previous version
#' of this function.
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
pi <- function(
data,
read.length = NULL,
parallel.core = parallel::detectCores() - 1,
path.folder = NULL,
verbose = TRUE
) {
radiator_packages_dep(package = "stringdist")
# Cleanup-------------------------------------------------------------------
radiator_function_header(f.name = "pi", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "pi", start = FALSE, verbose = verbose), add = TRUE)
res <- list()
# manage missing arguments -----------------------------------------------------
if (missing(data)) rlang::abort("Input file missing")
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "pi",
prefix_int = FALSE,
internal = FALSE,
file.date = file.date,
verbose = verbose)
# Detect format --------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
message("Importing gds file...")
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
tidy.data <- extract_genotypes_metadata(
gds = data,
genotypes.meta.select = c("MARKERS", "COL", "INDIVIDUALS", "POP_ID", "GT_BIN"))
if (length(tidy.data) == 0) {
tidy.data <- gds2tidy(gds = data)
}
SeqArray::seqClose(data)
data <- tidy.data
tidy.data <- NULL
} else {#tidy.data
if (is.vector(data)) data %<>% radiator::tidy_wide(data = ., import.metadata = TRUE)
}
# Nei & Li 1979 Nucleotide Diversity -----------------------------------------
message("Nucleotide diversity (Pi):")
if (is.null(read.length) && !rlang::has_name(data, "COL")) {
rlang::abort("Unable to estimate size of reads... read.length argument is required")
}
if (is.null(read.length)) read.length <- max(data$COL)
message(" Read length used: ", read.length)
# Pi: by individuals----------------------------------------------------------
message(" Pi calculations: individuals...")
data %<>%
dplyr::filter(!is.na(GT_BIN)) %>%
dplyr::mutate(
ALLELE1 = dplyr::case_when(
GT_BIN == 2L ~ "2",
GT_BIN != 2L ~ "1"
),
ALLELE2 = dplyr::case_when(
GT_BIN == 0L ~ "1",
GT_BIN != 0L ~ "2"
)
)
res$pi.individuals <- data %>%
dplyr::mutate(
PI = (
stringdist::stringdist(
a = ALLELE1,
b = ALLELE2,
method = "hamming"
)
) / read.length
) %>%
dplyr::group_by(POP_ID, INDIVIDUALS) %>%
dplyr::summarise(PI = mean(PI))
write_rad(
data = res$pi.individuals,
path = path.folder,
filename = "pi.individuals.tsv",
tsv = TRUE, verbose = verbose)
# Pi: by pop------------------------------------------------------------------
message(" Pi calculations: populations...")
data %<>%
dplyr::select(POP_ID, INDIVIDUALS, MARKERS, ALLELE1, ALLELE2) %>%
tidyr::pivot_longer(
data = .,
cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"),
names_to = "ALLELE_GROUP",
values_to = "ALLELES"
)
res$pi.populations <- data %>%
split(x = ., f = .$POP_ID) %>%
purrr::map_df(
.x = ., .f = pi_pop,
read.length = read.length, parallel.core = parallel.core
)
# Pi: overall ---------------------------------------------------------------
message(" Pi calculations: overall")
res$pi.populations <- tibble::add_row(
.data = res$pi.populations,
POP_ID = "OVERALL",
PI_NEI = radiator_future(
.x = data,
.f = pi_rad,
flat.future = "dfr",
split.vec = FALSE,
split.with = "MARKERS",
parallel.core = parallel.core,
read.length = read.length
) %>%
dplyr::summarise(PI_NEI = mean(PI), .groups = "drop") %>%
purrr::flatten_dbl(.)
)
write_rad(
data = res$pi.populations,
path = path.folder,
filename = "pi.populations.tsv",
tsv = TRUE, verbose = verbose)
# figure ---------------------------------------------------------------------
if (verbose) message("Generating violin plot of pi")
n.pop <- length(unique(res$pi.individuals$POP_ID))
res$boxplot.pi <- dplyr::filter(res$pi.individuals, POP_ID != "OVERALL") %>%
ggplot2::ggplot(data = ., ggplot2::aes(x = factor(POP_ID), y = PI, na.rm = TRUE)) +
ggplot2::geom_violin(trim = F) +
ggplot2::geom_boxplot(width = 0.1, fill = "black", outlier.colour = NA) +
ggplot2::stat_summary(fun = "mean", geom = "point", shape = 21, size = 2.5, fill = "white") +
ggplot2::labs(
x = "Sampling sites",
y = "Individual nucleotide diversity (Pi)"
) +
ggplot2::theme(
legend.position = "none",
axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
legend.title = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
legend.text = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
strip.text.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold")
)
ggplot2::ggsave(
limitsize = FALSE,
plot = res$boxplot.pi,
filename = file.path(path.folder, "pi.violin.plot.pdf"),
width = max(10, n.pop * 2), height = 10,
dpi = 300, units = "cm", useDingbats = FALSE)
# results --------------------------------------------------------------------
return(res)
}#End pi
# Internal nested functions ----------------------------------------------------
# pi_rad ---------------------------------------------------------------------------
#' @title pi_rad
#' @description main pi function
#' @rdname pi_rad
#' @keywords internal
#' @export
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
pi_rad <- carrier::crate(function(data, read.length) {
`%>%` <- magrittr::`%>%`
y <- dplyr::select(data, ALLELES) %>%
purrr::flatten_chr(.)
if (length(unique(y)) <= 1) {
pi <- tibble::tibble(PI = as.numeric(0))
} else {
#1 Get all pairwise comparison
allele.pairwise <- utils::combn(unique(y), 2)
#2 Calculate pairwise nucleotide mismatches
pairwise.mismatches <- apply(allele.pairwise, 2, function(z) {
stringdist::stringdist(a = z[1], b = z[2], method = "hamming")
})
#3 Calculate allele frequency
allele.freq <- table(y)/length(y)
#4 Calculate nucleotide diversity from pairwise mismatches and allele frequency
pi <- apply(allele.pairwise, 2, function(y) allele.freq[y[1]] * allele.freq[y[2]])
pi <- tibble::tibble(PI = sum(pi * pairwise.mismatches) / read.length)
}
return(pi)
})#End pi
# Pi pop ---------------------------------------------------------------------------
#' @title pi_pop
#' @description pi_pop function
#' @rdname pi_pop
#' @keywords internal
#' @export
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
pi_pop <- function(data, read.length, parallel.core = parallel::detectCores() - 1) {
pop <- unique(data$POP_ID)
message(" Pi calculations for pop: ", pop)
# data <- df.split.pop[["DD"]]
# data <- df.split.pop[["SKY"]]
pi.pop <- radiator_future(
.x = data,
.f = pi_rad,
flat.future = "dfr",
split.vec = FALSE,
split.with = "MARKERS",
parallel.core = parallel.core,
read.length = read.length
) %>%
dplyr::summarise(PI_NEI = mean(PI)) %>%
tibble::add_column(.data = ., POP_ID = pop, .before = "PI_NEI")
return(pi.pop)
}#End pi_pop
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.