#' @name ibdg_fh
#' @title FH measure of IBDg
#' @description FH is a proxy mesure of IBDg based on the excess in the observed
#' number of homozygous genotypes within an individual,
#' relative to the mean number of homozygous genotypes expected under random mating
#' (Keller et al., 2011; Kardos et al., 2015; Hedrick & Garcia-Dorado, 2016).
#'
#' \strong{IBDg} is the realized proportion of the individual genome
#' that is identical by descent by reference to the current population
#' under hypothetical random mating
#' (Keller et al., 2011; Kardos et al., 2015; Hedrick & Garcia-Dorado, 2016).
#'
#' This function is using a modified version of the FH measure
#' (constructed using \href{https://www.cog-genomics.org/plink2}{PLINK}
#' \code{-het} option) described in (Keller et al., 2011; Kardos et al., 2015).
#'
#' The novelties are:
#'
#' \itemize{
#' \item \strong{population-wise:} the individual's observed homozygosity is
#' contrasted against the expected homozygosity.
#' Two estimates of the expected homozygosity are provided based
#' on the population and/or the overall expected homozygosity
#' averaged across markers.
#' \item \strong{tailored for RADseq:} instead of using the overall number
#' of markers, the population and the overall expected homozygosity
#' are averaged with the same markers the individual's are genotyped for.
#' This reduces the bias potentially introduced by comparing the individual's
#' observed homozygosity (computed from non-missing genotypes) with
#' an estimate computed with more markers found at the population or at the
#' overall level.
#' }
#'
#' The FH measure is also computed in
#' \href{https://github.com/thierrygosselin/stackr}{stackr} \emph{summary_haplotypes}
#' function and \href{https://github.com/thierrygosselin/grur}{grur}
#' \emph{missing_visualization} functions.
#' See theory below for the equations.
#' @inheritParams radiator_common_arguments
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#' @param path.folder (path, optional) By default will print results in the working directory.
#' Default: \code{path.folder = NULL}.
#' @param ... (optional) To pass further arguments for fine-tuning the function.
#' @return A list is created with 4 objects:
#' \itemize{
#' \item \code{$fh}: the individual's FH values
#' \item \code{$fh.stats}: the population and overall FH values. These values are
#' calculated by averaging individual FH across samples and populations.
#' \item \code{$fh.box.plot}: the boxplot.
#' \item \code{$fh.distribution.plot}: the histogram.
#'}
#'
#' FH measure is on average negative when the parents are less related than
#' expected by random mating. The distribution \code{fh.distribution.plot}
#' should be centered around 0 in samples of non-inbred individuals.
#' @section Theory:
#'
#' \strong{Modified FH:}
#' \deqn{F_{h_i} = \frac{\overline{Het}_{obs_{ij}} - \overline{Het}_{exp_j}}{\sum_{i}snp_{ij} - \overline{Het}_{exp_j}}}
#'
#' \strong{Individual Observed Heterozygosity averaged across markers:}
#' \deqn{\overline{Het}_{obs_i} = \frac{\sum_iHet_{obs_i}}{\sum_i{snp_i}}}
#'
#' \strong{Population expected Heterozygosity (under Hardy-Weinberg) and
#' tailored by averaging for each individual using his genotyped markers:}
#' \deqn{\overline{Het}_{exp_j} = \frac{\sum_jHet_{exp_j}}{\sum_j{snp_j}}}
#' @section Advance mode:
#'
#' \emph{dots-dots-dots ...} allows to pass several arguments for fine-tuning the function.
#' These arguments are described in \code{\link{tidy_genomic_data}}.
#' \itemize{
#' \item filter.monomorphic = TRUE (default)
#' \item filter.common.markers = FALSE. The argument for common markers
#' between populations is set by default to maximize genome coverage of
#' individuals and populations.
#' }
#' @examples
#' \dontrun{
#' # Using a VCF file, the simplest for of the function:
#' fh <- ibdg_fh(data = "sturgeon.gds")
#'
#' # To see what's inside the list
#' names(fh)
#'
#' # To view the boxplot:
#' fh$fh.boxplot
#'
#' # To view the distribution of FH values:
#' fh$fh.distribution.plot
#' }
#' @references Keller MC, Visscher PM, Goddard ME (2011)
#' Quantification of inbreeding due to distant ancestors and its detection
#' using dense single nucleotide polymorphism data. Genetics, 189, 237–249.
#' @references Kardos M, Luikart G, Allendorf FW (2015)
#' Measuring individual inbreeding in the age of genomics: marker-based
#' measures are better than pedigrees. Heredity, 115, 63–72.
#' @references Hedrick PW, Garcia-Dorado A. (2016)
#' Understanding Inbreeding Depression, Purging, and Genetic Rescue.
#' Trends in Ecology and Evolution. 2016; 31: 940-952.
#' @export
#' @rdname ibdg_fh
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
ibdg_fh <- function(
data,
path.folder = NULL,
verbose = TRUE,
...
) {
# Cleanup-------------------------------------------------------------------
radiator_function_header(f.name = "ibdg_fh", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "ibdg_fh", start = FALSE, verbose = verbose), add = TRUE)
if (verbose) cat("################################################################################\n")
if (verbose) cat("############################### radiator::ibdg_fh ##############################\n")
if (verbose) cat("################################################################################\n")
# manage missing arguments ---------------------------------------------------
if (missing(data)) rlang::abort("Input file missing")
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "ibdg_fh",
prefix_int = FALSE,
internal = FALSE,
file.date = file.date,
verbose = verbose)
# Detect format --------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
message("Importing gds file...")
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
tidy.data <- extract_genotypes_metadata(
gds = data,
genotypes.meta.select = c("MARKERS", "COL", "INDIVIDUALS", "POP_ID", "GT_BIN"))
if (length(tidy.data) == 0) {
tidy.data <- gds2tidy(gds = data)
}
SeqArray::seqClose(data)
data <- tidy.data
tidy.data <- NULL
} else { #tidy.data
if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
}
# Detect if biallelic --------------------------------------------------------
biallelic <- radiator::detect_biallelic_markers(data)
# IBDg computations ----------------------------------------------------------
message("Genome-Wide Identity-By-Descent calculations using FH...")
if (rlang::has_name(data, "GT_BIN") && biallelic) {
# Remove missing
data %<>% dplyr::filter(!is.na(GT_BIN))
freq <- data %>%
dplyr::group_by(MARKERS, POP_ID) %>%
dplyr::summarise(
N = n(),
HOM_ALT = length(GT_BIN[GT_BIN == 2]),
HET = length(GT_BIN[GT_BIN == 1])
) %>%
dplyr::mutate(
FREQ_ALT = ((HOM_ALT * 2) + HET) / (2 * N),
FREQ_REF = 1 - FREQ_ALT,
HOM_E = (FREQ_REF^2) + (FREQ_ALT^2)
)
hom.e <- dplyr::full_join(
data,
dplyr::select(.data = freq, MARKERS, POP_ID, HOM_E)
, by = c("MARKERS", "POP_ID")
) %>%
dplyr::select(MARKERS, POP_ID, INDIVIDUALS, HOM_E) %>%
dplyr::group_by(POP_ID, INDIVIDUALS) %>%
dplyr::summarise(HOM_E = mean(HOM_E, na.rm = TRUE)) %>%
dplyr::ungroup(.)
fh <- data %>%
dplyr::group_by(POP_ID, INDIVIDUALS) %>%
dplyr::summarise(
N = n(),
HOM_REF = length(GT_BIN[GT_BIN == 0]),
HOM_ALT = length(GT_BIN[GT_BIN == 2]),
HOM = HOM_REF + HOM_ALT
) %>%
dplyr::mutate(HOM_O = HOM / N) %>%
dplyr::full_join(
dplyr::select(
.data = hom.e,
INDIVIDUALS, POP_ID, HOM_E
), by = c("POP_ID", "INDIVIDUALS")
) %>%
dplyr::mutate(FH = ((HOM_O - HOM_E)/(N - HOM_E))) %>%
dplyr::ungroup(.) %>%
dplyr::arrange(POP_ID, INDIVIDUALS)
ind.levels <- fh$INDIVIDUALS
fh %<>%
dplyr::mutate(
INDIVIDUALS = factor(INDIVIDUALS, levels = ind.levels, ordered = TRUE)
)
} else {
if (!rlang::has_name(data, "GT")) {
rlang::abort("GT genotype coding required for multi-allelic dataset")
}
# not biallelic
input.alleles <- dplyr::select(.data = data, MARKERS, POP_ID, INDIVIDUALS, GT) %>%
dplyr::filter(GT != "000000") %>%
dplyr::mutate(
A1 = stringi::stri_sub(GT, 1, 3),
A2 = stringi::stri_sub(GT, 4,6)
) %>%
dplyr::select(-GT)
freq <- input.alleles %>%
tidyr::pivot_longer(
data = .,
cols = -c("MARKERS", "INDIVIDUALS", "POP_ID"),
names_to = "ALLELE_GROUP",
values_to = "ALLELES"
) %>%
dplyr::group_by(MARKERS, ALLELES, POP_ID) %>%
dplyr::tally(.) %>%
dplyr::ungroup(.) %>%
tidyr::complete(data = ., POP_ID, tidyr::nesting(MARKERS, ALLELES), fill = list(n = 0)) %>%
dplyr::group_by(MARKERS, POP_ID) %>%
dplyr::mutate(
FREQ = n/sum(n),
HOM_E = FREQ^2
) %>%
dplyr::summarise(HOM_E = sum(HOM_E)) %>%
dplyr::ungroup(.)
hom.e <- dplyr::full_join(
dplyr::filter(.data = data, GT != "000000"),
dplyr::select(.data = freq, MARKERS, POP_ID, HOM_E)
, by = c("MARKERS", "POP_ID")
) %>%
dplyr::select(MARKERS, POP_ID, INDIVIDUALS, HOM_E) %>%
dplyr::group_by(POP_ID, INDIVIDUALS) %>%
dplyr::summarise(HOM_E = mean(HOM_E, na.rm = TRUE)) %>%
dplyr::ungroup(.)
fh <- input.alleles %>%
dplyr::group_by(POP_ID, INDIVIDUALS) %>%
dplyr::summarise(
N = n(),
HOM = length(INDIVIDUALS[A1 == A2])
) %>%
dplyr::mutate(HOM_O = HOM / N) %>%
dplyr::full_join(dplyr::select(.data = hom.e, INDIVIDUALS, POP_ID, HOM_E), by = c("POP_ID", "INDIVIDUALS")) %>%
dplyr::mutate(FH = ((HOM_O - HOM_E)/(N - HOM_E))) %>%
dplyr::ungroup(.) %>%
dplyr::arrange(POP_ID, INDIVIDUALS)
ind.levels <- fh$INDIVIDUALS
fh <- dplyr::mutate(.data = fh, INDIVIDUALS = factor(INDIVIDUALS, levels = ind.levels, ordered = TRUE))
}
# Write FH individuals
write_rad(
data = fh,
path = path.folder,
filename = "fh.individuals.tsv",
tsv = TRUE, verbose = verbose)
# FH statistics per pop
fh.stats <- fh %>%
dplyr::group_by(POP_ID) %>%
dplyr::summarise(FH = mean(FH)) %>%
dplyr::ungroup(.)
# per pop and overall combined
fh.stats <- tibble::add_row(
.data = fh.stats,
POP_ID = "OVERALL",
FH = unlist(dplyr::summarise(.data = fh.stats, FH = mean(FH)))
)
# Write FH individuals
write_rad(
data = fh.stats,
path = path.folder,
filename = "fh.populations.tsv",
tsv = TRUE, verbose = verbose)
# plots ----------------------------------------------------------------------
message("Generating plots")
# fh.boxplot
n.pop <- length(unique(fh$POP_ID))
fh.box.plot <- ggplot2::ggplot(
data = fh, ggplot2::aes(x = POP_ID, y = FH, colour = POP_ID)) +
ggplot2::geom_jitter(alpha = 0.5) +
ggplot2::geom_violin(trim = TRUE, fill = NA) +
ggplot2::geom_boxplot(width = 0.1, fill = NA, outlier.colour = NA, outlier.fill = NA) +
ggplot2::labs(
y = "Individual IBDg (FH)",
x = "Populations") +#colour = "Populations") +
ggplot2::theme_bw() +
ggplot2::theme(
legend.position = "none",
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.major.y = ggplot2::element_blank(),
axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.x = ggplot2::element_text(size = 10, family = "Helvetica"),
axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.y = ggplot2::element_text(size = 10, family = "Helvetica")) +
ggplot2::coord_flip()
ggplot2::ggsave(
limitsize = FALSE,
plot = fh.box.plot,
filename = file.path(path.folder, "fh.violin.plot.pdf"),
width = max(10, n.pop * 2), height = 10,
dpi = 300, units = "cm", useDingbats = FALSE)
# Histogram
fh.distribution.plot <- suppressWarnings(
ggplot2::ggplot(data = fh, ggplot2::aes(x = FH)) +
ggplot2::geom_histogram() +
ggplot2::labs(x = "Individual IBDg (FH)") +
ggplot2::labs(y = "Markers (number)") +
ggplot2::theme(
legend.position = "none",
axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.y = ggplot2::element_text(size = 10, family = "Helvetica"),
axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
axis.text.x = ggplot2::element_text(size = 10, family = "Helvetica"),
strip.text.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold")
) +
ggplot2::theme_bw()
)
# fh.distribution.plot
ggplot2::ggsave(
limitsize = FALSE,
plot = fh.distribution.plot,
filename = file.path(path.folder, "fh.distribution.plot.overall.pdf"),
width = 15, height = 15,
dpi = 300, units = "cm", useDingbats = FALSE)
# Results --------------------------------------------------------------------
res = list(
fh = fh,
fh.stats = fh.stats,
fh.box.plot = fh.box.plot,
fh.distribution.plot = fh.distribution.plot)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.