#' @name betas_estimator
#' @title Estimate \eqn{\beta}s per population
#' @description Estimate \eqn{\beta}s per population.
#' @inheritParams tidy_genomic_data
#' @return A list is created with 3 objects:
#' betaiovl: Average \eqn{\beta_i} over loci,
#' Hw: Within population gene diversities
#' Hb: Between populations gene diversities
#' @references
#' Weir, B., Goudet, J. (2017).
#' A Unified Characterization of Population Structure and Relatedness
#' Genetics 206(4), 2085-2103.
#' https://dx.doi.org/10.1534/genetics.116.198424
#'
#' Goudet, J., Kay, T., Weir, B. (2018).
#' How to estimate kinship
#' Molecular Ecology 27(20), 4121-4135.
#' https://dx.doi.org/10.1111/mec.14833
#' @examples
#' \dontrun{
#' # Using a VCF file, the simplest for of the function:
#' fh <- ibdg_fh(
#' data = "batch_1.vcf",
#' strata = "strata.panda.tsv"
#' )
#' # To see what's inside the list
#' names(fh)
#' # To view the manhattan plot:
#' fh$fh.manhattan.plot
#' # To view the distribution of FH values:
#' fh$fh.distribution.plot
#' }
#' @export
#' @rdname betas_estimator
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
betas_estimator <- function(
data,
strata = NULL,
filename = NULL,
verbose = FALSE
) {
# Cleanup-------------------------------------------------------------------
radiator_function_header(f.name = "betas", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "betas", start = FALSE, verbose = verbose), add = TRUE)
# manage missing arguments -----------------------------------------------------
if (missing(data)) rlang::abort("Input file missing")
# import data ----------------------------------------------------------------
if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
# BETAS computations ----------------------------------------------------------
message("Beta computation ...")
if (tibble::has_name(data, "GT_VCF")) {
message("Warning: implementation is not working yet for haplotype vcf file")
# option 2 using list-columns
betas.prep <- dplyr::filter(.data = data, GT_VCF != "./.") %>%
dplyr::group_by(MARKERS, POP_ID) %>%
dplyr::summarise(
N = n(),
HOM_ALT = length(GT_VCF[GT_VCF == "1/1"]),
HET = length(GT_VCF[GT_VCF == "1/0" | GT_VCF == "0/1"])
) %>%
dplyr::mutate(
NN = N * 2,
FREQ_ALT = ((HOM_ALT * 2) + HET) / NN,
FREQ_REF = 1 - FREQ_ALT,
HW = (NN / (NN - 1)) * (1 - ((FREQ_REF^2) + (FREQ_ALT^2)))
) %>%
dplyr::group_by(MARKERS) %>%
dplyr::mutate(
N_POP_C = n(), # number of pop per markers
N_POP_C = 1/(N_POP_C * (N_POP_C - 1)) # corrected number of pop per markers
) %>%
dplyr::ungroup(.)
gene_diversity_between <- function(x) {
new_sum <- purrr::lift(sum, na.rm = TRUE)
res <- utils::combn(x, m = 2, FUN = function(y) y[1]*y[2], simplify = FALSE)
res <- new_sum(res)
return(res)
}
betas <- dplyr::select(.data = betas.prep, MARKERS, POP_ID, FREQ_ALT, FREQ_REF) %>%
dplyr::group_by(MARKERS) %>%
dplyr::summarise(
.data = .,
dplyr::across(
.cols = c(FREQ_ALT, FREQ_REF),
.fns = list
),
.groups = "keep"
) %>%
dplyr::mutate(
FREQ_ALT = purrr::map(.x = FREQ_ALT, .f = gene_diversity_between),
FREQ_REF = purrr::map(.x = FREQ_REF, .f = gene_diversity_between)
) %>%
dplyr::mutate(HB = (unlist(FREQ_ALT) + unlist(FREQ_REF)) * 2) %>%
dplyr::select(MARKERS, HB) %>%
dplyr::full_join(
dplyr::select(.data = betas.prep, MARKERS, POP_ID, HW, N_POP_C), by = "MARKERS"
) %>%
dplyr::mutate(HB = 1 - N_POP_C * HB) %>%
dplyr::group_by(POP_ID) %>%
dplyr::mutate(BETAI = 1 - (sum(HW, na.rm = TRUE)/sum(HB, na.rm = TRUE))) %>%
dplyr::select(POP_ID, MARKERS, HW, HB, BETAI) %>%
dplyr::ungroup(.)
betas.prep <- NULL
} else {
betas.prep <- dplyr::select(.data = data, MARKERS, POP_ID, INDIVIDUALS, GT) %>%
dplyr::filter(GT != "000000") %>%
dplyr::mutate(
A1 = stringi::stri_sub(GT, 1, 3),
A2 = stringi::stri_sub(GT, 4,6)
) %>%
dplyr::select(MARKERS, POP_ID, INDIVIDUALS, A1, A2) %>%
tidyr::pivot_longer(
data = .,
cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"),
names_to = "ALLELES",
values_to = "GT"
) %>%
dplyr::group_by(MARKERS, GT, POP_ID) %>%
dplyr::tally(.) %>%
dplyr::ungroup() %>%
tidyr::complete(data = ., POP_ID, tidyr::nesting(MARKERS, GT), fill = list(n = 0)) %>%
dplyr::group_by(MARKERS, POP_ID) %>%
dplyr::mutate(
NN = sum(n),
HOM_O = n / NN,
HW = (NN / (NN - 1)) * (1 - sum(HOM_O^2))
) %>%
dplyr::group_by(MARKERS) %>%
dplyr::mutate(
N_POP = length(unique(POP_ID)), # number of pop per markers
N_POP_C = 1/(N_POP * (N_POP - 1)) # corrected number of pop per markers
) %>%
dplyr::ungroup(.)
gene_diversity_between <- function(x) {
res <- utils::combn(x, m = 2, FUN = function(y) y[1]*y[2], simplify = FALSE)
new_sum <- purrr::lift(sum, na.rm = TRUE)
res <- new_sum(res)
return(res)
}
betas <- dplyr::select(.data = betas.prep, MARKERS, POP_ID, GT, HOM_O) %>%
dplyr::group_by(MARKERS, GT) %>%
dplyr::summarise(
.data = .,
dplyr::across(
.cols = "HOM_O",
.fns = list
),
.groups = "keep"
) %>%
dplyr::mutate(
FREQ = unlist(purrr::map(.x = HOM_O, .f = gene_diversity_between))
) %>%
dplyr::group_by(MARKERS) %>%
dplyr::summarise(HB = sum(FREQ) * 2) %>%
dplyr::full_join(
dplyr::distinct(.data = betas.prep, MARKERS, POP_ID, HW, N_POP_C), by = "MARKERS"
) %>%
dplyr::mutate(HB = 1 - N_POP_C * HB) %>%
dplyr::group_by(POP_ID) %>%
dplyr::mutate(BETAI = 1 - (sum(HW, na.rm = TRUE) / sum(HB, na.rm = TRUE))) %>%
dplyr::select(POP_ID, MARKERS, HW, HB, BETAI) %>%
dplyr::ungroup(.)
}
# plots ----------------------------------------------------------------------
# message("Generating plots")
# # manhattan
# fh.manhattan.plot <- ggplot(data = fh, aes(x = INDIVIDUALS, y = FH, colour = POP_ID)) +
# geom_jitter() +
# labs(y = "Individual IBDg (FH)") +
# labs(x = "Individuals") +
# labs(colour = "Populations") +
# # theme_minimal() +
# theme_classic() +
# # theme_dark() +
# theme(
# panel.grid.minor.x = element_blank(),
# panel.grid.major.y = element_blank(),
# axis.title.x = element_text(size = 10, family = "Helvetica", face = "bold"),
# axis.text.x = element_blank(),
# axis.title.y = element_text(size = 10, family = "Helvetica", face = "bold"),
# axis.text.y = element_text(size = 8, family = "Helvetica")
# )
# # fh.manhattan.plot
# # Histogram
# fh.distribution.plot <- ggplot(data = fh, aes(x = FH)) +
# geom_histogram() +
# labs(x = "Individual IBDg (FH)") +
# labs(y = "Markers (number)") +
# theme(
# legend.position = "none",
# axis.title.x = element_text(size = 10, family = "Helvetica", face = "bold"),
# axis.title.y = element_text(size = 10, family = "Helvetica", face = "bold"),
# axis.text.x = element_text(size = 8, family = "Helvetica", angle = 90, hjust = 1, vjust = 0.5),
# strip.text.x = element_text(size = 10, family = "Helvetica", face = "bold")
# )
# # fh.distribution.plot
# Results --------------------------------------------------------------------
res <- list(
betaiovl = dplyr::distinct(.data = betas, POP_ID, BETAI),
Hw = dplyr::distinct(.data = betas, MARKERS, POP_ID, HW),
Hb = dplyr::distinct(.data = betas, MARKERS, HB)
)
if (verbose) {
message("\nBETA per pop (averaged over locus):")
message(stringi::stri_join(res$betaiovl$POP_ID, " = ", round(res$betaiovl$BETAI, 4), "\n"))
}
return(res)
}
##option 1 tested that was very straighforward using tidyr::nest
# Function to compute gene diversity between populations (Hb)
# gene_diversity_between <- function(x) {
# mult <- function(y) y[1]*y[2]
# res <- (sum(unlist(utils::combn(x = x$FREQ_ALT, m = 2, FUN = mult, simplify = FALSE))) + sum(unlist(utils::combn(x = x$FREQ_REF, m = 2, FUN = mult, simplify = FALSE))))*2
# return(res)
# }
#
# #
# betas <- data %>%
# dplyr::filter(GT_VCF != "./.") %>%
# dplyr::group_by(MARKERS, POP_ID) %>%
# dplyr::summarise(
# N = n(),
# HOM_ALT = length(GT_VCF[GT_VCF == "1/1"]),
# HET = length(GT_VCF[GT_VCF == "1/0" | GT_VCF == "0/1"])
# ) %>%
# dplyr::mutate(
# NN = N * 2,
# NN_C = NN / (NN - 1),
# FREQ_ALT = ((HOM_ALT * 2) + HET) / NN,
# FREQ_REF = 1 - FREQ_ALT,
# HOM_E = (FREQ_REF^2) + (FREQ_ALT^2),# MP2 hierfstat
# HET_E = 1 - HOM_E,
# HW = NN_C * HET_E
# ) %>%
# dplyr::group_by(MARKERS) %>%
# dplyr::mutate(
# N_POP = n(), # number of pop per markers
# N_POP_C = 1/(N_POP * (N_POP - 1)) # corrected number of pop per markers
# ) %>%
# dplyr::ungroup(.) %>%
# dplyr::select(MARKERS, POP_ID, HW, FREQ_ALT, FREQ_REF, N_POP_C) %>%
# dplyr::group_by(MARKERS, N_POP_C) %>%
# tidyr::nest(.key = FREQ) %>%
# dplyr::mutate(
# HB = purrr::map(.x = .$FREQ, .f = gene_diversity_between),
# HB = 1 - N_POP_C * unlist(HB)
# ) %>%
# tidyr::unnest(.) %>%
# dplyr::select(POP_ID, MARKERS, HW, HB) %>%
# dplyr::group_by(POP_ID) %>%
# dplyr::mutate(BETAI = 1 - (sum(HW, na.rm = TRUE)/sum(HB, na.rm = TRUE))) %>%
# dplyr::ungroup(.)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.