R/ibdg_fh.R

Defines functions ibdg_fh

Documented in ibdg_fh

#' @name ibdg_fh
#' @title FH measure of IBDg
#' @description FH is a proxy mesure of IBDg based on the excess in the observed
#' number of homozygous genotypes within an individual,
#' relative to the mean number of homozygous genotypes expected under random mating
#' (Keller et al., 2011; Kardos et al., 2015; Hedrick & Garcia-Dorado, 2016).
#'
#' \strong{IBDg} is the realized proportion of the individual genome
#' that is identical by descent by reference to the current population
#' under hypothetical random mating
#' (Keller et al., 2011; Kardos et al., 2015; Hedrick & Garcia-Dorado, 2016).
#'
#' This function is using a modified version of the FH measure
#' (constructed using \href{https://www.cog-genomics.org/plink2}{PLINK}
#' \code{-het} option) described in (Keller et al., 2011; Kardos et al., 2015).
#'
#' The novelties are:
#'
#' \itemize{
#' \item \strong{population-wise:} the individual's observed homozygosity is
#' contrasted against the expected homozygosity.
#' Two estimates of the expected homozygosity are provided based
#' on the population and/or the overall expected homozygosity
#' averaged across markers.
#' \item \strong{tailored for RADseq:} instead of using the overall number
#' of markers, the population and the overall expected homozygosity
#' are averaged with the same markers the individual's are genotyped for.
#' This reduces the bias potentially introduced by comparing the individual's
#' observed homozygosity (computed from non-missing genotypes) with
#' an estimate computed with more markers found at the population or at the
#' overall level.
#' }
#'
#' The FH measure is also computed in
#' \href{https://github.com/thierrygosselin/stackr}{stackr} \emph{summary_haplotypes}
#' function and \href{https://github.com/thierrygosselin/grur}{grur}
#' \emph{missing_visualization} functions.
#' See theory below for the equations.

#' @inheritParams radiator_common_arguments
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.

#' @param path.folder (path, optional) By default will print results in the working directory.
#' Default: \code{path.folder = NULL}.

#' @param ... (optional) To pass further arguments for fine-tuning the function.

#' @return A list is created with 4 objects:
#' \itemize{
#' \item \code{$fh}: the individual's FH values
#' \item \code{$fh.stats}: the population and overall FH values. These values are
#' calculated by averaging individual FH across samples and populations.
#' \item \code{$fh.box.plot}: the boxplot.
#' \item \code{$fh.distribution.plot}: the histogram.
#'}
#'
#' FH measure is on average negative when the parents are less related than
#' expected by random mating. The distribution \code{fh.distribution.plot}
#' should be centered around 0 in samples of non-inbred individuals.

#' @section Theory:
#'
#' \strong{Modified FH:}
#' \deqn{F_{h_i} = \frac{\overline{Het}_{obs_{ij}} - \overline{Het}_{exp_j}}{\sum_{i}snp_{ij} - \overline{Het}_{exp_j}}}
#'
#' \strong{Individual Observed Heterozygosity averaged across markers:}
#' \deqn{\overline{Het}_{obs_i} = \frac{\sum_iHet_{obs_i}}{\sum_i{snp_i}}}
#'
#' \strong{Population expected Heterozygosity (under Hardy-Weinberg) and
#' tailored by averaging for each individual using his genotyped markers:}
#' \deqn{\overline{Het}_{exp_j} = \frac{\sum_jHet_{exp_j}}{\sum_j{snp_j}}}

#' @section Advance mode:
#'
#' \emph{dots-dots-dots ...} allows to pass several arguments for fine-tuning the function.
#' These arguments are described in \code{\link{tidy_genomic_data}}.
#' \itemize{
#' \item filter.monomorphic = TRUE (default)
#' \item filter.common.markers = FALSE. The argument for common markers
#' between populations is set by default to maximize genome coverage of
#' individuals and populations.
#' }

#' @examples
#' \dontrun{
#' # Using a  VCF file, the simplest for of the function:
#' fh <- ibdg_fh(data = "sturgeon.gds")
#'
#' # To see what's inside the list
#' names(fh)
#'
#' # To view the boxplot:
#' fh$fh.boxplot
#'
#' # To view the distribution of FH values:
#' fh$fh.distribution.plot
#' }

#' @references Keller MC, Visscher PM, Goddard ME (2011)
#' Quantification of inbreeding due to distant ancestors and its detection
#'  using dense single nucleotide polymorphism data. Genetics, 189, 237–249.
#' @references Kardos M, Luikart G, Allendorf FW (2015)
#' Measuring individual inbreeding in the age of genomics: marker-based
#' measures are better than pedigrees. Heredity, 115, 63–72.
#' @references Hedrick PW, Garcia-Dorado A. (2016)
#' Understanding Inbreeding Depression, Purging, and Genetic Rescue.
#' Trends in Ecology and Evolution. 2016; 31: 940-952.

#' @export
#' @rdname ibdg_fh
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

ibdg_fh <- function(
  data,
  path.folder = NULL,
  verbose = TRUE,
  ...
) {
  # Cleanup-------------------------------------------------------------------
  radiator_function_header(f.name = "ibdg_fh", verbose = verbose)
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(radiator_function_header(f.name = "ibdg_fh", start = FALSE, verbose = verbose), add = TRUE)

  if (verbose) cat("################################################################################\n")
  if (verbose) cat("############################### radiator::ibdg_fh ##############################\n")
  if (verbose) cat("################################################################################\n")

  # manage missing arguments ---------------------------------------------------
  if (missing(data)) rlang::abort("Input file missing")

  # Folders---------------------------------------------------------------------
  path.folder <- generate_folder(
    f = path.folder,
    rad.folder = "ibdg_fh",
    prefix_int = FALSE,
    internal = FALSE,
    file.date = file.date,
    verbose = verbose)

  # Detect format --------------------------------------------------------------
  data.type <- radiator::detect_genomic_format(data)
  if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
    rlang::abort("Input not supported for this function: read function documentation")
  }

  if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
    radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)


    message("Importing gds file...")
    if (data.type == "gds.file") {
      data <- radiator::read_rad(data, verbose = verbose)
      data.type <- "SeqVarGDSClass"
    }

    tidy.data <- extract_genotypes_metadata(
      gds = data,
      genotypes.meta.select = c("MARKERS", "COL", "INDIVIDUALS", "POP_ID", "GT_BIN"))

    if (length(tidy.data) == 0) {
      tidy.data <- gds2tidy(gds = data)
    }
    SeqArray::seqClose(data)
    data <- tidy.data
    tidy.data <- NULL
  } else { #tidy.data
    if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
  }


  # Detect if biallelic --------------------------------------------------------
  biallelic <- radiator::detect_biallelic_markers(data)

  # IBDg computations ----------------------------------------------------------
  message("Genome-Wide Identity-By-Descent calculations using FH...")
  if (rlang::has_name(data, "GT_BIN") && biallelic) {
    # Remove missing
    data %<>% dplyr::filter(!is.na(GT_BIN))

    freq <- data %>%
      dplyr::group_by(MARKERS, POP_ID) %>%
      dplyr::summarise(
        N = n(),
        HOM_ALT = length(GT_BIN[GT_BIN == 2]),
        HET = length(GT_BIN[GT_BIN == 1])
      ) %>%
      dplyr::mutate(
        FREQ_ALT = ((HOM_ALT * 2) + HET) / (2 * N),
        FREQ_REF = 1 - FREQ_ALT,
        HOM_E = (FREQ_REF^2) + (FREQ_ALT^2)
      )

    hom.e <- dplyr::full_join(
      data,
      dplyr::select(.data = freq, MARKERS, POP_ID, HOM_E)
      , by = c("MARKERS", "POP_ID")
    ) %>%
      dplyr::select(MARKERS, POP_ID, INDIVIDUALS, HOM_E) %>%
      dplyr::group_by(POP_ID, INDIVIDUALS) %>%
      dplyr::summarise(HOM_E = mean(HOM_E, na.rm = TRUE)) %>%
      dplyr::ungroup(.)

    fh <- data %>%
      dplyr::group_by(POP_ID, INDIVIDUALS) %>%
      dplyr::summarise(
        N = n(),
        HOM_REF = length(GT_BIN[GT_BIN == 0]),
        HOM_ALT = length(GT_BIN[GT_BIN == 2]),
        HOM = HOM_REF + HOM_ALT
      ) %>%
      dplyr::mutate(HOM_O = HOM / N) %>%
      dplyr::full_join(
        dplyr::select(
          .data = hom.e,
          INDIVIDUALS, POP_ID, HOM_E
        ), by = c("POP_ID", "INDIVIDUALS")
      ) %>%
      dplyr::mutate(FH = ((HOM_O - HOM_E)/(N - HOM_E))) %>%
      dplyr::ungroup(.) %>%
      dplyr::arrange(POP_ID, INDIVIDUALS)

    ind.levels <- fh$INDIVIDUALS
    fh %<>%
      dplyr::mutate(
        INDIVIDUALS = factor(INDIVIDUALS, levels = ind.levels, ordered = TRUE)
      )
  } else {
    if (!rlang::has_name(data, "GT")) {
      rlang::abort("GT genotype coding required for multi-allelic dataset")
    }
    # not biallelic
    input.alleles <- dplyr::select(.data = data, MARKERS, POP_ID, INDIVIDUALS, GT) %>%
      dplyr::filter(GT != "000000") %>%
      dplyr::mutate(
        A1 = stringi::stri_sub(GT, 1, 3),
        A2 = stringi::stri_sub(GT, 4,6)
      ) %>%
      dplyr::select(-GT)

    freq <- input.alleles %>%
      tidyr::pivot_longer(
        data = .,
        cols = -c("MARKERS", "INDIVIDUALS", "POP_ID"),
        names_to = "ALLELE_GROUP",
        values_to = "ALLELES"
      ) %>%
      dplyr::group_by(MARKERS, ALLELES, POP_ID) %>%
      dplyr::tally(.) %>%
      dplyr::ungroup(.) %>%
      tidyr::complete(data = ., POP_ID, tidyr::nesting(MARKERS, ALLELES), fill = list(n = 0)) %>%
      dplyr::group_by(MARKERS, POP_ID) %>%
      dplyr::mutate(
        FREQ = n/sum(n),
        HOM_E = FREQ^2
      ) %>%
      dplyr::summarise(HOM_E = sum(HOM_E)) %>%
      dplyr::ungroup(.)

    hom.e <- dplyr::full_join(
      dplyr::filter(.data = data, GT != "000000"),
      dplyr::select(.data = freq, MARKERS, POP_ID, HOM_E)
      , by = c("MARKERS", "POP_ID")
    ) %>%
      dplyr::select(MARKERS, POP_ID, INDIVIDUALS, HOM_E) %>%
      dplyr::group_by(POP_ID, INDIVIDUALS) %>%
      dplyr::summarise(HOM_E = mean(HOM_E, na.rm = TRUE)) %>%
      dplyr::ungroup(.)

    fh <- input.alleles %>%
      dplyr::group_by(POP_ID, INDIVIDUALS) %>%
      dplyr::summarise(
        N = n(),
        HOM = length(INDIVIDUALS[A1 == A2])
      ) %>%
      dplyr::mutate(HOM_O = HOM / N) %>%
      dplyr::full_join(dplyr::select(.data = hom.e, INDIVIDUALS, POP_ID, HOM_E), by = c("POP_ID", "INDIVIDUALS")) %>%
      dplyr::mutate(FH = ((HOM_O - HOM_E)/(N - HOM_E))) %>%
      dplyr::ungroup(.) %>%
      dplyr::arrange(POP_ID, INDIVIDUALS)

    ind.levels <- fh$INDIVIDUALS
    fh <- dplyr::mutate(.data = fh, INDIVIDUALS = factor(INDIVIDUALS, levels = ind.levels, ordered = TRUE))
  }


  # Write FH individuals
  write_rad(
    data = fh,
    path = path.folder,
    filename = "fh.individuals.tsv",
    tsv = TRUE, verbose = verbose)


  # FH statistics per pop
  fh.stats <- fh %>%
    dplyr::group_by(POP_ID) %>%
    dplyr::summarise(FH = mean(FH)) %>%
    dplyr::ungroup(.)

  # per pop and overall combined
  fh.stats <- tibble::add_row(
    .data = fh.stats,
    POP_ID = "OVERALL",
    FH = unlist(dplyr::summarise(.data = fh.stats, FH = mean(FH)))
  )
  # Write FH individuals
  write_rad(
    data = fh.stats,
    path = path.folder,
    filename = "fh.populations.tsv",
    tsv = TRUE, verbose = verbose)

  # plots ----------------------------------------------------------------------
  message("Generating plots")
  # fh.boxplot
  n.pop <- length(unique(fh$POP_ID))
  fh.box.plot <- ggplot2::ggplot(
    data = fh, ggplot2::aes(x = POP_ID, y = FH, colour = POP_ID)) +
    ggplot2::geom_jitter(alpha = 0.5) +
    ggplot2::geom_violin(trim = TRUE, fill = NA) +
    ggplot2::geom_boxplot(width = 0.1, fill = NA, outlier.colour = NA, outlier.fill = NA) +
    ggplot2::labs(
      y = "Individual IBDg (FH)",
      x = "Populations") +#colour = "Populations") +
    ggplot2::theme_bw() +
    ggplot2::theme(
      legend.position = "none",
      panel.grid.minor.x = ggplot2::element_blank(),
      panel.grid.major.y = ggplot2::element_blank(),
      axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
      axis.text.x = ggplot2::element_text(size = 10, family = "Helvetica"),
      axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
      axis.text.y = ggplot2::element_text(size = 10, family = "Helvetica")) +
    ggplot2::coord_flip()

  ggplot2::ggsave(
    limitsize = FALSE,
    plot = fh.box.plot,
    filename = file.path(path.folder, "fh.violin.plot.pdf"),
    width = max(10, n.pop * 2), height = 10,
    dpi = 300, units = "cm", useDingbats = FALSE)

  # Histogram
  fh.distribution.plot <- suppressWarnings(
    ggplot2::ggplot(data = fh, ggplot2::aes(x = FH)) +
      ggplot2::geom_histogram() +
      ggplot2::labs(x = "Individual IBDg (FH)") +
      ggplot2::labs(y = "Markers (number)") +
      ggplot2::theme(
        legend.position = "none",
        axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
        axis.text.y = ggplot2::element_text(size = 10, family = "Helvetica"),
        axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
        axis.text.x = ggplot2::element_text(size = 10, family = "Helvetica"),
        strip.text.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold")
      ) +
      ggplot2::theme_bw()
  )
    # fh.distribution.plot
  ggplot2::ggsave(
    limitsize = FALSE,
    plot = fh.distribution.plot,
    filename = file.path(path.folder, "fh.distribution.plot.overall.pdf"),
    width = 15, height = 15,
    dpi = 300, units = "cm", useDingbats = FALSE)

  # Results --------------------------------------------------------------------
  res = list(
    fh = fh,
    fh.stats = fh.stats,
    fh.box.plot = fh.box.plot,
    fh.distribution.plot = fh.distribution.plot)
  return(res)
}
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.