R/detect_het_outliers.R

Defines functions summarise_genotypes detect_het_outliers

Documented in detect_het_outliers summarise_genotypes

#' @title Detect heterozygotes outliers and estimate miscall rate
#' @description Explore departure from H-W equilibrium in bi-allelic RADseq data.
#' Highlight excess of homozygotes present in numeros RADseq studies.
#' The function estimate the genotyping error rate and heterozygote miscall rate.
#' The model focus on heterozygotes being
#' incorrectly called as homozygotes. See details below for more info.
#'
#' @param nreps (integer, optional) The number of MCMC sweeps to do.
#' Default: \code{nreps = 2000}.

#' @param burn.in (integer, optional) The number of MCMC burn-in reps.
#' With default, during execution, you will be asked to enter the nuber of burn-in.
#' For this, a plot showing the heterozygote miscall rate for all
#' the MCMC sweeps will be printed. This plot will help pinpoint the
#' number of burn-in. The remaining MCMC sweeps will be used
#' to average the heterozygote miscall rate.
#' e.g. of common value \code{burn.in = 500}.
#' With default: \code{burn.in = NULL}.

#' @inheritParams read_strata
#' @inheritParams tidy_genomic_data

# @param blacklist.markers (optional) Path to a file with markers to blacklist
# before generating the miscall rate. Usefull to test the impact of different
# HWE thresholds rapidly...
# Default: \code{blacklist.markers = NULL}.

#' @details
#' \strong{Before using the function:}
#' \enumerate{
#' \item Don't use raw RADseq data, this function will work best with filtered data
#' \item Remove duplicate \code{\link[radiator]{detect_duplicate_genomes}}.
#' \item Remove mixed samples \code{\link[radiator]{detect_mixed_genomes}}.
#' \item Look at other filters in radiator package...
#' }
#'
#' \strong{During import:}
#'
#' By default the function will keep only polymorphic markers and markers common
#' between all populations. If you supply a tidy data frame or a \code{.rad} file,
#' the function skip all the filters, pop selection, etc. It will however scan and
#' remove monomorphic markers automatically.
#'
#' \strong{Keep track of the data:}
#'
#' Use the argument filename to write the imported (and maybe further filtered)
#' tidy genomic data set inside the folder. The filename will be automatically
#' appended \code{.rad} to it. This file can be used again directly inside this
#' function and other radiator functions. See \code{\link[radiator]{read_rad}}.


#' @return A folder generated automatically with date and time,
#' the file \code{het.summary.tsv} contains the summary statistics. The file
#' \code{markers.genotypes.boundaries.pdf} is the plot with boundaries.
#' The overall genotyping and heterozygotes miscall rate is writen in the file
#' \code{overall_error_rate.tsv}.
#' The function also returns a list inside the global environment with
#' 8 objects:
#'
#' \enumerate{
#' \item input the input data, cleaned if filters were used during import.
#' \item outlier.summary a list with a tibble and plot of genotypes frequencies
#' and boundaries (also written in the folder).
#' \item summary.alt.allele a tibble summarizing the number of markers with:
#' \itemize{
#' \item no homozygote for the alternate allele (NO_HOM_ALT)
#' \item no heterozygote genotype (NO_HET)
#' \item one homozygote for the alternate allele(ONE_HOM_ALT)
#' \item one heterozygote genotype (ONE_HET)
#' \item one homozygote for the alternate allele only (ONE_HOM_ALT_ONLY)
#' \item one heterozygote genotype only (ONE_HET_ONLY)
#' \item one homozygote for the alternate allele and one heterozygote genotype only (ONE_HOM_ALT_ONE_HET_ONLY)
#' }
#' \item m.nreps A tibble with the heterozygote miscall rate for each MCMC replicate
#' \item overall.genotyping.error.rate The overall genotyping error rate
#' \item overall.m The overall heterozygote miscall rate
#' \item simmed_genos The simulated genotypes
#' }
#'
#' The statistics are summarized per population and overall,
#' the grouping is found in the last column called \code{POP_ID}.
#'

#' @examples
#' \dontrun{
#' het.prob <- radiator::detect_het_outliers(
#' data = "tuna.vcf", strata = "tuna.strata.tsv", nreps = 2000)
#' }


#' @rdname detect_het_outliers
#' @export


#' @author Eric Anderson \email{eric.anderson@noaa.gov} and
#' Thierry Gosselin \email{thierrygosselin@@icloud.com}

detect_het_outliers <- function(
  data,
  nreps = 2000,
  burn.in = NULL,
  strata = NULL,
  filename = NULL,
  parallel.core = parallel::detectCores() - 1,
  ...
) {
  # TEST
  # nreps = 2000
  # burn.in = NULL
  # strata = NULL
  # filename = NULL
  # parallel.core = parallel::detectCores() - 1

  # Cleanup-------------------------------------------------------------------
  verbose <- TRUE
  radiator_function_header(f.name = "detect_het_outliers", verbose = verbose)
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(radiator_function_header(f.name = "detect_het_outliers", start = FALSE, verbose = verbose), add = TRUE)
  res <- list() # to store results



  # manage missing arguments
  if (missing(data)) stop("missing data argument")

  # folder ---------------------------------------------------------------------
  # Get date and time to have unique filenaming

  folder.extension <- stringi::stri_join("detect_het_outliers_", file.date, sep = "")
  path.folder <- file.path(getwd(), folder.extension)
  dir.create(path.folder)
  message(stringi::stri_join("Folder created: \n", folder.extension))
  file.date <- NULL #unused object

  # import data ----------------------------------------------------------------
  message("Importing data ...")

  # if filename argument present, add path.folder to it
  if (!is.null(filename)) filename <- file.path(path.folder, filename)

  data.type <- radiator::detect_genomic_format(data)

  if (data.type %in% c("tbl_df", "fst.file")) {
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "INDIVIDUALS", "POP_ID", "REF", "ALT", "GT", "GT_BIN")
    message("    using tidy data frame of genotypes as input")
    message("    skipping all filters except removal of monomorphic markers")
    res$input <- radiator::tidy_wide(data = data, import.metadata = TRUE) %>%
        dplyr::select(tidyselect::any_of(want))

  # Keeping common markers -----------------------------------------------------
  res$input <- radiator::filter_common_markers(data = res$input, verbose = TRUE)


  # Removing monomorphic markers -----------------------------------------------
  res$input <- radiator::filter_monomorphic(data = res$input, verbose = TRUE)


} else {
  res$input <- radiator::tidy_genomic_data(
    data = data,
    vcf.metadata = FALSE,
    filter.monomorphic = TRUE,
    filter.common.markers = TRUE,
    strata = strata,
    # pop.select = pop.select,
    filename = filename,
    parallel.core = parallel.core,
    verbose = FALSE
  )
}
data <- NULL # no need to store this extra data

# Quick check that dataset is biallelic --------------------------------------
biallelic <- detect_biallelic_markers(res$input)
if (!biallelic) stop("Analysis requires a biallelic dataset")

# Generate the summary statistics and plot -----------------------------------
message("\nGenerating genotypes summary statistics and plot with boundaries...")
res$outlier.summary <- plot_het_outliers(data = res$input, path.folder = path.folder)

res$summary.alt.allele <-  dplyr::ungroup(res$outlier.summary$het.summary) %>%
  dplyr::filter(POP_ID == "OVERALL") %>%
  dplyr::summarise(
    TOTAL = n(),
    NO_HOM_ALT = length(MARKERS[HOM_ALT == 0]),
    NO_HET = length(MARKERS[HET == 0]),
    ONE_HOM_ALT = length(MARKERS[HOM_ALT == 1]),
    ONE_HET = length(MARKERS[HET == 1]),
    ONE_HOM_ALT_ONLY = length(MARKERS[HOM_ALT == 1 & HET == 0]),
    ONE_HET_ONLY = length(MARKERS[HOM_ALT == 0 & HET == 1]),
    ONE_HOM_ALT_ONE_HET_ONLY = length(MARKERS[HOM_ALT == 1 & HET == 1])
  ) %>%
  tidyr::pivot_longer(
    data = .,
    cols = tidyselect::everything(),
    names_to = "MARKERS",
    values_to = "NUMBERS"
  ) %>%
  dplyr::mutate(PROPORTION = NUMBERS / (NUMBERS[MARKERS == "TOTAL"]))

# Estimate heterozygotes miscall rate -------------------------------------------
message("\nCalculating heterozygotes miscall rate...")

mest <- estimate_m(data = res$input, nreps = nreps, m_init = 0.1)
res$m.nreps <- tibble::tibble(iter = 1:nreps, m = mest$m)
res$simmed_genos <- mest$simmed_genos
res$overall.genotyping.error.rate <- tibble::tibble(overall.genotyping.error.rate = mest$overall_geno_err_est)
mest <- NULL


res$trace.mcmc.plot <- ggplot2::ggplot(res$m.nreps, ggplot2::aes(x = iter, y = m)) +
  ggplot2::geom_line() +
  ggplot2::labs(y= "Heterozygote miscall rate", x = "Number of MCMC sweeps") +
  ggplot2::theme(
    legend.position = "none",
    axis.title.x = ggplot2::element_text(size = 10, face = "bold"),
    axis.text.x = ggplot2::element_text(size = 10),
    axis.title.y = ggplot2::element_text(size = 10, face = "bold"),
    axis.text.y = ggplot2::element_text(size = 10)
  ) +
  ggplot2::theme_bw()
if (!is.null(path.folder)) {
  ggplot2::ggsave(
    filename = file.path(path.folder, "trace.mcmc.plot.png"),
    plot = res$trace.mcmc.plot,
    width = 20,
    height = 10,
    dpi = 200,
    units = "cm",
    # useDingbats = FALSE,
    limitsize = FALSE)
}

print(res$trace.mcmc.plot)

if (is.null(burn.in)) {
  message("    The plot shows the heterozygote miscall rate for all the MCMC sweeps")
  message("    after the burn-in, the remaining MCMC sweeps will be used
    to average the heterozygote miscall rate")
  message("\n    enter the max number of burn-in reps:")

  burn.in <- as.integer(readLines(n = 1))
}

res$m.post.means <- dplyr::filter(res$m.nreps, iter > burn.in) %>%
  dplyr::summarise(POSTERIOR_MEAN = mean(m))

tibble::tibble(
  OVERALL_GENOTYPING_ERROR_RATE = res$overall.genotyping.error.rate$overall.genotyping.error.rate,
  OVERALL_HETEROZYGOTES_MISCALL_RATE = res$m.post.means$POSTERIOR_MEAN) %>%
  readr::write_tsv(x = ., file = file.path(path.folder, "overall_error_rate.tsv"))

message("\n    Overall genotyping error rate = ", round(res$overall.genotyping.error.rate, digits = 4))
message("    Overall heterozygotes miscall rate = ", round(res$m.post.means, digits = 4))
return(res)
}#End detect_het_outliers


# Internal nested functions ----------------------------------------------------
#' @title summarise genotypes
#' @description Function that summarise genotypes info
#' @rdname summarise_genotypes
#' @keywords internal
#' @export
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

summarise_genotypes <- function(data, path.folder = NULL) {
  if (is.null(path.folder)) path.folder <- getwd()

  # data.bk <- data
  # data <- data.bk
  data %<>% dplyr::rename(POP_ID = tidyselect::any_of("STRATA"))
  want <- c("MARKERS", "POP_ID", "INDIVIDUALS", "GT_BIN", "READ_DEPTH")
  data %<>% dplyr::select(tidyselect::any_of(want))

  n.pop <- length(unique(data$POP_ID))
  if (is.factor(data$POP_ID)) {
    pop.levels <- c(levels(data$POP_ID), "OVERALL")
  } else {
    pop.levels <- c(sort(unique(data$POP_ID)), "OVERALL")
  }
  data$POP_ID <- as.character(data$POP_ID)
  replace_zero <- function(x) replace(x = x, list = which(is.na(x)), 0)

  # sum of read depth per pop and overall
  # scaled separately for pop and overall before merging
  if (rlang::has_name(data, "READ_DEPTH")) {
    rd.pop <- dplyr::select(data, MARKERS, POP_ID, READ_DEPTH) %>%
      dplyr::group_by(MARKERS, POP_ID) %>%
      dplyr::summarise(READ_DEPTH = sum(READ_DEPTH, na.rm = TRUE)) %>%
      dplyr::ungroup(.) %>%
      dplyr::mutate(READ_DEPTH_SCALED = READ_DEPTH / max(READ_DEPTH, na.rm = TRUE))

    rd <- dplyr::bind_rows(
      dplyr::select(rd.pop, MARKERS, POP_ID, READ_DEPTH = READ_DEPTH_SCALED),
      dplyr::mutate(rd.pop, POP_ID = "OVERALL") %>%
        dplyr::group_by(MARKERS, POP_ID) %>%
        dplyr::summarise(READ_DEPTH = sum(READ_DEPTH, na.rm = TRUE)) %>%
        dplyr::ungroup(.) %>%
        dplyr::mutate(READ_DEPTH = READ_DEPTH / max(READ_DEPTH, na.rm = TRUE))
    ) %>%
      dplyr::arrange(MARKERS, POP_ID) %>%
      dplyr::select(-MARKERS, -POP_ID)
    rd.pop <- NULL
  } else {
    rd <- NULL
  }

  pop <- data %>%
    dplyr::mutate(
      GT_BIN = dplyr::case_when(
        GT_BIN == 0 ~ "HOM_REF",
        GT_BIN == 1 ~ "HET",
        GT_BIN == 2 ~ "HOM_ALT",
        is.na(GT_BIN) ~ "MISSING")
    ) %>%
    dplyr::group_by(MARKERS, POP_ID, GT_BIN) %>%
    dplyr::tally(.) %>%
    data.table::as.data.table(.) %>%
    data.table::dcast.data.table(
      data = .,
      formula = MARKERS + POP_ID ~ GT_BIN,
      value.var = "n"
    ) %>%
    tibble::as_tibble(.) %>%
    dplyr::mutate(dplyr::across(where(is.integer), .fns = replace_zero))

  if (!rlang::has_name(pop, "HET")) {
    pop %<>% dplyr::mutate(HET = 0)
  }
  if (!rlang::has_name(pop, "HOM_REF")) {
    pop %<>% dplyr::mutate(HOM_REF = 0)
  }
  if (!rlang::has_name(pop, "HOM_ALT")) {
    pop %<>% dplyr::mutate(HOM_ALT = 0)
  }
  pop %<>% dplyr::mutate(N = HOM_REF + HET + HOM_ALT)

  if (!rlang::has_name(pop, "MISSING")) {
    pop <- dplyr::mutate(pop, MISSING = as.integer("0"))
  }

  data <- dplyr::bind_rows(
    pop,
    dplyr::mutate(pop, POP_ID = "OVERALL") %>%
      dplyr::group_by(MARKERS, POP_ID) %>%
      dplyr::summarise(dplyr::across(.cols = tidyselect::everything(), .fns = sum))
    ) %>%
    dplyr::arrange(MARKERS, POP_ID)
  pop <- NULL

  data  %<>%
    dplyr::mutate(
      FREQ_ALT = ((HOM_ALT * 2) + HET) / (2 * N),
      FREQ_REF = 1 - FREQ_ALT,
      FREQ_HET = HET / (2 * N),
      FREQ_HOM_REF_O = HOM_REF / N,
      FREQ_HET_O = HET / N,
      FREQ_HOM_ALT_O = HOM_ALT / N,
      FREQ_HOM_REF_E = FREQ_REF^2,
      FREQ_HET_E = 2 * FREQ_REF * FREQ_ALT,
      FREQ_HOM_ALT_E = FREQ_ALT^2,
      N_HOM_REF_EXP = N * FREQ_HOM_REF_E,
      N_HET_EXP = N * FREQ_HET_E,
      N_HOM_ALT_EXP = N * FREQ_HOM_ALT_E,
      HOM_REF_Z_SCORE = (HOM_REF - N_HOM_REF_EXP) / sqrt(N * FREQ_HOM_REF_E * (1 - FREQ_HOM_REF_E)),
      HOM_HET_Z_SCORE = (HET - N_HET_EXP) / sqrt(N * FREQ_HET_E * (1 - FREQ_HET_E)),
      HOM_ALT_Z_SCORE = (HOM_ALT - N_HOM_ALT_EXP) / sqrt(N * FREQ_HOM_ALT_E * (1 - FREQ_HOM_ALT_E))
    ) %>%
    dplyr::bind_cols(rd) %>%
    dplyr::mutate(POP_ID = factor(POP_ID, levels = pop.levels, ordered = TRUE))

  rd <- NULL

  readr::write_tsv(x = data, file = file.path(path.folder, "genotypes.summary.tsv"))
  return(data)
}#End summarise_genotypes

# Generate genotypes frequencies boundaries

#' @title plot_het_outliers
#' @description Function that calculates genotypes observed and expected frequencies
#' returns a tibble with all the summary info and a plot with boundaries.
#' @rdname plot_het_outliers
#' @keywords internal
#' @export
#' @author Eric Anderson \email{eric.anderson@noaa.gov} and
#' Thierry Gosselin \email{thierrygosselin@@icloud.com}

plot_het_outliers <- function(data, path.folder = NULL) {
  res <- list() # to store results
  n.pop <- dplyr::n_distinct(data$POP_ID)
  res$het.summary <- summarise_genotypes(data, path.folder = path.folder)

  # prepare data for figure
  freq.summary <- dplyr::bind_cols(
    res$het.summary %>%
      dplyr::select(MARKERS, POP_ID, HOM_REF = FREQ_HOM_REF_O, HET = FREQ_HET_O, HOM_ALT = FREQ_HOM_ALT_O) %>%
      tidyr::pivot_longer(
        data = .,
        cols = -c("POP_ID", "MARKERS"),
        names_to = "GENOTYPES",
        values_to = "OBSERVED"
      ) %>%
      dplyr::arrange(MARKERS, POP_ID),
    res$het.summary %>%
      dplyr::select(MARKERS, POP_ID, HOM_REF = FREQ_HOM_REF_E, HET = FREQ_HET_E, HOM_ALT = FREQ_HOM_ALT_E) %>%
      tidyr::pivot_longer(
        data = .,
        cols = -c("POP_ID", "MARKERS"),
        names_to = "GENOTYPES",
        values_to = "EXPECTED"
      ) %>%
      dplyr::arrange(MARKERS, POP_ID) %>%
      dplyr::select(EXPECTED)
  ) %>%
    dplyr::mutate(GENOTYPES = factor(
      GENOTYPES,
      levels = c("HOM_REF", "HET", "HOM_ALT"),
      labels = c("Homozygote REF allele", "Heterozygote", "Homozygote ALT allele"),
      ordered = TRUE))

  # generate boundaries
  boundaries <- generate_geno_freq_boundaries() %>%
    dplyr::mutate(
      GENOTYPES = factor(
        GENOTYPES,
        levels = c("Homozygote REF allele", "Heterozygote", "Homozygote ALT allele")))

  res$gt.boundaries.plot <- ggplot2::ggplot(freq.summary , ggplot2::aes(x = EXPECTED, y = OBSERVED, colour = GENOTYPES)) +
    ggplot2::geom_jitter(alpha = 0.1, width = 0.01, height = 0.01) +
    ggplot2::geom_polygon(data = boundaries, fill = NA, linetype = "dashed", colour = "black") +
    ggplot2::geom_abline(slope = 1, intercept = 0, linetype = "solid") +
    ggplot2::labs(x = "Genotypes (expected frequency) ", y = "Genotypes (observed frequency)") +
    ggplot2::theme(
      legend.position = "none",
      axis.title.x = ggplot2::element_text(size = 10, face = "bold"),
      axis.text.x = ggplot2::element_text(size = 10),
      axis.title.y = ggplot2::element_text(size = 10, face = "bold"),
      axis.text.y = ggplot2::element_text(size = 10)
    ) +
    ggplot2::theme_bw() +
    ggplot2::facet_grid(POP_ID ~ GENOTYPES)

  if (!is.null(path.folder)) {
    ggplot2::ggsave(
      filename = file.path(path.folder, "markers.genotypes.boundaries.png"),
      plot = res$gt.boundaries.plot,
      width = 20,
      height = (n.pop + 1) * 5,# + 1 for overall always present
      dpi = 200, units = "cm",
      # useDingbats = FALSE,
      limitsize = FALSE)
  }
  return(res)
}#End plot_het_outliers


#' @title generate_geno_freq_boundaries
#' @description Function that returns a tibble with the min/max values of
#' genotype freqs possible.
#'
#' These mins and maxes occur because the genotypes are used to estimate
#' the allele frequencie.
#' @rdname generate_geno_freq_boundaries
#' @keywords internal
#' @export
#' @author Eric Anderson \email{eric.anderson@noaa.gov}

generate_geno_freq_boundaries <- function() {
  # first do it for the homozygote category
  Poe <- seq(0,1, by = 0.005)
  phat <- 1 - sqrt(Poe)
  minPo <- pmax(0, 1 - 2 * phat)
  maxPo <- 1 - phat

  # these are the values for the two homozygote categories
  homo_tib <- tibble::tibble(
    EXPECTED = rep(Poe, 4),
    OBSERVED = rep(c(minPo, maxPo), 2),
    GENOTYPES = as.character(rep(c("Homozygote REF allele", "Homozygote ALT allele"), each = length(Poe) * 2)))

  # now, it should be easy to get the max/min values for heterozygotes.
  # They will occur where one of the homozygotes is min or max.
  P1e <- 2 * phat * (1 - phat)
  maxP1 <- 2 * (1 - phat - minPo)
  minP1 <- 2 * (1 - phat - maxPo)

  het_tib <- tibble::tibble(
    EXPECTED = rep(P1e, 2),
    OBSERVED = c(minP1, maxP1),
    GENOTYPES = "Heterozygote")

  dplyr::bind_rows(homo_tib, het_tib)
}#End generate_geno_freq_boundaries

#' @title heterozygotes miscall rate
#' @description estimate the heterozygotes miscall rate from a simple model
#' @param data A tidy data frame (biallelic only).
#' @param nreps number of MCMC sweeps to do.
#' Default: \code{nreps = 200}.
#' @param m_init initial starting value for m must be between 0 and 1
#' @param a0 beta parameter for reference alleles
#' @param a1 beta parameter for alternate alleles
#' @param sm standard devation of proposal distribution for m
#' @rdname estimate_m
#' @keywords internal
#' @export
#' @author Eric Anderson \email{eric.anderson@noaa.gov}

estimate_m <- function(
  data,
  nreps = 200,
  m_init = stats::runif(1),
  a0 = 0.5,
  a1 = 0.5,
  sm = 0.005
) {
  D <- dplyr::select(data, INDIVIDUALS, MARKERS, GT_BIN) %>%
    dplyr::group_by(INDIVIDUALS) %>%
    tidyr::pivot_wider(data = ., names_from = "MARKERS", values_from = "GT_BIN") %>%
    dplyr::ungroup(.) %>%
    dplyr::select(-INDIVIDUALS) %>%
    as.matrix(.)

  stopifnot(m_init > 0 & m_init < 1)

  D[is.na(D)] <- -1

  # get the N variables
  N0 <- colSums(D == 0)
  N1 <- colSums(D == 1)
  N2 <- colSums(D == 2)

  # initialize the Zs to the Ns
  Z0 <- N0
  Z1 <- N1
  Z2 <- N2

  # make some place to return the m values visited
  m <- rep(NA, nreps)
  m[1] <- m_init

  # then do the sweeps
  for (r in 2:nreps) {

    # new estimate of frequency of the "1" allele from Gibbs sampling
    p <- stats::rbeta(n = length(Z0),
                      shape1 = a1 + 2 * Z2 + Z1,
                      shape2 = a0 + 2 * Z0 + Z1)

    # propose then accept or reject a new value for m
    mprop <- m[r - 1] + stats::rnorm(1, 0, sm)
    reject <- TRUE  # reject it unless we don't
    if (mprop > 0 & mprop < 1) {
      numer <- sum(N0 * log((1 - p)^2 + mprop * p * (1 - p)) +
                     N1 * log((1 - mprop) * 2 * p * (1 - p)) +
                     N2 * log(p ^ 2 + mprop * p * (1 - p)))
      denom <- sum(N0 * log((1 - p)^2 + m[r - 1] * p * (1 - p)) +
                     N1 * log((1 - m[r - 1]) * 2 * p * (1 - p)) +
                     N2 * log(p ^ 2 + m[r - 1] * p * (1 - p)))
      if (log(stats::runif(1)) < numer - denom) {
        reject <- FALSE
      }
    }
    if (reject == FALSE) {
      m[r] <- mprop
    } else {
      m[r] <- m[r - 1]
    }

    # new values for Z from Gibbs sampling
    A0 <- stats::rbinom(n = length(N0), size = N0, prob = (m[r] * p) / (1 - p + m[r] * p))
    A2 <- stats::rbinom(n = length(N2), size = N2, prob = (m[r] * (1 - p)) / (p + m[r] * (1 - p)))

    Z0 <- N0 - A0
    Z1 <- N1 + A0 + A2
    Z2 <- N2 - A2

  }
  # return m, and eventually I need to also return the final Zs and the Ns
  # and I may as well return a new 012 file with "corrected" genotypes, which
  # I can make by broadcasting the Zs around, for example...

  # inferring/realizing/simulating genotypes. I can simulate these from their posterior
  # given the estimated allele freq and the observed genotype.  To do this I will cycle
  # over the columns (the snps) in D, and for each one, I will compute the posterior of the
  # the genotype given the observed genotype (only have to for 0's and 2's) and then I will
  # sample from those posteriors.  We have a separate function that does this
  ret <- list()
  ret$simmed_genos <- simulate_genos_from_posterior(D, p, m[nreps])

  # compute an overall genotyping error rate
  diff <- ret$simmed_genos != D
  diff[D == -1] <- NA
  ret$overall_geno_err_est <- mean(diff, na.rm = TRUE)

  # return the trace of m values
  ret$m <- m

  ret
}#End estimate_m

#' @title simulate_genos_from_posterior
#' @description simulate values for the genotypes given the observed genotype,
#' then estimate allele frequencies, and the genotyping error rate.
#'
#' This is a helper function for the estimate_m function
#' @param D an 012,-1 matrix of observed genotypes
#' @param p the estimated allele freqs
#' @param m the genotyping error rate (must be a scalar)
#' @rdname simulate_genos_from_posterior
#' @keywords internal
#' @export
#' @author Eric Anderson \email{eric.anderson@noaa.gov}

simulate_genos_from_posterior <- function(D, p, m) {
  stopifnot(length(m) == 1)

  glist <- lapply(1:ncol(D), function(i) {
    obs <- D[, i] # the observed genotypes
    pl <- p[i]  # the alle freq at locus i
    post0 <- c(
      (1 - pl) / (1 - pl + m * pl),  # posterior that observed 0 is truly a 0
      (m * pl) / (1 - pl + m * pl)   # posterior that observed 0 is truly a 1
    )
    post2 <- c(
      (m * (1 - pl)) / (pl + m * (1 - pl)),  # posterior that observed 2 is truly a 1
      pl / (pl + m * (1 - pl))               # posterior that observed 2 is truly a 2
    )
    obs[obs == 0] <- sample(x = c(0, 1), size = sum(obs == 0), replace = TRUE, prob = post0)
    obs[obs == 2] <- sample(x = c(1, 2), size = sum(obs == 2), replace = TRUE, prob = post2)
    obs
  })

  # then turn it into a matrix with the same dimensions and dimnames as D
  ret <- matrix(unlist(glist), nrow = nrow(D))
  dimnames(ret) <- dimnames(D)
  ret
}#End simulate_genos_from_posterior
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.