R/detect_paralogs.R

Defines functions detect_paralogs

Documented in detect_paralogs

# detect paralogs
#' @title Detect paralogs
#' @description The method uses the proportion of heterozygotes
#' and deviations in read ratios described in McKinney et al. 2017 to highlight
#' potential paralogous markers. Additionally, the function produces the HD
#' figure with marker's missingness accounted for.
#'
#' \emph{Note:} Violating Assumptions/Prerequisites (see below) can lead to
#' false positive or the absence of detection of paralogous markers.

#' @section Assumptions/Prerequisites:
#' \enumerate{
#' \item \strong{read the paper:}
#' Read \href{https://onlinelibrary.wiley.com/doi/abs/10.1111/1755-0998.12613}{Garrett McKinney}
#' paper, twice. It's good.
#' Reference is below.
#' \item \strong{Depth/Coverage information:} The function requires read depth
#' information for each alleles (REF/ALT). This info will not be found in genepop
#' or structure file format. You usually need a VCF file for this.
#' \emph{stacks, ANGSD, GATK, platypus, ipyrad, DArT, VCFTools, PLINK and freebayes}
#' are software producing the required output.
#' \item \strong{Batch effect:} You don't have any batch effect, e.g. with
#' sequencer id, library, sampling sites, tissue preservation,
#' person behind the wet-lab bench conducting the DNA extraction
#' (operating the pipette, etc.).
#' \item \strong{Normalization:} Any form of normalization, before or after
#' sequencing, as the potential to bias paralogs discovery.
#' This include: combining lanes or chips to increase individual coverage,
#' normalizing individuals FASTQ files, and of course normalizing read depth.
#' \item \strong{Identity-by-Missingness:} Absence of artifactual pattern
#' of missingness (\href{https://github.com/thierrygosselin/grur}{see missing visualization})
#' \item \strong{Low genotyping error rate:} see \code{\link{detect_het_outliers}}
#' and \href{https://github.com/eriqande/whoa}{whoa}. You can also use:
#' \code{\link{filter_hwe}} with very low threshold to discard markers with
#' obvious genotyping errors.
#' \item \strong{Low heterozygosity miscall rate:} see \code{\link{detect_het_outliers}} and
#' \href{https://github.com/eriqande/whoa}{whoa}.
#' \item \strong{Absence of pattern of heterozygosity driven by missingness:}
#' see \code{\link{detect_mixed_genomes}}. Don't use this function if the
#' individual heterozygosity is not under control. If your data is not uniform
#' inside this function, don't expect mush of \code{detect_paralogs}.
#' \item Use unfiltered dataset for fun only.
#' \item Use this function preferably on filtered data, check
#' \code{\link[radiator]{filter_rad}}.
#' \item \strong{Sex-biased sampling and sex markers} will generate
#' false-positive with this function.
#' Use \code{\link{sexy_markers}} function before!
#' }

#' @param data (2 options) A Genomic Data Structure (GDS) file or object
#' generated by radiator.
#'
#' \emph{How to get GDS data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.


#' @inheritParams radiator_common_arguments

#' @return The function returns the GDS data inside the global environment.
#' In the directory generated with \code{detect_paralogs} in it, the function
#' writes a tibble with the summary and the HD plot.
#'
#' The tibble as 1 line per SNPs/MARKERS and columns are:
#' \itemize{
#' \item VARIANT_ID: the GDS SNP id, to have all the markers metadata use
#' \code{\link{extract_markers_metadata}}.
#' \item NUMBER_HET: the number of heterozygote genotype.
#' \item NUMBER_MISSING: the number of missing genotype.
#' \item NUMBER_GENOTYPED: the number of individual genotyped for the marker.
#' \item TOTAL: the total number of sample for this marker.
#' \item DEPTH_REF: the sum of the REF allele depth/coverage.
#' \item DEPTH_ALT: the sum of the ALT allele depth/coverage.
#' \item MISSING_PROP: Missing genotype proportion.
#' \item HET_PROP: the proportion if heterozygote sample.
#' \item TOTAL_COUNTS: Total coverage/counts for the marker.
#' \item RATIO: the allele ratio.
#' \item Z_SCORE: the z-score (deviation in read ratio).
#' }
#'
#'
#' @rdname detect_paralogs
#' @export
#' @examples
#' \dontrun{
#' # require(httr)
#' # using the Chinook VCF dataset in dryad, from McKinney et al. 2017 paper:
#' writeBin(object = httr::content(x = httr::GET(
#' url = "https://datadryad.org/bitstream/handle/10255/dryad.123464/
#' Chinook_sequenceReads.vcf?sequence=1"),
#' as = "raw"), con = "chinook.vcf")
#'
#' # Import VCF and run the function
#' chinook.paralogs <- radiator::read_vcf(data = "chinook.vcf", vcf.stats = FALSE) %>%
#'     radiator::detect_paralogs(data = .)
#' }

#' @references McKinney GJ, Waples RK, Seeb LW, Seeb JE (2017)
#' Paralogs are revealed by proportion of heterozygotes and deviations in read
#' ratios in genotyping-by-sequencing data from natural populations.
#' Molecular Ecology Resources, 17, 656-669.

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

detect_paralogs <- function(
  data,
  parallel.core = parallel::detectCores() - 1,
  verbose = TRUE,
  ...
) {

  ## Test
  # verbose = TRUE
  # path.folder <- NULL
  # parameters = NULL
  # parallel.core = parallel::detectCores() - 1

  # Cleanup-------------------------------------------------------------------
  obj.keeper <- c(ls(envir = globalenv()), "data")
  radiator_function_header(f.name = "detect_paralogs", verbose = verbose)
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(radiator_function_header(f.name = "detect_paralogs", start = FALSE, verbose = verbose), add = TRUE)
  on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))

  # required package -----------------------------------------------------------
  radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)

  # Function call and dotslist -------------------------------------------------
  rad.dots <- radiator_dots(
    func.name = as.list(sys.call())[[1]],
    fd = rlang::fn_fmls_names(),
    args.list = as.list(environment()),
    dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
    keepers = c("path.folder", "parameters"),
    verbose = FALSE
  )

  # Checking for missing and/or default arguments ------------------------------
  if (missing(data)) rlang::abort("data is missing")

  # Folders---------------------------------------------------------------------
  path.folder <- generate_folder(
    f = path.folder,
    rad.folder = "detect_paralogs",
    internal = FALSE,
    file.date = file.date,
    verbose = verbose)

  # write the dots file
  write_rad(
    data = rad.dots,
    path = path.folder,
    filename = stringi::stri_join("radiator_detect_paralogs_args_", file.date, ".tsv"),
    tsv = TRUE,
    internal = FALSE,
    verbose = verbose
  )

  # Detect format --------------------------------------------------------------
  data.type <- radiator::detect_genomic_format(data)

  if (!data.type %in% c("SeqVarGDSClass", "gds.file")) {
    rlang::abort("For now, only GDS file or object works with this function")
  }
  if (data.type == "gds.file") {
    data <- radiator::read_rad(data, verbose = verbose)
    data.type <- "SeqVarGDSClass"
  }

  # Filter parameter file: generate and initiate -------------------------------
  filters.parameters <- radiator_parameters(
    generate = TRUE,
    initiate = TRUE,
    update = FALSE,
    parameter.obj = parameters,
    data = data,
    path.folder = path.folder,
    file.date = file.date,
    internal = FALSE,
    verbose = FALSE)

  # For PC change parallel.core ------------------------------------------------
  if (Sys.info()[['sysname']] == "Windows") parallel.core <- 1

  # Get the variant id
  variants <- SeqArray::seqGetData(gdsfile = data, var.name = "variant.id")

  # get the number of heterozygote samples per marker --------------------------
  if (verbose) message("Counting the number of heterozygote")
  het <- SeqArray::seqApply(
    gdsfile = data,
    var.name = "$dosage_alt",
    FUN = function(g) length(g[g == 1]),
    margin = "by.variant",
    as.is = "integer",
    parallel = parallel.core)

  # get the number of missing samples per marker -------------------------------
  if (verbose) message("Calculating missingness")
  missing <- SeqArray::seqApply(
    gdsfile = data,
    var.name = "$dosage_alt",
    FUN = function(g) length(g[is.na(g)]),
    margin = "by.variant",
    as.is = "integer",
    parallel = parallel.core)

  # get the number of genotyped samples per marker -----------------------------
  genotyped <- SeqArray::seqApply(
    gdsfile = data,
    var.name = "$dosage_alt",
    FUN = function(g) length(g[!is.na(g)]),
    margin = "by.variant",
    as.is = "integer",
    parallel = parallel.core)

  # tibble summary -------------------------------------------------------------
  markers.het.summary <- tibble::tibble(
    VARIANT_ID = variants,
    NUMBER_HET = het,
    NUMBER_MISSING = missing,
    NUMBER_GENOTYPED = genotyped) %>%
    dplyr::mutate(TOTAL = NUMBER_MISSING + NUMBER_GENOTYPED)

  het <- missing <- genotyped <- variants <- NULL

  # Extract the depth/coverage info from GDS -----------------------------------
  if (verbose) message("Extracting coverage...")
  depth.info <- extract_coverage(gds = data, individuals = FALSE, coverage.stats = "sum") %>%
    dplyr::mutate(dplyr::across(where(is.factor), .fns = as.character)) %>%
    dplyr::left_join(
      gds2tidy(gds = data, parallel.core = parallel.core) %>%
        dplyr::select(-POP_ID) %>%
        dplyr::mutate(dplyr::across(where(is.factor), .fns = as.character))
      , by = c("ID_SEQ", "M_SEQ")
      # , by = c("INDIVIDUALS", "MARKERS")
    )

  # check for stacks specific coverage problem ----------------------------------
  stacks.ad.problem <- depth.info %>%
    dplyr::select(GT_BIN, READ_DEPTH, ALLELE_REF_DEPTH, ALLELE_ALT_DEPTH) %>%
    dplyr::filter(!is.na(READ_DEPTH)) %>%
    dplyr::filter(is.na(ALLELE_REF_DEPTH) & is.na(ALLELE_ALT_DEPTH))
  stacks.ad.problem.n <- nrow(stacks.ad.problem)

  if (stacks.ad.problem.n > 0) {
    non.missing.gt <- length(depth.info$GT_BIN[!is.na(depth.info$GT_BIN)])
    stacks.ad.problem.prop <- round(stacks.ad.problem.n / non.missing.gt, 4)

    message("\n\nStacks problem detected")
    message("    missing allele depth info")
    message("    number of genotypes with problem: ", stacks.ad.problem.n, " (prop: ", stacks.ad.problem.prop,")")
    message("    correcting problem by adding the read depth info into AD fields...\n\n")

    stacks.ad.problem <- dplyr::select(depth.info, GT_BIN) %>%
      dplyr::bind_cols(dplyr::select(depth.info, READ_DEPTH, ALLELE_REF_DEPTH, ALLELE_ALT_DEPTH)) %>%
      dplyr::mutate(COL_ID = seq(1, n())) %>%
      dplyr::filter(!is.na(READ_DEPTH)) %>%
      dplyr::filter(is.na(ALLELE_REF_DEPTH) & is.na(ALLELE_ALT_DEPTH)) %>%
      dplyr::mutate(
        ALLELE_REF_DEPTH = dplyr::if_else(GT_BIN == 0, READ_DEPTH, ALLELE_REF_DEPTH),
        ALLELE_ALT_DEPTH = dplyr::if_else(GT_BIN == 2, READ_DEPTH, ALLELE_ALT_DEPTH)
      )

    depth.info$ALLELE_REF_DEPTH[stacks.ad.problem$COL_ID] <- stacks.ad.problem$ALLELE_REF_DEPTH
    depth.info$ALLELE_ALT_DEPTH[stacks.ad.problem$COL_ID] <- stacks.ad.problem$ALLELE_ALT_DEPTH
  }
  stacks.ad.problem.n <- stacks.ad.problem <- NULL


  # summarizing the read depth info --------------------------------------------

  # recalibration of REF/ALT allele
  calibration <- depth.info %>%
    dplyr::group_by(MARKERS) %>%
    dplyr::summarise(
      REF = sum(ALLELE_REF_DEPTH, na.rm = TRUE),
      ALT = sum(ALLELE_ALT_DEPTH, na.rm = TRUE)
    ) %>%
    dplyr::mutate(
      CALIBRATE = dplyr::if_else(ALT > REF, TRUE, FALSE)
    ) %>%
    dplyr::ungroup(.) %>%
    dplyr::select(MARKERS, CALIBRATE)

  message("Number of REF/ALT re-calibration based on allele read depth: ",
          nrow(dplyr::filter(calibration, CALIBRATE)))

  if (verbose) message("Generating summary...")

  depth.info %<>%
    dplyr::filter(GT_BIN == 1) %>%
    dplyr::group_by(MARKERS) %>%
    dplyr::summarise(
      REF = sum(ALLELE_REF_DEPTH, na.rm = TRUE),
      ALT = sum(ALLELE_ALT_DEPTH, na.rm = TRUE)
    ) %>%
    dplyr::ungroup(.) %>%
    dplyr::left_join(calibration, by = "MARKERS") %>%
    dplyr::mutate(
      DEPTH_REF = dplyr::if_else(CALIBRATE, ALT, REF),
      DEPTH_ALT = dplyr::if_else(CALIBRATE, REF, ALT),
      REF = NULL, ALT = NULL, CALIBRATE = NULL
    ) %>%
    dplyr::left_join(
      extract_markers_metadata(
        gds = data,
        markers.meta.select = c("VARIANT_ID", "MARKERS"),
        whitelist = TRUE
      ), by = "MARKERS"
    ) %>%
    dplyr::select(-MARKERS)
  calibration <- NULL

  if (verbose) message("Looking for paralogous markers...")
  markers.het.summary %<>%
    dplyr::left_join(depth.info, by = "VARIANT_ID") %>%
    dplyr::mutate(
      MISSING_PROP = NUMBER_MISSING / TOTAL,
      #calculate proportion of heterozygotes (H)
      HET_PROP = NUMBER_HET / TOTAL,
      #calculate allele ratio
      TOTAL_COUNTS = DEPTH_REF + DEPTH_ALT,
      RATIO = DEPTH_REF / TOTAL_COUNTS,
      #calculate read-ratio deviation (D)
      #calculate z-score for each SNPs
      Z_SCORE = -(TOTAL_COUNTS / 2 - DEPTH_REF) / sqrt(TOTAL_COUNTS * 0.5 * 0.5)
    )
  depth.info <- NULL
  readr::write_tsv(x = markers.het.summary, file = file.path(path.folder, "markers.het.summary.tsv"))

  # prepare the summary for the figure
  markers.het.summary %<>%
    dplyr::select(VARIANT_ID, MISSING_PROP, HET_PROP, Z_SCORE, RATIO) %>%
    radiator::rad_long(
      x = .,
      cols = c("VARIANT_ID", "HET_PROP", "MISSING_PROP"),
      measure_vars = c("Z_SCORE", "RATIO"),
      names_to = "GROUP",
      values_to = "VALUE"
    )

  facet_names <- ggplot2::as_labeller(
    c(`Z_SCORE` = "Read ratio deviation z-score (D)",
      `RATIO` = "Allelic ratio")
  )
  if (verbose) message("Generating HDplot...")
  hd.plot <- ggplot2::ggplot(
    data = markers.het.summary,
    ggplot2::aes(x = HET_PROP, y = VALUE, size = MISSING_PROP)) +
    ggplot2::geom_rect(
      ggplot2::aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = 0.5),
      fill = "grey", alpha = 0.1) +
    ggplot2::geom_point(alpha = 0.3) +
    ggplot2::scale_size_area(name = "Marker's missing proportion", max_size = 4) +
    ggplot2::scale_x_continuous(name = "Proportion of heterozygotes (H)", breaks = seq(0, 1, by = 0.1)) +
    ggplot2::scale_y_continuous(name = "Value") +
    ggplot2::theme_bw() +
    ggplot2::theme(
      axis.title.x = ggplot2::element_text(size = 10, face = "bold"),
      axis.title.y = ggplot2::element_text(size = 10, face = "bold"),
      axis.text.x = ggplot2::element_text(size = 10)
    ) +
    ggplot2::facet_grid(
      GROUP ~ .,
      scales = "free_y",
      labeller = ggplot2::labeller(GROUP = facet_names)
    )

  print(hd.plot)

  # save
  ggplot2::ggsave(
    filename = file.path(path.folder, "hd_plot.pdf"),
    plot = hd.plot,
    width = 30,
    height = 15,
    dpi = 300,
    units = "cm",
    useDingbats = FALSE)

  return(data)
}#End detect_paralogs
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.