# detect paralogs
#' @title Detect paralogs
#' @description The method uses the proportion of heterozygotes
#' and deviations in read ratios described in McKinney et al. 2017 to highlight
#' potential paralogous markers. Additionally, the function produces the HD
#' figure with marker's missingness accounted for.
#'
#' \emph{Note:} Violating Assumptions/Prerequisites (see below) can lead to
#' false positive or the absence of detection of paralogous markers.
#' @section Assumptions/Prerequisites:
#' \enumerate{
#' \item \strong{read the paper:}
#' Read \href{https://onlinelibrary.wiley.com/doi/abs/10.1111/1755-0998.12613}{Garrett McKinney}
#' paper, twice. It's good.
#' Reference is below.
#' \item \strong{Depth/Coverage information:} The function requires read depth
#' information for each alleles (REF/ALT). This info will not be found in genepop
#' or structure file format. You usually need a VCF file for this.
#' \emph{stacks, ANGSD, GATK, platypus, ipyrad, DArT, VCFTools, PLINK and freebayes}
#' are software producing the required output.
#' \item \strong{Batch effect:} You don't have any batch effect, e.g. with
#' sequencer id, library, sampling sites, tissue preservation,
#' person behind the wet-lab bench conducting the DNA extraction
#' (operating the pipette, etc.).
#' \item \strong{Normalization:} Any form of normalization, before or after
#' sequencing, as the potential to bias paralogs discovery.
#' This include: combining lanes or chips to increase individual coverage,
#' normalizing individuals FASTQ files, and of course normalizing read depth.
#' \item \strong{Identity-by-Missingness:} Absence of artifactual pattern
#' of missingness (\href{https://github.com/thierrygosselin/grur}{see missing visualization})
#' \item \strong{Low genotyping error rate:} see \code{\link{detect_het_outliers}}
#' and \href{https://github.com/eriqande/whoa}{whoa}. You can also use:
#' \code{\link{filter_hwe}} with very low threshold to discard markers with
#' obvious genotyping errors.
#' \item \strong{Low heterozygosity miscall rate:} see \code{\link{detect_het_outliers}} and
#' \href{https://github.com/eriqande/whoa}{whoa}.
#' \item \strong{Absence of pattern of heterozygosity driven by missingness:}
#' see \code{\link{detect_mixed_genomes}}. Don't use this function if the
#' individual heterozygosity is not under control. If your data is not uniform
#' inside this function, don't expect mush of \code{detect_paralogs}.
#' \item Use unfiltered dataset for fun only.
#' \item Use this function preferably on filtered data, check
#' \code{\link[radiator]{filter_rad}}.
#' \item \strong{Sex-biased sampling and sex markers} will generate
#' false-positive with this function.
#' Use \code{\link{sexy_markers}} function before!
#' }
#' @param data (2 options) A Genomic Data Structure (GDS) file or object
#' generated by radiator.
#'
#' \emph{How to get GDS data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#' @inheritParams radiator_common_arguments
#' @return The function returns the GDS data inside the global environment.
#' In the directory generated with \code{detect_paralogs} in it, the function
#' writes a tibble with the summary and the HD plot.
#'
#' The tibble as 1 line per SNPs/MARKERS and columns are:
#' \itemize{
#' \item VARIANT_ID: the GDS SNP id, to have all the markers metadata use
#' \code{\link{extract_markers_metadata}}.
#' \item NUMBER_HET: the number of heterozygote genotype.
#' \item NUMBER_MISSING: the number of missing genotype.
#' \item NUMBER_GENOTYPED: the number of individual genotyped for the marker.
#' \item TOTAL: the total number of sample for this marker.
#' \item DEPTH_REF: the sum of the REF allele depth/coverage.
#' \item DEPTH_ALT: the sum of the ALT allele depth/coverage.
#' \item MISSING_PROP: Missing genotype proportion.
#' \item HET_PROP: the proportion if heterozygote sample.
#' \item TOTAL_COUNTS: Total coverage/counts for the marker.
#' \item RATIO: the allele ratio.
#' \item Z_SCORE: the z-score (deviation in read ratio).
#' }
#'
#'
#' @rdname detect_paralogs
#' @export
#' @examples
#' \dontrun{
#' # require(httr)
#' # using the Chinook VCF dataset in dryad, from McKinney et al. 2017 paper:
#' writeBin(object = httr::content(x = httr::GET(
#' url = "https://datadryad.org/bitstream/handle/10255/dryad.123464/
#' Chinook_sequenceReads.vcf?sequence=1"),
#' as = "raw"), con = "chinook.vcf")
#'
#' # Import VCF and run the function
#' chinook.paralogs <- radiator::read_vcf(data = "chinook.vcf", vcf.stats = FALSE) %>%
#' radiator::detect_paralogs(data = .)
#' }
#' @references McKinney GJ, Waples RK, Seeb LW, Seeb JE (2017)
#' Paralogs are revealed by proportion of heterozygotes and deviations in read
#' ratios in genotyping-by-sequencing data from natural populations.
#' Molecular Ecology Resources, 17, 656-669.
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
detect_paralogs <- function(
data,
parallel.core = parallel::detectCores() - 1,
verbose = TRUE,
...
) {
## Test
# verbose = TRUE
# path.folder <- NULL
# parameters = NULL
# parallel.core = parallel::detectCores() - 1
# Cleanup-------------------------------------------------------------------
obj.keeper <- c(ls(envir = globalenv()), "data")
radiator_function_header(f.name = "detect_paralogs", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "detect_paralogs", start = FALSE, verbose = verbose), add = TRUE)
on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))
# required package -----------------------------------------------------------
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
# Function call and dotslist -------------------------------------------------
rad.dots <- radiator_dots(
func.name = as.list(sys.call())[[1]],
fd = rlang::fn_fmls_names(),
args.list = as.list(environment()),
dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
keepers = c("path.folder", "parameters"),
verbose = FALSE
)
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("data is missing")
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "detect_paralogs",
internal = FALSE,
file.date = file.date,
verbose = verbose)
# write the dots file
write_rad(
data = rad.dots,
path = path.folder,
filename = stringi::stri_join("radiator_detect_paralogs_args_", file.date, ".tsv"),
tsv = TRUE,
internal = FALSE,
verbose = verbose
)
# Detect format --------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("SeqVarGDSClass", "gds.file")) {
rlang::abort("For now, only GDS file or object works with this function")
}
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
# Filter parameter file: generate and initiate -------------------------------
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = TRUE,
update = FALSE,
parameter.obj = parameters,
data = data,
path.folder = path.folder,
file.date = file.date,
internal = FALSE,
verbose = FALSE)
# For PC change parallel.core ------------------------------------------------
if (Sys.info()[['sysname']] == "Windows") parallel.core <- 1
# Get the variant id
variants <- SeqArray::seqGetData(gdsfile = data, var.name = "variant.id")
# get the number of heterozygote samples per marker --------------------------
if (verbose) message("Counting the number of heterozygote")
het <- SeqArray::seqApply(
gdsfile = data,
var.name = "$dosage_alt",
FUN = function(g) length(g[g == 1]),
margin = "by.variant",
as.is = "integer",
parallel = parallel.core)
# get the number of missing samples per marker -------------------------------
if (verbose) message("Calculating missingness")
missing <- SeqArray::seqApply(
gdsfile = data,
var.name = "$dosage_alt",
FUN = function(g) length(g[is.na(g)]),
margin = "by.variant",
as.is = "integer",
parallel = parallel.core)
# get the number of genotyped samples per marker -----------------------------
genotyped <- SeqArray::seqApply(
gdsfile = data,
var.name = "$dosage_alt",
FUN = function(g) length(g[!is.na(g)]),
margin = "by.variant",
as.is = "integer",
parallel = parallel.core)
# tibble summary -------------------------------------------------------------
markers.het.summary <- tibble::tibble(
VARIANT_ID = variants,
NUMBER_HET = het,
NUMBER_MISSING = missing,
NUMBER_GENOTYPED = genotyped) %>%
dplyr::mutate(TOTAL = NUMBER_MISSING + NUMBER_GENOTYPED)
het <- missing <- genotyped <- variants <- NULL
# Extract the depth/coverage info from GDS -----------------------------------
if (verbose) message("Extracting coverage...")
depth.info <- extract_coverage(gds = data, individuals = FALSE, coverage.stats = "sum", parallel.core = parallel.core) %>%
dplyr::mutate(dplyr::across(where(is.factor), .fns = as.character)) %>%
dplyr::left_join(
gds2tidy(gds = data, parallel.core = parallel.core) %>%
dplyr::select(-POP_ID) %>%
dplyr::mutate(dplyr::across(where(is.factor), .fns = as.character))
, by = c("ID_SEQ", "M_SEQ")
# , by = c("INDIVIDUALS", "MARKERS")
)
# check for stacks specific coverage problem ----------------------------------
stacks.ad.problem <- depth.info %>%
dplyr::select(GT_BIN, READ_DEPTH, ALLELE_REF_DEPTH, ALLELE_ALT_DEPTH) %>%
dplyr::filter(!is.na(READ_DEPTH)) %>%
dplyr::filter(is.na(ALLELE_REF_DEPTH) & is.na(ALLELE_ALT_DEPTH))
stacks.ad.problem.n <- nrow(stacks.ad.problem)
if (stacks.ad.problem.n > 0) {
non.missing.gt <- length(depth.info$GT_BIN[!is.na(depth.info$GT_BIN)])
stacks.ad.problem.prop <- round(stacks.ad.problem.n / non.missing.gt, 4)
message("\n\nStacks problem detected")
message(" missing allele depth info")
message(" number of genotypes with problem: ", stacks.ad.problem.n, " (prop: ", stacks.ad.problem.prop,")")
message(" correcting problem by adding the read depth info into AD fields...\n\n")
stacks.ad.problem <- dplyr::select(depth.info, GT_BIN) %>%
dplyr::bind_cols(dplyr::select(depth.info, READ_DEPTH, ALLELE_REF_DEPTH, ALLELE_ALT_DEPTH)) %>%
dplyr::mutate(COL_ID = seq(1, n())) %>%
dplyr::filter(!is.na(READ_DEPTH)) %>%
dplyr::filter(is.na(ALLELE_REF_DEPTH) & is.na(ALLELE_ALT_DEPTH)) %>%
dplyr::mutate(
ALLELE_REF_DEPTH = dplyr::if_else(GT_BIN == 0, READ_DEPTH, ALLELE_REF_DEPTH),
ALLELE_ALT_DEPTH = dplyr::if_else(GT_BIN == 2, READ_DEPTH, ALLELE_ALT_DEPTH)
)
depth.info$ALLELE_REF_DEPTH[stacks.ad.problem$COL_ID] <- stacks.ad.problem$ALLELE_REF_DEPTH
depth.info$ALLELE_ALT_DEPTH[stacks.ad.problem$COL_ID] <- stacks.ad.problem$ALLELE_ALT_DEPTH
}
stacks.ad.problem.n <- stacks.ad.problem <- NULL
# summarizing the read depth info --------------------------------------------
# recalibration of REF/ALT allele
calibration <- depth.info %>%
dplyr::group_by(MARKERS) %>%
dplyr::summarise(
REF = sum(ALLELE_REF_DEPTH, na.rm = TRUE),
ALT = sum(ALLELE_ALT_DEPTH, na.rm = TRUE)
) %>%
dplyr::mutate(
CALIBRATE = dplyr::if_else(ALT > REF, TRUE, FALSE)
) %>%
dplyr::ungroup(.) %>%
dplyr::select(MARKERS, CALIBRATE)
message("Number of REF/ALT re-calibration based on allele read depth: ",
nrow(dplyr::filter(calibration, CALIBRATE)))
if (verbose) message("Generating summary...")
depth.info %<>%
dplyr::filter(GT_BIN == 1) %>%
dplyr::group_by(MARKERS) %>%
dplyr::summarise(
REF = sum(ALLELE_REF_DEPTH, na.rm = TRUE),
ALT = sum(ALLELE_ALT_DEPTH, na.rm = TRUE)
) %>%
dplyr::ungroup(.) %>%
dplyr::left_join(calibration, by = "MARKERS") %>%
dplyr::mutate(
DEPTH_REF = dplyr::if_else(CALIBRATE, ALT, REF),
DEPTH_ALT = dplyr::if_else(CALIBRATE, REF, ALT),
REF = NULL, ALT = NULL, CALIBRATE = NULL
) %>%
dplyr::left_join(
extract_markers_metadata(
gds = data,
markers.meta.select = c("VARIANT_ID", "MARKERS"),
whitelist = TRUE
), by = "MARKERS"
) %>%
dplyr::select(-MARKERS)
calibration <- NULL
if (verbose) message("Looking for paralogous markers...")
markers.het.summary %<>%
dplyr::left_join(depth.info, by = "VARIANT_ID") %>%
dplyr::mutate(
MISSING_PROP = NUMBER_MISSING / TOTAL,
#calculate proportion of heterozygotes (H)
HET_PROP = NUMBER_HET / TOTAL,
#calculate allele ratio
TOTAL_COUNTS = DEPTH_REF + DEPTH_ALT,
RATIO = DEPTH_REF / TOTAL_COUNTS,
#calculate read-ratio deviation (D)
#calculate z-score for each SNPs
Z_SCORE = -(TOTAL_COUNTS / 2 - DEPTH_REF) / sqrt(TOTAL_COUNTS * 0.5 * 0.5)
)
depth.info <- NULL
readr::write_tsv(x = markers.het.summary, file = file.path(path.folder, "markers.het.summary.tsv"))
# prepare the summary for the figure
markers.het.summary %<>%
dplyr::select(VARIANT_ID, MISSING_PROP, HET_PROP, Z_SCORE, RATIO) %>%
radiator::rad_long(
x = .,
cols = c("VARIANT_ID", "HET_PROP", "MISSING_PROP"),
measure_vars = c("Z_SCORE", "RATIO"),
names_to = "GROUP",
values_to = "VALUE"
)
facet_names <- ggplot2::as_labeller(
c(`Z_SCORE` = "Read ratio deviation z-score (D)",
`RATIO` = "Allelic ratio")
)
if (verbose) message("Generating HDplot...")
hd.plot <- ggplot2::ggplot(
data = markers.het.summary,
ggplot2::aes(x = HET_PROP, y = VALUE, size = MISSING_PROP)) +
ggplot2::geom_rect(
ggplot2::aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = 0.5),
fill = "grey", alpha = 0.1) +
ggplot2::geom_point(alpha = 0.3) +
ggplot2::scale_size_area(name = "Marker's missing proportion", max_size = 4) +
ggplot2::scale_x_continuous(name = "Proportion of heterozygotes (H)", breaks = seq(0, 1, by = 0.1)) +
ggplot2::scale_y_continuous(name = "Value") +
ggplot2::theme_bw() +
ggplot2::theme(
axis.title.x = ggplot2::element_text(size = 10, face = "bold"),
axis.title.y = ggplot2::element_text(size = 10, face = "bold"),
axis.text.x = ggplot2::element_text(size = 10)
) +
ggplot2::facet_grid(
GROUP ~ .,
scales = "free_y",
labeller = ggplot2::labeller(GROUP = facet_names)
)
print(hd.plot)
# save
ggplot2::ggsave(
filename = file.path(path.folder, "hd_plot.pdf"),
plot = hd.plot,
width = 30,
height = 15,
dpi = 300,
units = "cm",
useDingbats = FALSE)
return(data)
}#End detect_paralogs
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.