# Detect the number of alleles/nucleotides per bi-allelic markers
#' @name detect_biallelic_problems
#' @title Detect biallelic problems
#' @description Detect the number of alleles/nucleotides per markers.
#' Sometimes, a dataset might have 3 alleles at a SNP, is this biological or artifactual ?
#' This function helps to resolve this, by highlighting markers with this potential
#' problem, so that user can further look at the origin of the phenomenon.
#' The function can also split datasets in biallelic/multiallelic datasets.
#' @param data A tidy data frame object in the global environment or
#' a tidy data frame in wide or long format in the working directory.
#' The tidy dataset needs a column with nucleotide information \code{GT_VCF_NUC},
#' usually this is automatically generated by \pkg{radiator}.
#' \emph{How to get a tidy data frame ?}
#' Look into \pkg{radiator} \code{\link{tidy_genomic_data}}.
#' @param verbose (optional, logical) \code{verbose = TRUE} to be chatty
#' during execution.
#' Default: \code{verbose = TRUE}.
#' @param parallel.core (optional) The number of core used for parallel
#' execution.
#' Default: \code{parallel.core = parallel::detectCores() - 1}.
#' @return Several info
#' @export
#' @rdname detect_biallelic_problems
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
detect_biallelic_problems <- function(
data,
verbose = TRUE,
parallel.core = parallel::detectCores() - 1
) {
# Cleanup-------------------------------------------------------------------
radiator_function_header(f.name = "detect_biallelic_problems", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "detect_biallelic_problems", start = FALSE, verbose = verbose), add = TRUE)
res <- list()
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("Input file missing")
# Get data and time and generate a filename
# in case problematic markers are found and printed
# filename <- stringi::stri_join("radiator_biallelic_problem_", format(Sys.time(), "%Y%m%d@%H%M"), ".tsv")
# Import data ---------------------------------------------------------------
if (is.vector(data)) {
data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
}
if (!tibble::has_name(data, "GT_VCF_NUC")) {
message("Tidy dataset requires genotypes with nuclotides: GT_VCF_NUC")
data <- radiator::calibrate_alleles(
data = data,
biallelic = TRUE,
verbose = TRUE,
gt.vcf.nuc = TRUE
) %$%
input
}
if (!tibble::has_name(data, "GT_VCF_NUC")) {
message("Wrong genotype format")
message("Tidy dataset requires genotypes with nuclotides: GT_VCF_NUC")
rlang::abort("One way to get it is to use radiator::calibrate_alleles")
}
# Check data ----------------------------------------------------------
message("Generating statistics...")
markers.metadata <- purrr::keep(
.x = colnames(data),
.p = colnames(data) %in% c("MARKERS", "CHROM", "LOCUS", "POS"))
want <- c("MARKERS", "CHROM", "LOCUS", "POS", "GT_VCF_NUC")
blacklist.markers <- dplyr::select(data, tidyselect::any_of(want)) %>%
dplyr::filter(GT_VCF_NUC != "./.") %>%
dplyr::distinct(MARKERS, GT_VCF_NUC, .keep_all = TRUE) %>%
separate_gt(x = ., gt = "GT_VCF_NUC", exclude = markers.metadata, split.chunks = 3L, haplotypes = TRUE) %>%
dplyr::distinct(MARKERS, HAPLOTYPES, .keep_all = TRUE) %>%
dplyr::group_by(dplyr::across(markers.metadata)) %>%
dplyr::tally(.) %>%
dplyr::filter(n > 2) %>%
dplyr::rename(N_ALLELES = n)
if (nrow(blacklist.markers) == 0) {
message("No markers to blacklist")
res$blacklist.markers <- "No markers blacklisted"
res$blacklist.info <- "No markers blacklisted"
res$biallelic.data <- data
res$multiallelic.data <- NULL
} else {
# write the blacklist
blacklist.filename <- stringi::stri_join("blacklist.markers.not.biallelic_", file.date, ".tsv")
readr::write_tsv(x = blacklist.markers, file = blacklist.filename)
res$blacklist.markers <- blacklist.markers
exclude.vars <- purrr::keep(
.x = colnames(data),
.p = !colnames(data) %in% "GT_VCF_NUC")
group.vars <- purrr::keep(
.x = colnames(data),
.p = colnames(data) %in% c("MARKERS", "CHROM", "LOCUS", "POS", "POP_ID"))
# Generating biallelic/multiallelic datasets -------------------------------
res$multiallelic.data <- dplyr::filter(data, MARKERS %in% blacklist.markers$MARKERS)
res$biallelic.data <- dplyr::filter(data, !MARKERS %in% blacklist.markers$MARKERS)
data <- NULL # no longer required
# Blacklist additionnal info -----------------------------------------------
blacklist.info <- dplyr::filter(res$multiallelic.data, GT_VCF_NUC != "./.") %>%
separate_gt(x = ., gt = "GT_VCF_NUC", exclude = exclude.vars, split.chunks = 3L) %>%
dplyr::select(-ALLELES_GROUP) %>%
dplyr::group_by(dplyr::across(c(group.vars, "ALLELES"))) %>%
dplyr::summarise(N = n()) %>%
dplyr::ungroup(.) %>%
dplyr::group_by_at(dplyr::vars(c(markers.metadata, "ALLELES"))) %>%
dplyr::mutate(REF = sum(N)) %>%
dplyr::ungroup(.) %>%
dplyr::arrange(MARKERS, ALLELES)
markers.low.mac <- blacklist.info %>%
dplyr::group_by(dplyr::across(markers.metadata)) %>%
dplyr::filter(REF == min(REF)) %>%
dplyr::distinct(MARKERS, REF, .keep_all = TRUE) %>%
dplyr::filter(REF <=1) %>%
dplyr::ungroup(.) %>%
nrow
blacklist.info.filename <- stringi::stri_join("blacklist.markers.not.biallelic.info_", file.date, ".tsv")
blacklist.info <- blacklist.info %>%
dplyr::group_by(dplyr::across(markers.metadata)) %>%
dplyr::mutate(REF = dplyr::if_else(REF == max(REF), "REF", "ALT")) %>%
dplyr::ungroup(.) %>%
dplyr::group_by(dplyr::across(c(markers.metadata, "ALLELES", "REF"))) %>%
tidyr::pivot_wider(data = ., names_from = "POP_ID", values_from = "N") %>%
readr::write_tsv(x = ., file = blacklist.info.filename, na = "-")
res$blacklist.info <- blacklist.info
dodgy.markers <- dplyr::n_distinct(blacklist.info$MARKERS)
message("\nNumber of suspicious markers (> 2 alleles): ", dodgy.markers)
message("Number of suspicious markers with 1 count for the ALT allele: ", markers.low.mac)
message("Number of suspicious markers after low count removed: ", dodgy.markers - markers.low.mac)
message("\nblacklist of markers with > 2 alleles:")
message(" ", blacklist.filename)
message("\nblacklist with more info:")
message(" ", blacklist.info.filename)
message("\nBiallelic and multiallelic tidy datasets objects generated")
# if (tibble::has_name(data, "AVG_COUNT_REF")) {
# data3 <- data %>%
# dplyr::filter(MARKERS %in% blacklist.markers$MARKERS) %>%
# dplyr::group_by(MARKERS, GT_VCF_NUC) %>%
# dplyr::summarise(
# NUMBER_SAMPLES = n(),
# AVG_COUNT_REF = mean(AVG_COUNT_REF),
# AVG_COUNT_SNP = mean(AVG_COUNT_SNP)) %>%
# dplyr::arrange(MARKERS, dplyr::desc(NUMBER_SAMPLES))
# } else if (tibble::has_name(data, "ALLELE_REF_DEPTH")) {
# data3 <- data %>%
# dplyr::filter(MARKERS %in% blacklist.markers$MARKERS) %>%
# dplyr::group_by(MARKERS, GT_VCF_NUC) %>%
# dplyr::summarise(NUMBER_SAMPLES = n()) %>%
# dplyr::arrange(MARKERS, dplyr::desc(NUMBER_SAMPLES))
# } else {
# data3 <- data %>%
# dplyr::filter(MARKERS %in% blacklist.markers$MARKERS) %>%
# dplyr::group_by(MARKERS, GT_VCF_NUC) %>%
# dplyr::summarise(NUMBER_SAMPLES = n()) %>%
# dplyr::arrange(MARKERS, dplyr::desc(NUMBER_SAMPLES))
# }
}
return(res)
}#End detect_biallelic_problems
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.