# Detect if markers are biallelic
#' @name detect_biallelic_markers
#' @title Detect biallelic data
#' @description Detect if markers in tidy dataset or radiator gds are biallelic.
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#' @param verbose (optional, logical) \code{verbose = TRUE} to be chatty
#' during execution.
#' Default: \code{verbose = FALSE}.
#' @param parallel.core (optional) The number of core used for parallel
#' execution.
#' Default: \code{parallel.core = parallel::detectCores() - 1}.
#' @return A logical character string (TRUE/FALSE). That answer the question if
#' the data set is biallelic or not.
#' @export
#' @rdname detect_biallelic_markers
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
detect_biallelic_markers <- function(data, verbose = FALSE, parallel.core = parallel::detectCores() - 1) {
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("Input file missing")
# Detect format --------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
# Faster for RADIATOR generated gds with node...----------------------------
radiator.gds <- gdsfmt::index.gdsn(
node = data, path = "radiator", silent = TRUE)
biallelic <- gdsfmt::index.gdsn(
node = radiator.gds, path = "biallelic", silent = TRUE)
biallelic.attr <- gdsfmt::get.attr.gdsn(biallelic)$R.class[1]
if (!is.null(biallelic.attr)) {
biallelic <- gdsfmt::read.gdsn(biallelic)
if (!is.logical(biallelic)) biallelic.attr <- NULL
}
if (is.null(biallelic.attr)) {
ref <- SeqArray::seqGetData(gdsfile = data, var.name = "$ref")
sample.size <- min(length(unique(ref)), 100)
biallelic <- max(unique(stringi::stri_count_regex(
str = sample(
x = unique(ref),
size = sample.size), pattern = "[A-Z]"))) == 1
sample.size <- ref <- NULL
if (!is.null(radiator.gds)) {
# add biallelic info to GDS
gdsfmt::add.gdsn(
node = radiator.gds,
name = "biallelic",
val = biallelic,
replace = TRUE,
compress = "ZIP_RA",
closezip = TRUE)
}
}
if (biallelic) {
if (verbose) message("Data is bi-allelic")
} else {
if (verbose) message("Data is multi-allelic")
}
data <- biallelic
} else {
# Import data ---------------------------------------------------------------
if (is.vector(data)) data %<>% radiator::tidy_wide(data = ., import.metadata = TRUE)
if (rlang::has_name(data, "GT_BIN")) {
data <- TRUE
if (verbose) message("Data is bi-allelic")
} else {
# necessary steps to make sure we work with unique markers and not duplicated LOCUS
if (rlang::has_name(data, "LOCUS") && !rlang::has_name(data, "MARKERS")) {
data %<>% dplyr::rename(MARKERS = LOCUS)
}
# markers with all missing... yes I've seen it... breaks code...
# data <- detect_all_missing(data = data)
marker.problem <- radiator::detect_all_missing(data = data)
if (marker.problem$marker.problem) data <- marker.problem$data
marker.problem <- NULL
# Detecting biallelic markers-------------------------------------------------
if (verbose) message("Scanning for number of alleles per marker...")
if (rlang::has_name(data, "ALT")) {
alt.num <- max(unique(
stringi::stri_count_fixed(str = unique(data$ALT), pattern = ","))) + 1
if (alt.num > 1) {
data <- FALSE
if (verbose) message("Data is multi-allelic")
} else {
data <- TRUE
if (verbose) message("Data is bi-allelic")
}
alt.num <- NULL
} else {
# If there are less than 100 markers, sample all of them
sampled.markers <- unique(data$MARKERS)
n.markers <- length(sampled.markers)
if (!n.markers < 100) {
# otherwise 30% of the markers are randomly sampled
# small.panel <- FALSE
sampled.markers <- sample(
x = sampled.markers,
size = length(sampled.markers) * 0.30,
replace = FALSE)
n.markers <- length(sampled.markers)
}
# Allows to have either GT, GT_VCF_NUC, GT_VCF or GT_BIN
# If more than 1 is discovered in data, keep 1 randomly.
detect.gt <- detect_gt(x = data) #utils
want <- c("MARKERS", detect.gt)
data %<>% dplyr::select(dplyr::any_of(want))
if (rlang::has_name(data, "GT")) {
data %<>%
dplyr::select(MARKERS, GT) %>%
dplyr::filter(GT != "000000") %>%
dplyr::filter(MARKERS %in% sampled.markers) %>%
dplyr::distinct(MARKERS, GT) %>%
separate_gt(x = ., gt = "GT", exclude = "MARKERS", split.chunks = 1L) %>%
dplyr::distinct(MARKERS, ALLELES) %>%
dplyr::count(x = ., MARKERS)
}
if (rlang::has_name(data, "GT_VCF")) {
data %<>%
dplyr::select(MARKERS, GT_VCF) %>%
dplyr::filter(MARKERS %in% sampled.markers) %>%
dplyr::filter(GT_VCF != "./.") %>%
dplyr::distinct(MARKERS, GT_VCF) %>%
separate_gt(x = ., gt = "GT_VCF", exclude = "MARKERS", split.chunks = 1L) %>%
dplyr::distinct(MARKERS, ALLELES) %>% # Here read alleles, not haplotypes
dplyr::count(x = ., MARKERS)
}
if (rlang::has_name(data, "GT_VCF_NUC")) {
data %<>%
dplyr::select(MARKERS, GT_VCF_NUC) %>%
dplyr::filter(GT_VCF_NUC != "./.") %>%
dplyr::filter(MARKERS %in% sampled.markers) %>%
dplyr::distinct(MARKERS, GT_VCF_NUC) %>%
separate_gt(x = ., gt = "GT_VCF_NUC", exclude = "MARKERS", split.chunks = 1L) %>%
dplyr::distinct(MARKERS, ALLELES) %>%
dplyr::count(x = ., MARKERS)
}
# if (small.panel) {
n.allele <- dplyr::filter(data, n > 2)
if (nrow(n.allele) == n.markers) {
data <- FALSE
if (verbose) message(" Data is multi-allelic")
} else {
if (verbose) message(" Data is bi-allelic")
if (max(data$n) > 2) {
message("\nNote: more than 2 types of alleles/nucleotides detected")
message("artifact/biological ? run radiator::detect_biallelic_problem for more details")
}
data <- TRUE
}
}
}
}
return(data)
} # End detect_biallelic_markers
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.