# read_blacklist_genotypes ---------------------------------------------------------------
#' @name read_blacklist_genotypes
#' @title read blacklist of genotypes
#' @description Read a blacklist object or file.
#'
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#'
#' @param blacklist.genotypes (path or object)
#' The blacklist is an object in your
#' global environment or a file in the working directory (e.g. "blacklist.geno.tsv").
#' The dataframe contains at least these 2 columns: \code{MARKERS, INDIVIDUALS}.
#' Additional columns are allowed: \code{CHROM, LOCUS, POS}.
#'
#' Useful to erase genotypes with bad QC, e.g. genotype with more than 2 alleles
#' in diploid likely
#' sequencing errors or genotypes with poor genotype likelihood or coverage.
#'
#' Columns are cleaned of separators that interfere with some packages or codes, detailed in
#' \code{\link{clean_markers_names}} and \code{\link{clean_ind_names}}
#' Default \code{blacklist.genotypes = NULL}.
#'
#' @inheritParams radiator_common_arguments
#' @examples
#' \dontrun{
#' bl <- radiator::read_blacklist_genotypes(data = data,
#' blacklist.genotypes = "blacklist.geno.iguana.tsv")
#' }
#' @section Life cycle:
#'
#' This function arguments will be subject to changes. Currently the function uses
#' erase.genotypes, but using the \code{dots-dots-dots ...} arguments allows to
#' pass \code{erase.genotypes and masked.genotypes}. These arguments do exactly
#' the same thing and only one can be used.
#' @export
#' @rdname read_blacklist_genotypes
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
read_blacklist_genotypes <- function(
blacklist.genotypes = NULL,
verbose = FALSE,
...
) {
# dotslist -------------------------------------------------------------------
dotslist <- rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE)
want <- c("erase.genotypes", "masked.genotypes")
unknowned_param <- setdiff(names(dotslist), want)
if (length(unknowned_param) > 0) {
rlang::abort("Unknowned \"...\" parameters ",
stringi::stri_join(unknowned_param, collapse = " "))
}
radiator.dots <- dotslist[names(dotslist) %in% want]
erase.genotypes <- radiator.dots[["erase.genotypes"]]
masked.genotypes <- radiator.dots[["masked.genotypes"]]
if (!is.null(erase.genotypes) && !is.null(masked.genotypes) && !is.null(blacklist.genotypes)) {
rlang::abort("Only one of: erase.genotypes, masked.genotypes or blacklist.genotypes should be used")
}
if (!is.null(erase.genotypes) && is.null(blacklist.genotypes)) {
blacklist.genotypes <- erase.genotypes
}
if (!is.null(masked.genotypes) && is.null(blacklist.genotypes)) {
blacklist.genotypes <- masked.genotypes
}
if (!is.null(blacklist.genotypes)) {
# Import whitelist of markers
if (is.vector(blacklist.genotypes)) {
blacklist.genotypes <- suppressMessages(
readr::read_tsv(blacklist.genotypes, col_names = TRUE) %>%
dplyr::mutate(dplyr::across(tidyselect::everything(), .fns = as.character))
)
}
nrow.before <- nrow(blacklist.genotypes)
blacklist.genotypes <- dplyr::distinct(blacklist.genotypes)
nrow.after <- nrow(blacklist.genotypes)
duplicate.blacklist.genotypes <- nrow.before - nrow.after
if (duplicate.blacklist.genotypes > 0) {
if (verbose) message("Duplicated rows in blacklist genotypes: ", duplicate.blacklist.genotypes)
if (verbose) message(" Creating unique blacklist")
if (verbose) message(" Warning: downstream results might be impacted by this, check how you made your blacklist")
}
nrow.before <- duplicate.blacklist.genotypes <- nrow.after <- NULL
# cleaning names of markers
need.cleaning <- purrr::discard(
.x = colnames(blacklist.genotypes),
.p = colnames(blacklist.genotypes) %in% "INDIVIDUALS")
blacklist.genotypes <- dplyr::mutate(
.data = blacklist.genotypes,
dplyr::across(
.cols = need.cleaning,
.fns = clean_markers_names
)
)
blacklist.genotypes <- dplyr::mutate(
.data = blacklist.genotypes,
dplyr::across(
.cols = "INDIVIDUALS",
.fns = clean_ind_names
)
)
} else {
blacklist.genotypes <- NULL
}
return(blacklist.genotypes)
}#End
# filter_blacklist_genotypes ---------------------------------------------------------------
#' @name filter_blacklist_genotypes
#' @title Filter dataset with blacklist of genotypes
#' @description Filter dataset with blacklist of genotypes.
#'
#' This function allows to blacklist/erase/mask genotypes.
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#'
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#'
#'
#' @inheritParams radiator_common_arguments
#' @inheritParams read_blacklist_genotypes
#' @examples
#' \dontrun{
#' data <- radiator::filter_blacklist_genotypes(
#' data = data, blacklist.geno = "blacklist.geno.tsv"
#' )
#' }
#' @section Life cycle:
#'
#' This function arguments will be subject to changes. Currently the function uses
#' erase.genotypes, but using the \code{dots-dots-dots ...} arguments allows to
#' pass \code{erase.genotypes and masked.genotypes}. These arguments do exactly
#' the same thing and only one can be used.
#' @export
#' @rdname filter_blacklist_genotypes
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
filter_blacklist_genotypes <- function(
data,
blacklist.genotypes,
verbose = TRUE,
...
) {
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("Input file missing")
file.date <- format(Sys.time(), "%Y%m%d@%H%M")# Date and time
# dotslist -------------------------------------------------------------------
dotslist <- rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE)
want <- c("path.folder", "parameters", "erase.genotypes", "masked.genotypes")
unknowned_param <- setdiff(names(dotslist), want)
if (length(unknowned_param) > 0) {
rlang::abort("Unknowned \"...\" parameters ",
stringi::stri_join(unknowned_param, collapse = " "))
}
radiator.dots <- dotslist[names(dotslist) %in% want]
parameters <- radiator.dots[["parameters"]]
erase.genotypes <- radiator.dots[["erase.genotypes"]]
masked.genotypes <- radiator.dots[["masked.genotypes"]]
path.folder <- radiator.dots[["path.folder"]]
path.folder <- generate_folder(f = path.folder, file.date = file.date, verbose = verbose)
if (!is.null(erase.genotypes) && !is.null(masked.genotypes) && !is.null(blacklist.genotypes)) {
rlang::abort("Only one of: erase.genotypes, masked.genotypes or blacklist.genotypes should be used")
}
if (!is.null(erase.genotypes) && is.null(blacklist.genotypes)) {
blacklist.genotypes <- erase.genotypes
}
if (!is.null(masked.genotypes) && is.null(blacklist.genotypes)) {
blacklist.genotypes <- masked.genotypes
}
# read whitelist
blacklist.genotypes <- radiator::read_blacklist_genotypes(
blacklist.genotypes = blacklist.genotypes,
verbose = verbose)
if (!is.null(blacklist.genotypes)) {
# Import data ---------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
} else {
if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
data.type <- "tbl_df"
}
# Filter parameter file: generate and initiate -----------------------------
# TODO: modify the function to accomodate erasing genotypes...
# filters.parameters <- radiator_parameters(
# generate = TRUE,
# initiate = TRUE,
# update = FALSE,
# parameter.obj = parameters,
# data = data,
# path.folder = path.folder,
# file.date = file.date,
# verbose = verbose)
# checks -------------------------------------------------------------------
# check to keep only individuals present in the dataset and not already blacklisted
# check that markers bl are in dataset (not already blacklisted elsewhere...)
if (verbose) message("Checking matching individuals and markers between blacklist and data")
if (data.type == "tbl_df") {
id <- unique(data$INDIVIDUALS)
wl <- unique(data$MARKERS)
} else {
radiator.gds <- gdsfmt::index.gdsn(
node = data, path = "radiator", silent = TRUE)
id <- gdsfmt::index.gdsn(
node = radiator.gds, path = "individuals", silent = TRUE)
if (!is.null(id)) {
id <- gdsfmt::read.gdsn(id) %$% INDIVIDUALS
} else {
id <- SeqArray::seqGetData(gdsfile = data, var.name = "sample.id")
}
wl <- radiator::extract_markers_metadata(gds = data, whitelist = TRUE) %$% MARKERS
}
blacklist.genotypes %<>% dplyr::filter(INDIVIDUALS %in% id) %>%
dplyr::filter(MARKERS %in% wl)
n.markers.bl <- nrow(blacklist.genotypes)
if (n.markers.bl > 0) {
if (verbose) message("Blacklisted genotypes: ", n.markers.bl)
if (data.type == "tbl_df") {
blacklist.genotypes <- dplyr::semi_join(
data,
blacklist.genotypes,
by = intersect(colnames(data), colnames(blacklist.genotypes)))
if (tibble::has_name(blacklist.genotypes, "GT")) {
blacklist.genotypes %<>% dplyr::mutate(GT = rep("000000", n()))
}
if (tibble::has_name(blacklist.genotypes, "GT_VCF")) {
blacklist.genotypes %<>% dplyr::mutate(GT_VCF = rep("./.", n()))
}
if (tibble::has_name(blacklist.genotypes, "GT_VCF_NUC")) {
blacklist.genotypes %<>% dplyr::mutate(GT_VCF_NUC = rep("./.", n()))
}
if (tibble::has_name(blacklist.genotypes, "GT_BIN")) {
blacklist.genotypes %<>% dplyr::mutate(GT_BIN = rep(as.numeric(NA_character_), n()))
}
data %<>% dplyr::anti_join(
blacklist.genotypes,
by = intersect(colnames(data), colnames(blacklist.genotypes)))
data %<>% dplyr::bind_rows(blacklist.genotypes)
# TODO and optimize...
# required because REF/ALT might change after deleting genotypes...
data %<>% radiator::calibrate_alleles(data = .) %$% input
# GDS
if (data.type == "SeqVarGDSClass") {
message("Under construction...")
} # End GDS
} else {
message("There are no genotype left in the blacklist")
}
# Filter parameter file: update --------------------------------------------
# filters.parameters <- radiator_parameters(
# generate = FALSE,
# initiate = FALSE,
# update = TRUE,
# parameter.obj = filters.parameters,
# data = data,
# filter.name = "whitelist markers",
# param.name = "whitelist.markers",
# values = n.markers.w,
# path.folder = path.folder,
# file.date = file.date,
# verbose = verbose)
# if (verbose) {
# message("Number of individuals / strata / chrom / locus / SNP:")
# message(" Before: ", filters.parameters$filters.parameters$BEFORE)
# message(" Blacklisted: ", filters.parameters$filters.parameters$BLACKLIST)
# message(" After: ", filters.parameters$filters.parameters$AFTER)
# }
}# End !is.null
}
return(data)
}#End filter_blacklist_genotypes
# The section that was in tidy_genomic_data
# if (is.null(blacklist.genotype)) { # no Whitelist
# if (verbose) message("Erasing genotype: no")
# } else {
# if (verbose) message("Erasing genotype: yes")
# want <- c("MARKERS", "CHROM", "LOCUS", "POS", "INDIVIDUALS")
# if (is.vector(blacklist.genotype)) {
# suppressWarnings(suppressMessages(
# blacklist.genotype <- readr::read_tsv(blacklist.genotype, col_names = TRUE)))
# }
# suppressWarnings(suppressMessages(
# blacklist.genotype <- blacklist.genotype %>%
# dplyr::mutate(dplyr::across(.cols = "INDIVIDUALS", .fns = clean_ind_names)) %>%dplyr::mutate(dplyr::across(everything(), .fns = as.character))
# dplyr::select(tidyselect::any_of(want)) %>%
# dplyr::mutate(dplyr::across(everything(), .fns = as.character, exclude = NA))
# columns.names.blacklist.genotype <- colnames(blacklist.genotype)
#
# if (data.type == "haplo.file") {
# blacklist.genotype <- dplyr::select(.data = blacklist.genotype, INDIVIDUALS, LOCUS)
# columns.names.blacklist.genotype <- colnames(blacklist.genotype)
# }
#
# # control check to keep only individuals in the strata.df
# blacklist.genotype <- suppressWarnings(
# blacklist.genotype %>%
# dplyr::filter(INDIVIDUALS %in% strata.df$INDIVIDUALS)
# )
#
# # control check to keep only whitelisted markers from the blacklist of genotypes
# if (!is.null(whitelist.markers)) {
# if (verbose) message("Control check to keep only whitelisted markers present in the blacklist of genotypes to erase.")
# # updating the whitelist of markers to have all columns that id markers
# if (data.type == "vcf.file") {
# whitelist.markers.ind <- input %>% dplyr::distinct(CHROM, LOCUS, POS, INDIVIDUALS)
# } else {
# whitelist.markers.ind <- input %>% dplyr::distinct(LOCUS, INDIVIDUALS)
# }
#
# # updating the blacklist.genotype
# blacklist.genotype <- suppressWarnings(
# dplyr::semi_join(whitelist.markers.ind, blacklist.genotype,
# by = columns.names.blacklist.genotype))
# columns.names.blacklist.genotype <- colnames(blacklist.genotype)
# }
#
# # Update column names
# columns.names.blacklist.genotype <- colnames(blacklist.genotype)
#
# blacklisted.gen.number <- nrow(blacklist.genotype)
# if (blacklisted.gen.number > 0) {
# message(" Number of genotype(s) to erase: ", blacklisted.gen.number)
# input.erase <- dplyr::semi_join(
# input, blacklist.genotype, by = columns.names.blacklist.genotype) %>%
# dplyr::mutate(GT = rep("000000", n()))
# input <- dplyr::anti_join(
# input, blacklist.genotype, by = columns.names.blacklist.genotype)
# if (rlang::has_name(input.erase, "GT_VCF")) {
# input.erase <- dplyr::mutate(input.erase, GT_VCF = rep("./.", n()))
# }
#
# if (rlang::has_name(input.erase, "GT_VCF_NUC")) {
# input.erase <- dplyr::mutate(input.erase, GT_VCF_NUC = rep("./.", n()))
# }
#
# if (rlang::has_name(input.erase, "GT_BIN")) {
# input.erase <- dplyr::mutate(input.erase, GT_BIN = rep(as.numeric(NA_character_), n()))
# }
# input <- dplyr::bind_rows(input, input.erase)
# } else {
# message("There are no genotype left in the blacklist: input file left intact")
# }
#
# # required because REF/ALT might change after deleting genotypes...
# input <- radiator::calibrate_alleles(data = input)$input
# } # End erase genotypes
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.