# read_whitelist ---------------------------------------------------------------
#' @name read_whitelist
#' @title read whitelist of markers
#' @description Read a whitelist object or file.
#'
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#'
#' @param whitelist.markers (path or object)
#' The whitelist is an object in your
#' global environment or a file in the working directory (e.g. "whitelist.txt").
#' The dataframe contains one, a combination
#' or all of these columns: \code{MARKERS, CHROM, LOCUS, POS}.
#' Columns are cleaned of separators that interfere with some packages or codes, detailed in
#' \code{\link{clean_markers_names}}.
#' Default \code{whitelist.markers = NULL}.
#'
#' @inheritParams radiator_common_arguments
#' @details
#'
#' \strong{multi allelic datasets:}
#' Example: VCF with haplotypes containing \code{CHROM, LOCUS, POS} columns:
#' If the whitelist was not created from the same dataset,
#' the filtering could result in losing all the markers.
#' The POS column is different in biallelic and multiallelic file...
#' @examples
#' \dontrun{
#' wl <- radiator::read_whitelist(data = data, whitelist.markers = "mywhitelist.tsv")
#' }
#' @seealso \code{\link{filter_whitelist}}.
#' @export
#' @rdname read_whitelist
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
read_whitelist <- function(whitelist.markers = NULL, verbose = FALSE) {
if (!is.null(whitelist.markers)) {
# Import whitelist of markers
if (is.vector(whitelist.markers)) {
whitelist.markers <- suppressMessages(
readr::read_tsv(whitelist.markers, col_names = TRUE) %>%
dplyr::mutate(dplyr::across(tidyselect::everything(), .fns = as.character))
)
}
nrow.before <- nrow(whitelist.markers)
whitelist.markers <- dplyr::distinct(whitelist.markers)
nrow.after <- nrow(whitelist.markers)
duplicate.whitelist.markers <- nrow.before - nrow.after
if (duplicate.whitelist.markers > 0) {
if (verbose) message("Duplicated rows in whitelist of markers: ", duplicate.whitelist.markers)
if (verbose) message(" Creating unique whitelist")
if (verbose) message(" Warning: downstream results might be impacted by this, check how you made your whitelist...")
}
nrow.before <- duplicate.whitelist.markers <- nrow.after <- NULL
# cleaning names of markers
whitelist.markers <- dplyr::mutate(
.data = whitelist.markers,
dplyr::across(tidyselect::everything(), .fns = clean_markers_names)
)
if (tibble::has_name(whitelist.markers, "VARIANT_ID")) {
whitelist.markers$VARIANT_ID <- as.integer(whitelist.markers$VARIANT_ID)
}
} else {
whitelist.markers <- NULL
}
return(whitelist.markers)
}#End
# filter_whitelist ---------------------------------------------------------------
#' @name filter_whitelist
#' @title Filter dataset with whitelist of markers
#' @description Filter dataset with whitelist of markers
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users.
#'
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#'
#'
#' @inheritParams radiator_common_arguments
#' @inheritParams read_whitelist
#' @examples
#' \dontrun{
#' data <- radiator::filter_whitelist(data = data, whitelist.markers = "mywhitelist.tsv")
#' }
#' @export
#' @rdname filter_whitelist
#' @seealso \code{\link{read_whitelist}}.
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
filter_whitelist <- function(
data,
whitelist.markers = NULL,
verbose = TRUE,
...) {
# obj.keeper <- c(ls(envir = globalenv()), "data")
if (is.null(whitelist.markers)) return(data)
radiator_function_header(f.name = "filter_whitelist", verbose = verbose)
# Cleanup---------------------------------------------------------------------
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "filter_whitelist", start = FALSE, verbose = verbose), add = TRUE)
# on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("Input file missing")
# Function call and dotslist -------------------------------------------------
rad.dots <- radiator_dots(
func.name = as.list(sys.call())[[1]],
fd = rlang::fn_fmls_names(),
args.list = as.list(environment()),
dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
keepers = c("path.folder", "parameters", "biallelic", "markers.meta", "internal"),
verbose = verbose
)
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "filter_whitelist",
internal = internal,
file.date = file.date,
verbose = verbose)
# write the dots file
write_rad(
data = rad.dots,
path = path.folder,
filename = stringi::stri_join(
"radiator_filter_whitelist_args_", file.date, ".tsv"),
tsv = TRUE,
internal = internal,
verbose = verbose
)
# read whitelist
whitelist.markers <- radiator::read_whitelist(
whitelist.markers = whitelist.markers,
verbose = verbose)
n.markers.w <- nrow(whitelist.markers)
if (verbose) message("Whitelist of markers: ", n.markers.w)
# Import data ---------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
} else {
if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
data.type <- "tbl_df"
}
# Check biallelic ----------------------------------------------------------
if (is.null(biallelic)) biallelic <- detect_biallelic_markers(data = data)
if (!biallelic) {
if (ncol(whitelist.markers) >= 3) {
if (verbose) message("Note: whitelist with CHROM LOCUS POS columns and VCF haplotype:
If the whitelist was not created from this VCF,
the filtering could result in losing all the markers.
The POS column is different in biallelic and multiallelic file...\n")
if (verbose) message("Discarding the POS column in the whitelist if present")
if (tibble::has_name(whitelist.markers, "POS")) {
whitelist.markers %<>% dplyr::select(-POS)
}
}
}
# Filter parameter file: generate and initiate -----------------------------
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = TRUE,
update = FALSE,
parameter.obj = parameters,
data = data,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# filtering ----------------------------------------------------------------
# tidy data ----------------------------------------------------------------
if (data.type == "tbl_df") {
data <- suppressWarnings(dplyr::semi_join(
data,
whitelist.markers,
by = intersect(colnames(data), colnames(whitelist.markers))))
if (nrow(data) == 0) {
rlang::abort("No markers left in the dataset, check whitelist...")
}
}
# GDS
if (data.type == "SeqVarGDSClass") {
# extract all the markers
markers.meta <- extract_markers_metadata(
gds = data,
whitelist = FALSE
)
if (rlang::has_name(whitelist.markers, "VARIANT_ID") && rlang::has_name(markers.meta, "VARIANT_ID")) {
use.variant <- TRUE
use.markers <- use.others <- FALSE
} else if (rlang::has_name(whitelist.markers, "MARKERS") && rlang::has_name(markers.meta, "MARKERS")) {
use.variant <- use.others <- FALSE
use.markers <- TRUE
FALSE
} else {
use.variant <- use.markers <- FALSE
use.others <- TRUE
}
if (use.variant) {
markers.meta %<>%
dplyr::mutate(
FILTERS = dplyr::if_else(
!VARIANT_ID %in% whitelist.markers$VARIANT_ID &
FILTERS == "whitelist",
"filter.whitelist",
FILTERS
)
)
}
if (use.markers) {
markers.meta %<>%
dplyr::mutate(
FILTERS = dplyr::if_else(
!MARKERS %in% whitelist.markers$MARKERS &
FILTERS == "whitelist",
"filter.whitelist",
FILTERS
)
)
}
if (use.others) {
common.meta <- rlang::syms((intersect(colnames(markers.meta), colnames(whitelist.markers))))
whitelist.markers %<>%
dplyr::mutate(
.data = .,
COMMON_META = stringi::stri_join(!!!common.meta, sep = "__")
)
test <- markers.meta %>%
dplyr::mutate(
.data = .,
COMMON_META = stringi::stri_join(!!!common.meta, sep = "__")
) %>%
dplyr::filter(COMMON_META %in% whitelist.markers$COMMON_META)
# FILTERS = dplyr::if_else(
# !COMMON_META %in% whitelist.markers$COMMON_META &
# FILTERS == "whitelist",
# "filter.whitelist",
# FILTERS
# )
# # COMMON_META = NULL
# )
common.meta <- NULL
}
if (nrow(dplyr::filter(markers.meta, FILTERS == "whitelist")) == 0) {
rlang::abort("No markers left in the dataset, check whitelist...")
}
update_radiator_gds(
gds = data,
node.name = "markers.meta",
value = markers.meta,
replace = TRUE,
sync = TRUE,
verbose = TRUE
)
write_rad(
data = markers.meta %>% dplyr::filter(FILTERS == "filter.whitelist"),
path = path.folder,
filename = stringi::stri_join("blacklist.markers_", file.date, ".tsv"),
tsv = TRUE, internal = internal, verbose = verbose)
write_rad(
data = markers.meta %>% dplyr::filter(FILTERS == "whitelist"),
path = path.folder,
filename = stringi::stri_join("whitelist.markers_", file.date, ".tsv"),
tsv = TRUE, internal = internal, verbose = verbose)
} # End GDS
# Filter parameter file: update --------------------------------------------
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "whitelist markers",
param.name = "whitelist.markers",
values = n.markers.w,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# results ------------------------------------------------------------------
radiator_results_message(
rad.message = stringi::stri_join("\nFilter whitelist : ", n.markers.w, "SNPs"),
filters.parameters,
internal,
verbose
)
return(data)
}#End filter_whitelist
#' @title generate_whitelist
#' @description Generate Whitelist of markers automatically for different threshold
#' @rdname generate_whitelist
#' @keywords internal
#' @export
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
generate_whitelist <- function(x, t, path.folder = NULL) {
if (is.null(path.folder)) path.folder <- getwd()
x <- dplyr::ungroup(x) %>%
dplyr::filter(N_POP_HWD == hw.pop.threshold)
bl_map <- function(x, unfiltered.data, hw.pop.threshold,
path.folder, filters.parameters.path) {
if (nrow(x) > 0) {
significance.group <- unique(x$SIGNIFICANCE)
significance.group <- dplyr::case_when(
significance.group == "*****" ~ 0.00001,
significance.group == "****" ~ 0.0001,
significance.group == "***" ~ 0.001,
significance.group == "**" ~ 0.01,
significance.group == "*" ~ 0.05) %>%
format(., scientific = FALSE)
blacklist.filename <- file.path(
path.folder,
stringi::stri_join(
"blacklist.markers.hwd", significance.group, "mid.p.value",
hw.pop.threshold, "hw.pop.threshold", "tsv", sep = "."))
whitelist.filename <- file.path(
path.folder,
stringi::stri_join(
"whitelist.markers.hwe", significance.group, "mid.p.value",
hw.pop.threshold, "hw.pop.threshold", "tsv", sep = "."))
rad.filename <- file.path(
path.folder,
stringi::stri_join(
"tidy.filtered.hwe", significance.group, "mid.p.value",
hw.pop.threshold, "hw.pop.threshold", "rad", sep = "."))
# Generate the blacklist
want <- c("MARKERS", "CHROM", "LOCUS", "POS")
blacklist <- dplyr::distinct(x, MARKERS) %>%
dplyr::left_join(
dplyr::select(unfiltered.data, tidyselect::any_of(want)) %>%
dplyr::distinct(MARKERS, .keep_all = TRUE)
, by = "MARKERS") %>%
readr::write_tsv(
x = .,
file = blacklist.filename)
# Generate the rad data + the whitelist
if (!is.null(data.temp)) {
whitelist <- unfiltered.data %>%
dplyr::bind_rows(data.temp) %>%
dplyr::mutate(POP_ID = factor(POP_ID, levels = pop.id.levels)) %>%
dplyr::filter(!MARKERS %in% blacklist$MARKERS) %>%
radiator::write_rad(data = ., path = rad.filename) %>%
dplyr::select(tidyselect::any_of(want)) %>%
dplyr::distinct(MARKERS, .keep_all = TRUE) %>%
readr::write_tsv(x = ., file = whitelist.filename)
} else {
whitelist <- unfiltered.data %>%
dplyr::filter(!MARKERS %in% blacklist$MARKERS) %>%
radiator::write_rad(data = ., path = rad.filename) %>%
dplyr::select(tidyselect::any_of(want)) %>%
dplyr::distinct(MARKERS, .keep_all = TRUE) %>%
readr::write_tsv(x = ., file = whitelist.filename)
}
fil.param <- update_filter_parameter(
filter = "HWE",
unfiltered = unfiltered.data,
filtered = whitelist,
parameter = "hw.pop.threshold/mid.p.value",
threshold = stringi::stri_join(hw.pop.threshold, significance.group, sep = "/"),
param.path = filters.parameters.path)
}
}#End bl_map
split(x = x, f = x$SIGNIFICANCE) %>%
purrr::walk(
.x = ., .f = bl_map,
unfiltered.data = unfiltered.data,
hw.pop.threshold = hw.pop.threshold,
path.folder = path.folder,
filters.parameters.path = filters.parameters.path)
fil.param <- readr::read_tsv(file = filters.parameters.path, col_types = "cccccccc") %>%
dplyr::filter(FILTERS == "HWE")
return(fil.param)
}#End generate_whitelist
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.