# Filter/blacklist individuals
#' @name filter_individuals
#' @title Filter individuals based on genotyping/missingness rate,
#' heterozygosity and total coverage
#' @description Remove individuals with bad QC based on:
#' \itemize{
#' \item missingness (genotyping rate)
#' \item heterozygosity
#' \item coverage (total, median, iqr)
#' }
#'
#' \strong{Filter targets}: Individuals
#'
#' \strong{Statistics}: Missingness, heterozygosity and coverage
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users who wants to blacklist individuals.
#' @param data (2 options) A Genomic Data Structure (GDS)
#' file or object generated by radiator.
#'
#' \emph{How to get GDS?}
#' Look into:
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#' @param filter.individuals.missing (optional, double) A proportion above which the individuals are
#' blacklisted and removed from the dataset.
#' Default: \code{filter.individuals.missing = NULL}.
#' @param filter.individuals.heterozygosity (optional, string of doubles) A proportion below and
#' above which the individuals are blacklisted and removed from the dataset.
#' Default: \code{filter.individuals.heterozygosity = NULL}.
#' @param filter.individuals.coverage.total (optional, string of doubles)
#' Target the total coverage per samples.
#' A proportion below and
#' above which the individuals are blacklisted and removed from the dataset.
#' Default: \code{filter.individuals.coverage.total = NULL}.
#' @param filter.individuals.coverage.median (optional, string of integers)
#' Target the median coverage per samples.
#' Integers, below and above, that blacklist individuals (removed from the dataset)
#' Default: \code{filter.individuals.coverage.median = NULL}.
#' @param filter.individuals.coverage.iqr (optional, string of integers)
#' Target the IQR (Interquartile Range) coverage per samples.
#' Integers, below and above, that blacklist individuals (removed from the dataset)
#' Default: \code{filter.individuals.coverage.iqr = NULL}.
#' @inheritParams radiator_common_arguments
#' @return A list with the filtered input and blacklist of individuals.
#' @export
#' @rdname filter_individuals
#' @seealso
#' \code{\link{filter_rad}}
#' \code{\link{tidy_genomic_data}}, \code{\link{read_vcf}},
#' \code{\link{tidy_vcf}}.
#' @examples
#' \dontrun{
#' require(SeqArray)
#'
#' # blacklisting outliers individuals:
#' id.qc <- radiator::filter_individuals(
#' data = "my.radiator.gds.rad",
#' filter.individuals.missing = "outliers",
#' filter.individuals.heterozygosity = "outliers",
#' filter.individuals.coverage.total = "outliers")
#'
#' # using values to blacklist individuals:
#' id.qc <- radiator::filter_individuals(
#' data = "my.radiator.gds.rad",
#' filter.individuals.missing = 0.5,
#' filter.individuals.heterozygosity = c(0.02, 0.03),
#' filter.individuals.coverage.total = c(900000, 5000000))
#'
#' }
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
filter_individuals <- function(
data,
interactive.filter = TRUE,
filter.individuals.missing = NULL,
filter.individuals.heterozygosity = NULL,
filter.individuals.coverage.total = NULL,
filter.individuals.coverage.median = NULL,
filter.individuals.coverage.iqr = NULL,
parallel.core = parallel::detectCores() - 1,
verbose = TRUE,
...
) {
##TESTS
# path.folder <- NULL
# parameters <- NULL
# id.stats <- NULL
# internal <- FALSE
# subsample <- NULL
# subsample.markers.stats <- NULL
# interactive.filter = TRUE
# filter.individuals.missing = NULL
# filter.individuals.heterozygosity = NULL
# filter.individuals.coverage.total = NULL
# filter.individuals.coverage.median = NULL
# filter.individuals.coverage.iqr = NULL
# parallel.core = parallel::detectCores() - 1
# verbose = TRUE
if (interactive.filter ||
!is.null(filter.individuals.missing) ||
!is.null(filter.individuals.heterozygosity) ||
!is.null(filter.individuals.coverage.total) ||
!is.null(filter.individuals.coverage.median) ||
!is.null(filter.individuals.coverage.iqr)
) {
if (interactive.filter) verbose <- TRUE
radiator_function_header(f.name = "filter_individuals", verbose = verbose)
if (Sys.info()[['sysname']] == "Windows") parallel.core <- 1
# Cleanup---------------------------------------------------------------------
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing, verbose = verbose), add = TRUE)
on.exit(radiator_function_header(f.name = "filter_individuals", start = FALSE, verbose = verbose), add = TRUE)
# on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))
# Function call and dotslist -------------------------------------------------
rad.dots <- radiator_dots(
func.name = as.list(sys.call())[[1]],
fd = rlang::fn_fmls_names(),
args.list = as.list(environment()),
dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
keepers = c("path.folder", "parameters", "internal", "id.stats",
"subsample", "subsample.markers.stats"),
verbose = FALSE
)
# Checking for missing and/or default arguments ----------------------------
if (missing(data)) rlang::abort("data is missing")
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "filter_individuals",
internal = internal,
file.date = file.date,
verbose = verbose)
# write the dots file
write_rad(
data = rad.dots,
path = path.folder,
filename = stringi::stri_join("radiator_filter_individuals_args_", file.date, ".tsv"),
tsv = TRUE,
internal = internal,
write.message = "Function call and arguments stored in: ",
verbose = verbose
)
# Message about steps taken during the process -----------------------------
if (interactive.filter) {
message("Interactive mode: on\n")
message("Step 1. Visualization")
message("Step 2. Missingness")
message("Step 3. Heterozygosity")
message("Step 4. Coverage (if available)\n\n")
}
# Detect format ------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
# Import data --------------------------------------------------------------
if (!data.type %in% c("SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
# Filter parameter file: generate and initiate -----------------------------
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = TRUE,
update = FALSE,
parameter.obj = parameters,
data = data,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# stats -------------------------------------------------------------------
filter.monomorphic <- FALSE
# Step 1. Visuals ----------------------------------------------------------
if (interactive.filter) {
message("\nStep 1. Visualization of samples QC\n")
}
# Note to myself: for now it's better not to subsample, because of filtering
variant.select <- subsample <- NULL
subsample.markers.stats <- 1
id.stats <- generate_stats(
gds = data,
individuals = TRUE,
markers = FALSE,
exhaustive = FALSE,
subsample = variant.select,
path.folder = path.folder,
file.date = file.date,
parallel.core = parallel.core,
verbose = TRUE
)
print(id.stats$i.fig)
# Step 2. Missingness-----------------------------------------------------------------
if (interactive.filter) {
message("\nStep 2. Filtering markers based individual missingness/genotyping\n")
filter.individuals.missing <- radiator_question(
x = "Do you want to blacklist samples based on missingness ? (y/n):",
answer.opt = c("y", "n"))
if (filter.individuals.missing == "y") {
outlier.stats <- radiator_question(
x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
if (outlier.stats == 1) {
filter.individuals.missing <- "outliers"
} else {
filter.individuals.missing <- radiator_question(
x = "Enter the proportion threshold (0-1)
The maximum amount of missingness you tolerate for a sample (e.g. 0.3): ", minmax = c(0, 1))
}
outlier.stats <- NULL
} else {
filter.individuals.missing <- NULL
}
}
if (!is.null(filter.individuals.missing)) {
if (!purrr::is_double(filter.individuals.missing)) {
# if max and outliers high are the same leave the value
# if max and outliers high are not the same do gymnastic
if (id.stats$i.stats$OUTLIERS_HIGH[1] == id.stats$i.stats$MAX[1]) {
higher.eq <- FALSE
} else {
higher.eq <- TRUE
}
outlier.id.missing <- id.stats$i.stats$OUTLIERS_HIGH[1]
if (verbose) message("\nRemoving outliers individuals based on genotyping statistics: ", outlier.id.missing)
filter.individuals.missing <- outlier.id.missing
} else {
message("\nRemoving individuals based on genotyping statistics: ", filter.individuals.missing)
higher.eq <- FALSE
}
bl <- dplyr::ungroup(id.stats$i.info)
if (higher.eq) {
bl %<>% dplyr::filter(MISSING_PROP >= filter.individuals.missing)
} else {
bl %<>% dplyr::filter(MISSING_PROP > filter.individuals.missing)
}
bl %<>%
dplyr::distinct(INDIVIDUALS, .keep_all = TRUE) %>%
dplyr::mutate(FILTER = "filter.individuals.missing")
n.bl <- nrow(bl)
individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
dplyr::mutate(
FILTERS = dplyr::if_else(
INDIVIDUALS %in% bl$INDIVIDUALS,
"filter.individuals.missing", FILTERS
)
)
update_radiator_gds(
gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)
if (n.bl > 0) {
filter.monomorphic <- TRUE
bl.filename <- stringi::stri_join("blacklist.individuals.missing_", file.date, ".tsv")
readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
}
# Filter parameter file: update
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "Filter individuals based on missingness (with outlier stats or values)",
param.name = "filter.individuals.missing",
values = filter.individuals.missing,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# results ------------------------------------------------------------------
radiator_results_message(
rad.message = stringi::stri_join("\nFilter individuals based on missingness: ",
filter.individuals.missing),
filters.parameters,
internal,
verbose
)
}#End filter.individuals.missing
# Step 3. Heterozygosity------------------------------------------------------------
if (interactive.filter) {
message("\nStep 3. Filtering markers based on individual heterozygosity\n")
filter.individuals.heterozygosity <- radiator_question(
x = "Do you want to blacklist samples based on heterozygosity ? (y/n):",
answer.opt = c("y", "n"))
if (filter.individuals.heterozygosity == "y") {
outlier.stats <- radiator_question(
x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
if (outlier.stats == 1) {
filter.individuals.heterozygosity <- "outliers"
} else {
filter.individuals.heterozygosity <- c(0,1)
filter.individuals.heterozygosity[1] <- radiator_question(
x = "Enter the min proportion threshold (0-1)
The minimum amount of heterozygosity you tolerate for a sample:", minmax = c(0, 1))
filter.individuals.heterozygosity[2] <- radiator_question(
x = "Enter the max proportion threshold (0-1)
The maximum amount of heterozygosity you tolerate for a sample:", minmax = c(0, 1))
}
outlier.stats <- NULL
} else {
filter.individuals.heterozygosity <- NULL
}
}
if (!is.null(filter.individuals.heterozygosity)) {
if (length(filter.individuals.heterozygosity) > 1) {
het.low <- filter.individuals.heterozygosity[1]
het.high <- filter.individuals.heterozygosity[2]
if (verbose) message("\nRemoving individuals based on heterozygosity statistics: ", het.low, " / ", het.high)
higher.eq <- FALSE
lower.eq <- FALSE
} else {
if (is.character(filter.individuals.heterozygosity)) {
if (id.stats$i.stats$OUTLIERS_LOW[2] == id.stats$i.stats$MIN[2]) {
lower.eq <- FALSE
} else {
lower.eq <- TRUE
}
if (id.stats$i.stats$OUTLIERS_HIGH[2] == id.stats$i.stats$MAX[2]) {
higher.eq <- FALSE
} else {
higher.eq <- TRUE
}
het.low <- id.stats$i.stats$OUTLIERS_LOW[2]
het.high <- id.stats$i.stats$OUTLIERS_HIGH[2]
if (verbose) message("\nRemoving outliers individuals based on heterozygosity statistics: ", het.low, " / ", het.high)
} else {
rlang::abort("Unknown filter.individuals.heterozygosity thresholds used")
}
}
bl <- dplyr::ungroup(id.stats$i.info)
# fl & fh for filter high and low
if (lower.eq) {
fl <- "HETEROZYGOSITY <= het.low"
} else {
fl <- "HETEROZYGOSITY < het.low"
}
if (higher.eq) {
fh <- "HETEROZYGOSITY >= het.high"
} else {
fh <- "HETEROZYGOSITY > het.high"
}
filter.het <- stringi::stri_join(fh, fl, sep = " | ")
bl %<>%
dplyr::filter(!!rlang::parse_expr(filter.het)) %>%
dplyr::distinct(INDIVIDUALS) %>%
dplyr::mutate(FILTER = "filter.individuals.heterozygosity")
n.bl <- nrow(bl)
individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
dplyr::mutate(
FILTERS = dplyr::if_else(
INDIVIDUALS %in% bl$INDIVIDUALS,
"filter.individuals.heterozygosity", FILTERS
)
)
update_radiator_gds(gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)
if (n.bl > 0) {
filter.monomorphic <- TRUE
# if (verbose) message(" number of individuals blacklisted based on heterozygosity: ", n.bl)
bl.filename <- stringi::stri_join("blacklist.individuals.heterozygosity_", file.date, ".tsv")
readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
# bl.i <- update_bl_individuals(gds = data, update = bl)
id.stats$i.info %<>%
dplyr::filter(!INDIVIDUALS %in% bl$INDIVIDUALS)
# update_radiator_gds(gds = data, node.name = "individuals.meta", value = id.stats$info, sync = TRUE)
}
# Filter parameter file: update
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "Filter individuals based on heterozygosity (with outlier stats or values)",
param.name = "filter.individuals.heterozygosity",
values = paste(het.low, het.high, collapse = " / "),
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# results ------------------------------------------------------------------
radiator_results_message(
rad.message = stringi::stri_join("\nFilter individuals based on heterozygosity: ", paste(het.low, het.high, collapse = " / ")),
filters.parameters,
internal,
verbose
)
}#End filter.individuals.heterozygosity
# Step 4. Coverage ---------------------------------------------------------
depth.info <- "total coverage" %in% id.stats$i.stats$GROUP
if (depth.info) {
if (interactive.filter) {
message("\nStep 4. Filtering markers based on individual's coverage\n")
# TOTAL COVERAGE -------------------------------------------------------
filter.individuals.coverage.total <- radiator_question(
x = "Do you want to blacklist samples based on TOTAL coverage ? (y/n):",
answer.opt = c("y", "n"))
if (filter.individuals.coverage.total == "y") {
outlier.stats <- radiator_question(
x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
if (outlier.stats == 1) {
filter.individuals.coverage.total <- "outliers"
} else {
filter.individuals.coverage.total <- c(0,10000000000000000000000)
filter.individuals.coverage.total[1] <- radiator_question(
x = "Enter the minimum TOTAL coverage threshold you tolerate for a sample:",
minmax = c(0, 10000000000000000000000)
)
filter.individuals.coverage.total[2] <- radiator_question(
x = "Enter the maximum TOTAL coverage threshold you tolerate for a sample:",
minmax = c(0, 10000000000000000000000)
)
}
outlier.stats <- NULL
} else {# if no...
filter.individuals.coverage.total <- NULL
}
# MEDIAN COVERAGE -----------------------------------------------------
filter.individuals.coverage.median <- radiator_question(
x = "Do you want to blacklist samples based on MEDIAN coverage ? (y/n):",
answer.opt = c("y", "n")
)
if (filter.individuals.coverage.median == "y") {
outlier.stats <- radiator_question(
x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
if (outlier.stats == 1) {
filter.individuals.coverage.median <- "outliers"
} else {
filter.individuals.coverage.median <- c(0,10000000000000000000000)
filter.individuals.coverage.median[1] <- radiator_question(
x = "Enter the minimum MEDIAN coverage threshold you tolerate for a sample:",
minmax = c(0, 10000000000000000000000)
)
filter.individuals.coverage.median[2] <- radiator_question(
x = "Enter the maximum MEDIAN coverage threshold you tolerate for a sample:",
minmax = c(0, 10000000000000000000000)
)
}
outlier.stats <- NULL
} else {# if no...
filter.individuals.coverage.median <- NULL
}
# IQR COVERAGE ---------------------------------------------------------
filter.individuals.coverage.iqr <- radiator_question(
x = "Do you want to blacklist samples based on Interquartile Range (IQR) coverage ? (y/n):",
answer.opt = c("y", "n")
)
if (filter.individuals.coverage.iqr == "y") {
outlier.stats <- radiator_question(
x = "2 options to blacklist samples:\n1. based on the outlier statistics\n2. enter your own threshold", minmax = c(1, 2))
if (outlier.stats == 1) {
filter.individuals.coverage.iqr <- "outliers"
} else {
filter.individuals.coverage.iqr <- c(0,10000000000000000000000)
filter.individuals.coverage.iqr[1] <- radiator_question(
x = "Enter the minimum IQR coverage threshold you tolerate for a sample:",
minmax = c(0, 10000000000000000000000)
)
filter.individuals.coverage.iqr[2] <- radiator_question(
x = "Enter the maximum IQR coverage threshold you tolerate for a sample:",
minmax = c(0, 10000000000000000000000)
)
}
outlier.stats <- NULL
} else {# if no...
filter.individuals.coverage.iqr <- NULL
}
}#End interactive mode
# filtering done here...
# TOTAL COVERAGE -------------------------------------------------------
if (!is.null(filter.individuals.coverage.total)) {
if (length(filter.individuals.coverage.total) > 1) {
cov.low <- filter.individuals.coverage.total[1]
cov.high <- filter.individuals.coverage.total[2]
if (verbose) message("\nRemoving individuals based on total coverage statistics: ", cov.low, " / ", cov.high)
lower.eq <- higher.eq <- FALSE
} else {
if (is.character(filter.individuals.coverage.total)) {
if (id.stats$i.stats$OUTLIERS_LOW[3] == id.stats$i.stats$MIN[3]) {
lower.eq <- FALSE
} else {
lower.eq <- TRUE
}
if (id.stats$i.stats$OUTLIERS_HIGH[3] == id.stats$i.stats$MAX[3]) {
higher.eq <- FALSE
} else {
higher.eq <- TRUE
}
cov.low <- id.stats$i.stats$OUTLIERS_LOW[3]
cov.high <- id.stats$i.stats$OUTLIERS_HIGH[3]
if (verbose) message("\nRemoving outliers individuals based on total coverage statistics: ", cov.low, " / ", cov.high)
} else {
rlang::abort("Unknown TOTAL coverage thresholds used")
}
}
bl <- dplyr::ungroup(id.stats$i.info)
# fl & fh for filter high and low
if (lower.eq) {
fl <- "COVERAGE_TOTAL <= cov.low"
} else {
fl <- "COVERAGE_TOTAL < cov.low"
}
if (higher.eq) {
fh <- "COVERAGE_TOTAL >= cov.high"
} else {
fh <- "COVERAGE_TOTAL > cov.high"
}
filter.cov <- stringi::stri_join(fh, fl, sep = " | ")
bl %<>%
dplyr::filter(!!rlang::parse_expr(filter.cov)) %>%
dplyr::distinct(INDIVIDUALS) %>%
dplyr::mutate(FILTER = "filter.individuals.coverage.total")
n.bl <- nrow(bl)
individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
dplyr::mutate(
FILTERS = dplyr::if_else(
INDIVIDUALS %in% bl$INDIVIDUALS,
"filter.individuals.coverage.total", FILTERS
)
)
update_radiator_gds(gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)
if (n.bl > 0) {
filter.monomorphic <- TRUE
bl.filename <- stringi::stri_join("blacklist.individuals.coverate.total_", file.date, ".tsv")
readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
}
# Filter parameter file: update
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "Filter individuals based on total coverage (with outlier stats or values)",
param.name = "filter.individuals.coverage.total",
values = paste(cov.low, cov.high, collapse = " / "),
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# results ------------------------------------------------------------------
radiator_results_message(
rad.message = stringi::stri_join("\nFilter individuals based on total coverage: ", paste(cov.low, cov.high, collapse = " / ")),
filters.parameters,
internal,
verbose
)
}#End coverage total
# MEDIAN COVERAGE -----------------------------------------------------
if (!is.null(filter.individuals.coverage.median)) {
if (length(filter.individuals.coverage.median) > 1) {
cov.low <- filter.individuals.coverage.median[1]
cov.high <- filter.individuals.coverage.median[2]
if (verbose) message("\nRemoving individuals based on MEDIAN coverage statistics: ", cov.low, " / ", cov.high)
lower.eq <- higher.eq <- FALSE
} else {
if (is.character(filter.individuals.coverage.median)) {
if (id.stats$i.stats$OUTLIERS_LOW[5] == id.stats$i.stats$MIN[5]) {
lower.eq <- FALSE
} else {
lower.eq <- TRUE
}
if (id.stats$i.stats$OUTLIERS_HIGH[5] == id.stats$i.stats$MAX[5]) {
higher.eq <- FALSE
} else {
higher.eq <- TRUE
}
cov.low <- id.stats$i.stats$OUTLIERS_LOW[5]
cov.high <- id.stats$i.stats$OUTLIERS_HIGH[5]
if (verbose) message("\nRemoving outliers individuals based on MEDIAN coverage statistics: ", cov.low, " / ", cov.high)
} else {
rlang::abort("Unknown MEDIAN coverage thresholds used")
}
}
bl <- dplyr::ungroup(id.stats$i.info)
# fl & fh for filter high and low
if (lower.eq) {
fl <- "COVERAGE_MEDIAN <= cov.low"
} else {
fl <- "COVERAGE_MEDIAN < cov.low"
}
if (higher.eq) {
fh <- "COVERAGE_MEDIAN >= cov.high"
} else {
fh <- "COVERAGE_MEDIAN > cov.high"
}
filter.cov <- stringi::stri_join(fh, fl, sep = " | ")
bl %<>%
dplyr::filter(!!rlang::parse_expr(filter.cov)) %>%
dplyr::distinct(INDIVIDUALS) %>%
dplyr::mutate(FILTER = "filter.individuals.coverage.median")
n.bl <- nrow(bl)
individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
dplyr::mutate(
FILTERS = dplyr::if_else(
INDIVIDUALS %in% bl$INDIVIDUALS,
"filter.individuals.coverage.median", FILTERS
)
)
update_radiator_gds(gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)
if (n.bl > 0) {
filter.monomorphic <- TRUE
bl.filename <- stringi::stri_join("blacklist.individuals.coverate.median_", file.date, ".tsv")
readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
}
# Filter parameter file: update
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "Filter individuals based on median coverage (with outlier stats or values)",
param.name = "filter.individuals.coverage.median",
values = paste(cov.low, cov.high, collapse = " / "),
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# results ------------------------------------------------------------------
radiator_results_message(
rad.message = stringi::stri_join("\nFilter individuals based on median coverage: ", paste(cov.low, cov.high, collapse = " / ")),
filters.parameters,
internal,
verbose
)
}#End coverage median
# IQR COVERAGE ---------------------------------------------------------
if (!is.null(filter.individuals.coverage.iqr)) {
if (length(filter.individuals.coverage.iqr) > 1) {
cov.low <- filter.individuals.coverage.iqr[1]
cov.high <- filter.individuals.coverage.iqr[2]
if (verbose) message("\nRemoving individuals based on IQR coverage statistics: ", cov.low, " / ", cov.high)
lower.eq <- higher.eq <- FALSE
} else {
if (is.character(filter.individuals.coverage.iqr)) {
if (id.stats$i.stats$OUTLIERS_LOW[6] == id.stats$i.stats$MIN[6]) {
lower.eq <- FALSE
} else {
lower.eq <- TRUE
}
if (id.stats$i.stats$OUTLIERS_HIGH[6] == id.stats$i.stats$MAX[6]) {
higher.eq <- FALSE
} else {
higher.eq <- TRUE
}
cov.low <- id.stats$i.stats$OUTLIERS_LOW[6]
cov.high <- id.stats$i.stats$OUTLIERS_HIGH[6]
if (verbose) message("\nRemoving outliers individuals based on IQR coverage statistics: ", cov.low, " / ", cov.high)
} else {
rlang::abort("Unknown IQR coverage thresholds used")
}
}
bl <- dplyr::ungroup(id.stats$i.info)
# fl & fh for filter high and low
if (lower.eq) {
fl <- "COVERAGE_IQR <= cov.low"
} else {
fl <- "COVERAGE_IQR < cov.low"
}
if (higher.eq) {
fh <- "COVERAGE_IQR >= cov.high"
} else {
fh <- "COVERAGE_IQR > cov.high"
}
filter.cov <- stringi::stri_join(fh, fl, sep = " | ")
bl %<>%
dplyr::filter(!!rlang::parse_expr(filter.cov)) %>%
dplyr::distinct(INDIVIDUALS) %>%
dplyr::mutate(FILTER = "filter.individuals.coverage.iqr")
n.bl <- nrow(bl)
individuals <- extract_individuals_metadata(gds = data, whitelist = FALSE) %>%
dplyr::mutate(
FILTERS = dplyr::if_else(
INDIVIDUALS %in% bl$INDIVIDUALS,
"filter.individuals.coverage.iqr", FILTERS
)
)
update_radiator_gds(gds = data, node.name = "individuals.meta", value = individuals, sync = TRUE)
if (n.bl > 0) {
filter.monomorphic <- TRUE
bl.filename <- stringi::stri_join("blacklist.individuals.coverate.iqr_", file.date, ".tsv")
readr::write_tsv(x = bl, file = file.path(path.folder, bl.filename))
}
# Filter parameter file: update
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "Filter individuals based on iqr coverage (with outlier stats or values)",
param.name = "filter.individuals.coverage.iqr",
values = paste(cov.low, cov.high, collapse = " / "),
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# results ------------------------------------------------------------------
radiator_results_message(
rad.message = stringi::stri_join("\nFilter individuals based on IQR coverage: ", paste(cov.low, cov.high, collapse = " / ")),
filters.parameters,
internal,
verbose
)
}#End coverage iqr
}#End DP
# MONOMORPHIC MARKERS --------------------------------------------------
data <- filter_monomorphic(
data = data,
filter.monomorphic = filter.monomorphic,
parallel.core = parallel.core,
verbose = FALSE,
parameters = filters.parameters,
path.folder = path.folder,
internal = FALSE)
}#before this one
return(data)
}#End filter_individuals
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.