# Keep markers in common between all populations
#' @name filter_common_markers
#' @title Filter common markers between strata
#' @description The function will filter the markers by keeping only those
#' in common between all strata
#' (population or any groupings defined in \code{STRATA} column).
#'
#' \strong{Filter targets}: SNPs
#'
#' \strong{Statistics}: strata genotyping rate per SNPs
#'
#'
#' Used internally in \href{https://github.com/thierrygosselin/radiator}{radiator}
#' and might be of interest for users who wants to keep only markers in common.
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
#'
#' @param filter.common.markers (optional, logical)
#' Default: \code{filter.common.markers = TRUE}.
#'
#' @param fig (optional, logical) \code{fig = TRUE} will produce an
#' \href{https://github.com/hms-dbmi/UpSetR}{UpSet fig} to visualize the number
#' of markers between populations. The package is required for this to work...
#' Default: \code{fig = FALSE}.
#' @param verbose (optional, logical) \code{verbose = TRUE} to be chatty
#' during execution.
#' Default: \code{verbose = FALSE}.
#' @param ... (optional) To pass further arguments for fine-tuning the function
#' and legacy arguments.
#' @inheritParams tidy_genomic_data
#' @return A list with the filtered input, whitelist and blacklist of markers..
#' @examples
#' \dontrun{
#' require(SeqArray) # when using gds
#' common <- radiator::filter_common_markers(data = "my.radiator.gds.rad", verbose = TRUE)
#' }
#' @export
#' @rdname filter_common_markers
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}
filter_common_markers <- function(
data,
filter.common.markers = TRUE,
fig = FALSE,
parallel.core = parallel::detectCores() - 1,
verbose = FALSE,
...
) {
# Test
# filter.common.markers = TRUE
# fig = TRUE
# parallel.core = parallel::detectCores() - 1
# verbose = TRUE
# path.folder <- NULL
# parameters <- NULL
# internal <- FALSE
if (!filter.common.markers) {
return(data)
} else {
# Cleanup-------------------------------------------------------------------
radiator_function_header(f.name = "filter_common_markers", verbose = verbose)
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- radiator_tic()
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing, verbose = verbose), add = TRUE)
on.exit(radiator_function_header(f.name = "filter_common_markers", start = FALSE, verbose = verbose), add = TRUE)
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("Input file missing")
# Function call and dotslist -------------------------------------------------
rad.dots <- radiator_dots(
func.name = as.list(sys.call())[[1]],
fd = rlang::fn_fmls_names(),
args.list = as.list(environment()),
dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
keepers = c("path.folder", "parameters", "internal"),
verbose = FALSE
)
if (internal) fig <- FALSE
if (fig) {
if (!requireNamespace("UpSetR", quietly = TRUE)) {
rlang::abort("UpSetR needed for this function to work
Install with install.packages('UpSetR')")
}
}
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "filter_common_markers",
internal = internal,
file.date = file.date,
verbose = verbose)
# write the dots file
write_rad(
data = rad.dots,
path = path.folder,
filename = stringi::stri_join(
"radiator_filter_common_markers_args_", file.date, ".tsv"),
tsv = TRUE,
internal = internal,
write.message = "Function call and arguments stored in: ",
verbose = verbose
)
# Detect format --------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
# GDS
if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
# Filter parameter file: generate and initiate -----------------------------
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = TRUE,
update = FALSE,
parameter.obj = parameters,
data = data,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# Filtering common markers -----------------------------------------------
if (verbose) message("Scanning for common markers...")
n.markers.before <- filters.parameters$info$n.snp
strata <- extract_individuals_metadata(
gds = data,
ind.field.select = c("STRATA", "INDIVIDUALS"),
whitelist = TRUE)
n.pop <- length(unique(strata$STRATA))
if (n.pop == 1) {
message("Filter common markers: only 1 strata, returning data")
return(data)
}
check.strata <- strata %>% dplyr::count(STRATA) %>% dplyr::filter(n <= 1)
if (nrow(check.strata) > 0) {
message("\nStrata with low sample size detected: fig <- FALSE\n")
fig <- FALSE
}
# plot_upset--------------------------------------------------------------
if (fig) {
plot.filename <- stringi::stri_join(
"common.markers.upsetrplot_", file.date)
plot.filename <- file.path(path.folder, plot.filename)
plot_upset(x = data,
data.type = data.type,
plot.filename = plot.filename,
parallel.core = parallel.core,
verbose = verbose)
}
# while SeqArray bug is fixed, PLAN B below uses tidy data
markers.meta <- extract_markers_metadata(gds = data, whitelist = FALSE)
bl <- not_common_markers(
x = data, strata = strata,
parallel.core = 1 #parallel.core
)
n.markers.removed <- length(bl)
want <- c("VARIANT_ID", "MARKERS", "CHROM", "LOCUS", "POS")
if (n.markers.removed > 0) {
n.markers.after <- n.markers.before - n.markers.removed
markers.meta %<>%
dplyr::mutate(
FILTERS = dplyr::if_else(
VARIANT_ID %in% bl, "filter.common.markers", FILTERS
)
)
write_rad(
data = markers.meta %>% dplyr::filter(FILTERS == "filter.common.markers"),
path = path.folder,
filename = stringi::stri_join("blacklist.not.common.markers_", file.date, ".tsv"),
tsv = TRUE, internal = internal, verbose = verbose)
# Update GDS
update_radiator_gds(
gds = data,
node.name = "markers.meta",
value = markers.meta,
sync = TRUE
)
}
write_rad(
data = markers.meta %>% dplyr::filter(FILTERS == "whitelist"),
path = path.folder,
filename = stringi::stri_join("whitelist.common.markers_", file.date, ".tsv"),
tsv = TRUE,
internal = internal,
verbose = verbose)
} else {#Tidy data
# Import data ---------------------------------------------------------------
if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
# Keep whitelist and blacklist (same = same space used)
wl <- radiator::separate_markers(data = data, sep = "__", markers.meta.lists.only = TRUE)
data %<>% dplyr::left_join(wl, by = intersect(colnames(data), colnames(wl)))
# Filter parameter file: generate and initiate ------------------------------------------
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = TRUE,
update = FALSE,
parameter.obj = parameters,
data = data,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
n.pop <- filters.parameters$info$n.pop
if (fig && n.pop == 1) {
message("\n\nNOTE: the plot argument requires more than 1 strata\n\n")
fig <- FALSE
}
if (fig) {
plot.filename <- stringi::stri_join(
"common.markers.upsetrplot_", file.date)
plot.filename <- file.path(path.folder, plot.filename)
plot_upset(x = data,
data.type = data.type,
plot.filename = plot.filename,
parallel.core = parallel.core,
verbose = verbose)
}
if (verbose) message("Scanning for common markers...")
data %<>% dplyr::rename(STRATA = tidyselect::any_of("POP_ID"))
if (tibble::has_name(data, "GT_BIN")) {
bl <- dplyr::select(.data = data, MARKERS, STRATA, GT_BIN) %>%
dplyr::filter(!is.na(GT_BIN))
} else {
bl <- dplyr::select(.data = data, MARKERS, STRATA, GT) %>%
dplyr::filter(GT != "000000")
}
bl <- dplyr::distinct(bl, MARKERS, STRATA) %>%
dplyr::count(x = ., MARKERS) %>%
dplyr::filter(n != length(unique(data$STRATA))) %>%
dplyr::distinct(MARKERS) %>%
dplyr::arrange(MARKERS)
# Remove the markers from the dataset
n.markers.removed <- nrow(bl)
if (n.markers.removed > 0) {
data <- dplyr::filter(data, !MARKERS %in% bl$MARKERS)
bl <- wl %>% dplyr::filter(MARKERS %in% bl$MARKERS)
write_rad(
data = bl,
path = path.folder,
filename = stringi::stri_join("blacklist.not.common.markers_", file.date, ".tsv"),
tsv = TRUE, internal = internal, verbose = verbose)
wl %<>% dplyr::filter(!MARKERS %in% bl$MARKERS)
write_rad(
data = wl,
path = path.folder,
filename = stringi::stri_join("whitelist.common.markers_", file.date, ".tsv"),
tsv = TRUE, internal = internal, verbose = verbose)
} else {
bl <- wl[0,]
}
}#End tidy
# Filter parameter file: update --------------------------------------------
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "Filter markers in common",
param.name = "filter.common.markers",
values = "",
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# Return -----------------------------------------------------------------------
radiator_results_message(
rad.message = "\nFilter common markers:",
filters.parameters,
internal,
verbose
)
return(data)
}
}#End filter_common_markers
# Internal functions -----------------------------------------------------------
# Generate a blacklist of markers not in common
#' @title not_common_markers
#' @description Generate a blacklist of markers not in common
#' @rdname not_common_markers
#' @keywords internal
#' @export
not_common_markers <- function(
x,
strata,
parallel.core = parallel::detectCores() - 2
) {
# PLAN B
# n.pop <- length(unique(strata$STRATA))
# bl <- extract_genotypes_metadata(
# gds = x,
# genotypes.meta.select = c("INDIVIDUALS", "VARIANT_ID", "GT_BIN")
# ) %>%
# dplyr::filter(!is.na(GT_BIN)) %>%
# join_strata(data = ., strata = strata, verbose = FALSE) %>%
# dplyr::distinct(VARIANT_ID, STRATA) %>%
# dplyr::count(x = ., VARIANT_ID) %>%
# dplyr::filter(n != n.pop) %>%
# dplyr::distinct(VARIANT_ID) %>%
# dplyr::arrange(VARIANT_ID) %$% VARIANT_ID
# PLAN A using seqarray
# Get the sample from radiator node or gds -----------------------------------
# Note to myself : you could get the info below from the strata
sample.bk <- extract_individuals_metadata(
gds = x,
ind.field.select = "INDIVIDUALS",
whitelist = TRUE
) %$% INDIVIDUALS
not_common <- function(
split.data = NULL,
x = NULL,
parallel.core = parallel::detectCores() - 2
) {
parallel.core.opt <- parallel_core_opt(parallel.core)
SeqArray::seqSetFilter(
object = x,
sample.id = split.data,
action = "set",
verbose = FALSE)
bl <- SeqArray::seqGetData(
gdsfile = x,
var.name = "variant.id")[SeqArray::seqMissing(
gdsfile = x,
per.variant = TRUE,
parallel = parallel.core.opt
) == 1]
return(bl)
}#End not_common
bl <- dplyr::group_split(strata, STRATA, .keep = FALSE) %>%
purrr::flatten(.) %>%
purrr::map(.x = .,
.f = not_common,
x = x,
parallel.core = parallel.core
) %>% unlist %>% unique %>% sort
# reset
# summary_gds(x)
# SeqArray::seqSetFilter(x, action = "pop", verbose = TRUE)
SeqArray::seqSetFilter(
object = x,
sample.id = sample.bk,
action = "set",
verbose = FALSE)
# summary_gds(x)
return(bl)
}#End not_common_markers
# Generate UPSETR plot----------------------------------------------------------
#' @title plot_upset
#' @description Generate UpSetR plot
#' @rdname plot_upset
#' @keywords internal
#' @export
plot_upset <- function(
x,
data.type,
plot.filename = NULL,
parallel.core = parallel::detectCores() - 2,
verbose = FALSE
) {
# For pc set the number of core to 1 -----------------------------------------
parallel.core <- 1
# if (Sys.info()[['sysname']] == "Windows") parallel.core <- 1
if (verbose) message("Generating UpSet plot to visualize markers in common")
if (data.type == "SeqVarGDSClass") {
# Get the sample from radiator node or gds ---------------------------------
strata <- extract_individuals_metadata(
gds = x,
ind.field.select = c("STRATA", "INDIVIDUALS"),
whitelist = TRUE
)
n.pop = length(unique(strata$STRATA))
sample.bk <- strata$INDIVIDUALS
missing_markers_pop <- function(
strata.split,
x,
parallel.core = parallel::detectCores() - 2
) {
# strata.split <- dplyr::group_split(.tbl = strata, STRATA)[[4]]
parallel.core.opt <- parallel_core_opt(parallel.core)
SeqArray::seqSetFilter(
object = x,
sample.id = strata.split$INDIVIDUALS,
action = "set",
verbose = FALSE
)
res <- tibble::tibble(
STRATA = SeqArray::seqMissing(
gdsfile = x,
per.variant = TRUE,
parallel = parallel.core.opt
) %>%
magrittr::inset(. == 1L, 9L) %>%
magrittr::inset(. < 1L, 1L) %>%
magrittr::inset(. == 9L, 0L)
) %>%
magrittr::set_colnames(., unique(strata.split$STRATA))
return(res)
}#End not_common
plot.data <- dplyr::group_split(.tbl = strata, STRATA) %>%
purrr::map_dfc(
.x = .,
.f = missing_markers_pop,
x = x,
parallel.core = parallel.core
) %>%
data.frame(.)#UpSetR requires data.frame
readr::write_tsv(x = plot.data, file = stringi::stri_join(plot.filename, ".tsv"))
SeqArray::seqSetFilter(
object = x,
sample.id = sample.bk,
action = "set",
verbose = FALSE)
} else {
n.pop = length(unique(x$STRATA))
if (tibble::has_name(x, "GT_BIN")) {
plot.data <- dplyr::filter(x, !is.na(GT_BIN))
} else {
plot.data <- dplyr::filter(x, GT != "000000")
}
plot.data <- dplyr::distinct(plot.data, MARKERS, STRATA) %>%
dplyr::mutate(
n = rep(1, n()),
STRATA = stringi::stri_join("POP_", STRATA)
) %>%
tidyr::pivot_wider(data = ., names_from = "STRATA", values_from = "n", values_fill = 0) %>%
data.frame(.)#UpSetR requires data.frame
}
# generate the plot
# print(dev.list())
# pdf(
# file = stringi::stri_join(plot.filename, ".pdf"),
# onefile = FALSE
# )
grDevices::png(filename = stringi::stri_join(plot.filename, ".png"))
print(
UpSetR::upset(
data = plot.data,
nsets = n.pop,
order.by = "freq",
empty.intersections = NULL)
)
dev.off()
# print(dev.list())
}#End plot_upset
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.