# SNP number per haplotype
#' @name filter_snp_position_read
#' @title Filter markers/SNP based on their position on the read
#' @description This filter removes markers/SNPs based on their position on the read.
#' The data requires snp, locus and col information (e.g. from a VCF file).
#'
#' The impact of assembly artifacts can be tested in downstream analysis with
#' the whitelist and blacklist generated by this function.
#'
#'
#' \strong{Filter targets}: Markers
#'
#' \strong{Statistics}: The position of the SNPs on the read.
#'
#' @inheritParams read_strata
#' @inheritParams radiator_common_arguments
#' @param filter.snp.position.read (character)
#' Options are: \code{"outliers", "q75", "iqr", c(min value,max value)}.
#' For a safe and conservative
#' value, use \code{"outliers"}, this will remove SNPs with outlier position on
#' the reads.
#' Default: \code{filter.snp.read.position = NULL}.
#' @param filename (optional) Name of the filtered tidy data frame file
#' written to the working directory (ending with \code{.tsv})
#' Default: \code{filename = NULL}.
#' @rdname filter_snp_position_read
#' @export
#' @details
#' \strong{Interactive version}
#'
#' There are 2 steps in the interactive version to visualize and filter
#' the data based on the number of SNP on the read/locus:
#'
#' Step 1. SNP number per read/locus visualization
#'
#' Step 2. Choose the filtering thresholds
#'
#'
#' @return A list in the global environment with 6 objects:
#' \enumerate{
#' \item $snp.number.markers
#' \item $number.snp.reads.plot
#' \item $whitelist.markers
#' \item $tidy.filtered.snp.number
#' \item $blacklist.markers
#' \item $filters.parameters
#' }
#'
#' The object can be isolated in separate object outside the list by
#' following the example below.
#' @examples
#' \dontrun{
#' turtle <- radiator::filter_snp_position_read(
#' data = "turtle.vcf",
#' strata = "turtle.strata.tsv",
#' filter.snp.position.read = "outliers",
#' filename = "tidy.data.turtle.tsv"
#' )
#' }
filter_snp_position_read <- function(
data,
strata = NULL,
interactive.filter = TRUE,
filter.snp.position.read = NULL,
filename = NULL,
parallel.core = parallel::detectCores() - 1,
verbose = TRUE,
...
) {
# TEST
# data <- gds
# interactive.filter <- TRUE
# path.folder <- "testing_snp_position"
# force.stats <- TRUE
# parameters <- NULL
# filename <- NULL
# filter.snp.position.read <- "outliers"
# parallel.core <- parallel::detectCores() - 1
# verbose = TRUE
# obj.keeper <- c(ls(envir = globalenv()), "data")
if (!is.null(filter.snp.position.read) || interactive.filter) {
if (interactive.filter) verbose <- TRUE
radiator_function_header(f.name = "filter_snp_position_read", verbose = verbose)
# Cleanup-------------------------------------------------------------------
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
timing <- radiator_tic()
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(radiator_toc(timing), add = TRUE)
on.exit(radiator_function_header(f.name = "filter_snp_position_read", start = FALSE, verbose = verbose), add = TRUE)
# on.exit(rm(list = setdiff(ls(envir = sys.frame(-1L)), obj.keeper), envir = sys.frame(-1L)))
# Function call and dotslist -------------------------------------------------
rad.dots <- radiator_dots(
func.name = as.list(sys.call())[[1]],
fd = rlang::fn_fmls_names(),
args.list = as.list(environment()),
dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE),
keepers = c("path.folder", "parameters", "internal"),
verbose = FALSE
)
# Checking for missing and/or default arguments ------------------------------
if (missing(data)) rlang::abort("data is missing")
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "filter_snp_position_read",
internal = internal,
file.date = file.date,
verbose = verbose)
# write the dots file
write_rad(
data = rad.dots,
path = path.folder,
filename = stringi::stri_join("radiator_filter_snp_position_read_args_", file.date, ".tsv"),
tsv = TRUE,
internal = internal,
write.message = "Function call and arguments stored in: ",
verbose = verbose
)
# Message about steps taken during the process ---------------------------------
if (interactive.filter) {
message("2 steps to visualize and filter the data based on the number of SNP on the read/locus:")
message("Step 1. Visualization (boxplot, distribution")
message("Step 2. Threshold selection")
}
# File type detection----------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
} else {
if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
data.type <- "tbl_df"
}
# Filter parameter file: initiate ------------------------------------------
filters.parameters <- radiator_parameters(
generate = TRUE,
initiate = TRUE,
update = FALSE,
parameter.obj = parameters,
data = data,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# Whitelist and blacklist --------------------------------------------------
want <- c("MARKERS", "CHROM", "LOCUS", "POS", "COL")
if (data.type == "SeqVarGDSClass") {
wl <- bl <- extract_markers_metadata(gds = data, whitelist = TRUE)
# check for presence of COL info...
if (!rlang::has_name(wl, "COL")) {
message("COL info required, returning data")
return(data)
}
if (!is.numeric(wl$COL)) wl$COL <- as.numeric(wl$COL)
} else {
wl <- bl <- dplyr::select(data, tidyselect::any_of(want))
}
# Check that required info is present in data: snp and locus----------------
locus.check <- tibble::has_name(wl, "LOCUS")
snp.check <- tibble::has_name(wl, "POS")
col.check <- tibble::has_name(wl, "COL")
if (!locus.check || !snp.check || !col.check) {
problem.data <- "This filter requires a dataset with SNP (POS), LOCUS and COL informations"
message("\n\n", problem.data)
readr::write_lines(
x = problem.data,
file = file.path(path.folder, "README"))
return(data)
}
# Generate snp per locus stats----------------------------------------------
if (verbose) message("Generating SNP position on read stats")
stats <- tibble_stats(
x = wl %>%
dplyr::distinct(MARKERS,COL) %$% COL,# %>% as.numeric(.)
group = "snp position on read")
snp.col.iqr.threshold <- c(stats$Q25, stats$Q75)
# Generate box plot --------------------------------------------------------
read.length <- max(wl$COL)
fig <- boxplot_stats(
data = stats,
title = "SNP position on the read",
subtitle = if (!is.null(read.length)) {
stringi::stri_join("Read length (max): ", read.length, " bp", "\nOutlier: ", ceiling(stats[[9]]))
} else {
stringi::stri_join("\nOutlier: ", ceiling(stats[[9]]))
},
x.axis.title = NULL,
y.axis.title = "SNP position on the read (bp)",
bp.filename = stringi::stri_join("snp.position.read.boxplot_", file.date, ".pdf"),
path.folder = path.folder)
# Distribution -------------------------------------------------------------
d.plot <- wl %>%
dplyr::distinct(MARKERS, COL)
col.levels <- sort(unique(wl$COL))
d.plot %<>% dplyr::mutate(COL = factor(x = COL, levels = col.levels))
d.plot <- ggplot2::ggplot(
data = d.plot, ggplot2::aes(factor(sort(as.integer(COL))))) +
ggplot2::geom_bar() +
ggplot2::labs(y = "Number of SNPs", x = "SNP position on the read (bp)") +
ggplot2::theme_bw()+
ggplot2::theme(
axis.title.x = ggplot2::element_text(size = 12, face = "bold"),
axis.title.y = ggplot2::element_text(size = 12, face = "bold"),
legend.title = ggplot2::element_text(size = 12, face = "bold"),
legend.text = ggplot2::element_text(size = 12, face = "bold"),
strip.text.x = ggplot2::element_text(size = 12, face = "bold")
# axis.text.x = ggplot2::element_text(size = 12, angle = 90, hjust = 1, vjust = 0.5)
)
# print(d.plot)
# save
d.plot.filename <- stringi::stri_join("snp.position.read.distribution_", file.date, ".pdf")
ggplot2::ggsave(
filename = file.path(path.folder, d.plot.filename),
plot = d.plot,
width = read.length, height = 10, dpi = 300, units = "cm",
useDingbats = FALSE,
limitsize = FALSE)
# Helper table -------------------------------------------------------------
if (verbose) message("Generating helper table...")
n.markers <- nrow(wl)
helper.table <- tibble::tibble(
STATS = c("outliers", "q75", "iqr"),
WHITELISTED_MARKERS = c(
nrow(dplyr::filter(wl, COL <= stats[[9]])),
nrow(dplyr::filter(wl, COL <= stats[[5]])),
nrow(dplyr::filter(wl, COL >= stats[[3]] & COL <= stats[[5]])))
) %>%
dplyr::mutate(
BLACKLISTED_MARKERS = n.markers - WHITELISTED_MARKERS,
STATS = factor(x = STATS, levels = c("outliers", "q75", "iqr"), ordered = FALSE)) %>%
dplyr::arrange(STATS)
readr::write_tsv(
x = helper.table,
file = file.path(path.folder, "snp.position.read.helper.table.tsv"))
# figures
markers.plot <- ggplot2::ggplot(
data = tidyr::pivot_longer(
data = helper.table,
cols = -STATS,
names_to = "LIST",
values_to = "MARKERS"
),
ggplot2::aes(x = STATS, y = MARKERS)) +
# ggplot2::geom_line() +
ggplot2::geom_point(size = 2, shape = 21, fill = "white") +
ggplot2::scale_x_discrete(name = "SNP position on the read") +
ggplot2::scale_y_continuous(name = "Number of markers") +
ggplot2::theme_bw()+
ggplot2::theme(
axis.title.x = ggplot2::element_text(size = 10, face = "bold"),
axis.title.y = ggplot2::element_text(size = 10, face = "bold"),
axis.text.x = ggplot2::element_text(size = 8)#, angle = 90, hjust = 1, vjust = 0.5)
) +
ggplot2::facet_grid(LIST ~. , scales = "free", space = "free")
# print(markers.plot) # not that useful...
print(d.plot) # more useful to leave at the end
# save
ggplot2::ggsave(
filename = file.path(path.folder, "snp.position.read.helper.plot.pdf"),
plot = markers.plot,
width = 20,
height = 15,
dpi = 300,
units = "cm",
useDingbats = FALSE,
limitsize = FALSE)
helper.table <- markers.plot <- NULL
if (verbose) message("Files written: helper tables and plots")
# Step 2. Thresholds selection ---------------------------------------------
if (interactive.filter) {
message("\nStep 2. Filtering markers based on the SNPs position on the read\n")
filter.snp.position.read <- radiator_question(
x = "Choice of stats are: \n1: all (filter off)\n2: outliers\n3: q75\n4: iqr\n5: choose your own min and max values",
answer.opt = c("1", "2", "3", "4", "5"))
filter.snp.position.read <- stringi::stri_replace_all_fixed(
str = filter.snp.position.read,
pattern = c("1", "2", "3", "4", "5"),
replacement = c("all", "outliers", "q75", "iqr", "thresholds"),
vectorize_all = FALSE)
if (filter.snp.position.read == "thresholds") {
filter.snp.position.read[1] <- radiator_question(
x = "Enter the min position of SNP on the read:",
minmax = c(1,10000))
filter.snp.position.read[2] <- radiator_question(
x = "Enter the max position of SNP on the read:",
minmax = c(1,10000))
}
}
# filter.snp.position.read <- match.arg(
# filter.snp.position.read,
# choices = c("outliers", "q75", "iqr", "all"),
# several.ok = FALSE)
# readr::write_tsv(x = stats, file = "testing.stats.tsv")
# Filtering ----------------------------------------------------------------
# if (filter.snp.position.read == "all") not necessary wl already exists...
if (length(filter.snp.position.read) == 2) {
wl %<>% dplyr::filter(COL >= as.numeric(filter.snp.position.read[1]) & COL <= as.numeric(filter.snp.position.read[2]))
} else {
if (filter.snp.position.read == "outliers") {
wl %<>% dplyr::filter(COL <= stats[[9]])
}
if (filter.snp.position.read == "q75") {
wl %<>% dplyr::filter(COL <= stats[[5]])
}
if (filter.snp.position.read == "iqr") {
wl %<>% dplyr::filter(COL >= stats[[3]] & COL <= stats[[5]])
}
}
# Whitelist and Blacklist of markers
readr::write_tsv(
x = wl,
file = file.path(path.folder, "whitelist.markers.snp.position.read.tsv"),
append = FALSE, col_names = TRUE)
bl %<>% dplyr::filter(!MARKERS %in% wl$MARKERS) %>%
dplyr::mutate(FILTERS = "filter.snp.position.read")
# dplyr::setdiff(wl) %>% # crash RStudio...
if (nrow(bl) > 0) {
readr::write_tsv(
x = bl,
file = file.path(path.folder, "blacklist.markers.snp.position.read.tsv"),
append = FALSE, col_names = TRUE)
}
# saving whitelist and blacklist
if (verbose) message("File written: whitelist.markers.snp.position.read.tsv")
if (verbose) message("File written: blacklist.markers.snp.position.read.tsv")
if (data.type == "SeqVarGDSClass") {
markers.meta <- extract_markers_metadata(gds = data, whitelist = FALSE) %>%
dplyr::mutate(
FILTERS = dplyr::if_else(
VARIANT_ID %in% bl$VARIANT_ID, "filter.snp.position.read", FILTERS
)
)
update_radiator_gds(
gds = data,
node.name = "markers.meta",
value = markers.meta,
sync = TRUE
)
# update blacklist.markers
# if (nrow(bl) > 0) {
# bl %<>% dplyr::select(MARKERS) %>%
# dplyr::mutate(FILTER = "filter.snp.position.read")
# bl.gds <- update_bl_markers(gds = data, update = bl)
# }
} else {
# Apply the filter to the tidy data
data %<>% dplyr::filter(MARKERS %in% wl$MARKERS)
}
# radiator_parameters------------------------------------------------------
filters.parameters <- radiator_parameters(
generate = FALSE,
initiate = FALSE,
update = TRUE,
parameter.obj = filters.parameters,
data = data,
filter.name = "Filter SNPs position on the read",
param.name = "filter.snp.position.read",
values = filter.snp.position.read,
path.folder = path.folder,
file.date = file.date,
internal = internal,
verbose = verbose)
# results ------------------------------------------------------------------
radiator_results_message(
rad.message = stringi::stri_join("\nFilter SNP position on the read : ",
filter.snp.position.read),
filters.parameters,
internal,
verbose
)
}
return(data)
} #End filter_snp_position_read
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.